/** * Conversion function. They deal with data-types in 3 ways, always making local copies. * In order to allow performance testings, there are 3 functions: * - one copying directly from one memory location to another one using the * data-type copy function. * - one which use a 2 convertors created with the same data-type * - and one using 2 convertors created from different data-types. * */ static int local_copy_ddt_count( ompi_datatype_t* pdt, int count ) { void *pdst, *psrc; TIMER_DATA_TYPE start, end; long total_time; size_t length; length = compute_buffer_length(pdt, count); pdst = malloc(length); psrc = malloc(length); for( int i = 0; i < length; i++ ) ((char*)psrc)[i] = i % 128 + 32; memset(pdst, 0, length); cache_trash(); /* make sure the cache is useless */ GET_TIME( start ); if( OMPI_SUCCESS != ompi_datatype_copy_content_same_ddt( pdt, count, pdst, psrc ) ) { printf( "Unable to copy the datatype in the function local_copy_ddt_count." " Is the datatype committed ?\n" ); } GET_TIME( end ); total_time = ELAPSED_TIME( start, end ); printf( "direct local copy in %ld microsec\n", total_time ); free(pdst); free(psrc); return OMPI_SUCCESS; }
static int mca_coll_ml_reduce_unpack(mca_coll_ml_collective_operation_progress_t *coll_op) { int ret; /* need to put in more */ int count = coll_op->variable_fn_params.count; ompi_datatype_t *dtype = coll_op->variable_fn_params.dtype; void *dest = (void *)((uintptr_t)coll_op->full_message.dest_user_addr + (uintptr_t)coll_op->fragment_data.offset_into_user_buffer); void *src = (void *)((uintptr_t)coll_op->fragment_data.buffer_desc->data_addr + (size_t)coll_op->variable_fn_params.rbuf_offset); ret = ompi_datatype_copy_content_same_ddt(dtype, (int32_t) count, (char *) dest, (char *) src); if (ret < 0) { return OMPI_ERROR; } if (coll_op->variable_fn_params.root_flag) { ML_VERBOSE(1,("In reduce unpack %d", *(int *)((unsigned char*) src))); } ML_VERBOSE(10, ("sbuf addr %p, sbuf offset %d, sbuf val %lf, rbuf addr %p, rbuf offset %d, rbuf val %lf.", coll_op->variable_fn_params.sbuf, coll_op->variable_fn_params.sbuf_offset, *(double *) ((unsigned char *) coll_op->variable_fn_params.sbuf + (size_t) coll_op->variable_fn_params.sbuf_offset), coll_op->variable_fn_params.rbuf, coll_op->variable_fn_params.rbuf_offset, *(double *) ((unsigned char *) coll_op->variable_fn_params.rbuf + (size_t) coll_op->variable_fn_params.rbuf_offset))); return OMPI_SUCCESS; }
/* * reduce_lin_intra * * Function: - reduction * Accepts: - same as MPI_Reduce() * Returns: - MPI_SUCCESS or error code */ int mca_coll_self_reduce_intra(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { if (MPI_IN_PLACE == sbuf) { return MPI_SUCCESS; } else { return ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); } }
int ompi_osc_sm_fetch_and_op(void *origin_addr, void *result_addr, struct ompi_datatype_t *dt, int target, OPAL_PTRDIFF_TYPE target_disp, struct ompi_op_t *op, struct ompi_win_t *win) { ompi_osc_sm_module_t *module = (ompi_osc_sm_module_t*) win->w_osc_module; void *remote_address; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "fetch_and_op: 0x%lx, %s, %d, %d, %s, 0x%lx", (unsigned long) origin_addr, dt->name, target, (int) target_disp, op->o_name, (unsigned long) win)); remote_address = ((char*) (module->bases[target])) + module->disp_units[target] * target_disp; opal_atomic_lock(&module->node_states[target].accumulate_lock); /* fetch */ ompi_datatype_copy_content_same_ddt(dt, 1, (char*) result_addr, (char*) remote_address); if (op == &ompi_mpi_op_no_op.op) goto done; /* op */ if (op == &ompi_mpi_op_replace.op) { ompi_datatype_copy_content_same_ddt(dt, 1, (char*) remote_address, (char*) origin_addr); } else { ompi_op_reduce(op, origin_addr, remote_address, 1, dt); } done: opal_atomic_unlock(&module->node_states[target].accumulate_lock); return OMPI_SUCCESS;; }
int ompi_osc_sm_compare_and_swap(void *origin_addr, void *compare_addr, void *result_addr, struct ompi_datatype_t *dt, int target, OPAL_PTRDIFF_TYPE target_disp, struct ompi_win_t *win) { ompi_osc_sm_module_t *module = (ompi_osc_sm_module_t*) win->w_osc_module; void *remote_address; size_t size; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "compare_and_swap: 0x%lx, %s, %d, %d, 0x%lx", (unsigned long) origin_addr, dt->name, target, (int) target_disp, (unsigned long) win)); remote_address = ((char*) (module->bases[target])) + module->disp_units[target] * target_disp; ompi_datatype_type_size(dt, &size); opal_atomic_lock(&module->node_states[target].accumulate_lock); /* fetch */ ompi_datatype_copy_content_same_ddt(dt, 1, (char*) result_addr, (char*) remote_address); /* compare */ if (0 == memcmp(result_addr, compare_addr, size)) { /* set */ ompi_datatype_copy_content_same_ddt(dt, 1, (char*) remote_address, (char*) origin_addr); } opal_atomic_unlock(&module->node_states[target].accumulate_lock); return OMPI_SUCCESS; }
/** * This is a generic implementation of the reduce protocol. It used the tree * provided as an argument and execute all operations using a segment of * count times a datatype. * For the last communication it will update the count in order to limit * the number of datatype to the original count (original_count) * * Note that for non-commutative operations we cannot save memory copy * for the first block: thus we must copy sendbuf to accumbuf on intermediate * to keep the optimized loop happy. */ int ompi_coll_tuned_reduce_generic( void* sendbuf, void* recvbuf, int original_count, ompi_datatype_t* datatype, ompi_op_t* op, int root, ompi_communicator_t* comm, mca_coll_base_module_t *module, ompi_coll_tree_t* tree, int count_by_segment, int max_outstanding_reqs ) { char *inbuf[2] = {NULL, NULL}, *inbuf_free[2] = {NULL, NULL}; char *accumbuf = NULL, *accumbuf_free = NULL; char *local_op_buffer = NULL, *sendtmpbuf = NULL; ptrdiff_t extent, lower_bound, segment_increment; size_t typelng; ompi_request_t* reqs[2] = {MPI_REQUEST_NULL, MPI_REQUEST_NULL}; int num_segments, line, ret, segindex, i, rank; int recvcount, prevcount, inbi; /** * Determine number of segments and number of elements * sent per operation */ ompi_datatype_get_extent( datatype, &lower_bound, &extent ); ompi_datatype_type_size( datatype, &typelng ); num_segments = (original_count + count_by_segment - 1) / count_by_segment; segment_increment = (ptrdiff_t)count_by_segment * extent; sendtmpbuf = (char*) sendbuf; if( sendbuf == MPI_IN_PLACE ) { sendtmpbuf = (char *)recvbuf; } OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:reduce_generic count %d, msg size %ld, segsize %ld, max_requests %d", original_count, (unsigned long)((ptrdiff_t)num_segments * (ptrdiff_t)segment_increment), (unsigned long)segment_increment, max_outstanding_reqs)); rank = ompi_comm_rank(comm); /*printf("Tree of rank %d - ", rank); printf("Parent : %d - ", tree->tree_prev); printf("Child : "); for (i = 0; i < tree->tree_nextsize; i++) printf("%d ", tree->tree_next[i]); printf("\n");*/ /* non-leaf nodes - wait for children to send me data & forward up (if needed) */ if( tree->tree_nextsize > 0 ) { ptrdiff_t true_lower_bound, true_extent, real_segment_size; ompi_datatype_get_true_extent( datatype, &true_lower_bound, &true_extent ); /* handle non existant recv buffer (i.e. its NULL) and protect the recv buffer on non-root nodes */ accumbuf = (char*)recvbuf; if( (NULL == accumbuf) || (root != rank) ) { /* Allocate temporary accumulator buffer. */ accumbuf_free = (char*)malloc(true_extent + (ptrdiff_t)(original_count - 1) * extent); if (accumbuf_free == NULL) { line = __LINE__; ret = -1; goto error_hndl; } accumbuf = accumbuf_free - lower_bound; } /* If this is a non-commutative operation we must copy sendbuf to the accumbuf, in order to simplfy the loops */ if (!ompi_op_is_commute(op)) { ompi_datatype_copy_content_same_ddt(datatype, original_count, (char*)accumbuf, (char*)sendtmpbuf); } /* Allocate two buffers for incoming segments */ real_segment_size = true_extent + (ptrdiff_t)(count_by_segment - 1) * extent; inbuf_free[0] = (char*) malloc(real_segment_size); if( inbuf_free[0] == NULL ) { line = __LINE__; ret = -1; goto error_hndl; } inbuf[0] = inbuf_free[0] - lower_bound; /* if there is chance to overlap communication - allocate second buffer */ if( (num_segments > 1) || (tree->tree_nextsize > 1) ) { inbuf_free[1] = (char*) malloc(real_segment_size); if( inbuf_free[1] == NULL ) { line = __LINE__; ret = -1; goto error_hndl; } inbuf[1] = inbuf_free[1] - lower_bound; } /* reset input buffer index and receive count */ inbi = 0; recvcount = 0; /* for each segment */ for( segindex = 0; segindex <= num_segments; segindex++ ) { prevcount = recvcount; /* recvcount - number of elements in current segment */ recvcount = count_by_segment; if( segindex == (num_segments-1) ) recvcount = original_count - (ptrdiff_t)count_by_segment * (ptrdiff_t)segindex; /* for each child */ for( i = 0; i < tree->tree_nextsize; i++ ) { /** * We try to overlap communication: * either with next segment or with the next child */ /* post irecv for current segindex on current child */ if( segindex < num_segments ) { void* local_recvbuf = inbuf[inbi]; if( 0 == i ) { /* for the first step (1st child per segment) and * commutative operations we might be able to irecv * directly into the accumulate buffer so that we can * reduce(op) this with our sendbuf in one step as * ompi_op_reduce only has two buffer pointers, * this avoids an extra memory copy. * * BUT if the operation is non-commutative or * we are root and are USING MPI_IN_PLACE this is wrong! */ if( (ompi_op_is_commute(op)) && !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) { local_recvbuf = accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment; } } ret = MCA_PML_CALL(irecv(local_recvbuf, recvcount, datatype, tree->tree_next[i], MCA_COLL_BASE_TAG_REDUCE, comm, &reqs[inbi])); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl;} } /* wait for previous req to complete, if any. if there are no requests reqs[inbi ^1] will be MPI_REQUEST_NULL. */ /* wait on data from last child for previous segment */ ret = ompi_request_wait_all( 1, &reqs[inbi ^ 1], MPI_STATUSES_IGNORE ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } local_op_buffer = inbuf[inbi ^ 1]; if( i > 0 ) { /* our first operation is to combine our own [sendbuf] data * with the data we recvd from down stream (but only * the operation is commutative and if we are not root and * not using MPI_IN_PLACE) */ if( 1 == i ) { if( (ompi_op_is_commute(op)) && !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) { local_op_buffer = sendtmpbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment; } } /* apply operation */ ompi_op_reduce(op, local_op_buffer, accumbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, recvcount, datatype ); } else if ( segindex > 0 ) { void* accumulator = accumbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment; if( tree->tree_nextsize <= 1 ) { if( (ompi_op_is_commute(op)) && !((MPI_IN_PLACE == sendbuf) && (rank == tree->tree_root)) ) { local_op_buffer = sendtmpbuf + (ptrdiff_t)(segindex-1) * (ptrdiff_t)segment_increment; } } ompi_op_reduce(op, local_op_buffer, accumulator, prevcount, datatype ); /* all reduced on available data this step (i) complete, * pass to the next process unless you are the root. */ if (rank != tree->tree_root) { /* send combined/accumulated data to parent */ ret = MCA_PML_CALL( send( accumulator, prevcount, datatype, tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm) ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } } /* we stop when segindex = number of segments (i.e. we do num_segment+1 steps for pipelining */ if (segindex == num_segments) break; } /* update input buffer index */ inbi = inbi ^ 1; } /* end of for each child */ } /* end of for each segment */ /* clean up */ if( inbuf_free[0] != NULL) free(inbuf_free[0]); if( inbuf_free[1] != NULL) free(inbuf_free[1]); if( accumbuf_free != NULL ) free(accumbuf_free); } /* leaf nodes Depending on the value of max_outstanding_reqs and the number of segments we have two options: - send all segments using blocking send to the parent, or - avoid overflooding the parent nodes by limiting the number of outstanding requests to max_oustanding_reqs. TODO/POSSIBLE IMPROVEMENT: If there is a way to determine the eager size for the current communication, synchronization should be used only when the message/segment size is smaller than the eager size. */ else { /* If the number of segments is less than a maximum number of oustanding requests or there is no limit on the maximum number of outstanding requests, we send data to the parent using blocking send */ if ((0 == max_outstanding_reqs) || (num_segments <= max_outstanding_reqs)) { segindex = 0; while ( original_count > 0) { if (original_count < count_by_segment) { count_by_segment = original_count; } ret = MCA_PML_CALL( send((char*)sendbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, count_by_segment, datatype, tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm) ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } segindex++; original_count -= count_by_segment; } } /* Otherwise, introduce flow control: - post max_outstanding_reqs non-blocking synchronous send, - for remaining segments - wait for a ssend to complete, and post the next one. - wait for all outstanding sends to complete. */ else { int creq = 0; ompi_request_t **sreq = NULL; sreq = (ompi_request_t**) calloc( max_outstanding_reqs, sizeof(ompi_request_t*) ); if (NULL == sreq) { line = __LINE__; ret = -1; goto error_hndl; } /* post first group of requests */ for (segindex = 0; segindex < max_outstanding_reqs; segindex++) { ret = MCA_PML_CALL( isend((char*)sendbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, count_by_segment, datatype, tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_SYNCHRONOUS, comm, &sreq[segindex]) ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } original_count -= count_by_segment; } creq = 0; while ( original_count > 0 ) { /* wait on a posted request to complete */ ret = ompi_request_wait(&sreq[creq], MPI_STATUS_IGNORE); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } sreq[creq] = MPI_REQUEST_NULL; if( original_count < count_by_segment ) { count_by_segment = original_count; } ret = MCA_PML_CALL( isend((char*)sendbuf + (ptrdiff_t)segindex * (ptrdiff_t)segment_increment, count_by_segment, datatype, tree->tree_prev, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_SYNCHRONOUS, comm, &sreq[creq]) ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } creq = (creq + 1) % max_outstanding_reqs; segindex++; original_count -= count_by_segment; } /* Wait on the remaining request to complete */ ret = ompi_request_wait_all( max_outstanding_reqs, sreq, MPI_STATUSES_IGNORE ); if (ret != MPI_SUCCESS) { line = __LINE__; goto error_hndl; } /* free requests */ free(sreq); } } return OMPI_SUCCESS; error_hndl: /* error handler */ OPAL_OUTPUT (( ompi_coll_tuned_stream, "ERROR_HNDL: node %d file %s line %d error %d\n", rank, __FILE__, line, ret )); if( inbuf_free[0] != NULL ) free(inbuf_free[0]); if( inbuf_free[1] != NULL ) free(inbuf_free[1]); if( accumbuf_free != NULL ) free(accumbuf); return ret; }
/* * reduce_intra_in_order_binary * * Function: Logarithmic reduce operation for non-commutative operations. * Acecpts: same as MPI_Reduce() * Returns: MPI_SUCCESS or error code */ int ompi_coll_tuned_reduce_intra_in_order_binary( void *sendbuf, void *recvbuf, int count, ompi_datatype_t* datatype, ompi_op_t* op, int root, ompi_communicator_t* comm, mca_coll_base_module_t *module, uint32_t segsize, int max_outstanding_reqs ) { int ret, rank, size, io_root, segcount = count; void *use_this_sendbuf = NULL, *use_this_recvbuf = NULL; size_t typelng; mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_in_order_binary rank %d ss %5d", rank, segsize)); COLL_TUNED_UPDATE_IN_ORDER_BINTREE( comm, tuned_module ); /** * Determine number of segments and number of elements * sent per operation */ ompi_datatype_type_size( datatype, &typelng ); COLL_TUNED_COMPUTED_SEGCOUNT( segsize, typelng, segcount ); /* An in-order binary tree must use root (size-1) to preserve the order of operations. Thus, if root is not rank (size - 1), then we must handle 1. MPI_IN_PLACE option on real root, and 2. we must allocate temporary recvbuf on rank (size - 1). Note that generic function must be careful not to switch order of operations for non-commutative ops. */ io_root = size - 1; use_this_sendbuf = sendbuf; use_this_recvbuf = recvbuf; if (io_root != root) { ptrdiff_t tlb, text, lb, ext; char *tmpbuf = NULL; ompi_datatype_get_extent(datatype, &lb, &ext); ompi_datatype_get_true_extent(datatype, &tlb, &text); if ((root == rank) && (MPI_IN_PLACE == sendbuf)) { tmpbuf = (char *) malloc(text + (ptrdiff_t)(count - 1) * ext); if (NULL == tmpbuf) { return MPI_ERR_INTERN; } ompi_datatype_copy_content_same_ddt(datatype, count, (char*)tmpbuf, (char*)recvbuf); use_this_sendbuf = tmpbuf; } else if (io_root == rank) { tmpbuf = (char *) malloc(text + (ptrdiff_t)(count - 1) * ext); if (NULL == tmpbuf) { return MPI_ERR_INTERN; } use_this_recvbuf = tmpbuf; } } /* Use generic reduce with in-order binary tree topology and io_root */ ret = ompi_coll_tuned_reduce_generic( use_this_sendbuf, use_this_recvbuf, count, datatype, op, io_root, comm, module, data->cached_in_order_bintree, segcount, max_outstanding_reqs ); if (MPI_SUCCESS != ret) { return ret; } /* Clean up */ if (io_root != root) { if (root == rank) { /* Receive result from rank io_root to recvbuf */ ret = MCA_PML_CALL(recv(recvbuf, count, datatype, io_root, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != ret) { return ret; } if (MPI_IN_PLACE == sendbuf) { free(use_this_sendbuf); } } else if (io_root == rank) { /* Send result from use_this_recvbuf to root */ ret = MCA_PML_CALL(send(use_this_recvbuf, count, datatype, root, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != ret) { return ret; } free(use_this_recvbuf); } } return MPI_SUCCESS; }
/* * reduce_lin_intra * * Function: - reduction using O(N) algorithm * Accepts: - same as MPI_Reduce() * Returns: - MPI_SUCCESS or error code */ int ompi_coll_base_reduce_intra_basic_linear(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, err, size; ptrdiff_t extent, dsize, gap = 0; char *free_buffer = NULL; char *pml_buffer = NULL; char *inplace_temp_free = NULL; char *inbuf; /* Initialize */ rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); /* If not root, send data to the root. */ if (rank != root) { err = MCA_PML_CALL(send(sbuf, count, dtype, root, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); return err; } dsize = opal_datatype_span(&dtype->super, count, &gap); ompi_datatype_type_extent(dtype, &extent); if (MPI_IN_PLACE == sbuf) { sbuf = rbuf; inplace_temp_free = (char*)malloc(dsize); if (NULL == inplace_temp_free) { return OMPI_ERR_OUT_OF_RESOURCE; } rbuf = inplace_temp_free - gap; } if (size > 1) { free_buffer = (char*)malloc(dsize); if (NULL == free_buffer) { if (NULL != inplace_temp_free) { free(inplace_temp_free); } return OMPI_ERR_OUT_OF_RESOURCE; } pml_buffer = free_buffer - gap; } /* Initialize the receive buffer. */ if (rank == (size - 1)) { err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); } else { err = MCA_PML_CALL(recv(rbuf, count, dtype, size - 1, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); } if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } /* Loop receiving and calling reduction function (C or Fortran). */ for (i = size - 2; i >= 0; --i) { if (rank == i) { inbuf = (char*)sbuf; } else { err = MCA_PML_CALL(recv(pml_buffer, count, dtype, i, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } inbuf = pml_buffer; } /* Perform the reduction */ ompi_op_reduce(op, inbuf, rbuf, count, dtype); } if (NULL != inplace_temp_free) { err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf, rbuf); free(inplace_temp_free); } if (NULL != free_buffer) { free(free_buffer); } /* All done */ return MPI_SUCCESS; }
int mca_coll_base_alltoallv_intra_basic_inplace(const void *rbuf, const int *rcounts, const int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; int i, j, size, rank, err=MPI_SUCCESS; ompi_request_t **preq, **reqs; char *allocated_buffer, *tmp_buffer; size_t max_size, rdtype_size; OPAL_PTRDIFF_TYPE ext, gap = 0; /* Initialize. */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); ompi_datatype_type_size(rdtype, &rdtype_size); /* If only one process, we're done. */ if (1 == size || 0 == rdtype_size) { return MPI_SUCCESS; } /* Find the largest receive amount */ ompi_datatype_type_extent (rdtype, &ext); for (i = 0, max_size = 0 ; i < size ; ++i) { size_t size = opal_datatype_span(&rdtype->super, rcounts[i], &gap); max_size = size > max_size ? size : max_size; } /* The gap will always be the same as we are working on the same datatype */ /* Allocate a temporary buffer */ allocated_buffer = calloc (max_size, 1); if (NULL == allocated_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } tmp_buffer = allocated_buffer - gap; /* Initiate all send/recv to/from others. */ reqs = preq = coll_base_comm_get_reqs(base_module->base_data, 2); if( NULL == reqs ) { err = OMPI_ERR_OUT_OF_RESOURCE; goto error_hndl; } /* in-place alltoallv slow algorithm (but works) */ for (i = 0 ; i < size ; ++i) { for (j = i+1 ; j < size ; ++j) { preq = reqs; if (i == rank && rcounts[j]) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtype, rcounts[j], tmp_buffer, (char *) rbuf + rdisps[j] * ext); if (MPI_SUCCESS != err) { goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[j] * ext, rcounts[j], rdtype, j, MCA_COLL_BASE_TAG_ALLTOALLV, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } err = MCA_PML_CALL(isend ((void *) tmp_buffer, rcounts[j], rdtype, j, MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } } else if (j == rank && rcounts[i]) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtype, rcounts[i], tmp_buffer, (char *) rbuf + rdisps[i] * ext); if (MPI_SUCCESS != err) { goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[i] * ext, rcounts[i], rdtype, i, MCA_COLL_BASE_TAG_ALLTOALLV, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } err = MCA_PML_CALL(isend ((void *) tmp_buffer, rcounts[i], rdtype, i, MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } } else { continue; } /* Wait for the requests to complete */ err = ompi_request_wait_all (2, reqs, MPI_STATUSES_IGNORE); if (MPI_SUCCESS != err) { goto error_hndl; } } } error_hndl: /* Free the temporary buffer */ free (allocated_buffer); if( MPI_SUCCESS != err ) { ompi_coll_base_free_reqs(reqs, 2 ); } /* All done */ return err; }
/* MPI_IN_PLACE all to all algorithm. TODO: implement a better one. */ int mca_coll_base_alltoall_intra_basic_inplace(const void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, j, size, rank, err = MPI_SUCCESS, line; OPAL_PTRDIFF_TYPE ext, gap; ompi_request_t *req; char *allocated_buffer = NULL, *tmp_buffer; size_t max_size; /* Initialize. */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); /* If only one process, we're done. */ if (1 == size) { return MPI_SUCCESS; } /* Find the largest receive amount */ ompi_datatype_type_extent (rdtype, &ext); max_size = opal_datatype_span(&rdtype->super, rcount, &gap); /* Initiate all send/recv to/from others. */ /* Allocate a temporary buffer */ allocated_buffer = calloc (max_size, 1); if( NULL == allocated_buffer) { err = OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto error_hndl; } tmp_buffer = allocated_buffer - gap; max_size = ext * rcount; /* in-place alltoall slow algorithm (but works) */ for (i = 0 ; i < size ; ++i) { for (j = i+1 ; j < size ; ++j) { if (i == rank) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer, (char *) rbuf + j * max_size); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * j, rcount, rdtype, j, MCA_COLL_BASE_TAG_ALLTOALL, comm, &req)); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } err = MCA_PML_CALL(send ((char *) tmp_buffer, rcount, rdtype, j, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } } else if (j == rank) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer, (char *) rbuf + i * max_size); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * i, rcount, rdtype, i, MCA_COLL_BASE_TAG_ALLTOALL, comm, &req)); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } err = MCA_PML_CALL(send ((char *) tmp_buffer, rcount, rdtype, i, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } } else { continue; } /* Wait for the requests to complete */ err = ompi_request_wait ( &req, MPI_STATUSES_IGNORE); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } } } error_hndl: /* Free the temporary buffer */ if( NULL != allocated_buffer ) free (allocated_buffer); if( MPI_SUCCESS != err ) { OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); (void)line; // silence compiler warning } /* All done */ return err; }
int ompi_coll_base_alltoall_intra_bruck(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, k, line = -1, rank, size, err = 0; int sendto, recvfrom, distance, *displs = NULL, *blen = NULL; char *tmpbuf = NULL, *tmpbuf_free = NULL; OPAL_PTRDIFF_TYPE sext, rext, span, gap; struct ompi_datatype_t *new_ddt; if (MPI_IN_PLACE == sbuf) { return mca_coll_base_alltoall_intra_basic_inplace (rbuf, rcount, rdtype, comm, module); } size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:alltoall_intra_bruck rank %d", rank)); err = ompi_datatype_type_extent (sdtype, &sext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } err = ompi_datatype_type_extent (rdtype, &rext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } span = opal_datatype_span(&sdtype->super, (int64_t)size * scount, &gap); displs = (int *) malloc(size * sizeof(int)); if (displs == NULL) { line = __LINE__; err = -1; goto err_hndl; } blen = (int *) malloc(size * sizeof(int)); if (blen == NULL) { line = __LINE__; err = -1; goto err_hndl; } /* tmp buffer allocation for message data */ tmpbuf_free = (char *)malloc(span); if (tmpbuf_free == NULL) { line = __LINE__; err = -1; goto err_hndl; } tmpbuf = tmpbuf_free - gap; /* Step 1 - local rotation - shift up by rank */ err = ompi_datatype_copy_content_same_ddt (sdtype, (int32_t) ((ptrdiff_t)(size - rank) * (ptrdiff_t)scount), tmpbuf, ((char*) sbuf) + (ptrdiff_t)rank * (ptrdiff_t)scount * sext); if (err<0) { line = __LINE__; err = -1; goto err_hndl; } if (rank != 0) { err = ompi_datatype_copy_content_same_ddt (sdtype, (ptrdiff_t)rank * (ptrdiff_t)scount, tmpbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)scount* sext, (char*) sbuf); if (err<0) { line = __LINE__; err = -1; goto err_hndl; } } /* perform communication step */ for (distance = 1; distance < size; distance<<=1) { sendto = (rank + distance) % size; recvfrom = (rank - distance + size) % size; k = 0; /* create indexed datatype */ for (i = 1; i < size; i++) { if (( i & distance) == distance) { displs[k] = (ptrdiff_t)i * (ptrdiff_t)scount; blen[k] = scount; k++; } } /* Set indexes and displacements */ err = ompi_datatype_create_indexed(k, blen, displs, sdtype, &new_ddt); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* Commit the new datatype */ err = ompi_datatype_commit(&new_ddt); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* Sendreceive */ err = ompi_coll_base_sendrecv ( tmpbuf, 1, new_ddt, sendto, MCA_COLL_BASE_TAG_ALLTOALL, rbuf, 1, new_ddt, recvfrom, MCA_COLL_BASE_TAG_ALLTOALL, comm, MPI_STATUS_IGNORE, rank ); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* Copy back new data from recvbuf to tmpbuf */ err = ompi_datatype_copy_content_same_ddt(new_ddt, 1,tmpbuf, (char *) rbuf); if (err < 0) { line = __LINE__; err = -1; goto err_hndl; } /* free ddt */ err = ompi_datatype_destroy(&new_ddt); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } } /* end of for (distance = 1... */ /* Step 3 - local rotation - */ for (i = 0; i < size; i++) { err = ompi_datatype_copy_content_same_ddt (rdtype, (int32_t) rcount, ((char*)rbuf) + ((ptrdiff_t)((rank - i + size) % size) * (ptrdiff_t)rcount * rext), tmpbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * rext); if (err < 0) { line = __LINE__; err = -1; goto err_hndl; } } /* Step 4 - clean up */ if (tmpbuf != NULL) free(tmpbuf_free); if (displs != NULL) free(displs); if (blen != NULL) free(blen); return OMPI_SUCCESS; err_hndl: OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); (void)line; // silence compiler warning if (tmpbuf != NULL) free(tmpbuf_free); if (displs != NULL) free(displs); if (blen != NULL) free(blen); return err; }
/* * exscan_intra * * Function: - basic exscan operation * Accepts: - same arguments as MPI_Exscan() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_exscan_intra(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int size, rank, err; ptrdiff_t true_lb, true_extent, lb, extent; char *free_buffer = NULL; char *reduce_buffer = NULL; rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); /* For MPI_IN_PLACE, just adjust send buffer to point to * receive buffer. */ if (MPI_IN_PLACE == sbuf) { sbuf = rbuf; } /* If we're rank 0, then just send our sbuf to the next rank, and * we are done. */ if (0 == rank) { return MCA_PML_CALL(send(sbuf, count, dtype, rank + 1, MCA_COLL_BASE_TAG_EXSCAN, MCA_PML_BASE_SEND_STANDARD, comm)); } /* If we're the last rank, then just receive the result from the * prior rank, and we are done. */ else if ((size - 1) == rank) { return MCA_PML_CALL(recv(rbuf, count, dtype, rank - 1, MCA_COLL_BASE_TAG_EXSCAN, comm, MPI_STATUS_IGNORE)); } /* Otherwise, get the result from the prior rank, combine it with my * data, and send it to the next rank */ /* Get a temporary buffer to perform the reduction into. Rationale * for malloc'ing this size is provided in coll_basic_reduce.c. */ ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); free_buffer = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } reduce_buffer = free_buffer - lb; err = ompi_datatype_copy_content_same_ddt(dtype, count, reduce_buffer, (char*)sbuf); /* Receive the reduced value from the prior rank */ err = MCA_PML_CALL(recv(rbuf, count, dtype, rank - 1, MCA_COLL_BASE_TAG_EXSCAN, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { goto error; } /* Now reduce the prior rank's result with my source buffer. The source * buffer had been previously copied into the temporary reduce_buffer. */ ompi_op_reduce(op, rbuf, reduce_buffer, count, dtype); /* Send my result off to the next rank */ err = MCA_PML_CALL(send(reduce_buffer, count, dtype, rank + 1, MCA_COLL_BASE_TAG_EXSCAN, MCA_PML_BASE_SEND_STANDARD, comm)); /* Error */ error: free(free_buffer); /* All done */ return err; }
static int mca_coll_tuned_alltoallv_intra_basic_inplace(void *rbuf, const int *rcounts, const int *rdisps, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; int i, j, size, rank, err; MPI_Request *preq; char *tmp_buffer; size_t max_size; ptrdiff_t ext; /* Initialize. */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); /* If only one process, we're done. */ if (1 == size) { return MPI_SUCCESS; } /* Find the largest receive amount */ ompi_datatype_type_extent (rdtype, &ext); for (i = 0, max_size = 0 ; i < size ; ++i) { size_t size = ext * rcounts[rank]; max_size = size > max_size ? size : max_size; } /* Allocate a temporary buffer */ tmp_buffer = calloc (max_size, 1); if (NULL == tmp_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } /* in-place alltoallv slow algorithm (but works) */ for (i = 0 ; i < size ; ++i) { for (j = i+1 ; j < size ; ++j) { /* Initiate all send/recv to/from others. */ preq = tuned_module->tuned_data->mcct_reqs; if (i == rank && rcounts[j]) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtype, rcounts[j], tmp_buffer, (char *) rbuf + rdisps[j]); if (MPI_SUCCESS != err) { goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[j], rcounts[j], rdtype, j, MCA_COLL_BASE_TAG_ALLTOALLV, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } err = MCA_PML_CALL(isend ((void *) tmp_buffer, rcounts[j], rdtype, j, MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } } else if (j == rank && rcounts[i]) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtype, rcounts[i], tmp_buffer, (char *) rbuf + rdisps[i]); if (MPI_SUCCESS != err) { goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[i], rcounts[i], rdtype, i, MCA_COLL_BASE_TAG_ALLTOALLV, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } err = MCA_PML_CALL(isend ((void *) tmp_buffer, rcounts[i], rdtype, i, MCA_COLL_BASE_TAG_ALLTOALLV, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } } else { continue; } /* Wait for the requests to complete */ err = ompi_request_wait_all (2, tuned_module->tuned_data->mcct_reqs, MPI_STATUS_IGNORE); if (MPI_SUCCESS != err) { goto error_hndl; } /* Free the requests. */ mca_coll_tuned_free_reqs(tuned_module->tuned_data->mcct_reqs, 2); } } error_hndl: /* Free the temporary buffer */ free (tmp_buffer); /* All done */ return err; }
/* * reduce_log_intra * * Function: - reduction using O(log N) algorithm * Accepts: - same as MPI_Reduce() * Returns: - MPI_SUCCESS or error code * * * Performing reduction on each dimension of the hypercube. * An example for 8 procs (dimensions = 3): * * Stage 1, reduce on X dimension, 1 -> 0, 3 -> 2, 5 -> 4, 7 -> 6 * * 6----<---7 proc_0: 0+1 * /| /| proc_1: 1 * / | / | proc_2: 2+3 * / | / | proc_3: 3 * 4----<---5 | proc_4: 4+5 * | 2--< |---3 proc_5: 5 * | / | / proc_6: 6+7 * | / | / proc_7: 7 * |/ |/ * 0----<---1 * * Stage 2, reduce on Y dimension, 2 -> 0, 6 -> 4 * * 6--------7 proc_0: 0+1+2+3 * /| /| proc_1: 1 * v | / | proc_2: 2+3 * / | / | proc_3: 3 * 4--------5 | proc_4: 4+5+6+7 * | 2--- |---3 proc_5: 5 * | / | / proc_6: 6+7 * | v | / proc_7: 7 * |/ |/ * 0--------1 * * Stage 3, reduce on Z dimension, 4 -> 0 * * 6--------7 proc_0: 0+1+2+3+4+5+6+7 * /| /| proc_1: 1 * / | / | proc_2: 2+3 * / | / | proc_3: 3 * 4--------5 | proc_4: 4+5+6+7 * | 2--- |---3 proc_5: 5 * v / | / proc_6: 6+7 * | / | / proc_7: 7 * |/ |/ * 0--------1 * * */ int mca_coll_basic_reduce_log_intra(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, size, rank, vrank; int err, peer, dim, mask; ptrdiff_t true_lb, true_extent, lb, extent; char *free_buffer = NULL; char *free_rbuf = NULL; char *pml_buffer = NULL; char *snd_buffer = NULL; char *rcv_buffer = (char*)rbuf; char *inplace_temp = NULL; /* JMS Codearound for now -- if the operations is not communative, * just call the linear algorithm. Need to talk to Edgar / George * about fixing this algorithm here to work with non-communative * operations. */ if (!ompi_op_is_commute(op)) { return mca_coll_basic_reduce_lin_intra(sbuf, rbuf, count, dtype, op, root, comm, module); } /* Some variables */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); vrank = ompi_op_is_commute(op) ? (rank - root + size) % size : rank; dim = comm->c_cube_dim; /* Allocate the incoming and resulting message buffers. See lengthy * rationale above. */ ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); free_buffer = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } pml_buffer = free_buffer - lb; /* read the comment about commutative operations (few lines down * the page) */ if (ompi_op_is_commute(op)) { rcv_buffer = pml_buffer; } /* Allocate sendbuf in case the MPI_IN_PLACE option has been used. See lengthy * rationale above. */ if (MPI_IN_PLACE == sbuf) { inplace_temp = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == inplace_temp) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup_and_return; } sbuf = inplace_temp - lb; err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf, (char*)rbuf); } snd_buffer = (char*)sbuf; if (rank != root && 0 == (vrank & 1)) { /* root is the only one required to provide a valid rbuf. * Assume rbuf is invalid for all other ranks, so fix it up * here to be valid on all non-leaf ranks */ free_rbuf = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == free_rbuf) { err = OMPI_ERR_OUT_OF_RESOURCE; goto cleanup_and_return; } rbuf = free_rbuf - lb; } /* Loop over cube dimensions. High processes send to low ones in the * dimension. */ for (i = 0, mask = 1; i < dim; ++i, mask <<= 1) { /* A high-proc sends to low-proc and stops. */ if (vrank & mask) { peer = vrank & ~mask; if (ompi_op_is_commute(op)) { peer = (peer + root) % size; } err = MCA_PML_CALL(send(snd_buffer, count, dtype, peer, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) { goto cleanup_and_return; } snd_buffer = (char*)rbuf; break; } /* A low-proc receives, reduces, and moves to a higher * dimension. */ else { peer = vrank | mask; if (peer >= size) { continue; } if (ompi_op_is_commute(op)) { peer = (peer + root) % size; } /* Most of the time (all except the first one for commutative * operations) we receive in the user provided buffer * (rbuf). But the exception is here to allow us to dont have * to copy from the sbuf to a temporary location. If the * operation is commutative we dont care in which order we * apply the operation, so for the first time we can receive * the data in the pml_buffer and then apply to operation * between this buffer and the user provided data. */ err = MCA_PML_CALL(recv(rcv_buffer, count, dtype, peer, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { goto cleanup_and_return; } /* Perform the operation. The target is always the user * provided buffer We do the operation only if we receive it * not in the user buffer */ if (snd_buffer != sbuf) { /* the target buffer is the locally allocated one */ ompi_op_reduce(op, rcv_buffer, pml_buffer, count, dtype); } else { /* If we're commutative, we don't care about the order of * operations and we can just reduce the operations now. * If we are not commutative, we have to copy the send * buffer into a temp buffer (pml_buffer) and then reduce * what we just received against it. */ if (!ompi_op_is_commute(op)) { ompi_datatype_copy_content_same_ddt(dtype, count, pml_buffer, (char*)sbuf); ompi_op_reduce(op, rbuf, pml_buffer, count, dtype); } else { ompi_op_reduce(op, sbuf, pml_buffer, count, dtype); } /* now we have to send the buffer containing the computed data */ snd_buffer = pml_buffer; /* starting from now we always receive in the user * provided buffer */ rcv_buffer = (char*)rbuf; } } } /* Get the result to the root if needed. */ err = MPI_SUCCESS; if (0 == vrank) { if (root == rank) { ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, snd_buffer); } else { err = MCA_PML_CALL(send(snd_buffer, count, dtype, root, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); } } else if (rank == root) { err = MCA_PML_CALL(recv(rcv_buffer, count, dtype, 0, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (rcv_buffer != rbuf) { ompi_op_reduce(op, rcv_buffer, rbuf, count, dtype); } } cleanup_and_return: if (NULL != inplace_temp) { free(inplace_temp); } if (NULL != free_buffer) { free(free_buffer); } if (NULL != free_rbuf) { free(free_rbuf); } /* All done */ return err; }
/* * reduce_lin_intra * * Function: - reduction using O(N) algorithm * Accepts: - same as MPI_Reduce() * Returns: - MPI_SUCCESS or error code */ int ompi_coll_tuned_reduce_intra_basic_linear(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, err, size; ptrdiff_t true_lb, true_extent, lb, extent; char *free_buffer = NULL, *pml_buffer = NULL; char *inplace_temp = NULL, *inbuf; /* Initialize */ rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); OPAL_OUTPUT((ompi_coll_tuned_stream,"coll:tuned:reduce_intra_basic_linear rank %d", rank)); /* If not root, send data to the root. */ if (rank != root) { err = MCA_PML_CALL(send(sbuf, count, dtype, root, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); return err; } /* see discussion in ompi_coll_basic_reduce_lin_intra about extent and true extent */ /* for reducing buffer allocation lengths.... */ ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); if (MPI_IN_PLACE == sbuf) { sbuf = rbuf; inplace_temp = (char*)malloc(true_extent + (ptrdiff_t)(count - 1) * extent); if (NULL == inplace_temp) { return OMPI_ERR_OUT_OF_RESOURCE; } rbuf = inplace_temp - lb; } if (size > 1) { free_buffer = (char*)malloc(true_extent + (ptrdiff_t)(count - 1) * extent); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } pml_buffer = free_buffer - lb; } /* Initialize the receive buffer. */ if (rank == (size - 1)) { err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); } else { err = MCA_PML_CALL(recv(rbuf, count, dtype, size - 1, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); } if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } /* Loop receiving and calling reduction function (C or Fortran). */ for (i = size - 2; i >= 0; --i) { if (rank == i) { inbuf = (char*)sbuf; } else { err = MCA_PML_CALL(recv(pml_buffer, count, dtype, i, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } inbuf = pml_buffer; } /* Perform the reduction */ ompi_op_reduce(op, inbuf, rbuf, count, dtype); } if (NULL != inplace_temp) { err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf, inplace_temp); free(inplace_temp); } if (NULL != free_buffer) { free(free_buffer); } /* All done */ return MPI_SUCCESS; }
/* * scan * * Function: - basic scan operation * Accepts: - same arguments as MPI_Scan() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_scan_intra(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int size, rank, err; ptrdiff_t true_lb, true_extent, lb, extent; char *free_buffer = NULL; char *pml_buffer = NULL; /* Initialize */ rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); /* If I'm rank 0, just copy into the receive buffer */ if (0 == rank) { if (MPI_IN_PLACE != sbuf) { err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); if (MPI_SUCCESS != err) { return err; } } } /* Otherwise receive previous buffer and reduce. */ else { /* Allocate a temporary buffer. Rationale for this size is * listed in coll_basic_reduce.c. Use this temporary buffer to * receive into, later. */ ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); free_buffer = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == free_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } pml_buffer = free_buffer - true_lb; /* Copy the send buffer into the receive buffer. */ if (MPI_IN_PLACE != sbuf) { err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } } /* Receive the prior answer */ err = MCA_PML_CALL(recv(pml_buffer, count, dtype, rank - 1, MCA_COLL_BASE_TAG_SCAN, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } /* Perform the operation */ ompi_op_reduce(op, pml_buffer, rbuf, count, dtype); /* All done */ if (NULL != free_buffer) { free(free_buffer); } } /* Send result to next process. */ if (rank < (size - 1)) { return MCA_PML_CALL(send(rbuf, count, dtype, rank + 1, MCA_COLL_BASE_TAG_SCAN, MCA_PML_BASE_SEND_STANDARD, comm)); } /* All done */ return MPI_SUCCESS; }
/* Todo: gather_intra_generic, gather_intra_binary, gather_intra_chain, * gather_intra_pipeline, segmentation? */ int ompi_coll_base_gather_intra_binomial(const void *sbuf, int scount, struct ompi_datatype_t *sdtype, void *rbuf, int rcount, struct ompi_datatype_t *rdtype, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int line = -1, i, rank, vrank, size, total_recv = 0, err; char *ptmp = NULL, *tempbuf = NULL; ompi_coll_tree_t* bmtree; MPI_Status status; MPI_Aint sextent, slb, strue_lb, strue_extent; MPI_Aint rextent, rlb, rtrue_lb, rtrue_extent; mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; mca_coll_base_comm_t *data = base_module->base_data; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_gather_intra_binomial rank %d", rank)); /* create the binomial tree */ COLL_BASE_UPDATE_IN_ORDER_BMTREE( comm, base_module, root ); bmtree = data->cached_in_order_bmtree; ompi_datatype_get_extent(sdtype, &slb, &sextent); ompi_datatype_get_true_extent(sdtype, &strue_lb, &strue_extent); vrank = (rank - root + size) % size; if (rank == root) { ompi_datatype_get_extent(rdtype, &rlb, &rextent); ompi_datatype_get_true_extent(rdtype, &rtrue_lb, &rtrue_extent); if (0 == root){ /* root on 0, just use the recv buffer */ ptmp = (char *) rbuf; if (sbuf != MPI_IN_PLACE) { err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype, ptmp, rcount, rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } } else { /* root is not on 0, allocate temp buffer for recv, * rotate data at the end */ tempbuf = (char *) malloc(rtrue_extent + ((ptrdiff_t)rcount * (ptrdiff_t)size - 1) * rextent); if (NULL == tempbuf) { err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; } ptmp = tempbuf - rtrue_lb; if (sbuf != MPI_IN_PLACE) { /* copy from sbuf to temp buffer */ err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype, ptmp, rcount, rdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } else { /* copy from rbuf to temp buffer */ err = ompi_datatype_copy_content_same_ddt(rdtype, rcount, ptmp, (char *)rbuf + (ptrdiff_t)rank * rextent * (ptrdiff_t)rcount); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } } total_recv = rcount; } else if (!(vrank % 2)) { /* other non-leaf nodes, allocate temp buffer for data received from * children, the most we need is half of the total data elements due * to the property of binimoal tree */ tempbuf = (char *) malloc(strue_extent + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sextent); if (NULL == tempbuf) { err= OMPI_ERR_OUT_OF_RESOURCE; line = __LINE__; goto err_hndl; } ptmp = tempbuf - strue_lb; /* local copy to tempbuf */ err = ompi_datatype_sndrcv((void *)sbuf, scount, sdtype, ptmp, scount, sdtype); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } /* use sdtype,scount as rdtype,rdcount since they are ignored on * non-root procs */ rdtype = sdtype; rcount = scount; rextent = sextent; total_recv = rcount; } else { /* leaf nodes, no temp buffer needed, use sdtype,scount as * rdtype,rdcount since they are ignored on non-root procs */ ptmp = (char *) sbuf; total_recv = scount; } if (!(vrank % 2)) { /* all non-leaf nodes recv from children */ for (i = 0; i < bmtree->tree_nextsize; i++) { int mycount = 0, vkid; /* figure out how much data I have to send to this child */ vkid = (bmtree->tree_next[i] - root + size) % size; mycount = vkid - vrank; if (mycount > (size - vkid)) mycount = size - vkid; mycount *= rcount; OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_gather_intra_binomial rank %d recv %d mycount = %d", rank, bmtree->tree_next[i], mycount)); err = MCA_PML_CALL(recv(ptmp + total_recv*rextent, (ptrdiff_t)rcount * size - total_recv, rdtype, bmtree->tree_next[i], MCA_COLL_BASE_TAG_GATHER, comm, &status)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } total_recv += mycount; } } if (rank != root) { /* all nodes except root send to parents */ OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "ompi_coll_base_gather_intra_binomial rank %d send %d count %d\n", rank, bmtree->tree_prev, total_recv)); err = MCA_PML_CALL(send(ptmp, total_recv, sdtype, bmtree->tree_prev, MCA_COLL_BASE_TAG_GATHER, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } } if (rank == root) { if (root != 0) { /* rotate received data on root if root != 0 */ err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rcount * (ptrdiff_t)(size - root), (char *)rbuf + rextent * (ptrdiff_t)root * (ptrdiff_t)rcount, ptmp); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } err = ompi_datatype_copy_content_same_ddt(rdtype, (ptrdiff_t)rcount * (ptrdiff_t)root, (char *) rbuf, ptmp + rextent * (ptrdiff_t)rcount * (ptrdiff_t)(size-root)); if (MPI_SUCCESS != err) { line = __LINE__; goto err_hndl; } free(tempbuf); } } else if (!(vrank % 2)) { /* other non-leaf nodes */ free(tempbuf); } return MPI_SUCCESS; err_hndl: if (NULL != tempbuf) free(tempbuf); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); return err; }
/* MPI_IN_PLACE all to all algorithm. TODO: implement a better one. */ int mca_coll_base_alltoall_intra_basic_inplace(const void *rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { mca_coll_base_module_t *base_module = (mca_coll_base_module_t*) module; int i, j, size, rank, err = MPI_SUCCESS, line; MPI_Request *preq; char *tmp_buffer; size_t max_size; ptrdiff_t ext; /* Initialize. */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); /* If only one process, we're done. */ if (1 == size) { return MPI_SUCCESS; } /* Find the largest receive amount */ ompi_datatype_type_extent (rdtype, &ext); max_size = ext * rcount; /* Allocate a temporary buffer */ tmp_buffer = calloc (max_size, 1); if (NULL == tmp_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } /* in-place alltoall slow algorithm (but works) */ for (i = 0 ; i < size ; ++i) { for (j = i+1 ; j < size ; ++j) { /* Initiate all send/recv to/from others. */ preq = coll_base_comm_get_reqs(base_module->base_data, size * 2); if (i == rank) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer, (char *) rbuf + j * max_size); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * j, rcount, rdtype, j, MCA_COLL_BASE_TAG_ALLTOALL, comm, preq++)); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } err = MCA_PML_CALL(isend ((char *) tmp_buffer, rcount, rdtype, j, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } } else if (j == rank) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtype, rcount, tmp_buffer, (char *) rbuf + i * max_size); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + max_size * i, rcount, rdtype, i, MCA_COLL_BASE_TAG_ALLTOALL, comm, preq++)); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } err = MCA_PML_CALL(isend ((char *) tmp_buffer, rcount, rdtype, i, MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } } else { continue; } /* Wait for the requests to complete */ err = ompi_request_wait_all (2, base_module->base_data->mcct_reqs, MPI_STATUSES_IGNORE); if (MPI_SUCCESS != err) { line = __LINE__; goto error_hndl; } } } error_hndl: /* Free the temporary buffer */ free (tmp_buffer); if( MPI_SUCCESS != err ) { OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); ompi_coll_base_free_reqs(base_module->base_data->mcct_reqs, 2); } /* All done */ return err; }
static int mca_coll_basic_alltoallw_intra_inplace(const void *rbuf, const int *rcounts, const int *rdisps, struct ompi_datatype_t * const *rdtypes, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, j, size, rank, err=MPI_SUCCESS, max_size; MPI_Request *preq, *reqs = NULL; char *tmp_buffer; ptrdiff_t ext; /* Initialize. */ size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); /* If only one process, we're done. */ if (1 == size) { return MPI_SUCCESS; } /* Find the largest receive amount */ for (i = 0, max_size = 0 ; i < size ; ++i) { ompi_datatype_type_extent (rdtypes[i], &ext); ext *= rcounts[i]; max_size = ext > max_size ? ext : max_size; } /* Allocate a temporary buffer */ tmp_buffer = calloc (max_size, 1); if (NULL == tmp_buffer) { return OMPI_ERR_OUT_OF_RESOURCE; } reqs = coll_base_comm_get_reqs( module->base_data, 2); /* in-place alltoallw slow algorithm (but works) */ for (i = 0 ; i < size ; ++i) { size_t msg_size_i; ompi_datatype_type_size(rdtypes[i], &msg_size_i); msg_size_i *= rcounts[i]; for (j = i+1 ; j < size ; ++j) { size_t msg_size_j; ompi_datatype_type_size(rdtypes[j], &msg_size_j); msg_size_j *= rcounts[j]; /* Initiate all send/recv to/from others. */ preq = reqs; if (i == rank && msg_size_j != 0) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtypes[j], rcounts[j], tmp_buffer, (char *) rbuf + rdisps[j]); if (MPI_SUCCESS != err) { goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[j], rcounts[j], rdtypes[j], j, MCA_COLL_BASE_TAG_ALLTOALLW, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } err = MCA_PML_CALL(isend ((void *) tmp_buffer, rcounts[j], rdtypes[j], j, MCA_COLL_BASE_TAG_ALLTOALLW, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } } else if (j == rank && msg_size_i != 0) { /* Copy the data into the temporary buffer */ err = ompi_datatype_copy_content_same_ddt (rdtypes[i], rcounts[i], tmp_buffer, (char *) rbuf + rdisps[i]); if (MPI_SUCCESS != err) { goto error_hndl; } /* Exchange data with the peer */ err = MCA_PML_CALL(irecv ((char *) rbuf + rdisps[i], rcounts[i], rdtypes[i], i, MCA_COLL_BASE_TAG_ALLTOALLW, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } err = MCA_PML_CALL(isend ((void *) tmp_buffer, rcounts[i], rdtypes[i], i, MCA_COLL_BASE_TAG_ALLTOALLW, MCA_PML_BASE_SEND_STANDARD, comm, preq++)); if (MPI_SUCCESS != err) { goto error_hndl; } } else { continue; } /* Wait for the requests to complete */ err = ompi_request_wait_all (2, reqs, MPI_STATUSES_IGNORE); if (MPI_SUCCESS != err) { goto error_hndl; } } } error_hndl: /* Free the temporary buffer */ free (tmp_buffer); if( MPI_SUCCESS != err ) { /* Free the requests. */ if( NULL != reqs ) { ompi_coll_base_free_reqs(reqs, 2); } } /* All done */ return err; }
int ompi_coll_tuned_alltoall_intra_bruck(void *sbuf, int scount, struct ompi_datatype_t *sdtype, void* rbuf, int rcount, struct ompi_datatype_t *rdtype, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, k, line = -1, rank, size, err = 0, weallocated = 0; int sendto, recvfrom, distance, *displs = NULL, *blen = NULL; char *tmpbuf = NULL, *tmpbuf_free = NULL; ptrdiff_t rlb, slb, tlb, sext, rext, tsext; struct ompi_datatype_t *new_ddt; #ifdef blahblah mca_coll_tuned_module_t *tuned_module = (mca_coll_tuned_module_t*) module; mca_coll_tuned_comm_t *data = tuned_module->tuned_data; #endif if (MPI_IN_PLACE == sbuf) { return mca_coll_tuned_alltoall_intra_basic_inplace (rbuf, rcount, rdtype, comm, module); } size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_tuned_stream, "coll:tuned:alltoall_intra_bruck rank %d", rank)); err = ompi_datatype_get_extent (sdtype, &slb, &sext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } err = ompi_datatype_get_true_extent(sdtype, &tlb, &tsext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } err = ompi_datatype_get_extent (rdtype, &rlb, &rext); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } #ifdef blahblah /* try and SAVE memory by using the data segment hung off the communicator if possible */ if (data->mcct_num_reqs >= size) { /* we have enought preallocated for displments and lengths */ displs = (int*) data->mcct_reqs; blen = (int *) (displs + size); weallocated = 0; } else { /* allocate the buffers ourself */ #endif displs = (int *) malloc(size * sizeof(int)); if (displs == NULL) { line = __LINE__; err = -1; goto err_hndl; } blen = (int *) malloc(size * sizeof(int)); if (blen == NULL) { line = __LINE__; err = -1; goto err_hndl; } weallocated = 1; #ifdef blahblah } #endif /* tmp buffer allocation for message data */ tmpbuf_free = (char *) malloc(tsext + ((ptrdiff_t)scount * (ptrdiff_t)size - 1) * sext); if (tmpbuf_free == NULL) { line = __LINE__; err = -1; goto err_hndl; } tmpbuf = tmpbuf_free - slb; /* Step 1 - local rotation - shift up by rank */ err = ompi_datatype_copy_content_same_ddt (sdtype, (int32_t) ((ptrdiff_t)(size - rank) * (ptrdiff_t)scount), tmpbuf, ((char*) sbuf) + (ptrdiff_t)rank * (ptrdiff_t)scount * sext); if (err<0) { line = __LINE__; err = -1; goto err_hndl; } if (rank != 0) { err = ompi_datatype_copy_content_same_ddt (sdtype, (ptrdiff_t)rank * (ptrdiff_t)scount, tmpbuf + (ptrdiff_t)(size - rank) * (ptrdiff_t)scount* sext, (char*) sbuf); if (err<0) { line = __LINE__; err = -1; goto err_hndl; } } /* perform communication step */ for (distance = 1; distance < size; distance<<=1) { sendto = (rank + distance) % size; recvfrom = (rank - distance + size) % size; k = 0; /* create indexed datatype */ for (i = 1; i < size; i++) { if (( i & distance) == distance) { displs[k] = (ptrdiff_t)i * (ptrdiff_t)scount; blen[k] = scount; k++; } } /* Set indexes and displacements */ err = ompi_datatype_create_indexed(k, blen, displs, sdtype, &new_ddt); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* Commit the new datatype */ err = ompi_datatype_commit(&new_ddt); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* Sendreceive */ err = ompi_coll_tuned_sendrecv ( tmpbuf, 1, new_ddt, sendto, MCA_COLL_BASE_TAG_ALLTOALL, rbuf, 1, new_ddt, recvfrom, MCA_COLL_BASE_TAG_ALLTOALL, comm, MPI_STATUS_IGNORE, rank ); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } /* Copy back new data from recvbuf to tmpbuf */ err = ompi_datatype_copy_content_same_ddt(new_ddt, 1,tmpbuf, (char *) rbuf); if (err < 0) { line = __LINE__; err = -1; goto err_hndl; } /* free ddt */ err = ompi_datatype_destroy(&new_ddt); if (err != MPI_SUCCESS) { line = __LINE__; goto err_hndl; } } /* end of for (distance = 1... */ /* Step 3 - local rotation - */ for (i = 0; i < size; i++) { err = ompi_datatype_copy_content_same_ddt (rdtype, (int32_t) rcount, ((char*)rbuf) + ((ptrdiff_t)((rank - i + size) % size) * (ptrdiff_t)rcount * rext), tmpbuf + (ptrdiff_t)i * (ptrdiff_t)rcount * rext); if (err < 0) { line = __LINE__; err = -1; goto err_hndl; } } /* Step 4 - clean up */ if (tmpbuf != NULL) free(tmpbuf_free); if (weallocated) { if (displs != NULL) free(displs); if (blen != NULL) free(blen); } return OMPI_SUCCESS; err_hndl: OPAL_OUTPUT((ompi_coll_tuned_stream, "%s:%4d\tError occurred %d, rank %2d", __FILE__, line, err, rank)); if (tmpbuf != NULL) free(tmpbuf_free); if (weallocated) { if (displs != NULL) free(displs); if (blen != NULL) free(blen); } return err; }
static int mca_coll_ml_reduce_frag_progress(mca_coll_ml_collective_operation_progress_t *coll_op) { /* local variables */ void *buf; size_t dt_size; int ret, frag_len, count; ptrdiff_t lb, extent; mca_bcol_base_payload_buffer_desc_t *src_buffer_desc; mca_coll_ml_collective_operation_progress_t *new_op; mca_coll_ml_module_t *ml_module = OP_ML_MODULE(coll_op); ret = ompi_datatype_get_extent(coll_op->variable_fn_params.dtype, &lb, &extent); if (ret < 0) { return OMPI_ERROR; } dt_size = (size_t) extent; /* Keep the pipeline filled with fragments */ while (coll_op->fragment_data.message_descriptor->n_active < coll_op->fragment_data.message_descriptor->pipeline_depth) { /* If an active fragment happens to have completed the collective during * a hop into the progress engine, then don't launch a new fragment, * instead break and return. */ if (coll_op->fragment_data.message_descriptor->n_bytes_scheduled == coll_op->fragment_data.message_descriptor->n_bytes_total) { break; } /* Get an ml buffer */ src_buffer_desc = mca_coll_ml_alloc_buffer(OP_ML_MODULE(coll_op)); if (NULL == src_buffer_desc) { /* If there exist outstanding fragments, then break out * and let an active fragment deal with this later, * there are no buffers available. */ if (0 < coll_op->fragment_data.message_descriptor->n_active) { return OMPI_SUCCESS; } else { /* It is useless to call progress from here, since * ml progress can't be executed as result ml memsync * call will not be completed and no memory will be * recycled. So we put the element on the list, and we will * progress it later when memsync will recycle some memory*/ /* The fragment is already on list and * the we still have no ml resources * Return busy */ if (coll_op->pending & REQ_OUT_OF_MEMORY) { ML_VERBOSE(10,("Out of resources %p", coll_op)); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } coll_op->pending |= REQ_OUT_OF_MEMORY; opal_list_append(&((OP_ML_MODULE(coll_op))->waiting_for_memory_list), (opal_list_item_t *)coll_op); ML_VERBOSE(10,("Out of resources %p adding to pending queue", coll_op)); return OMPI_ERR_TEMP_OUT_OF_RESOURCE; } } /* Get a new collective descriptor and initialize it */ new_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, ml_module->coll_ml_reduce_functions[ML_SMALL_DATA_REDUCE], coll_op->fragment_data.message_descriptor->src_user_addr, coll_op->fragment_data.message_descriptor->dest_user_addr, coll_op->fragment_data.message_descriptor->n_bytes_total, coll_op->fragment_data.message_descriptor->n_bytes_scheduled); ML_VERBOSE(1,(" In Reduce fragment progress %d %d ", coll_op->fragment_data.message_descriptor->n_bytes_total, coll_op->fragment_data.message_descriptor->n_bytes_scheduled)); MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(new_op, src_buffer_desc->buffer_index, src_buffer_desc); new_op->fragment_data.current_coll_op = coll_op->fragment_data.current_coll_op; new_op->fragment_data.message_descriptor = coll_op->fragment_data.message_descriptor; /* set the task setup callback */ new_op->sequential_routine.seq_task_setup = mca_coll_ml_reduce_task_setup; /* We need this address for pointer arithmetic in memcpy */ buf = (void*)coll_op->fragment_data.message_descriptor->src_user_addr; /* calculate the number of data types in this packet */ count = (coll_op->fragment_data.message_descriptor->n_bytes_total - coll_op->fragment_data.message_descriptor->n_bytes_scheduled < ((size_t) OP_ML_MODULE(coll_op)->small_message_thresholds[BCOL_REDUCE]/4 )? (coll_op->fragment_data.message_descriptor->n_bytes_total - coll_op->fragment_data.message_descriptor->n_bytes_scheduled) / dt_size : (size_t) coll_op->variable_fn_params.count); /* calculate the fragment length */ frag_len = count * dt_size; ret = ompi_datatype_copy_content_same_ddt(coll_op->variable_fn_params.dtype, count, (char *) src_buffer_desc->data_addr, (char *) ((uintptr_t) buf + (uintptr_t) coll_op->fragment_data.message_descriptor->n_bytes_scheduled)); if (ret < 0) { return OMPI_ERROR; } /* if root unpack the data */ if (ompi_comm_rank(ml_module->comm) == coll_op->global_root ) { new_op->process_fn = mca_coll_ml_reduce_unpack; new_op->variable_fn_params.root_flag = true; } else { new_op->process_fn = NULL; new_op->variable_fn_params.root_flag = false; } new_op->variable_fn_params.root_route = coll_op->variable_fn_params.root_route; /* Setup fragment specific data */ new_op->fragment_data.message_descriptor->n_bytes_scheduled += frag_len; new_op->fragment_data.buffer_desc = src_buffer_desc; new_op->fragment_data.fragment_size = frag_len; (new_op->fragment_data.message_descriptor->n_active)++; /* Set in Reduce Buffer arguments */ ML_SET_VARIABLE_PARAMS_BCAST(new_op, OP_ML_MODULE(new_op), count, coll_op->variable_fn_params.dtype, src_buffer_desc, 0, (ml_module->payload_block->size_buffer - ml_module->data_offset)/2, frag_len, src_buffer_desc->data_addr); new_op->variable_fn_params.buffer_size = frag_len; new_op->variable_fn_params.sbuf = src_buffer_desc->data_addr; new_op->variable_fn_params.rbuf = src_buffer_desc->data_addr; new_op->variable_fn_params.root = coll_op->variable_fn_params.root; new_op->global_root = coll_op->global_root; new_op->variable_fn_params.op = coll_op->variable_fn_params.op; new_op->variable_fn_params.hier_factor = coll_op->variable_fn_params.hier_factor; new_op->sequential_routine.current_bcol_status = SEQ_TASK_PENDING; MCA_COLL_ML_SET_NEW_FRAG_ORDER_INFO(new_op); ML_VERBOSE(10,("FFFF Contig + fragmentation [0-sk, 1-lk, 3-su, 4-lu] %d %d %d\n", new_op->variable_fn_params.buffer_size, new_op->fragment_data.fragment_size, new_op->fragment_data.message_descriptor->n_bytes_scheduled)); /* initialize first coll */ new_op->sequential_routine.seq_task_setup(new_op); /* append this collective !! */ OPAL_THREAD_LOCK(&(mca_coll_ml_component.sequential_collectives_mutex)); opal_list_append(&mca_coll_ml_component.sequential_collectives, (opal_list_item_t *)new_op); OPAL_THREAD_UNLOCK(&(mca_coll_ml_component.sequential_collectives_mutex)); } return OMPI_SUCCESS; }
/* * reduce_lin_intra * * Function: - reduction using O(N) algorithm * Accepts: - same as MPI_Reduce() * Returns: - MPI_SUCCESS or error code */ int mca_coll_basic_reduce_lin_intra(void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int i, rank, err, size; ptrdiff_t true_lb, true_extent, lb, extent; char *free_buffer = NULL; char *pml_buffer = NULL; char *inplace_temp = NULL; char *inbuf; /* Initialize */ rank = ompi_comm_rank(comm); size = ompi_comm_size(comm); /* If not root, send data to the root. */ if (rank != root) { err = MCA_PML_CALL(send(sbuf, count, dtype, root, MCA_COLL_BASE_TAG_REDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); return err; } /* Root receives and reduces messages. Allocate buffer to receive * messages. This comment applies to all collectives in this basic * module where we allocate a temporary buffer. For the next few * lines of code, it's tremendously complicated how we decided that * this was the Right Thing to do. Sit back and enjoy. And prepare * to have your mind warped. :-) * * Recall some definitions (I always get these backwards, so I'm * going to put them here): * * extent: the length from the lower bound to the upper bound -- may * be considerably larger than the buffer required to hold the data * (or smaller! But it's easiest to think about when it's larger). * * true extent: the exact number of bytes required to hold the data * in the layout pattern in the datatype. * * For example, consider the following buffer (just talking about * LB, extent, and true extent -- extrapolate for UB; i.e., assume * the UB equals exactly where the data ends): * * A B C * -------------------------------------------------------- * | | | * -------------------------------------------------------- * * There are multiple cases: * * 1. A is what we give to MPI_Send (and friends), and A is where * the data starts, and C is where the data ends. In this case: * * - extent: C-A * - true extent: C-A * - LB: 0 * * A C * -------------------------------------------------------- * | | * -------------------------------------------------------- * <=======================extent=========================> * <======================true extent=====================> * * 2. A is what we give to MPI_Send (and friends), B is where the * data starts, and C is where the data ends. In this case: * * - extent: C-A * - true extent: C-B * - LB: positive * * A B C * -------------------------------------------------------- * | | User buffer | * -------------------------------------------------------- * <=======================extent=========================> * <===============true extent=============> * * 3. B is what we give to MPI_Send (and friends), A is where the * data starts, and C is where the data ends. In this case: * * - extent: C-A * - true extent: C-A * - LB: negative * * A B C * -------------------------------------------------------- * | | User buffer | * -------------------------------------------------------- * <=======================extent=========================> * <======================true extent=====================> * * 4. MPI_BOTTOM is what we give to MPI_Send (and friends), B is * where the data starts, and C is where the data ends. In this * case: * * - extent: C-MPI_BOTTOM * - true extent: C-B * - LB: [potentially very large] positive * * MPI_BOTTOM B C * -------------------------------------------------------- * | | User buffer | * -------------------------------------------------------- * <=======================extent=========================> * <===============true extent=============> * * So in all cases, for a temporary buffer, all we need to malloc() * is a buffer of size true_extent. We therefore need to know two * pointer values: what value to give to MPI_Send (and friends) and * what value to give to free(), because they might not be the same. * * Clearly, what we give to free() is exactly what was returned from * malloc(). That part is easy. :-) * * What we give to MPI_Send (and friends) is a bit more complicated. * Let's take the 4 cases from above: * * 1. If A is what we give to MPI_Send and A is where the data * starts, then clearly we give to MPI_Send what we got back from * malloc(). * * 2. If B is what we get back from malloc, but we give A to * MPI_Send, then the buffer range [A,B) represents "dead space" * -- no data will be put there. So it's safe to give B-LB to * MPI_Send. More specifically, the LB is positive, so B-LB is * actually A. * * 3. If A is what we get back from malloc, and B is what we give to * MPI_Send, then the LB is negative, so A-LB will actually equal * B. * * 4. Although this seems like the weirdest case, it's actually * quite similar to case #2 -- the pointer we give to MPI_Send is * smaller than the pointer we got back from malloc(). * * Hence, in all cases, we give (return_from_malloc - LB) to MPI_Send. * * This works fine and dandy if we only have (count==1), which we * rarely do. ;-) So we really need to allocate (true_extent + * ((count - 1) * extent)) to get enough space for the rest. This may * be more than is necessary, but it's ok. * * Simple, no? :-) * */ ompi_datatype_get_extent(dtype, &lb, &extent); ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); if (MPI_IN_PLACE == sbuf) { sbuf = rbuf; inplace_temp = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == inplace_temp) { return OMPI_ERR_OUT_OF_RESOURCE; } rbuf = inplace_temp - lb; } if (size > 1) { free_buffer = (char*)malloc(true_extent + (count - 1) * extent); if (NULL == free_buffer) { if (NULL != inplace_temp) { free(inplace_temp); } return OMPI_ERR_OUT_OF_RESOURCE; } pml_buffer = free_buffer - lb; } /* Initialize the receive buffer. */ if (rank == (size - 1)) { err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); } else { err = MCA_PML_CALL(recv(rbuf, count, dtype, size - 1, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); } if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } /* Loop receiving and calling reduction function (C or Fortran). */ for (i = size - 2; i >= 0; --i) { if (rank == i) { inbuf = (char*)sbuf; } else { err = MCA_PML_CALL(recv(pml_buffer, count, dtype, i, MCA_COLL_BASE_TAG_REDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != err) { if (NULL != free_buffer) { free(free_buffer); } return err; } inbuf = pml_buffer; } /* Perform the reduction */ ompi_op_reduce(op, inbuf, rbuf, count, dtype); } if (NULL != inplace_temp) { err = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)sbuf, inplace_temp); free(inplace_temp); } if (NULL != free_buffer) { free(free_buffer); } /* All done */ return MPI_SUCCESS; }
static inline __opal_attribute_always_inline__ int parallel_reduce_start (void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, int root, struct ompi_communicator_t *comm, mca_coll_ml_module_t *ml_module, ompi_request_t **req, int small_data_reduce, int large_data_reduce) { ptrdiff_t lb, extent; size_t pack_len, dt_size; mca_bcol_base_payload_buffer_desc_t *src_buffer_desc = NULL; mca_coll_ml_collective_operation_progress_t * coll_op = NULL; bool contiguous = ompi_datatype_is_contiguous_memory_layout(dtype, count); mca_coll_ml_component_t *cm = &mca_coll_ml_component; int ret, n_fragments = 1, frag_len, pipeline_depth, n_dts_per_frag, rank; if (MPI_IN_PLACE == sbuf) { sbuf = rbuf; } ret = ompi_datatype_get_extent(dtype, &lb, &extent); if (ret < 0) { return OMPI_ERROR; } rank = ompi_comm_rank (comm); dt_size = (size_t) extent; pack_len = count * dt_size; /* We use a separate recieve and send buffer so only half the buffer is usable. */ if (pack_len < (size_t) ml_module->small_message_thresholds[BCOL_REDUCE] / 4) { /* The len of the message can not be larger than ML buffer size */ assert(pack_len <= ml_module->payload_block->size_buffer); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); ML_VERBOSE(10,("Using small data reduce (threshold = %d)", REDUCE_SMALL_MESSAGE_THRESHOLD)); while (NULL == src_buffer_desc) { opal_progress(); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); } coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, ml_module->coll_ml_reduce_functions[small_data_reduce], sbuf, rbuf, pack_len, 0); MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, src_buffer_desc->buffer_index, src_buffer_desc); coll_op->variable_fn_params.rbuf = src_buffer_desc->data_addr; coll_op->variable_fn_params.sbuf = src_buffer_desc->data_addr; coll_op->variable_fn_params.buffer_index = src_buffer_desc->buffer_index; coll_op->variable_fn_params.src_desc = src_buffer_desc; coll_op->variable_fn_params.count = count; ret = ompi_datatype_copy_content_same_ddt(dtype, count, (void *) (uintptr_t) src_buffer_desc->data_addr, (char *) sbuf); if (ret < 0) { return OMPI_ERROR; } } else if (cm->enable_fragmentation || !contiguous) { ML_VERBOSE(1,("Using Fragmented Reduce ")); /* fragment the data */ /* check for retarded application programming decisions */ if (dt_size > (size_t) ml_module->small_message_thresholds[BCOL_REDUCE] / 4) { ML_ERROR(("Sorry, but we don't support datatypes that large")); return OMPI_ERROR; } /* calculate the number of data types that can fit per ml-buffer */ n_dts_per_frag = ml_module->small_message_thresholds[BCOL_REDUCE] / (4 * dt_size); /* calculate the number of fragments */ n_fragments = (count + n_dts_per_frag - 1) / n_dts_per_frag; /* round up */ /* calculate the actual pipeline depth */ pipeline_depth = n_fragments < cm->pipeline_depth ? n_fragments : cm->pipeline_depth; /* calculate the fragment size */ frag_len = n_dts_per_frag * dt_size; /* allocate an ml buffer */ src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); while (NULL == src_buffer_desc) { opal_progress(); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); } coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, ml_module->coll_ml_reduce_functions[small_data_reduce], sbuf,rbuf, pack_len, 0 /* offset for first pack */); MCA_COLL_IBOFFLOAD_SET_ML_BUFFER_INFO(coll_op, src_buffer_desc->buffer_index, src_buffer_desc); coll_op->variable_fn_params.sbuf = (void *) src_buffer_desc->data_addr; coll_op->variable_fn_params.rbuf = (void *) src_buffer_desc->data_addr; coll_op->fragment_data.message_descriptor->n_active = 1; coll_op->full_message.n_bytes_scheduled = frag_len; coll_op->full_message.fragment_launcher = mca_coll_ml_reduce_frag_progress; coll_op->full_message.pipeline_depth = pipeline_depth; coll_op->fragment_data.current_coll_op = small_data_reduce; coll_op->fragment_data.fragment_size = frag_len; coll_op->variable_fn_params.count = n_dts_per_frag; /* seems fishy */ coll_op->variable_fn_params.buffer_size = frag_len; coll_op->variable_fn_params.src_desc = src_buffer_desc; /* copy into the ml-buffer */ ret = ompi_datatype_copy_content_same_ddt(dtype, n_dts_per_frag, (char *) src_buffer_desc->data_addr, (char *) sbuf); if (ret < 0) { return OMPI_ERROR; } } else { ML_VERBOSE(1,("Using zero-copy ptp reduce")); coll_op = mca_coll_ml_alloc_op_prog_single_frag_dag(ml_module, ml_module->coll_ml_reduce_functions[large_data_reduce], sbuf, rbuf, pack_len, 0); coll_op->variable_fn_params.userbuf = coll_op->variable_fn_params.sbuf = sbuf; coll_op->variable_fn_params.rbuf = rbuf; /* The ML buffer is used for testing. Later, when we * switch to use knem/mmap/portals this should be replaced * appropriately */ src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); while (NULL == src_buffer_desc) { opal_progress(); src_buffer_desc = mca_coll_ml_alloc_buffer(ml_module); } coll_op->variable_fn_params.buffer_index = src_buffer_desc->buffer_index; coll_op->variable_fn_params.src_desc = src_buffer_desc; coll_op->variable_fn_params.count = count; } coll_op->process_fn = (rank != root) ? NULL : mca_coll_ml_reduce_unpack; /* Set common parts */ coll_op->fragment_data.buffer_desc = src_buffer_desc; coll_op->variable_fn_params.dtype = dtype; coll_op->variable_fn_params.op = op; /* NTH: the root, root route, and root flag are set in the task setup */ /* Fill in the function arguments */ coll_op->variable_fn_params.sbuf_offset = 0; coll_op->variable_fn_params.rbuf_offset = (ml_module->payload_block->size_buffer - ml_module->data_offset)/2; /* Keep track of the global root of this operation */ coll_op->global_root = root; coll_op->variable_fn_params.sequence_num = OPAL_THREAD_ADD32(&(ml_module->collective_sequence_num), 1); coll_op->sequential_routine.current_active_bcol_fn = 0; /* set the task setup callback */ coll_op->sequential_routine.seq_task_setup = mca_coll_ml_reduce_task_setup; /* Reduce requires the schedule to be fixed. If we use other (changing) schedule, the operation might result in different result. */ coll_op->coll_schedule->component_functions = coll_op->coll_schedule-> comp_fn_arr[coll_op->coll_schedule->topo_info->route_vector[root].level]; /* Launch the collective */ ret = mca_coll_ml_launch_sequential_collective (coll_op); if (OMPI_SUCCESS != ret) { ML_VERBOSE(10, ("Failed to launch reduce collective")); return ret; } *req = &coll_op->full_message.super; return OMPI_SUCCESS; }
/* * ompi_coll_base_allreduce_intra_recursivedoubling * * Function: Recursive doubling algorithm for allreduce operation * Accepts: Same as MPI_Allreduce() * Returns: MPI_SUCCESS or error code * * Description: Implements recursive doubling algorithm for allreduce. * Original (non-segmented) implementation is used in MPICH-2 * for small and intermediate size messages. * The algorithm preserves order of operations so it can * be used both by commutative and non-commutative operations. * * Example on 7 nodes: * Initial state * # 0 1 2 3 4 5 6 * [0] [1] [2] [3] [4] [5] [6] * Initial adjustment step for non-power of two nodes. * old rank 1 3 5 6 * new rank 0 1 2 3 * [0+1] [2+3] [4+5] [6] * Step 1 * old rank 1 3 5 6 * new rank 0 1 2 3 * [0+1+] [0+1+] [4+5+] [4+5+] * [2+3+] [2+3+] [6 ] [6 ] * Step 2 * old rank 1 3 5 6 * new rank 0 1 2 3 * [0+1+] [0+1+] [0+1+] [0+1+] * [2+3+] [2+3+] [2+3+] [2+3+] * [4+5+] [4+5+] [4+5+] [4+5+] * [6 ] [6 ] [6 ] [6 ] * Final adjustment step for non-power of two nodes * # 0 1 2 3 4 5 6 * [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] [0+1+] * [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] [2+3+] * [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] [4+5+] * [6 ] [6 ] [6 ] [6 ] [6 ] [6 ] [6 ] * */ int ompi_coll_base_allreduce_intra_recursivedoubling(const void *sbuf, void *rbuf, int count, struct ompi_datatype_t *dtype, struct ompi_op_t *op, struct ompi_communicator_t *comm, mca_coll_base_module_t *module) { int ret, line, rank, size, adjsize, remote, distance; int newrank, newremote, extra_ranks; char *tmpsend = NULL, *tmprecv = NULL, *tmpswap = NULL, *inplacebuf = NULL; ptrdiff_t true_lb, true_extent, lb, extent; ompi_request_t *reqs[2] = {NULL, NULL}; size = ompi_comm_size(comm); rank = ompi_comm_rank(comm); OPAL_OUTPUT((ompi_coll_base_framework.framework_output, "coll:base:allreduce_intra_recursivedoubling rank %d", rank)); /* Special case for size == 1 */ if (1 == size) { if (MPI_IN_PLACE != sbuf) { ret = ompi_datatype_copy_content_same_ddt(dtype, count, (char*)rbuf, (char*)sbuf); if (ret < 0) { line = __LINE__; goto error_hndl; } } return MPI_SUCCESS; } /* Allocate and initialize temporary send buffer */ ret = ompi_datatype_get_extent(dtype, &lb, &extent); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } ret = ompi_datatype_get_true_extent(dtype, &true_lb, &true_extent); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } inplacebuf = (char*) malloc(true_extent + (ptrdiff_t)(count - 1) * extent); if (NULL == inplacebuf) { ret = -1; line = __LINE__; goto error_hndl; } if (MPI_IN_PLACE == sbuf) { ret = ompi_datatype_copy_content_same_ddt(dtype, count, inplacebuf, (char*)rbuf); if (ret < 0) { line = __LINE__; goto error_hndl; } } else { ret = ompi_datatype_copy_content_same_ddt(dtype, count, inplacebuf, (char*)sbuf); if (ret < 0) { line = __LINE__; goto error_hndl; } } tmpsend = (char*) inplacebuf; tmprecv = (char*) rbuf; /* Determine nearest power of two less than or equal to size */ adjsize = opal_next_poweroftwo (size); adjsize >>= 1; /* Handle non-power-of-two case: - Even ranks less than 2 * extra_ranks send their data to (rank + 1), and sets new rank to -1. - Odd ranks less than 2 * extra_ranks receive data from (rank - 1), apply appropriate operation, and set new rank to rank/2 - Everyone else sets rank to rank - extra_ranks */ extra_ranks = size - adjsize; if (rank < (2 * extra_ranks)) { if (0 == (rank % 2)) { ret = MCA_PML_CALL(send(tmpsend, count, dtype, (rank + 1), MCA_COLL_BASE_TAG_ALLREDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } newrank = -1; } else { ret = MCA_PML_CALL(recv(tmprecv, count, dtype, (rank - 1), MCA_COLL_BASE_TAG_ALLREDUCE, comm, MPI_STATUS_IGNORE)); if (MPI_SUCCESS != ret) { line = __LINE__; goto error_hndl; } /* tmpsend = tmprecv (op) tmpsend */ ompi_op_reduce(op, tmprecv, tmpsend, count, dtype); newrank = rank >> 1; } } else {
/** * All-reduce - subgroup in communicator */ OMPI_DECLSPEC int comm_allgather_pml(void *src_buf, void *dest_buf, int count, ompi_datatype_t *dtype, int my_rank_in_group, int n_peers, int *ranks_in_comm,ompi_communicator_t *comm) { /* local variables */ int rc=OMPI_SUCCESS,msg_cnt; int pair_rank,exchange,extra_rank, n_extra_nodes,n_extra; int proc_block,extra_start,extra_end,iovec_len; int remote_data_start_rank,remote_data_end_rank; int local_data_start_rank; netpatterns_pair_exchange_node_t my_exchange_node; size_t message_extent,current_data_extent,current_data_count; size_t dt_size; OPAL_PTRDIFF_TYPE dt_extent; char *src_buf_current; char *dest_buf_current; struct iovec send_iov[2] = {{0,0},{0,0}}, recv_iov[2] = {{0,0},{0,0}}; ompi_request_t *requests[4]; /* get size of data needed - same layout as user data, so that * we can apply the reudction routines directly on these buffers */ rc = ompi_datatype_type_size(dtype, &dt_size); if( OMPI_SUCCESS != rc ) { goto Error; } rc = ompi_datatype_type_extent(dtype, &dt_extent); if( OMPI_SUCCESS != rc ) { goto Error; } message_extent = dt_extent*count; /* place my data in the correct destination buffer */ rc=ompi_datatype_copy_content_same_ddt(dtype,count, (char *)dest_buf+my_rank_in_group*message_extent, (char *)src_buf); if( OMPI_SUCCESS != rc ) { goto Error; } /* 1 process special case */ if(1 == n_peers) { return OMPI_SUCCESS; } /* get my reduction communication pattern */ memset(&my_exchange_node, 0, sizeof(netpatterns_pair_exchange_node_t)); rc = netpatterns_setup_recursive_doubling_tree_node(n_peers, my_rank_in_group, &my_exchange_node); if(OMPI_SUCCESS != rc){ return rc; } n_extra_nodes=n_peers-my_exchange_node.n_largest_pow_2; /* get the data from the extra sources */ if(0 < my_exchange_node.n_extra_sources) { if ( EXCHANGE_NODE == my_exchange_node.node_type ) { /* ** Receive data from extra node */ extra_rank=my_exchange_node.rank_extra_source; /* receive the data into the correct location - will use 2 * messages in the recursive doubling phase */ dest_buf_current=(char *)dest_buf+message_extent*extra_rank; rc=MCA_PML_CALL(recv(dest_buf_current, count,dtype,ranks_in_comm[extra_rank], -OMPI_COMMON_TAG_ALLREDUCE, comm, MPI_STATUSES_IGNORE)); if( 0 > rc ) { goto Error; } } else { /* ** Send data to "partner" node */ extra_rank=my_exchange_node.rank_extra_source; src_buf_current=(char *)src_buf; rc=MCA_PML_CALL(send(src_buf_current, count,dtype,ranks_in_comm[extra_rank], -OMPI_COMMON_TAG_ALLREDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if( 0 > rc ) { goto Error; } } } current_data_extent=message_extent; current_data_count=count; src_buf_current=(char *)dest_buf+my_rank_in_group*message_extent; proc_block=1; local_data_start_rank=my_rank_in_group; /* loop over data exchanges */ for(exchange=0 ; exchange < my_exchange_node.n_exchanges ; exchange++) { /* is the remote data read */ pair_rank=my_exchange_node.rank_exchanges[exchange]; msg_cnt=0; /* * Power of 2 data segment */ /* post non-blocking receive */ if(pair_rank > my_rank_in_group ){ recv_iov[0].iov_base=src_buf_current+current_data_extent; recv_iov[0].iov_len=current_data_extent; iovec_len=1; remote_data_start_rank=local_data_start_rank+proc_block; remote_data_end_rank=remote_data_start_rank+proc_block-1; } else { recv_iov[0].iov_base=src_buf_current-current_data_extent; recv_iov[0].iov_len=current_data_extent; iovec_len=1; remote_data_start_rank=local_data_start_rank-proc_block; remote_data_end_rank=remote_data_start_rank+proc_block-1; } /* the data from the non power of 2 ranks */ if(remote_data_start_rank<n_extra_nodes) { /* figure out how much data is at the remote rank */ /* last rank with data */ extra_start=remote_data_start_rank; extra_end=remote_data_end_rank; if(extra_end >= n_extra_nodes ) { /* if last rank exceeds the ranks with extra data, * adjust this. */ extra_end=n_extra_nodes-1; } /* get the number of ranks whos data is to be grabbed */ n_extra=extra_end-extra_start+1; recv_iov[1].iov_base=(char *)dest_buf+ (extra_start+my_exchange_node.n_largest_pow_2)*message_extent; recv_iov[1].iov_len=n_extra*count; iovec_len=2; } rc=MCA_PML_CALL(irecv(recv_iov[0].iov_base, current_data_count,dtype,ranks_in_comm[pair_rank], -OMPI_COMMON_TAG_ALLREDUCE, comm,&(requests[msg_cnt]))); if( 0 > rc ) { goto Error; } msg_cnt++; if(iovec_len > 1 ) { rc=MCA_PML_CALL(irecv(recv_iov[1].iov_base, recv_iov[1].iov_len,dtype,ranks_in_comm[pair_rank], -OMPI_COMMON_TAG_ALLREDUCE, comm,&(requests[msg_cnt]))); if( 0 > rc ) { goto Error; } msg_cnt++; } /* post non-blocking send */ send_iov[0].iov_base=src_buf_current; send_iov[0].iov_len=current_data_extent; iovec_len=1; /* the data from the non power of 2 ranks */ if(local_data_start_rank<n_extra_nodes) { /* figure out how much data is at the remote rank */ /* last rank with data */ extra_start=local_data_start_rank; extra_end=extra_start+proc_block-1; if(extra_end >= n_extra_nodes ) { /* if last rank exceeds the ranks with extra data, * adjust this. */ extra_end=n_extra_nodes-1; } /* get the number of ranks whos data is to be grabbed */ n_extra=extra_end-extra_start+1; send_iov[1].iov_base=(char *)dest_buf+ (extra_start+my_exchange_node.n_largest_pow_2)*message_extent; send_iov[1].iov_len=n_extra*count; iovec_len=2; } rc=MCA_PML_CALL(isend(send_iov[0].iov_base, current_data_count,dtype,ranks_in_comm[pair_rank], -OMPI_COMMON_TAG_ALLREDUCE,MCA_PML_BASE_SEND_STANDARD, comm,&(requests[msg_cnt]))); if( 0 > rc ) { goto Error; } msg_cnt++; if( iovec_len > 1 ) { rc=MCA_PML_CALL(isend(send_iov[1].iov_base, send_iov[1].iov_len,dtype,ranks_in_comm[pair_rank], -OMPI_COMMON_TAG_ALLREDUCE,MCA_PML_BASE_SEND_STANDARD, comm,&(requests[msg_cnt]))); if( 0 > rc ) { goto Error; } msg_cnt++; } /* prepare the source buffer for the next iteration */ if(pair_rank < my_rank_in_group ){ src_buf_current-=current_data_extent; local_data_start_rank-=proc_block; } proc_block*=2; current_data_extent*=2; current_data_count*=2; /* wait on send and receive completion */ ompi_request_wait_all(msg_cnt,requests,MPI_STATUSES_IGNORE); } /* copy data in from the "extra" source, if need be */ if(0 < my_exchange_node.n_extra_sources) { if ( EXTRA_NODE == my_exchange_node.node_type ) { /* ** receive the data ** */ extra_rank=my_exchange_node.rank_extra_source; rc=MCA_PML_CALL(recv(dest_buf, count*n_peers,dtype,ranks_in_comm[extra_rank], -OMPI_COMMON_TAG_ALLREDUCE, comm,MPI_STATUSES_IGNORE)); if(0 > rc ) { goto Error; } } else { /* send the data to the pair-rank outside of the power of 2 set ** of ranks */ extra_rank=my_exchange_node.rank_extra_source; rc=MCA_PML_CALL(send(dest_buf, count*n_peers,dtype,ranks_in_comm[extra_rank], -OMPI_COMMON_TAG_ALLREDUCE, MCA_PML_BASE_SEND_STANDARD, comm)); if( 0 > rc ) { goto Error; } } } netpatterns_cleanup_recursive_doubling_tree_node(&my_exchange_node); /* return */ return OMPI_SUCCESS; Error: return rc; }