int ompi_osc_rdma_raccumulate (const void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype, int target_rank, OPAL_PTRDIFF_TYPE target_disp, int target_count, struct ompi_datatype_t *target_datatype, struct ompi_op_t *op, struct ompi_win_t *win, struct ompi_request_t **request) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_osc_rdma_peer_t *peer; ompi_osc_rdma_request_t *rdma_request; ompi_osc_rdma_sync_t *sync; int ret; sync = ompi_osc_rdma_module_sync_lookup (module, target_rank, &peer); if (OPAL_UNLIKELY(NULL == sync)) { return OMPI_ERR_RMA_SYNC; } OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "racc: 0x%lx, %d, %s, %d, 0x%lx, %d, %s, %s, %s", (unsigned long) origin_addr, origin_count, origin_datatype->name, target_rank, (unsigned long) target_disp, target_count, target_datatype->name, op->o_name, win->w_name)); OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, rdma_request); ret = ompi_osc_rdma_rget_accumulate_internal (sync, origin_addr, origin_count, origin_datatype, NULL, 0, NULL, peer, target_rank, target_disp, target_count, target_datatype, op, rdma_request); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { OMPI_OSC_RDMA_REQUEST_RETURN(rdma_request); return ret; } *request = &rdma_request->super; return OMPI_SUCCESS; }
int ompi_osc_rdma_raccumulate(void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_op_t *op, struct ompi_win_t *win, struct ompi_request_t **request) { ompi_osc_rdma_request_t *rdma_request; int ret; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "raccumulate: 0x%lx, %d, %s, %d, %d, %d, %s, %s, %s", (unsigned long) origin_addr, origin_count, origin_dt->name, target, (int) target_disp, target_count, target_dt->name, op->o_name, win->w_name)); OMPI_OSC_RDMA_REQUEST_ALLOC(win, rdma_request); if (NULL == rdma_request) { return OMPI_ERR_OUT_OF_RESOURCE; } /* short-circuit case */ if (0 == origin_count || 0 == target_count) { ompi_osc_rdma_request_complete (rdma_request, MPI_SUCCESS); *request = (ompi_request_t *) rdma_request; return OMPI_SUCCESS; } rdma_request->type = OMPI_OSC_RDMA_HDR_TYPE_ACC; ret = ompi_osc_rdma_accumulate_w_req (origin_addr, origin_count, origin_dt, target, target_disp, target_count, target_dt, op, win, rdma_request); if (OMPI_SUCCESS != ret) { OMPI_OSC_RDMA_REQUEST_RETURN(rdma_request); return ret; } *request = (ompi_request_t *) rdma_request; return OMPI_SUCCESS; }
static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_buffer, const void *compare_buffer, void *result_buffer, ompi_datatype_t *datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle) { ompi_osc_rdma_module_t *module = sync->module; const size_t btl_alignment_mask = ALIGNMENT_MASK(module->selected_btl->btl_get_alignment); unsigned long offset, aligned_len, len = datatype->super.size; ompi_osc_rdma_frag_t *frag = NULL; ompi_osc_rdma_request_t *request; char *ptr = NULL; int ret; OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, request); request->internal = true; request->type = OMPI_OSC_RDMA_TYPE_CSWAP; request->sync = sync; OPAL_THREAD_LOCK(&module->lock); /* to ensure order wait until the previous accumulate completes */ while (ompi_osc_rdma_peer_is_accumulating (peer)) { OPAL_THREAD_UNLOCK(&module->lock); ompi_osc_rdma_progress (module); OPAL_THREAD_LOCK(&module->lock); } peer->flags |= OMPI_OSC_RDMA_PEER_ACCUMULATING; OPAL_THREAD_UNLOCK(&module->lock); offset = target_address & btl_alignment_mask;; aligned_len = (len + offset + btl_alignment_mask) & ~btl_alignment_mask; ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { peer->flags &= ~OMPI_OSC_RDMA_PEER_ACCUMULATING; OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, "Could not allocate an rdma fragment for get accumulate. Falling back on point-to-point")); return OMPI_ERR_OUT_OF_RESOURCE; } if (!ompi_osc_rdma_peer_is_exclusive (peer)) { (void) ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); } /* set up the request */ request->frag = frag; request->origin_addr = (void *) source_buffer; request->ctx = (void *) target_handle; request->result_addr = result_buffer; request->compare_addr = compare_buffer; request->result_dt = datatype; request->offset = (ptrdiff_t) offset; request->target_address = target_address; request->len = len; OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "initiating btl get...")); ret = module->selected_btl->btl_get (module->selected_btl, peer->data_endpoint, ptr, target_address, frag->handle, target_handle, aligned_len, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_cas_get_complete, request, NULL); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { ompi_osc_rdma_frag_complete (frag); return ret; } ompi_osc_rdma_sync_rdma_inc (sync); return OMPI_SUCCESS; }
static inline int ompi_osc_rdma_gacc_master (ompi_osc_rdma_sync_t *sync, const void *source_buffer, int source_count, ompi_datatype_t *source_datatype, void *result_buffer, int result_count, ompi_datatype_t *result_datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle, int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; struct iovec source_iovec[OMPI_OSC_RDMA_DECODE_MAX], target_iovec[OMPI_OSC_RDMA_DECODE_MAX]; const size_t acc_limit = (mca_osc_rdma_component.buffer_size >> 3); uint32_t source_primitive_count, target_primitive_count; opal_convertor_t source_convertor, target_convertor; uint32_t source_iov_count, target_iov_count; uint32_t source_iov_index, target_iov_index; ompi_datatype_t *source_primitive, *target_primitive; /* needed for opal_convertor_raw but not used */ size_t source_size, target_size; ompi_osc_rdma_request_t *subreq; size_t result_position; ptrdiff_t lb, extent; int ret, acc_len; bool done; (void) ompi_datatype_get_extent (target_datatype, &lb, &extent); target_address += lb; /* fast path for accumulate on built-in types */ if (OPAL_LIKELY((!source_count || ompi_datatype_is_predefined (source_datatype)) && ompi_datatype_is_predefined (target_datatype) && (!result_count || ompi_datatype_is_predefined (result_datatype)) && (target_datatype->super.size * target_count <= acc_limit))) { if (NULL == request) { OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, request); request->internal = true; request->type = result_datatype ? OMPI_OSC_RDMA_TYPE_GET_ACC : OMPI_OSC_RDMA_TYPE_ACC; } if (source_datatype) { (void) ompi_datatype_get_extent (source_datatype, &lb, &extent); source_buffer = (void *)((intptr_t) source_buffer + lb); } if (result_datatype) { (void) ompi_datatype_get_extent (result_datatype, &lb, &extent); result_buffer = (void *)((intptr_t) result_buffer + lb); } ret = ompi_osc_rdma_gacc_contig (sync, source_buffer, source_count, source_datatype, result_buffer, result_count, result_datatype, peer, target_address, target_handle, target_count, target_datatype, op, request); if (OPAL_LIKELY(OMPI_SUCCESS == ret)) { return OMPI_SUCCESS; } if (source_datatype) { /* the convertors will handle the lb */ (void) ompi_datatype_get_extent (source_datatype, &lb, &extent); source_buffer = (void *)((intptr_t) source_buffer - lb); } if (result_datatype) { (void) ompi_datatype_get_extent (result_datatype, &lb, &extent); result_buffer = (void *)((intptr_t) result_buffer - lb); } } /* the convertor will handle lb from here */ (void) ompi_datatype_get_extent (target_datatype, &lb, &extent); target_address -= lb; /* get the primitive datatype info */ ret = ompi_osc_base_get_primitive_type_info (target_datatype, &target_primitive, &target_primitive_count); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { /* target datatype is not made up of a single basic datatype */ return ret; } if (source_datatype) { ret = ompi_osc_base_get_primitive_type_info (source_datatype, &source_primitive, &source_primitive_count); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { /* target datatype is not made up of a single basic datatype */ return ret; } if (OPAL_UNLIKELY(source_primitive != target_primitive)) { return MPI_ERR_TYPE; } } /* prepare convertors for the source and target. these convertors will be used to determine the * contiguous segments within the source and target. */ /* the source may be NULL if using MPI_OP_NO_OP with MPI_Get_accumulate */ if (source_datatype) { OBJ_CONSTRUCT(&source_convertor, opal_convertor_t); ret = opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor, &source_datatype->super, source_count, source_buffer, 0, &source_convertor); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } } /* target_datatype can never be NULL */ OBJ_CONSTRUCT(&target_convertor, opal_convertor_t); ret = opal_convertor_copy_and_prepare_for_send (ompi_mpi_local_convertor, &target_datatype->super, target_count, (void *) (intptr_t) target_address, 0, &target_convertor); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { return ret; } if (request) { /* keep the request from completing until all the transfers have started */ request->outstanding_requests = 1; } target_iov_index = 0; target_iov_count = 0; result_position = 0; do { /* decode segments of the source data */ source_iov_count = OMPI_OSC_RDMA_DECODE_MAX; source_iov_index = 0; /* opal_convertor_raw returns done when it has reached the end of the data */ if (!source_datatype) { done = true; source_iovec[0].iov_len = (size_t) -1; source_iovec[0].iov_base = NULL; source_iov_count = 1; } else { done = opal_convertor_raw (&source_convertor, source_iovec, &source_iov_count, &source_size); } /* loop on the target segments until we have exhaused the decoded source data */ while (source_iov_index != source_iov_count) { if (target_iov_index == target_iov_count) { /* decode segments of the target buffer */ target_iov_count = OMPI_OSC_RDMA_DECODE_MAX; target_iov_index = 0; (void) opal_convertor_raw (&target_convertor, target_iovec, &target_iov_count, &target_size); } /* we already checked that the target was large enough. this should be impossible */ assert (0 != target_iov_count); /* determine how much to put in this operation */ acc_len = min(target_iovec[target_iov_index].iov_len, source_iovec[source_iov_index].iov_len); acc_len = min((size_t) acc_len, acc_limit); /* execute the get */ OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, subreq); subreq->internal = true; subreq->parent_request = request; if (request) { (void) OPAL_THREAD_ADD32 (&request->outstanding_requests, 1); } if (result_datatype) { /* prepare a convertor for this part of the result */ opal_convertor_copy_and_prepare_for_recv (ompi_mpi_local_convertor, &result_datatype->super, result_count, result_buffer, 0, &subreq->convertor); opal_convertor_set_position (&subreq->convertor, &result_position); subreq->type = OMPI_OSC_RDMA_TYPE_GET_ACC; } else { subreq->type = OMPI_OSC_RDMA_TYPE_ACC; } OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "target index = %d, target = {%p, %lu}, source_index = %d, source = {%p, %lu}, result = %p, result position = %lu, " "acc_len = %d, count = %lu", target_iov_index, target_iovec[target_iov_index].iov_base, (unsigned long) target_iovec[target_iov_index].iov_len, source_iov_index, source_iovec[source_iov_index].iov_base, (unsigned long) source_iovec[source_iov_index].iov_len, result_buffer, (unsigned long) result_position, acc_len, (unsigned long)(acc_len / target_primitive->super.size))); ret = ompi_osc_rdma_gacc_contig (sync, source_iovec[source_iov_index].iov_base, acc_len / target_primitive->super.size, target_primitive, NULL, 0, NULL, peer, (uint64_t) (intptr_t) target_iovec[target_iov_index].iov_base, target_handle, acc_len / target_primitive->super.size, target_primitive, op, subreq); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { if (OPAL_UNLIKELY(OMPI_ERR_OUT_OF_RESOURCE != ret)) { /* something bad happened. need to figure out how to handle these errors */ return ret; } /* progress and try again */ ompi_osc_rdma_progress (module); continue; } /* adjust io vectors */ target_iovec[target_iov_index].iov_len -= acc_len; source_iovec[source_iov_index].iov_len -= acc_len; target_iovec[target_iov_index].iov_base = (void *)((intptr_t) target_iovec[target_iov_index].iov_base + acc_len); source_iovec[source_iov_index].iov_base = (void *)((intptr_t) source_iovec[source_iov_index].iov_base + acc_len); result_position += acc_len; source_iov_index += !source_datatype || (0 == source_iovec[source_iov_index].iov_len); target_iov_index += (0 == target_iovec[target_iov_index].iov_len); } } while (!done); if (request) { /* release our reference so the request can complete */ (void) OPAL_THREAD_ADD32 (&request->outstanding_requests, -1); } if (source_datatype) { opal_convertor_cleanup (&source_convertor); OBJ_DESTRUCT(&source_convertor); } opal_convertor_cleanup (&target_convertor); OBJ_DESTRUCT(&target_convertor); return OMPI_SUCCESS; }
static inline int ompi_osc_rdma_rget_internal (void *origin_addr, int origin_count, struct ompi_datatype_t *origin_dt, int target, OPAL_PTRDIFF_TYPE target_disp, int target_count, struct ompi_datatype_t *target_dt, struct ompi_win_t *win, bool release_req, struct ompi_request_t **request) { int ret, tag; ompi_osc_rdma_module_t *module = GET_MODULE(win); bool is_long_datatype = false; ompi_osc_rdma_frag_t *frag; ompi_osc_rdma_header_get_t *header; size_t ddt_len, frag_len; char *ptr; const void *packed_ddt; ompi_osc_rdma_request_t *rdma_request; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "get: 0x%lx, %d, %s, %d, %d, %d, %s, %s", (unsigned long) origin_addr, origin_count, origin_dt->name, target, (int) target_disp, target_count, target_dt->name, win->w_name)); if (!ompi_osc_rdma_check_access_epoch (module, target)) { return OMPI_ERR_RMA_SYNC; } /* gets are always request based, so that we know where to land the data */ OMPI_OSC_RDMA_REQUEST_ALLOC(win, rdma_request); if (NULL == rdma_request) { return OMPI_ERR_OUT_OF_RESOURCE; } rdma_request->internal = release_req; /* short-circuit case */ if (0 == origin_count || 0 == target_count) { ompi_osc_rdma_request_complete (rdma_request, MPI_SUCCESS); *request = &rdma_request->super; return OMPI_SUCCESS; } /* optimize self communication. TODO: optimize local communication */ if (ompi_comm_rank (module->comm) == target) { *request = &rdma_request->super; return ompi_osc_rdma_get_self (origin_addr, origin_count, origin_dt, target_disp, target_count, target_dt, module, rdma_request); } rdma_request->type = OMPI_OSC_RDMA_HDR_TYPE_GET; rdma_request->origin_addr = origin_addr; rdma_request->origin_count = origin_count; OBJ_RETAIN(origin_dt); rdma_request->origin_dt = origin_dt; /* Compute datatype length. Note that the datatype description * must fit in a single frag */ ddt_len = ompi_datatype_pack_description_length(target_dt); OPAL_THREAD_LOCK(&module->lock); frag_len = sizeof(ompi_osc_rdma_header_get_t) + ddt_len; ret = ompi_osc_rdma_frag_alloc(module, target, frag_len, &frag, &ptr); if (OMPI_SUCCESS != ret) { /* allocate space for the header plus space to store ddt_len */ frag_len = sizeof(ompi_osc_rdma_header_put_t) + 8; ret = ompi_osc_rdma_frag_alloc(module, target, frag_len, &frag, &ptr); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { OPAL_THREAD_UNLOCK(&module->lock); return OMPI_ERR_OUT_OF_RESOURCE; } is_long_datatype = true; } tag = get_tag (module); /* for bookkeeping the get is "outgoing" */ ompi_osc_signal_outgoing (module, target, 1); /* flush will be called at the end of this function. make sure the post message has * arrived. */ if (!release_req && module->sc_group) { while (0 != module->num_post_msgs) { OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "waiting for post messages. num_post_msgs = %d", module->num_post_msgs)); opal_condition_wait(&module->cond, &module->lock); } } OPAL_THREAD_UNLOCK(&module->lock); header = (ompi_osc_rdma_header_get_t*) ptr; header->base.type = OMPI_OSC_RDMA_HDR_TYPE_GET; header->base.flags = 0; header->len = frag_len; header->count = target_count; header->displacement = target_disp; header->tag = tag; ptr += sizeof(ompi_osc_rdma_header_get_t); do { ret = ompi_datatype_get_pack_description(target_dt, &packed_ddt); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { break; } if (is_long_datatype) { /* the datatype does not fit in an eager message. send it seperately */ header->base.flags |= OMPI_OSC_RDMA_HDR_FLAG_LARGE_DATATYPE; OBJ_RETAIN(target_dt); ret = ompi_osc_rdma_isend_w_cb ((void *) packed_ddt, ddt_len, MPI_BYTE, target, tag, module->comm, ompi_osc_rdma_dt_send_complete, target_dt); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { break; } *((uint64_t *) ptr) = ddt_len; ptr += 8; } else { memcpy((unsigned char*) ptr, packed_ddt, ddt_len); ptr += ddt_len; } /* TODO -- store the request somewhere so we can cancel it on error */ rdma_request->outstanding_requests = 1; ret = ompi_osc_rdma_irecv_w_cb (origin_addr, origin_count, origin_dt, target, tag, module->comm, NULL, ompi_osc_rdma_req_comm_complete, rdma_request); } while (0); if (OMPI_SUCCESS == ret) { header->base.flags |= OMPI_OSC_RDMA_HDR_FLAG_VALID; *request = &rdma_request->super; } OPAL_THREAD_LOCK(&module->lock); ret = ompi_osc_rdma_frag_finish(module, frag); if (!release_req) { /* need to flush now in case the caller decides to wait on the request */ ompi_osc_rdma_frag_flush_target (module, target); } OPAL_THREAD_UNLOCK(&module->lock); return ret; }
int ompi_osc_rdma_compare_and_swap (void *origin_addr, void *compare_addr, void *result_addr, struct ompi_datatype_t *dt, int target, OPAL_PTRDIFF_TYPE target_disp, struct ompi_win_t *win) { ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_proc_t *proc = ompi_comm_peer_lookup(module->comm, target); ompi_osc_rdma_frag_t *frag; ompi_osc_rdma_header_cswap_t *header; size_t ddt_len, payload_len, frag_len; ompi_osc_rdma_request_t *request; const void *packed_ddt; int ret, tag; char *ptr; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "cswap: 0x%lx, 0x%lx, 0x%lx, %s, %d, %d, %s", (unsigned long) origin_addr, (unsigned long) compare_addr, (unsigned long) result_addr, dt->name, target, (int) target_disp, win->w_name)); if (!ompi_osc_rdma_check_access_epoch (module, target)) { return OMPI_ERR_RMA_SYNC; } /* optimize self case. TODO: optimize local case */ if (ompi_comm_rank (module->comm) == target) { return ompi_osc_rdma_cas_self (origin_addr, compare_addr, result_addr, dt, target_disp, module); } /* compare-and-swaps are always request based, so that we know where to land the data */ OMPI_OSC_RDMA_REQUEST_ALLOC(win, request); if (NULL == request) { return OMPI_ERR_OUT_OF_RESOURCE; } request->type = OMPI_OSC_RDMA_HDR_TYPE_CSWAP; request->origin_addr = origin_addr; request->internal = true; OBJ_RETAIN(dt); request->origin_dt = dt; /* Compute datatype and payload lengths. Note that the datatype description * must fit in a single frag. It should be small in this case. */ ddt_len = ompi_datatype_pack_description_length(dt); /* we need to send both the origin and compare buffers */ payload_len = dt->super.size * 2; OPAL_THREAD_LOCK(&module->lock); frag_len = sizeof(ompi_osc_rdma_header_cswap_t) + ddt_len + payload_len; ret = ompi_osc_rdma_frag_alloc(module, target, frag_len, &frag, &ptr); if (OMPI_SUCCESS != ret) { OPAL_THREAD_UNLOCK(&module->lock); return OMPI_ERR_OUT_OF_RESOURCE; } tag = get_tag (module); ompi_osc_signal_outgoing (module, target, 1); header = (ompi_osc_rdma_header_cswap_t *) ptr; header->base.type = OMPI_OSC_RDMA_HDR_TYPE_CSWAP; header->base.flags = OMPI_OSC_RDMA_HDR_FLAG_VALID; header->len = frag_len; header->displacement = target_disp; header->tag = tag; ptr += sizeof(ompi_osc_rdma_header_cswap_t); ret = ompi_datatype_get_pack_description(dt, &packed_ddt); memcpy((unsigned char*) ptr, packed_ddt, ddt_len); ptr += ddt_len; /* pack the origin and compare data */ osc_rdma_copy_for_send (ptr, dt->super.size, origin_addr, proc, 1, dt); ptr += dt->super.size; osc_rdma_copy_for_send (ptr, dt->super.size, compare_addr, proc, 1, dt); request->outstanding_requests = 1; ret = ompi_osc_rdma_irecv_w_cb (result_addr, 1, dt, target, tag, module->comm, NULL, ompi_osc_rdma_req_comm_complete, request); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { OPAL_THREAD_UNLOCK(&module->lock); return ret; } ret = ompi_osc_rdma_frag_finish(module, frag); OPAL_THREAD_UNLOCK(&module->lock); return ret; }
static inline int ompi_osc_rdma_rget_accumulate_internal (void *origin_addr, int origin_count, struct ompi_datatype_t *origin_datatype, void *result_addr, int result_count, struct ompi_datatype_t *result_datatype, int target_rank, MPI_Aint target_disp, int target_count, struct ompi_datatype_t *target_datatype, struct ompi_op_t *op, struct ompi_win_t *win, bool release_req, struct ompi_request_t **request) { int ret; ompi_osc_rdma_module_t *module = GET_MODULE(win); ompi_proc_t *proc = ompi_comm_peer_lookup(module->comm, target_rank); bool is_long_datatype = false; bool is_long_msg = false; ompi_osc_rdma_frag_t *frag; ompi_osc_rdma_header_acc_t *header; size_t ddt_len, payload_len, frag_len; char *ptr; const void *packed_ddt; int tag; ompi_osc_rdma_request_t *rdma_request; OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "rget_acc: 0x%lx, %d, %s, 0x%lx, %d, %s, 0x%x, %d, %d, %s, %s, %s", (unsigned long) origin_addr, origin_count, origin_datatype->name, (unsigned long) result_addr, result_count, result_datatype->name, target_rank, (int) target_disp, target_count, target_datatype->name, op->o_name, win->w_name)); if (!ompi_osc_rdma_check_access_epoch (module, target_rank)) { return OMPI_ERR_RMA_SYNC; } /* get_accumulates are always request based, so that we know where to land the data */ OMPI_OSC_RDMA_REQUEST_ALLOC(win, rdma_request); if (OPAL_UNLIKELY(NULL == rdma_request)) { return OMPI_ERR_OUT_OF_RESOURCE; } rdma_request->internal = release_req; /* short-circuit case. note that origin_count may be 0 if op is MPI_NO_OP */ if (0 == result_count || 0 == target_count) { ompi_osc_rdma_request_complete (rdma_request, MPI_SUCCESS); *request = &rdma_request->super; return OMPI_SUCCESS; } /* optimize the self case. TODO: optimize the local case */ if (ompi_comm_rank (module->comm) == target_rank) { *request = &rdma_request->super; return ompi_osc_rdma_gacc_self (origin_addr, origin_count, origin_datatype, result_addr, result_count, result_datatype, target_disp, target_count, target_datatype, op, module, rdma_request); } rdma_request->type = OMPI_OSC_RDMA_HDR_TYPE_GET_ACC; rdma_request->origin_addr = origin_addr; rdma_request->origin_count = origin_count; OBJ_RETAIN(origin_datatype); rdma_request->origin_dt = origin_datatype; /* Compute datatype and payload lengths. Note that the datatype description * must fit in a single frag */ ddt_len = ompi_datatype_pack_description_length(target_datatype); if (&ompi_mpi_op_no_op.op != op) { payload_len = origin_datatype->super.size * origin_count; } else { payload_len = 0; } OPAL_THREAD_LOCK(&module->lock); frag_len = sizeof(*header) + ddt_len + payload_len; ret = ompi_osc_rdma_frag_alloc(module, target_rank, frag_len, &frag, &ptr); if (OMPI_SUCCESS != ret) { frag_len = sizeof(*header) + ddt_len; ret = ompi_osc_rdma_frag_alloc(module, target_rank, frag_len, &frag, &ptr); if (OMPI_SUCCESS != ret) { /* allocate space for the header plus space to store ddt_len */ frag_len = sizeof(*header) + 8; ret = ompi_osc_rdma_frag_alloc(module, target_rank, frag_len, &frag, &ptr); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { OPAL_THREAD_UNLOCK(&module->lock); return OMPI_ERR_OUT_OF_RESOURCE; } is_long_datatype = true; } is_long_msg = true; } tag = get_tag (module); /* If this is a long message then we need two completions before the * request is complete (1 for the send, 1 for the receive) */ rdma_request->outstanding_requests = 1 + is_long_msg; /* increment the number of outgoing fragments */ ompi_osc_signal_outgoing (module, target_rank, rdma_request->outstanding_requests); /* flush will be called at the end of this function. make sure the post message has * arrived. */ if (!release_req && module->sc_group) { while (0 != module->num_post_msgs) { OPAL_OUTPUT_VERBOSE((50, ompi_osc_base_framework.framework_output, "waiting for post messages. num_post_msgs = %d", module->num_post_msgs)); opal_condition_wait(&module->cond, &module->lock); } } OPAL_THREAD_UNLOCK(&module->lock); header = (ompi_osc_rdma_header_acc_t *) ptr; header->base.flags = 0; header->len = frag_len; header->count = target_count; header->displacement = target_disp; header->op = op->o_f_to_c_index; header->tag = tag; ptr = (char *)(header + 1); do { ret = ompi_datatype_get_pack_description(target_datatype, &packed_ddt); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { break; } if (is_long_datatype) { /* the datatype does not fit in an eager message. send it seperately */ header->base.flags |= OMPI_OSC_RDMA_HDR_FLAG_LARGE_DATATYPE; OBJ_RETAIN(target_datatype); ret = ompi_osc_rdma_isend_w_cb ((void *) packed_ddt, ddt_len, MPI_BYTE, target_rank, tag, module->comm, ompi_osc_rdma_dt_send_complete, target_datatype); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { break; } *((uint64_t *) ptr) = ddt_len; ptr += 8; } else { memcpy((unsigned char*) ptr, packed_ddt, ddt_len); ptr += ddt_len; } ret = ompi_osc_rdma_irecv_w_cb (result_addr, result_count, result_datatype, target_rank, tag, module->comm, NULL, ompi_osc_rdma_req_comm_complete, rdma_request); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { break; } if (!is_long_msg) { header->base.type = OMPI_OSC_RDMA_HDR_TYPE_GET_ACC; if (&ompi_mpi_op_no_op.op != op) { osc_rdma_copy_for_send (ptr, payload_len, origin_addr, proc, origin_count, origin_datatype); } } else { header->base.type = OMPI_OSC_RDMA_HDR_TYPE_GET_ACC_LONG; ret = ompi_osc_rdma_isend_w_cb (origin_addr, origin_count, origin_datatype, target_rank, tag, module->comm, ompi_osc_rdma_req_comm_complete, rdma_request); } } while (0); if (OMPI_SUCCESS == ret) { header->base.flags |= OMPI_OSC_RDMA_HDR_FLAG_VALID; *request = (ompi_request_t *) rdma_request; } OPAL_THREAD_LOCK(&module->lock); ret = ompi_osc_rdma_frag_finish(module, frag); if (!release_req) { /* need to flush now in case the caller decides to wait on the request */ ompi_osc_rdma_frag_flush_target (module, target_rank); } OPAL_THREAD_UNLOCK(&module->lock); return ret; }