static int ompi_osc_rdma_gacc_local (const void *source_buffer, int source_count, ompi_datatype_t *source_datatype, void *result_buffer, int result_count, ompi_datatype_t *result_datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle, int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_module_t *module, ompi_osc_rdma_request_t *request) { int ret = OMPI_SUCCESS; do { if (!ompi_osc_rdma_peer_is_exclusive (peer)) { (void) ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); } if (NULL != result_buffer) { /* get accumulate */ ret = ompi_datatype_sndrcv ((void *) (intptr_t) target_address, target_count, target_datatype, result_buffer, result_count, result_datatype); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { break; } } if (&ompi_mpi_op_no_op.op != op) { if (&ompi_mpi_op_replace.op != op) { ret = ompi_osc_base_sndrcv_op (source_buffer, source_count, source_datatype, (void *) (intptr_t) target_address, target_count, target_datatype, op); } else { ret = ompi_datatype_sndrcv (source_buffer, source_count, source_datatype, (void *) (intptr_t) target_address, target_count, target_datatype); } } if (!ompi_osc_rdma_peer_is_exclusive (peer)) { (void) ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); } } while (0); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, "ompi_osc_rdma_gacc_self: failed performing accumulate operation. ret = %d", ret)); return ret; } if (request) { /* NTH: is it ok to use an ompi error code here? */ ompi_osc_rdma_request_complete (request, ret); } return ret; }
static inline int ompi_osc_rdma_cas_local (const void *source_buffer, const void *compare_buffer, void *result_buffer, ompi_datatype_t *datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle, ompi_osc_rdma_module_t *module) { ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); memcpy (result_buffer, (void *) (uintptr_t) target_address, datatype->super.size); if (0 == memcmp (compare_buffer, result_buffer, datatype->super.size)) { memcpy ((void *) (uintptr_t) target_address, source_buffer, datatype->super.size); } ompi_osc_rdma_lock_release_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); return OMPI_SUCCESS; }
int ompi_osc_rdma_attach (struct ompi_win_t *win, void *base, size_t len) { ompi_osc_rdma_module_t *module = GET_MODULE(win); const int my_rank = ompi_comm_rank (module->comm); ompi_osc_rdma_peer_t *my_peer = ompi_osc_rdma_module_peer (module, my_rank); ompi_osc_rdma_region_t *region; osc_rdma_counter_t region_count; osc_rdma_counter_t region_id; void *bound; intptr_t page_size = getpagesize (); int region_index; int ret; if (module->flavor != MPI_WIN_FLAVOR_DYNAMIC) { return OMPI_ERR_RMA_FLAVOR; } if (0 == len) { /* shot-circuit 0-byte case */ return OMPI_SUCCESS; } OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "attach: %s, %p, %lu", win->w_name, base, (unsigned long) len); OPAL_THREAD_LOCK(&module->lock); region_count = module->state->region_count & 0xffffffffL; region_id = module->state->region_count >> 32; if (region_count == mca_osc_rdma_component.max_attach) { OPAL_THREAD_UNLOCK(&module->lock); return OMPI_ERR_RMA_ATTACH; } /* it is wasteful to register less than a page. this may allow the remote side to access more * memory but the MPI standard covers this with calling the calling behavior erroneous */ bound = (void *)OPAL_ALIGN((intptr_t) base + len, page_size, intptr_t); base = (void *)((intptr_t) base & ~(page_size - 1)); len = (size_t)((intptr_t) bound - (intptr_t) base); /* see if a matching region already exists */ region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1, (intptr_t) base, (intptr_t) bound, module->region_size, ®ion_index); if (NULL != region) { ++module->dynamic_handles[region_index].refcnt; OPAL_THREAD_UNLOCK(&module->lock); /* no need to invalidate remote caches */ return OMPI_SUCCESS; } /* region is in flux */ module->state->region_count = -1; opal_atomic_wmb (); ompi_osc_rdma_lock_acquire_exclusive (module, my_peer, offsetof (ompi_osc_rdma_state_t, regions_lock)); /* do a binary seach for where the region should be inserted */ if (region_count) { region = find_insertion_point ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1, (intptr_t) base, module->region_size, ®ion_index); if (region_index < region_count) { memmove ((void *) ((intptr_t) region + module->region_size), region, (region_count - region_index) * module->region_size); if (module->selected_btl->btl_register_mem) { memmove (module->dynamic_handles + region_index + 1, module->dynamic_handles + region_index, (region_count - region_index) * sizeof (module->dynamic_handles[0])); } } } else { region_index = 0; region = (ompi_osc_rdma_region_t *) module->state->regions; } region->base = (intptr_t) base; region->len = len; OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "attaching dynamic memory region {%p, %p} at index %d", base, (void *)((intptr_t) base + len), region_index); if (module->selected_btl->btl_register_mem) { mca_btl_base_registration_handle_t *handle; ret = ompi_osc_rdma_register (module, MCA_BTL_ENDPOINT_ANY, (void *) region->base, region->len, MCA_BTL_REG_FLAG_ACCESS_ANY, &handle); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { OPAL_THREAD_UNLOCK(&module->lock); return OMPI_ERR_RMA_ATTACH; } memcpy (region->btl_handle_data, handle, module->selected_btl->btl_registration_handle_size); module->dynamic_handles[region_index].btl_handle = handle; } else { module->dynamic_handles[region_index].btl_handle = NULL; } module->dynamic_handles[region_index].refcnt = 1; #if OPAL_ENABLE_DEBUG for (int i = 0 ; i < region_count + 1 ; ++i) { region = (ompi_osc_rdma_region_t *) ((intptr_t) module->state->regions + i * module->region_size); OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, " dynamic region %d: {%p, %lu}", i, (void *) region->base, (unsigned long) region->len); } #endif opal_atomic_mb (); /* the region state has changed */ module->state->region_count = ((region_id + 1) << 32) | (region_count + 1); ompi_osc_rdma_lock_release_exclusive (module, my_peer, offsetof (ompi_osc_rdma_state_t, regions_lock)); OPAL_THREAD_UNLOCK(&module->lock); OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "attach complete"); return OMPI_SUCCESS; }
int ompi_osc_rdma_detach (struct ompi_win_t *win, const void *base) { ompi_osc_rdma_module_t *module = GET_MODULE(win); const int my_rank = ompi_comm_rank (module->comm); ompi_osc_rdma_peer_dynamic_t *my_peer = (ompi_osc_rdma_peer_dynamic_t *) ompi_osc_rdma_module_peer (module, my_rank); osc_rdma_counter_t region_count, region_id; ompi_osc_rdma_region_t *region; int region_index; if (module->flavor != MPI_WIN_FLAVOR_DYNAMIC) { return OMPI_ERR_WIN; } OPAL_THREAD_LOCK(&module->lock); OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "detach: %s, %p", win->w_name, base); /* the upper 4 bytes of the region count are an instance counter */ region_count = module->state->region_count & 0xffffffffL; region_id = module->state->region_count >> 32; region = ompi_osc_rdma_find_region_containing ((ompi_osc_rdma_region_t *) module->state->regions, 0, region_count - 1, (intptr_t) base, (intptr_t) base + 1, module->region_size, ®ion_index); if (NULL == region) { OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_INFO, "could not find dynamic memory region starting at %p", base); OPAL_THREAD_UNLOCK(&module->lock); return OMPI_ERROR; } if (--module->dynamic_handles[region_index].refcnt > 0) { OPAL_THREAD_UNLOCK(&module->lock); OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "detach complete"); return OMPI_SUCCESS; } /* lock the region so it can't change while a peer is reading it */ ompi_osc_rdma_lock_acquire_exclusive (module, &my_peer->super, offsetof (ompi_osc_rdma_state_t, regions_lock)); OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_DEBUG, "detaching dynamic memory region {%p, %p} from index %d", base, (void *)((intptr_t) base + region->len), region_index); if (module->selected_btl->btl_register_mem) { ompi_osc_rdma_deregister (module, module->dynamic_handles[region_index].btl_handle); if (region_index < region_count - 1) { memmove (module->dynamic_handles + region_index, module->dynamic_handles + region_index + 1, (region_count - region_index - 1) * sizeof (void *)); } memset (module->dynamic_handles + region_count - 1, 0, sizeof (module->dynamic_handles[0])); } if (region_index < region_count - 1) { memmove (region, (void *)((intptr_t) region + module->region_size), (region_count - region_index - 1) * module->region_size);; } module->state->region_count = ((region_id + 1) << 32) | (region_count - 1); ompi_osc_rdma_lock_release_exclusive (module, &my_peer->super, offsetof (ompi_osc_rdma_state_t, regions_lock)); OPAL_THREAD_UNLOCK(&module->lock); OSC_RDMA_VERBOSE(MCA_BASE_VERBOSE_TRACE, "detach complete"); return OMPI_SUCCESS; }
static inline int cas_rdma (ompi_osc_rdma_sync_t *sync, const void *source_buffer, const void *compare_buffer, void *result_buffer, ompi_datatype_t *datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle) { ompi_osc_rdma_module_t *module = sync->module; const size_t btl_alignment_mask = ALIGNMENT_MASK(module->selected_btl->btl_get_alignment); unsigned long offset, aligned_len, len = datatype->super.size; ompi_osc_rdma_frag_t *frag = NULL; ompi_osc_rdma_request_t *request; char *ptr = NULL; int ret; OMPI_OSC_RDMA_REQUEST_ALLOC(module, peer, request); request->internal = true; request->type = OMPI_OSC_RDMA_TYPE_CSWAP; request->sync = sync; OPAL_THREAD_LOCK(&module->lock); /* to ensure order wait until the previous accumulate completes */ while (ompi_osc_rdma_peer_is_accumulating (peer)) { OPAL_THREAD_UNLOCK(&module->lock); ompi_osc_rdma_progress (module); OPAL_THREAD_LOCK(&module->lock); } peer->flags |= OMPI_OSC_RDMA_PEER_ACCUMULATING; OPAL_THREAD_UNLOCK(&module->lock); offset = target_address & btl_alignment_mask;; aligned_len = (len + offset + btl_alignment_mask) & ~btl_alignment_mask; ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { peer->flags &= ~OMPI_OSC_RDMA_PEER_ACCUMULATING; OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, "Could not allocate an rdma fragment for get accumulate. Falling back on point-to-point")); return OMPI_ERR_OUT_OF_RESOURCE; } if (!ompi_osc_rdma_peer_is_exclusive (peer)) { (void) ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); } /* set up the request */ request->frag = frag; request->origin_addr = (void *) source_buffer; request->ctx = (void *) target_handle; request->result_addr = result_buffer; request->compare_addr = compare_buffer; request->result_dt = datatype; request->offset = (ptrdiff_t) offset; request->target_address = target_address; request->len = len; OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "initiating btl get...")); ret = module->selected_btl->btl_get (module->selected_btl, peer->data_endpoint, ptr, target_address, frag->handle, target_handle, aligned_len, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_cas_get_complete, request, NULL); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { ompi_osc_rdma_frag_complete (frag); return ret; } ompi_osc_rdma_sync_rdma_inc (sync); return OMPI_SUCCESS; }
static inline int ompi_osc_rdma_gacc_contig (ompi_osc_rdma_sync_t *sync, const void *source, int source_count, ompi_datatype_t *source_datatype, void *result, int result_count, ompi_datatype_t *result_datatype, ompi_osc_rdma_peer_t *peer, uint64_t target_address, mca_btl_base_registration_handle_t *target_handle, int target_count, ompi_datatype_t *target_datatype, ompi_op_t *op, ompi_osc_rdma_request_t *request) { ompi_osc_rdma_module_t *module = sync->module; const size_t btl_alignment_mask = ALIGNMENT_MASK(module->selected_btl->btl_get_alignment); unsigned long len = target_count * target_datatype->super.size; ompi_osc_rdma_frag_t *frag = NULL; unsigned long aligned_len, offset; char *ptr = NULL; int ret; offset = target_address & btl_alignment_mask;; aligned_len = (len + offset + btl_alignment_mask) & ~btl_alignment_mask; ret = ompi_osc_rdma_frag_alloc (module, aligned_len, &frag, &ptr); if (OPAL_UNLIKELY(OMPI_SUCCESS != ret)) { OPAL_OUTPUT_VERBOSE((10, ompi_osc_base_framework.framework_output, "Could not allocate an rdma fragment for get accumulate")); return OMPI_ERR_OUT_OF_RESOURCE; } OPAL_THREAD_LOCK(&module->lock); /* to ensure order wait until the previous accumulate completes */ while (ompi_osc_rdma_peer_is_accumulating (peer)) { OPAL_THREAD_UNLOCK(&module->lock); ompi_osc_rdma_progress (module); OPAL_THREAD_LOCK(&module->lock); } peer->flags |= OMPI_OSC_RDMA_PEER_ACCUMULATING; OPAL_THREAD_UNLOCK(&module->lock); if (!ompi_osc_rdma_peer_is_exclusive (peer)) { (void) ompi_osc_rdma_lock_acquire_exclusive (module, peer, offsetof (ompi_osc_rdma_state_t, accumulate_lock)); } /* set up the request */ request->frag = frag; request->origin_addr = (void *) source; request->origin_dt = source_datatype; request->origin_count = source_count; request->ctx = (void *) target_handle; request->result_addr = result; request->result_count = result_count; request->result_dt = result_datatype; request->offset = (ptrdiff_t) target_address & btl_alignment_mask; request->target_address = target_address; request->len = len; request->op = op; request->sync = sync; ompi_osc_rdma_sync_rdma_inc (sync); if (&ompi_mpi_op_replace.op != op || result) { /* align the target address */ target_address = target_address & ~btl_alignment_mask; OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "initiating btl get local: {%p, %p}, remote: {0x%" PRIx64 ", %p}...", ptr, (void *) frag->handle, target_address, (void *) target_handle)); ret = module->selected_btl->btl_get (module->selected_btl, peer->data_endpoint, ptr, target_address, frag->handle, target_handle, aligned_len, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_acc_get_complete, request, NULL); } else { /* copy the put accumulate data */ memcpy (ptr, source, len); OPAL_OUTPUT_VERBOSE((60, ompi_osc_base_framework.framework_output, "initiating btl put...")); ret = module->selected_btl->btl_put (module->selected_btl, peer->data_endpoint, ptr, target_address, frag->handle, target_handle, len, 0, MCA_BTL_NO_ORDER, ompi_osc_rdma_acc_put_complete, request, NULL); } if (OPAL_UNLIKELY(OMPI_SUCCESS == ret)) { return OMPI_SUCCESS; } OPAL_OUTPUT_VERBOSE((20, ompi_osc_base_framework.framework_output, "btl operation failed with ret = %d", ret)); ompi_osc_rdma_cleanup_rdma (sync, frag, NULL, NULL); return ret; }