int OSP_NbGet(int target, void* src, void* dst, int bytes, OSP_handle_t osp_handle) { int status = OSP_SUCCESS; int my_rank = OSPD_Process_id(OSP_GROUP_WORLD); OSPU_FUNC_ENTER(); # ifdef HAVE_ERROR_CHECKING # endif # ifdef OSP_TAU_PROFILING { TAU_TRACE_SENDMSG (OSP_TAU_TAG_NBGET, target, bytes); } # endif if(target == my_rank && (bytes < ospu_settings.network_bypass_upper_limit_1d) ) { status = OSPU_Get_memcpy(src, dst, bytes); OSPU_ERR_POP(status != OSP_SUCCESS, "OSPU_Get_memcpy returned an error\n"); } else { status = OSPD_NbGet(target, src, dst, bytes, osp_handle); OSPU_ERR_POP(status != OSP_SUCCESS, "OSPD_NbGet returned an error\n"); } fn_exit: OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSPD_NbAllreduce_group(OSP_group_t* group, int count, OSP_reduce_op_t osp_op, OSP_datatype_t osp_type, void* in, void* out, OSP_handle_t osp_handle) { int status = OSP_SUCCESS; OSPD_Handle_t *ospd_handle; OSPU_FUNC_ENTER(); OSPDI_CRITICAL_ENTER(); if (group == OSP_GROUP_WORLD || group == NULL) { OSPU_ERR_POP(1, "OSPDI_NbAllreduce has not been implemented \n"); } else { OSPU_ERR_POP(1, "OSPD_NbAllreduce_group not implemented for non-world groups!"); } fn_exit: OSPDI_CRITICAL_EXIT(); OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSPDI_GlobalAllreduce_initialize() { int i,status = OSP_SUCCESS; OSPU_FUNC_ENTER(); barrier_conf.protocol = DCMF_GI_BARRIER_PROTOCOL; barrier_conf.cb_geometry = getGeometry; status = DCMF_Barrier_register(&OSPD_Barrier_protocol, &barrier_conf); barrier_conf.protocol = DCMF_LOCKBOX_BARRIER_PROTOCOL; barrier_conf.cb_geometry = getGeometry; status = DCMF_Barrier_register(&OSPD_Localbarrier_protocol, &barrier_conf); /*This has to eventually freed, not being done now*/ status = OSPDI_Malloc((void **) &allreduce_ranklist, OSPD_Process_info.num_ranks * sizeof(unsigned)); OSPU_ERR_POP(status != 0, "OSPDI_Malloc returned with error %d \n", status); for(i=0; i<OSPD_Process_info.num_ranks; i++) allreduce_ranklist[i] = i; barrier_ptr = &OSPD_Barrier_protocol; localbarrier_ptr = &OSPD_Localbarrier_protocol; status = DCMF_Geometry_initialize(&geometry, 0, allreduce_ranklist, OSPD_Process_info.num_ranks, &barrier_ptr, 1, &localbarrier_ptr, 1, &crequest, 0, 1); allreduce_conf.protocol = DCMF_TORUS_BINOMIAL_ALLREDUCE_PROTOCOL; allreduce_conf.cb_geometry = getGeometry; allreduce_conf.reuse_storage = 1; status = DCMF_Allreduce_register(&OSPD_GlobalAllreduce_protocol, &allreduce_conf); OSPU_ERR_POP(status != DCMF_SUCCESS, "DCMF_Allreduce_register returned with error %d \n", status); fn_exit: OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSPI_Recursive_PutAcc(int target, int stride_level, int *block_sizes, void* source_ptr, int *src_stride_ar, void* target_ptr, int *trg_stride_ar, OSP_datatype_t osp_type, void* scaling, OSP_handle_t osp_handle) { int i, status = OSP_SUCCESS; OSPU_FUNC_ENTER(); if (stride_level > 0) { for (i = 0; i < block_sizes[stride_level]; i++) { status = OSPI_Recursive_PutAcc(target, stride_level - 1, block_sizes, (void *) ((size_t) source_ptr + i * src_stride_ar[stride_level - 1]), src_stride_ar, (void *) ((size_t) target_ptr + i * trg_stride_ar[stride_level - 1]), trg_stride_ar, osp_type, scaling, osp_handle); OSPU_ERR_POP(status != OSP_SUCCESS, "OSPI_Recursive_PutAcc returned error in OSPI_Recursive_PutAcc.\n"); } } else { status = OSPD_NbPutAcc(target, source_ptr, target_ptr, block_sizes[0], osp_type, scaling, osp_handle); OSPU_ERR_POP(status != OSP_SUCCESS, "OSPD_NbPutAcc returned with an error \n"); } fn_exit: OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSPD_Allreduce_group(OSP_group_t* group, int count, OSP_reduce_op_t osp_op, OSP_datatype_t osp_type, void* in, void* out) { int status = OSP_SUCCESS; OSPU_FUNC_ENTER(); OSPDI_CRITICAL_ENTER(); if (group == OSP_GROUP_WORLD || group == NULL) { status = OSPDI_GlobalAllreduce(count, osp_op, osp_type, in, out); OSPU_ERR_ABORT(status != OSP_SUCCESS, "OSPDI_GlobalAllreduce returned with an error"); goto fn_exit; } else { OSPU_ERR_POP(1, "OSPD_Allreduce_group not implemented for non-world groups!"); goto fn_fail; } fn_exit: OSPDI_CRITICAL_EXIT(); OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSPD_Finalize(void) { int status = OSP_SUCCESS; int count = 0; OSPU_FUNC_ENTER(); /* TODO: need to unset "OSP is alive" global variable */ OSPDI_CRITICAL_ENTER(); /*waiting for everyone*/ status = OSPDI_GlobalBarrier(); OSPU_ERR_POP(status != OSP_SUCCESS, "OSPDI_GlobalBarrier returned with an error"); /* Freeing request pool */ OSPDI_Request_pool_finalize(); /* Freeing handle pool */ OSPDI_Handle_pool_finalize(); /* Freeing buffer pool */ OSPDI_Buffer_pool_finalize(); /* Freeing memory region pointers and local memroy region*/ OSPDI_Free(OSPD_Membase_global); OSPDI_Free(OSPD_Memregion_global); /* Freeing conenction active counters */ OSPDI_Free((void *) OSPD_Connection_send_active); OSPDI_Free((void *) OSPD_Connection_put_active); /* Freeing put flush local counters and pointers */ OSPDI_Free(OSPD_Put_Flushcounter_ptr[OSPD_Process_info.my_rank]); OSPDI_Free(OSPD_Put_Flushcounter_ptr); if (ospd_settings.enable_cht) { status = pthread_cancel(OSPDI_CHT_pthread); } OSPDI_CRITICAL_EXIT(); /* NOTE: exit critical section before finalize since CS may not work after DCMF is terminated */ count = DCMF_Messager_finalize(); /* Do not issue this warning if using MPI since in that case we know DCMF will be initialized by MPI before OSP (assuming GA->ARMCI->OSP call path). */ //if(!ospd_settings.mpi_active) //{ // OSPU_WARNING(count == 0, // "DCMF_Messager_finalize has been called more than once."); //} fn_exit: OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSPDI_NbGlobalBarrier(OSPD_Handle_t *ospd_handle) { int status = OSP_SUCCESS; OSPD_Request_t *ospd_request; DCMF_Callback_t done_callback; volatile int active; OSPU_FUNC_ENTER(); ospd_request = OSPDI_Get_request(1); OSPU_ERR_POP(status = (ospd_request == NULL), "OSPDI_Get_request returned error \n"); OSPDI_Set_handle(ospd_request, ospd_handle); ospd_handle->active++; done_callback.function = OSPDI_Request_done; done_callback.clientdata = (void *) ospd_request; status = DCMF_GlobalBarrier(&OSPD_GlobalBarrier_protocol, &(ospd_request->request), done_callback); OSPU_ERR_ABORT(status != DCMF_SUCCESS, "DCMF_GlobalBarrier returned with an error"); fn_exit: OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSPD_Bcast_group(OSP_group_t* group, int root, int count, void* buffer) { int status = OSP_SUCCESS; OSPU_FUNC_ENTER(); OSPDI_CRITICAL_ENTER(); if (group == OSP_GROUP_WORLD || group == NULL) { status = OSPDI_GlobalBcast(root, count, buffer); OSPU_ERR_ABORT(status != OSP_SUCCESS, "OSPDI_GlobalBcast returned with an error"); goto fn_exit; } else { OSPU_ERR_POP(1, "OSPD_Bcast_group not implemented for non-world groups!"); goto fn_fail; } fn_exit: OSPDI_CRITICAL_EXIT(); OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSPD_Release_segments(OSP_group_t* group, void *ptr) { int status = OSP_SUCCESS; OSPU_FUNC_ENTER(); OSPDI_CRITICAL_ENTER(); /*This functions does nothing becuase BG does not involve any registration. It has to do a barrier syncrhonization to ensure everyone is agreeing on the release*/ if (group == OSP_GROUP_WORLD || group == NULL) { status = OSPDI_GlobalBarrier(); OSPU_ERR_ABORT(status != OSP_SUCCESS, "DCMF_GlobalBarrier returned with an error"); goto fn_exit; } else { OSPU_ERR_POP(1, "OSPD_Release_segments not implemented for non-world groups!"); goto fn_fail; } fn_exit: OSPDI_CRITICAL_EXIT(); OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSPD_Barrier_group(OSP_group_t* group) { int status = OSP_SUCCESS; OSPU_FUNC_ENTER(); OSPDI_CRITICAL_ENTER(); if (group == OSP_GROUP_WORLD || group == NULL) { status = OSPDI_GlobalBarrier(); OSPU_ERR_ABORT(status != OSP_SUCCESS, "DCMF_GlobalBarrier returned with an error"); goto fn_exit; } else { OSPU_ERR_POP(1, "OSPD_Barrier_group not implemented for non-world groups!"); goto fn_fail; } fn_exit: OSPDI_CRITICAL_EXIT(); OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSPD_NbPutAccV(int target, OSP_iov_t *iov_ar, int ar_len, OSP_datatype_t osp_type, void* scaling, OSP_handle_t osp_handle) { int status = OSP_SUCCESS; OSPD_Handle_t *ospd_handle; OSPU_FUNC_ENTER(); OSPDI_CRITICAL_ENTER(); ospd_handle = (OSPD_Handle_t *) osp_handle; status = OSPDI_Direct_putaccv(target, iov_ar, ar_len, osp_type, scaling, ospd_handle); OSPU_ERR_POP(status, "Direct putaccv function returned with an error \n"); fn_exit: OSPDI_CRITICAL_EXIT(); OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSPD_NbGet(int target, void* src, void* dst, int bytes, OSP_handle_t osp_handle) { int status = OSP_SUCCESS; OSPD_Handle_t* ospd_handle = NULL; OSPD_Request_t* ospd_request = NULL; DCMF_Callback_t callback; unsigned src_disp, dst_disp; OSPU_FUNC_ENTER(); OSPDI_CRITICAL_ENTER(); ospd_handle = (OSPD_Handle_t *) osp_handle; ospd_handle->active++; ospd_request = OSPDI_Get_request(1); OSPU_ERR_POP(status = (ospd_request == NULL), "OSPDI_Get_request returned error."); OSPDI_Set_handle(ospd_request, ospd_handle); callback.function = OSPDI_Request_done; callback.clientdata = (void *) ospd_request; src_disp = (size_t) src - (size_t) OSPD_Membase_global[target]; dst_disp = (size_t) dst - (size_t) OSPD_Membase_global[OSPD_Process_info.my_rank]; status = DCMF_Get(&OSPD_Generic_get_protocol, &(ospd_request->request), callback, DCMF_RELAXED_CONSISTENCY, target, bytes, &OSPD_Memregion_global[target], &OSPD_Memregion_global[OSPD_Process_info.my_rank], src_disp, dst_disp); OSPU_ERR_POP(status, "DCMF_Get returned with an error \n"); fn_exit: OSPDI_CRITICAL_EXIT(); OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSP_NbPutAccV(int target, OSP_iov_t *iov_ar, int ar_len, OSP_datatype_t osp_type, void* scaling, OSP_handle_t osp_handle) { int status = OSP_SUCCESS; int my_rank = OSPD_Process_id(OSP_GROUP_WORLD); OSPU_FUNC_ENTER(); # ifdef HAVE_ERROR_CHECKING # endif # ifdef OSP_TAU_PROFILING { int i, total_bytes = 0; for (i = 0; i < ar_len; i++) total_bytes += iov_ar[i].ptr_array_len * iov_ar[i].bytes; TAU_TRACE_SENDMSG (OSP_TAU_TAG_NBPUTACCV, target, total_bytes); } # endif /* Bypass is ALWAYS better for accumulate; we do not test against threshold. */ if (target == my_rank && ospu_settings.network_bypass) { status = OSPU_AccV_memcpy(iov_ar, ar_len, osp_type, scaling); OSPU_ERR_POP(status != OSP_SUCCESS, "OSPU_AccV_memcpy returned an error\n"); } else { status = OSPD_NbPutAccV(target, iov_ar, ar_len, osp_type, scaling, osp_handle); OSPU_ERR_POP(status, "OSPD_NbPutAccV returned error\n"); } fn_exit: OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSP_NbPut(int target, void* src, void* dst, int bytes, OSP_handle_t osp_handle) { int status = OSP_SUCCESS; int my_rank = OSPD_Process_id(OSP_GROUP_WORLD); OSPU_FUNC_ENTER(); # ifdef HAVE_ERROR_CHECKING # endif # ifdef OSP_TAU_PROFILING { TAU_TRACE_SENDMSG (OSP_TAU_TAG_NBPUT, target, bytes); } # endif /* Not sure if what is the right strategy for bypass. OSPU_*_memcpy are blocking * but the overhead of going into DCMF_Put is likely not worth the savings * from said call being non-blocking. This is especially true under heavy load * since we have determined that DMA vs. memcpy turns over when the NIC is getting * hammered. */ if(target == my_rank && (bytes < ospu_settings.network_bypass_upper_limit_1d) ) { status = OSPU_Put_memcpy(src, dst, bytes); OSPU_ERR_POP(status != OSP_SUCCESS, "OSPU_Put_memcpy returned an error\n"); } else { status = OSPD_NbPut(target, src, dst, bytes, osp_handle); OSPU_ERR_POP(status != OSP_SUCCESS, "OSPD_NbPut returned an error\n"); } fn_exit: OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSPDI_NbGlobalBcast(int root, int count, void *buffer, OSPD_Handle_t *ospd_handle) { int status = OSP_SUCCESS; OSPD_Request_t *ospd_request; DCMF_Callback_t done_callback; OSPU_FUNC_ENTER(); ospd_request = OSPDI_Get_request(1); OSPU_ERR_POP(status = (ospd_request == NULL), "OSPDI_Get_request returned error \n"); OSPDI_Set_handle(ospd_request, ospd_handle); ospd_handle->active++; done_callback.function = OSPDI_Request_done; done_callback.clientdata = (void *) ospd_request; status = DCMF_GlobalBcast(&OSPD_GlobalBcast_protocol, &(ospd_request->request), done_callback, DCMF_SEQUENTIAL_CONSISTENCY, root, (char *) buffer, count); OSPU_ERR_POP(status != DCMF_SUCCESS, "DCMF_GlobalBcast returned with error %d \n", status); fn_exit: OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSPD_PutAccV(int target, OSP_iov_t *iov_ar, int ar_len, OSP_datatype_t osp_type, void* scaling) { int status = OSP_SUCCESS; OSPD_Handle_t *ospd_handle; OSPU_FUNC_ENTER(); OSPDI_CRITICAL_ENTER(); ospd_handle = OSPDI_Get_handle(); OSPU_ERR_POP(status = (ospd_handle == NULL), "OSPDI_Get_handle returned NULL in OSPD_PutAccS.\n"); status = OSPDI_Direct_putaccv(target, iov_ar, ar_len, osp_type, scaling, ospd_handle); OSPU_ERR_POP(status, "Direct putaccv function returned with an error \n"); OSPDI_Conditional_advance(ospd_handle->active > 0); fn_exit: OSPDI_Release_handle(ospd_handle); OSPDI_CRITICAL_EXIT(); OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSPDI_Get_initialize() { int status = OSP_SUCCESS; DCMF_Get_Configuration_t conf; OSPU_FUNC_ENTER(); conf.protocol = DCMF_DEFAULT_GET_PROTOCOL; conf.network = DCMF_TORUS_NETWORK; status = DCMF_Get_register(&OSPD_Generic_get_protocol, &conf); OSPU_ERR_POP(status != DCMF_SUCCESS, "DCMF_Get_register failed"); fn_exit: OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSPD_Get(int target, void* src, void* dst, int bytes) { int status = OSP_SUCCESS; DCMF_Request_t request; DCMF_Callback_t callback; volatile int active; unsigned src_disp, dst_disp; OSPU_FUNC_ENTER(); OSPDI_CRITICAL_ENTER(); callback.function = OSPDI_Generic_done; callback.clientdata = (void *) &active; src_disp = (size_t) src - (size_t) OSPD_Membase_global[target]; dst_disp = (size_t) dst - (size_t) OSPD_Membase_global[OSPD_Process_info.my_rank]; active = 1; status = DCMF_Get(&OSPD_Generic_get_protocol, &request, callback, DCMF_RELAXED_CONSISTENCY, target, bytes, &OSPD_Memregion_global[target], &OSPD_Memregion_global[OSPD_Process_info.my_rank], src_disp, dst_disp); OSPU_ERR_POP(status, "DCMF_Get returned with an error"); OSPDI_Conditional_advance(active > 0); fn_exit: OSPDI_CRITICAL_EXIT(); OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSPDI_GlobalBarrier_initialize() { int status = OSP_SUCCESS; DCMF_GlobalBarrier_Configuration_t conf; OSPU_FUNC_ENTER(); conf.protocol = DCMF_DEFAULT_GLOBALBARRIER_PROTOCOL; status = DCMF_GlobalBarrier_register(&OSPD_GlobalBarrier_protocol, &conf); OSPU_ERR_POP(status != DCMF_SUCCESS, "DCMF_GlobalBarrier_register returned with error %d \n", status); fn_exit: OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSPD_NbSync_group(OSP_group_t* group, OSP_handle_t osp_handle) { int status = OSP_SUCCESS; OSPD_Handle_t *ospd_handle; OSPU_FUNC_ENTER(); OSPDI_CRITICAL_ENTER(); if (group == OSP_GROUP_WORLD || group == NULL) { ospd_handle = (OSPD_Handle_t *) osp_handle; /*This has to be replace with a non-blocking flushall to make it truly non blocking*/ status = OSPDI_Flush_all(); OSPU_ERR_ABORT(status != OSP_SUCCESS, "OSPDI_Flush_all returned with an error"); status = OSPDI_NbGlobalBarrier(ospd_handle); OSPU_ERR_ABORT(status != OSP_SUCCESS, "OSPDI_NbGlobalBarrier returned with an error"); goto fn_exit; } else { OSPU_ERR_POP(1, "OSPD_NbSync_group not implemented for non-world groups!"); goto fn_fail; } fn_exit: OSPDI_CRITICAL_EXIT(); OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSP_Rmw(int target, void* source_ptr_in, void* source_ptr_out, void* target_ptr, int bytes, OSP_atomic_op_t op, OSP_datatype_t osp_type) { int status = OSP_SUCCESS; OSPU_FUNC_ENTER(); # ifdef HAVE_ERROR_CHECKING # endif # ifdef OSP_TAU_PROFILING { TAU_TRACE_SENDMSG (OSP_TAU_TAG_RMW, target, bytes); } # endif status = OSPD_Rmw(target, source_ptr_in, source_ptr_out, target_ptr, bytes, op, osp_type); OSPU_ERR_POP(status!=OSP_SUCCESS, "OSPD_Rmw returned an error\n"); fn_exit: OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSPDI_GlobalBcast(int root, int count, void *buffer) { int status = OSP_SUCCESS; DCMF_Request_t request; DCMF_Callback_t done_callback; volatile unsigned gb_active = 0; OSPU_FUNC_ENTER(); gb_active += 1; done_callback.function = OSPDI_Generic_done; done_callback.clientdata = (void *) &gb_active; status = DCMF_GlobalBcast(&OSPD_GlobalBcast_protocol, &request, done_callback, DCMF_SEQUENTIAL_CONSISTENCY, root, (char *) buffer, count); OSPU_ERR_POP(status != DCMF_SUCCESS, "DCMF_GlobalBcast returned with error %d \n", status); OSPDI_Conditional_advance(gb_active > 0); fn_exit: OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSPU_GetS_local(int stride_level, int *block_sizes, void* source_ptr, int *src_stride_ar, void* target_ptr, int *trg_stride_ar) { int status = OSP_SUCCESS; int chunk_count = 1; int *block_sizes_w; int i, y; OSPU_FUNC_ENTER(); block_sizes_w = malloc(sizeof(int) * (stride_level + 1)); OSPU_ERR_POP((status = (NULL == block_sizes_w)), "malloc failed in OSPU_GetS_local"); memcpy(block_sizes_w, block_sizes, sizeof(int) * (stride_level + 1)); for (i = 1; i <= stride_level; i++) chunk_count = block_sizes[i] * chunk_count; for (i = 0; i < chunk_count; i++) { memcpy(target_ptr, source_ptr, block_sizes[0]); block_sizes_w[1]--; if (block_sizes_w[1] == 0) { y = 1; while (block_sizes_w[y] == 0) { if (y == stride_level) { OSPU_ASSERT(i == chunk_count - 1, status); return status; } y++; } block_sizes_w[y]--; /*The strides done on lower dimensions should be subtracted as these are included in the stride along the current dimension*/ source_ptr = (void *) ((size_t) source_ptr + src_stride_ar[y - 1] - (block_sizes[y-1] - 1) * src_stride_ar[y-2]); target_ptr = (void *) ((size_t) target_ptr + trg_stride_ar[y - 1] - (block_sizes[y-1] - 1) * trg_stride_ar[y-2]); y--; while (y >= 1) { block_sizes_w[y] = block_sizes[y]; y--; } } else { source_ptr = (void *) ((size_t) source_ptr + src_stride_ar[0]); target_ptr = (void *) ((size_t) target_ptr + trg_stride_ar[0]); } } fn_exit: OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSP_NbPutAccS(int target, int stride_level, int *block_sizes, void* source_ptr, int *src_stride_ar, void* target_ptr, int *trg_stride_ar, OSP_datatype_t osp_type, void* scaling, OSP_handle_t osp_handle) { int status = OSP_SUCCESS; int my_rank = OSPD_Process_id(OSP_GROUP_WORLD); OSPU_FUNC_ENTER(); # ifdef HAVE_ERROR_CHECKING # endif # ifdef OSP_TAU_PROFILING { int i, bytes = 1; for (i = 0; i <= stride_levels; i++) total_bytes *= count[i]; TAU_TRACE_SENDMSG (OSP_TAU_TAG_NBPUTACCS, target, total_bytes); } # endif /* Bypass is ALWAYS better for accumulate; we do not test against threshold. */ if (target == my_rank && ospu_settings.network_bypass) { status = OSPU_AccS_local(stride_level, block_sizes, source_ptr, src_stride_ar, target_ptr, trg_stride_ar, osp_type, scaling); OSPU_ERR_POP(status != OSP_SUCCESS, "OSPU_AccS_local returned an error\n"); } else { status = OSPI_Recursive_PutAcc(target, stride_level, block_sizes, source_ptr, src_stride_ar, target_ptr, trg_stride_ar, osp_type, scaling, osp_handle); OSPU_ERR_POP(status!=OSP_SUCCESS, "OSPI_Recursive_PutAcc returned error\n"); } fn_exit: OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSPDI_GlobalAllreduce(int count, OSP_reduce_op_t osp_op, OSP_datatype_t osp_type, void *in, void *out) { int status = OSP_SUCCESS; DCMF_CollectiveRequest_t ar_crequest; DCMF_Callback_t done_callback; DCMF_Op reduce_op; DCMF_Dt datatype; int bytes = 0; void *in_abs = NULL; volatile unsigned ga_active = 0; OSPU_FUNC_ENTER(); switch (osp_op) { case OSP_SUM: reduce_op = DCMF_SUM; break; case OSP_PROD: reduce_op = DCMF_PROD; break; case OSP_MAX: case OSP_MAXABS: reduce_op = DCMF_MAX; break; case OSP_MIN: case OSP_MINABS: reduce_op = DCMF_MIN; break; case OSP_OR: reduce_op = DCMF_LOR; break; default: OSPU_ERR_POP(status != DCMF_SUCCESS, "Unsupported OSP_reduce_op \n"); break; } if (osp_op == OSP_MAXABS || osp_op == OSP_MINABS) { switch (osp_type) { case OSP_DOUBLE: datatype = DCMF_DOUBLE; bytes = count * sizeof(double); status = OSPDI_Malloc(&in_abs, bytes); OSPU_ERR_POP(status != OSP_SUCCESS, "OSPDI_Malloc returned error in OSPDI_GlobalAllreduce \n"); OSPDI_ABS(double, in, in_abs, count); in = in_abs; break; case OSP_INT32: datatype = DCMF_SIGNED_INT; bytes = count * sizeof(int32_t); status = OSPDI_Malloc(&in_abs, bytes); OSPU_ERR_POP(status != OSP_SUCCESS, "OSPDI_Malloc returned error in OSPDI_GlobalAllreduce \n"); OSPDI_ABS(int32_t, in, in_abs, count); in = in_abs; break; case OSP_INT64: datatype = DCMF_SIGNED_LONG_LONG; bytes = count * sizeof(int64_t); status = OSPDI_Malloc(&in_abs, bytes); OSPU_ERR_POP(status != OSP_SUCCESS, "OSPDI_Malloc returned error in OSPDI_GlobalAllreduce \n"); OSPDI_ABS(int64_t, in, in_abs, count); in = in_abs; break; case OSP_UINT32: datatype = DCMF_UNSIGNED_INT; break; case OSP_UINT64: datatype = DCMF_UNSIGNED_LONG_LONG; break; case OSP_FLOAT: datatype = DCMF_FLOAT; bytes = count * sizeof(float); status = OSPDI_Malloc(&in_abs, bytes); OSPU_ERR_POP(status != OSP_SUCCESS, "OSPDI_Malloc returned error in OSPDI_GlobalAllreduce \n"); OSPDI_ABS(float, in, in_abs, count); in = in_abs; break; default: OSPU_ERR_POP(status != DCMF_SUCCESS, "Unsupported OSP_datatype \n"); break; } } else { switch (osp_type) { case OSP_DOUBLE: datatype = DCMF_DOUBLE; break; case OSP_INT32: datatype = DCMF_SIGNED_INT; break; case OSP_INT64: datatype = DCMF_SIGNED_LONG_LONG; break; case OSP_UINT32: datatype = DCMF_UNSIGNED_INT; break; case OSP_UINT64: datatype = DCMF_UNSIGNED_LONG_LONG; break; case OSP_FLOAT: datatype = DCMF_FLOAT; break; default: OSPU_ERR_ABORT(status != DCMF_SUCCESS, "Unsupported OSP_datatype \n"); break; } } ga_active += 1; done_callback.function = OSPDI_Generic_done; done_callback.clientdata = (void *) &ga_active; status = DCMF_Allreduce(&OSPD_GlobalAllreduce_protocol, &ar_crequest, done_callback, DCMF_SEQUENTIAL_CONSISTENCY, &geometry, (char *) in, (char *) out, count, datatype, reduce_op); OSPDI_Conditional_advance(ga_active > 0); fn_exit: if (in_abs != NULL) OSPDI_Free(in_abs); OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSPU_ModV_memcpy(OSP_iov_t *iov_ar, int ar_len, OSP_reduce_op_t osp_op, OSP_datatype_t osp_type, void* scaling) { int i, j, status = OSP_SUCCESS; OSPU_FUNC_ENTER(); OSPD_Global_lock_acquire(); for (i=0; i<ar_len; i++) { for(j=0; j<iov_ar[i].ptr_ar_len; j++) { switch (osp_op) { case OSP_BXOR: switch (osp_type) { case OSP_INT32: OSPUI_MOD_BXOR(int32_t, iov_ar[i].source_ptr_ar[j], iov_ar[i].target_ptr_ar[j], (iov_ar[i].size)/sizeof(int32_t)); break; case OSP_INT64: OSPUI_MOD_BXOR(int64_t, iov_ar[i].source_ptr_ar[j], iov_ar[i].target_ptr_ar[j], (iov_ar[i].size)/sizeof(int64_t)); break; case OSP_UINT32: OSPUI_MOD_BXOR(uint32_t, iov_ar[i].source_ptr_ar[j], iov_ar[i].target_ptr_ar[j], (iov_ar[i].size)/sizeof(uint32_t)); break; case OSP_UINT64: OSPUI_MOD_BXOR(uint64_t, iov_ar[i].source_ptr_ar[j], iov_ar[i].target_ptr_ar[j], (iov_ar[i].size)/sizeof(uint64_t)); break; default: status = OSP_ERROR; OSPU_ERR_POP((status != OSP_SUCCESS), "Invalid data type in OSPU_AccV_memcpy\n"); break; } break; default: status = OSP_ERROR; OSPU_ERR_POP((status != OSP_SUCCESS), "Invalid op type in OSPU_AccV_memcpy\n"); break; } } } OSPD_Global_lock_release(); fn_exit: OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSPDI_Direct_putaccv(int target, OSP_iov_t *iov_ar, int ar_len, OSP_datatype_t osp_type, void *scaling, OSPD_Handle_t *ospd_handle) { int i, j, status = OSP_SUCCESS; OSPD_Putacc_header_t header; OSPD_Request_t *ospd_request; DCMF_Callback_t done_callback; OSPU_FUNC_ENTER(); header.datatype = osp_type; switch (osp_type) { case OSP_DOUBLE: (header.scaling).double_value = *((double *) scaling); break; case OSP_INT32: (header.scaling).int32_value = *((int32_t *) scaling); break; case OSP_INT64: (header.scaling).int64_value = *((int64_t *) scaling); break; case OSP_UINT32: (header.scaling).uint32_value = *((uint32_t *) scaling); break; case OSP_UINT64: (header.scaling).uint64_value = *((uint64_t *) scaling); break; case OSP_FLOAT: (header.scaling).float_value = *((float *) scaling); break; default: status = OSP_ERROR; OSPU_ERR_POP((status != OSP_SUCCESS),"Invalid data type in putacc \n"); break; } for (i=0; i<ar_len; i++) { for(j=0; j<iov_ar[i].ptr_ar_len; j++) { ospd_request = OSPDI_Get_request(1); OSPU_ERR_POP(status = (ospd_request == NULL), "OSPDI_Get_request returned error.\n"); OSPDI_Set_handle(ospd_request, ospd_handle); done_callback.function = OSPDI_Request_done; done_callback.clientdata = (void *) ospd_request; ospd_handle->active++; header.target_ptr = iov_ar[i].target_ptr_ar[j]; status = DCMF_Send(&OSPD_Generic_putacc_protocol, &(ospd_request->request), done_callback, DCMF_SEQUENTIAL_CONSISTENCY, target, iov_ar[i].size, iov_ar[i].source_ptr_ar[j], (DCQuad *) &header, (unsigned) 2); OSPU_ERR_POP((status != DCMF_SUCCESS), "Putacc returned with an error \n"); OSPD_Connection_send_active[target]++; } } fn_exit: OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }