int A1_Flush(int proc) { int status = A1_SUCCESS; A1U_FUNC_ENTER(); # ifdef HAVE_ERROR_CHECKING # endif # ifdef A1_TAU_PROFILING { TAU_TRACE_SENDMSG (A1_TAU_TAG_FLUSH, proc, 8); } # endif status = A1D_Flush(proc); A1U_ERR_POP(status!=A1_SUCCESS, "A1D_Flush returned an error\n"); fn_exit: A1U_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSP_NbGet(int target, void* src, void* dst, int bytes, OSP_handle_t osp_handle) { int status = OSP_SUCCESS; int my_rank = OSPD_Process_id(OSP_GROUP_WORLD); OSPU_FUNC_ENTER(); # ifdef HAVE_ERROR_CHECKING # endif # ifdef OSP_TAU_PROFILING { TAU_TRACE_SENDMSG (OSP_TAU_TAG_NBGET, target, bytes); } # endif if(target == my_rank && (bytes < ospu_settings.network_bypass_upper_limit_1d) ) { status = OSPU_Get_memcpy(src, dst, bytes); OSPU_ERR_POP(status != OSP_SUCCESS, "OSPU_Get_memcpy returned an error\n"); } else { status = OSPD_NbGet(target, src, dst, bytes, osp_handle); OSPU_ERR_POP(status != OSP_SUCCESS, "OSPD_NbGet returned an error\n"); } fn_exit: OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int A1_NbPutAcc(int target, void* source_ptr, void* target_ptr, int bytes, A1_datatype_t a1_type, void* scaling, A1_handle_t a1_handle) { int status = A1_SUCCESS; int my_rank = A1D_Process_id(A1_GROUP_WORLD); A1U_FUNC_ENTER(); # ifdef HAVE_ERROR_CHECKING # endif # ifdef A1_TAU_PROFILING { TAU_TRACE_SENDMSG (A1_TAU_TAG_NBPUTACC, target, bytes); } # endif /* Bypass is ALWAYS better for accumulate; we do not test against threshold. */ if (target == my_rank && a1u_settings.network_bypass) { status = A1U_Acc_memcpy(source_ptr, target_ptr, bytes, a1_type, scaling); A1U_ERR_POP(status != A1_SUCCESS, "A1U_Acc_memcpy returned an error\n"); } else { status = A1D_NbPutAcc(target, source_ptr, target_ptr, bytes, a1_type, scaling, a1_handle); A1U_ERR_POP((status!=A1_SUCCESS), "A1D_NbPutAcc returned error\n"); } fn_exit: A1U_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSP_NbPutAccV(int target, OSP_iov_t *iov_ar, int ar_len, OSP_datatype_t osp_type, void* scaling, OSP_handle_t osp_handle) { int status = OSP_SUCCESS; int my_rank = OSPD_Process_id(OSP_GROUP_WORLD); OSPU_FUNC_ENTER(); # ifdef HAVE_ERROR_CHECKING # endif # ifdef OSP_TAU_PROFILING { int i, total_bytes = 0; for (i = 0; i < ar_len; i++) total_bytes += iov_ar[i].ptr_array_len * iov_ar[i].bytes; TAU_TRACE_SENDMSG (OSP_TAU_TAG_NBPUTACCV, target, total_bytes); } # endif /* Bypass is ALWAYS better for accumulate; we do not test against threshold. */ if (target == my_rank && ospu_settings.network_bypass) { status = OSPU_AccV_memcpy(iov_ar, ar_len, osp_type, scaling); OSPU_ERR_POP(status != OSP_SUCCESS, "OSPU_AccV_memcpy returned an error\n"); } else { status = OSPD_NbPutAccV(target, iov_ar, ar_len, osp_type, scaling, osp_handle); OSPU_ERR_POP(status, "OSPD_NbPutAccV returned error\n"); } fn_exit: OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSP_NbPut(int target, void* src, void* dst, int bytes, OSP_handle_t osp_handle) { int status = OSP_SUCCESS; int my_rank = OSPD_Process_id(OSP_GROUP_WORLD); OSPU_FUNC_ENTER(); # ifdef HAVE_ERROR_CHECKING # endif # ifdef OSP_TAU_PROFILING { TAU_TRACE_SENDMSG (OSP_TAU_TAG_NBPUT, target, bytes); } # endif /* Not sure if what is the right strategy for bypass. OSPU_*_memcpy are blocking * but the overhead of going into DCMF_Put is likely not worth the savings * from said call being non-blocking. This is especially true under heavy load * since we have determined that DMA vs. memcpy turns over when the NIC is getting * hammered. */ if(target == my_rank && (bytes < ospu_settings.network_bypass_upper_limit_1d) ) { status = OSPU_Put_memcpy(src, dst, bytes); OSPU_ERR_POP(status != OSP_SUCCESS, "OSPU_Put_memcpy returned an error\n"); } else { status = OSPD_NbPut(target, src, dst, bytes, osp_handle); OSPU_ERR_POP(status != OSP_SUCCESS, "OSPD_NbPut returned an error\n"); } fn_exit: OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int A1_PutModV(int target, A1_iov_t *iov_ar, int ar_len, A1_reduce_op_t a1_op, A1_datatype_t a1_type) { int status = A1_SUCCESS; int my_rank = A1D_Process_id(A1_GROUP_WORLD); A1U_FUNC_ENTER(); # ifdef HAVE_ERROR_CHECKING # endif # ifdef A1_TAU_PROFILING { int i, total_bytes = 0; for (i = 0; i < ar_len; i++) total_bytes += iov_ar[i].ptr_array_len * iov_ar[i].bytes; TAU_TRACE_SENDMSG (A1_TAU_TAG_PUTMODV, target, total_bytes); } # endif /* Bypass is ALWAYS better for accumulate; we do not test against threshold. */ if (target == my_rank && a1u_settings.network_bypass) { status = A1U_ModV_memcpy(iov_ar, ar_len, a1_op, a1_type); A1U_ERR_POP(status != A1_SUCCESS, "A1U_ModV_memcpy returned an error\n"); } else { status = A1D_PutModV(target, iov_ar, ar_len, a1_op, a1_type); A1U_ERR_POP(status, "A1D_PutModV returned error\n"); } fn_exit: A1U_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int A1_Rmw(int target, void* source_ptr_in, void* source_ptr_out, void* target_ptr, int bytes, A1_atomic_op_t op, A1_datatype_t a1_type) { int status = A1_SUCCESS; A1U_FUNC_ENTER(); # ifdef HAVE_ERROR_CHECKING # endif # ifdef A1_TAU_PROFILING { TAU_TRACE_SENDMSG (A1_TAU_TAG_RMW, target, bytes); } # endif status = A1D_Rmw(target, source_ptr_in, source_ptr_out, target_ptr, bytes, op, a1_type); A1U_ERR_POP(status!=A1_SUCCESS, "A1D_Rmw returned an error\n"); fn_exit: A1U_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSP_Rmw(int target, void* source_ptr_in, void* source_ptr_out, void* target_ptr, int bytes, OSP_atomic_op_t op, OSP_datatype_t osp_type) { int status = OSP_SUCCESS; OSPU_FUNC_ENTER(); # ifdef HAVE_ERROR_CHECKING # endif # ifdef OSP_TAU_PROFILING { TAU_TRACE_SENDMSG (OSP_TAU_TAG_RMW, target, bytes); } # endif status = OSPD_Rmw(target, source_ptr_in, source_ptr_out, target_ptr, bytes, op, osp_type); OSPU_ERR_POP(status!=OSP_SUCCESS, "OSPD_Rmw returned an error\n"); fn_exit: OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }
int OSP_NbPutAccS(int target, int stride_level, int *block_sizes, void* source_ptr, int *src_stride_ar, void* target_ptr, int *trg_stride_ar, OSP_datatype_t osp_type, void* scaling, OSP_handle_t osp_handle) { int status = OSP_SUCCESS; int my_rank = OSPD_Process_id(OSP_GROUP_WORLD); OSPU_FUNC_ENTER(); # ifdef HAVE_ERROR_CHECKING # endif # ifdef OSP_TAU_PROFILING { int i, bytes = 1; for (i = 0; i <= stride_levels; i++) total_bytes *= count[i]; TAU_TRACE_SENDMSG (OSP_TAU_TAG_NBPUTACCS, target, total_bytes); } # endif /* Bypass is ALWAYS better for accumulate; we do not test against threshold. */ if (target == my_rank && ospu_settings.network_bypass) { status = OSPU_AccS_local(stride_level, block_sizes, source_ptr, src_stride_ar, target_ptr, trg_stride_ar, osp_type, scaling); OSPU_ERR_POP(status != OSP_SUCCESS, "OSPU_AccS_local returned an error\n"); } else { status = OSPI_Recursive_PutAcc(target, stride_level, block_sizes, source_ptr, src_stride_ar, target_ptr, trg_stride_ar, osp_type, scaling, osp_handle); OSPU_ERR_POP(status!=OSP_SUCCESS, "OSPI_Recursive_PutAcc returned error\n"); } fn_exit: OSPU_FUNC_EXIT(); return status; fn_fail: goto fn_exit; }