/** Optimized implementation of the ARMCI IOV operation that uses a single * lock/unlock pair. */ int ARMCII_Iov_op_batched(enum ARMCII_Op_e op, void **src, void **dst, int count, int elem_count, MPI_Datatype type, int proc) { int i; gmr_t *mreg; void *shr_ptr; switch(op) { case ARMCII_OP_PUT: shr_ptr = dst[0]; break; case ARMCII_OP_GET: shr_ptr = src[0]; break; case ARMCII_OP_ACC: shr_ptr = dst[0]; break; default: ARMCII_Error("unknown operation (%d)", op); return 1; } mreg = gmr_lookup(shr_ptr, proc); ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer"); gmr_lock(mreg, proc); for (i = 0; i < count; i++) { if ( ARMCII_GLOBAL_STATE.iov_batched_limit > 0 && i % ARMCII_GLOBAL_STATE.iov_batched_limit == 0 && i > 0 ) { gmr_unlock(mreg, proc); gmr_lock(mreg, proc); } switch(op) { case ARMCII_OP_PUT: gmr_put(mreg, src[i], dst[i], elem_count, proc); break; case ARMCII_OP_GET: gmr_get(mreg, src[i], dst[i], elem_count, proc); break; case ARMCII_OP_ACC: gmr_accumulate(mreg, src[i], dst[i], elem_count, type, proc); break; default: ARMCII_Error("unknown operation (%d)", op); return 1; } } gmr_unlock(mreg, proc); return 0; }
/** One-sided accumulate operation. * * @param[in] datatype ARMCI data type for the accumulate operation (see armci.h) * @param[in] scale Pointer for a scalar of type datatype that will be used to * scale values in the source buffer * @param[in] src Source address (remote) * @param[in] dst Destination address (local) * @param[in] bytes Number of bytes to transfer * @param[in] proc Process id to target * @return 0 on success, non-zero on failure */ int PARMCI_Acc(int datatype, void *scale, void *src, void *dst, int bytes, int proc) { void *src_buf; int count, type_size, scaled; MPI_Datatype type; gmr_t *src_mreg, *dst_mreg; /* If NOGUARD is set, assume the buffer is not shared */ if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) src_mreg = gmr_lookup(src, ARMCI_GROUP_WORLD.rank); else src_mreg = NULL; dst_mreg = gmr_lookup(dst, proc); ARMCII_Assert_msg(dst_mreg != NULL, "Invalid remote pointer"); /* Prepare the input data: Apply scaling if needed and acquire the DLA lock if * needed. We hold the DLA lock if (src_buf == src && src_mreg != NULL). */ scaled = ARMCII_Buf_acc_is_scaled(datatype, scale); if (scaled) { MPI_Alloc_mem(bytes, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); ARMCII_Buf_acc_scale(src, src_buf, bytes, datatype, scale); } else { src_buf = src; } /* Check if we need to copy: user requested it or same mem region */ if ( (src_buf == src) /* buf_prepare didn't make a copy */ && (ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_COPY || src_mreg == dst_mreg) ) { MPI_Alloc_mem(bytes, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); ARMCI_Copy(src, src_buf, bytes); } ARMCII_Acc_type_translate(datatype, &type, &type_size); count = bytes/type_size; ARMCII_Assert_msg(bytes % type_size == 0, "Transfer size is not a multiple of the datatype size"); /* TODO: Support a local accumulate operation more efficiently */ gmr_accumulate(dst_mreg, src_buf, dst, count, type, proc); gmr_flush(dst_mreg, proc, 1); /* flush_local */ if (src_buf != src) MPI_Free_mem(src_buf); return 0; }