/** Optimized implementation of the ARMCI IOV operation that uses a single * lock/unlock pair. */ int ARMCII_Iov_op_batched(enum ARMCII_Op_e op, void **src, void **dst, int count, int elem_count, MPI_Datatype type, int proc) { int i; gmr_t *mreg; void *shr_ptr; switch(op) { case ARMCII_OP_PUT: shr_ptr = dst[0]; break; case ARMCII_OP_GET: shr_ptr = src[0]; break; case ARMCII_OP_ACC: shr_ptr = dst[0]; break; default: ARMCII_Error("unknown operation (%d)", op); return 1; } mreg = gmr_lookup(shr_ptr, proc); ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer"); gmr_lock(mreg, proc); for (i = 0; i < count; i++) { if ( ARMCII_GLOBAL_STATE.iov_batched_limit > 0 && i % ARMCII_GLOBAL_STATE.iov_batched_limit == 0 && i > 0 ) { gmr_unlock(mreg, proc); gmr_lock(mreg, proc); } switch(op) { case ARMCII_OP_PUT: gmr_put(mreg, src[i], dst[i], elem_count, proc); break; case ARMCII_OP_GET: gmr_get(mreg, src[i], dst[i], elem_count, proc); break; case ARMCII_OP_ACC: gmr_accumulate(mreg, src[i], dst[i], elem_count, type, proc); break; default: ARMCII_Error("unknown operation (%d)", op); return 1; } } gmr_unlock(mreg, proc); return 0; }
/** One-sided put operation. * * @param[in] src Source address (remote) * @param[in] dst Destination address (local) * @param[in] size Number of bytes to transfer * @param[in] target Process id to target * @return 0 on success, non-zero on failure */ int ARMCI_Put(void *src, void *dst, int size, int target) { gmr_t *src_mreg, *dst_mreg; src_mreg = gmr_lookup(src, ARMCI_GROUP_WORLD.rank); dst_mreg = gmr_lookup(dst, target); ARMCII_Assert_msg(dst_mreg != NULL, "Invalid remote pointer"); /* Local operation */ if (target == ARMCI_GROUP_WORLD.rank) { if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) { gmr_dla_lock(dst_mreg); if (src_mreg) gmr_dla_lock(src_mreg); } ARMCI_Copy(src, dst, size); if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) { gmr_dla_unlock(dst_mreg); if (src_mreg) gmr_dla_unlock(src_mreg); } } /* Origin buffer is private */ else if (src_mreg == NULL || ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_NOGUARD) { gmr_lock(dst_mreg, target); gmr_put(dst_mreg, src, dst, size, target); gmr_unlock(dst_mreg, target); } /* COPY: Either origin and target buffers are in the same window and we can't * lock the same window twice (MPI semantics) or the user has requested * always-copy mode. */ else { void *src_buf; MPI_Alloc_mem(size, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); gmr_dla_lock(src_mreg); ARMCI_Copy(src, src_buf, size); gmr_dla_unlock(src_mreg); gmr_lock(dst_mreg, target); gmr_put(dst_mreg, src_buf, dst, size, target); gmr_unlock(dst_mreg, target); MPI_Free_mem(src_buf); } return 0; }
/** Blocking operation that accumulates data from the local process into the * memory of the remote process. The data transfer is strided and blocking. * * @param[in] datatype Type of data to be transferred. * @param[in] scale Pointer to the value that input data should be scaled by. * @param[in] src_ptr Source starting address of the data block to put. * @param[in] src_stride_arr Source array of stride distances in bytes. * @param[in] dst_ptr Destination starting address to put data. * @param[in] dst_stride_ar Destination array of stride distances in bytes. * @param[in] count Block size in each dimension. count[0] should be the * number of bytes of contiguous data in leading dimension. * @param[in] stride_levels The level of strides. * @param[in] proc Remote process ID (destination). * * @return Zero on success, error code otherwise. */ int PARMCI_AccS(int datatype, void *scale, void *src_ptr, int src_stride_ar[/*stride_levels*/], void *dst_ptr, int dst_stride_ar[/*stride_levels*/], int count[/*stride_levels+1*/], int stride_levels, int proc) { int err; if (ARMCII_GLOBAL_STATE.strided_method == ARMCII_STRIDED_DIRECT) { void *src_buf = NULL; gmr_t *mreg, *gmr_loc = NULL; MPI_Datatype src_type, dst_type, mpi_datatype; int scaled, mpi_datatype_size; ARMCII_Acc_type_translate(datatype, &mpi_datatype, &mpi_datatype_size); scaled = ARMCII_Buf_acc_is_scaled(datatype, scale); /* SCALE: copy and scale if requested */ if (scaled) { armci_giov_t iov; int i, nelem; if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) gmr_loc = gmr_lookup(src_ptr, ARMCI_GROUP_WORLD.rank); for (i = 1, nelem = count[0]/mpi_datatype_size; i < stride_levels+1; i++) nelem *= count[i]; MPI_Alloc_mem(nelem*mpi_datatype_size, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); if (gmr_loc != NULL) gmr_dla_lock(gmr_loc); /* Shoehorn the strided information into an IOV */ ARMCII_Strided_to_iov(&iov, src_ptr, src_stride_ar, src_ptr, src_stride_ar, count, stride_levels); for (i = 0; i < iov.ptr_array_len; i++) ARMCII_Buf_acc_scale(iov.src_ptr_array[i], ((uint8_t*)src_buf) + i*iov.bytes, iov.bytes, datatype, scale); free(iov.src_ptr_array); free(iov.dst_ptr_array); if (gmr_loc != NULL) gmr_dla_unlock(gmr_loc); MPI_Type_contiguous(nelem, mpi_datatype, &src_type); } /* COPY: Guard shared buffers */ else if (ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_COPY) { gmr_loc = gmr_lookup(src_ptr, ARMCI_GROUP_WORLD.rank); if (gmr_loc != NULL) { int i, nelem; for (i = 1, nelem = count[0]/mpi_datatype_size; i < stride_levels+1; i++) nelem *= count[i]; MPI_Alloc_mem(nelem*mpi_datatype_size, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); gmr_dla_lock(gmr_loc); armci_write_strided(src_ptr, stride_levels, src_stride_ar, count, src_buf); gmr_dla_unlock(gmr_loc); MPI_Type_contiguous(nelem, mpi_datatype, &src_type); } } /* NOGUARD: If src_buf hasn't been assigned to a copy, the strided source * buffer is going to be used directly. */ if (src_buf == NULL) { src_buf = src_ptr; ARMCII_Strided_to_dtype(src_stride_ar, count, stride_levels, mpi_datatype, &src_type); } ARMCII_Strided_to_dtype(dst_stride_ar, count, stride_levels, mpi_datatype, &dst_type); MPI_Type_commit(&src_type); MPI_Type_commit(&dst_type); int src_size, dst_size; MPI_Type_size(src_type, &src_size); MPI_Type_size(dst_type, &dst_size); ARMCII_Assert(src_size == dst_size); mreg = gmr_lookup(dst_ptr, proc); ARMCII_Assert_msg(mreg != NULL, "Invalid shared pointer"); gmr_lock(mreg, proc); gmr_accumulate_typed(mreg, src_buf, 1, src_type, dst_ptr, 1, dst_type, proc); gmr_unlock(mreg, proc); MPI_Type_free(&src_type); MPI_Type_free(&dst_type); /* COPY/SCALE: Free temp buffer */ if (src_buf != src_ptr) MPI_Free_mem(src_buf); err = 0; } else { armci_giov_t iov; ARMCII_Strided_to_iov(&iov, src_ptr, src_stride_ar, dst_ptr, dst_stride_ar, count, stride_levels); err = PARMCI_AccV(datatype, scale, &iov, 1, proc); free(iov.src_ptr_array); free(iov.dst_ptr_array); } return err; }
/** Blocking operation that transfers data from the remote process to the * memory of the calling process. The data transfer is strided and blocking. * * @param[in] src_ptr Source starting address of the data block to put. * @param[in] src_stride_arr Source array of stride distances in bytes. * @param[in] dst_ptr Destination starting address to put data. * @param[in] dst_stride_ar Destination array of stride distances in bytes. * @param[in] count Block size in each dimension. count[0] should be the * number of bytes of contiguous data in leading dimension. * @param[in] stride_levels The level of strides. * @param[in] proc Remote process ID (destination). * * @return Zero on success, error code otherwise. */ int PARMCI_GetS(void *src_ptr, int src_stride_ar[/*stride_levels*/], void *dst_ptr, int dst_stride_ar[/*stride_levels*/], int count[/*stride_levels+1*/], int stride_levels, int proc) { int err; if (ARMCII_GLOBAL_STATE.strided_method == ARMCII_STRIDED_DIRECT) { void *dst_buf = NULL; gmr_t *mreg, *gmr_loc = NULL; MPI_Datatype src_type, dst_type; /* COPY: Guard shared buffers */ if (ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_COPY) { gmr_loc = gmr_lookup(dst_ptr, ARMCI_GROUP_WORLD.rank); if (gmr_loc != NULL) { int i, size; for (i = 1, size = count[0]; i < stride_levels+1; i++) size *= count[i]; MPI_Alloc_mem(size, MPI_INFO_NULL, &dst_buf); ARMCII_Assert(dst_buf != NULL); MPI_Type_contiguous(size, MPI_BYTE, &dst_type); } } /* NOGUARD: If dst_buf hasn't been assigned to a copy, the strided source * buffer is going to be used directly. */ if (dst_buf == NULL) { dst_buf = dst_ptr; ARMCII_Strided_to_dtype(dst_stride_ar, count, stride_levels, MPI_BYTE, &dst_type); } ARMCII_Strided_to_dtype(src_stride_ar, count, stride_levels, MPI_BYTE, &src_type); MPI_Type_commit(&src_type); MPI_Type_commit(&dst_type); mreg = gmr_lookup(src_ptr, proc); ARMCII_Assert_msg(mreg != NULL, "Invalid shared pointer"); gmr_lock(mreg, proc); gmr_get_typed(mreg, src_ptr, 1, src_type, dst_buf, 1, dst_type, proc); gmr_unlock(mreg, proc); /* COPY: Finish the transfer */ if (dst_buf != dst_ptr) { gmr_dla_lock(gmr_loc); armci_read_strided(dst_ptr, stride_levels, dst_stride_ar, count, dst_buf); gmr_dla_unlock(gmr_loc); MPI_Free_mem(dst_buf); } MPI_Type_free(&src_type); MPI_Type_free(&dst_type); err = 0; } else { armci_giov_t iov; ARMCII_Strided_to_iov(&iov, src_ptr, src_stride_ar, dst_ptr, dst_stride_ar, count, stride_levels); err = PARMCI_GetV(&iov, 1, proc); free(iov.src_ptr_array); free(iov.dst_ptr_array); } return err; }
/** Optimized implementation of the ARMCI IOV operation that uses an MPI * datatype to achieve a one-sided gather/scatter. Does not use MPI_BOTTOM. */ int ARMCII_Iov_op_datatype_no_bottom(enum ARMCII_Op_e op, void **src, void **dst, int count, int elem_count, MPI_Datatype type, int proc) { gmr_t *mreg; MPI_Datatype type_loc, type_rem; MPI_Aint disp_loc[count]; int disp_rem[count]; int block_len[count]; void *dst_win_base; int dst_win_size, i, type_size; void **buf_rem, **buf_loc; MPI_Aint base_rem; MPI_Aint base_loc; void *base_loc_ptr; switch(op) { case ARMCII_OP_ACC: case ARMCII_OP_PUT: buf_rem = dst; buf_loc = src; break; case ARMCII_OP_GET: buf_rem = src; buf_loc = dst; break; default: ARMCII_Error("unknown operation (%d)", op); return 1; } MPI_Type_size(type, &type_size); mreg = gmr_lookup(buf_rem[0], proc); ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer"); dst_win_base = mreg->slices[proc].base; dst_win_size = mreg->slices[proc].size; MPI_Get_address(dst_win_base, &base_rem); /* Pick a base address for the start of the origin's datatype */ base_loc_ptr = buf_loc[0]; MPI_Get_address(base_loc_ptr, &base_loc); for (i = 0; i < count; i++) { MPI_Aint target_rem, target_loc; MPI_Get_address(buf_loc[i], &target_loc); MPI_Get_address(buf_rem[i], &target_rem); disp_loc[i] = target_loc - base_loc; disp_rem[i] = (target_rem - base_rem)/type_size; block_len[i] = elem_count; ARMCII_Assert_msg((target_rem - base_rem) % type_size == 0, "Transfer size is not a multiple of type size"); ARMCII_Assert_msg(disp_rem[i] >= 0 && disp_rem[i] < dst_win_size, "Invalid remote pointer"); ARMCII_Assert_msg(((uint8_t*)buf_rem[i]) + block_len[i] <= ((uint8_t*)dst_win_base) + dst_win_size, "Transfer exceeds buffer length"); } MPI_Type_create_hindexed(count, block_len, disp_loc, type, &type_loc); MPI_Type_create_indexed_block(count, elem_count, disp_rem, type, &type_rem); //MPI_Type_indexed(count, block_len, disp_rem, type, &type_rem); MPI_Type_commit(&type_loc); MPI_Type_commit(&type_rem); gmr_lock(mreg, proc); switch(op) { case ARMCII_OP_ACC: gmr_accumulate_typed(mreg, base_loc_ptr, 1, type_loc, MPI_BOTTOM, 1, type_rem, proc); break; case ARMCII_OP_PUT: gmr_put_typed(mreg, base_loc_ptr, 1, type_loc, MPI_BOTTOM, 1, type_rem, proc); break; case ARMCII_OP_GET: gmr_get_typed(mreg, MPI_BOTTOM, 1, type_rem, base_loc_ptr, 1, type_loc, proc); break; default: ARMCII_Error("unknown operation (%d)", op); return 1; } gmr_unlock(mreg, proc); MPI_Type_free(&type_loc); MPI_Type_free(&type_rem); return 0; }
/** One-sided accumulate operation. * * @param[in] datatype ARMCI data type for the accumulate operation (see armci.h) * @param[in] scale Pointer for a scalar of type datatype that will be used to * scale values in the source buffer * @param[in] src Source address (remote) * @param[in] dst Destination address (local) * @param[in] bytes Number of bytes to transfer * @param[in] proc Process id to target * @return 0 on success, non-zero on failure */ int ARMCI_Acc(int datatype, void *scale, void *src, void *dst, int bytes, int proc) { void *src_buf; int count, type_size, scaled, src_is_locked = 0; MPI_Datatype type; gmr_t *src_mreg, *dst_mreg; src_mreg = gmr_lookup(src, ARMCI_GROUP_WORLD.rank); dst_mreg = gmr_lookup(dst, proc); ARMCII_Assert_msg(dst_mreg != NULL, "Invalid remote pointer"); /* Prepare the input data: Apply scaling if needed and acquire the DLA lock if * needed. We hold the DLA lock if (src_buf == src && src_mreg != NULL). */ scaled = ARMCII_Buf_acc_is_scaled(datatype, scale); if (src_mreg && ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) { gmr_dla_lock(src_mreg); src_is_locked = 1; } if (scaled) { MPI_Alloc_mem(bytes, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); ARMCII_Buf_acc_scale(src, src_buf, bytes, datatype, scale); } else { src_buf = src; } /* Check if we need to copy: user requested it or same mem region */ if ( (src_buf == src) /* buf_prepare didn't make a copy */ && (ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_COPY || src_mreg == dst_mreg) ) { MPI_Alloc_mem(bytes, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); ARMCI_Copy(src, src_buf, bytes); } /* Unlock early if src_buf is a copy */ if (src_buf != src && src_is_locked) { gmr_dla_unlock(src_mreg); src_is_locked = 0; } ARMCII_Acc_type_translate(datatype, &type, &type_size); count = bytes/type_size; ARMCII_Assert_msg(bytes % type_size == 0, "Transfer size is not a multiple of the datatype size"); /* TODO: Support a local accumulate operation more efficiently */ gmr_lock(dst_mreg, proc); gmr_accumulate(dst_mreg, src_buf, dst, count, type, proc); gmr_unlock(dst_mreg, proc); if (src_is_locked) { gmr_dla_unlock(src_mreg); src_is_locked = 0; } if (src_buf != src) MPI_Free_mem(src_buf); return 0; }