/** Prepare a set of buffers for use with an accumulate operation. The * returned set of buffers is guaranteed to be in private space and scaled. * Copies will be made if needed, the result should be completed by finish. * * @param[in] orig_bufs Original set of buffers. * @param[out] new_bufs Pointer to the set of private buffers. * @param[in] count Number of entries in the buffer list. * @param[in] size The size of the buffers (all are of the same size). * @param[in] datatype The type of the buffer. * @param[in] scale Scaling constant to apply to each buffer. * @return Number of buffers that were moved. */ int ARMCII_Buf_prepare_acc_vec(void **orig_bufs, void ***new_bufs_ptr, int count, int size, int datatype, void *scale) { void **new_bufs; int i, scaled, num_moved = 0; new_bufs = malloc(count*sizeof(void*)); ARMCII_Assert(new_bufs != NULL); scaled = ARMCII_Buf_acc_is_scaled(datatype, scale); for (i = 0; i < count; i++) { gmr_t *mreg = NULL; // Check if the source buffer is within a shared region. if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) mreg = gmr_lookup(orig_bufs[i], ARMCI_GROUP_WORLD.rank); if (scaled) { MPI_Alloc_mem(size, MPI_INFO_NULL, &new_bufs[i]); ARMCII_Assert(new_bufs[i] != NULL); // Lock if needed so we can directly access the buffer if (mreg != NULL) gmr_dla_lock(mreg); ARMCII_Buf_acc_scale(orig_bufs[i], new_bufs[i], size, datatype, scale); if (mreg != NULL) gmr_dla_unlock(mreg); } else { new_bufs[i] = orig_bufs[i]; } if (mreg != NULL) { // If the buffer wasn't copied, we should copy it into a private buffer if (new_bufs[i] == orig_bufs[i]) { MPI_Alloc_mem(size, MPI_INFO_NULL, &new_bufs[i]); ARMCII_Assert(new_bufs[i] != NULL); gmr_dla_lock(mreg); ARMCI_Copy(orig_bufs[i], new_bufs[i], size); gmr_dla_unlock(mreg); } } if (new_bufs[i] == orig_bufs[i]) num_moved++; } *new_bufs_ptr = new_bufs; return num_moved; }
/** One-sided put operation. * * @param[in] src Source address (remote) * @param[in] dst Destination address (local) * @param[in] size Number of bytes to transfer * @param[in] target Process id to target * @return 0 on success, non-zero on failure */ int ARMCI_Put(void *src, void *dst, int size, int target) { gmr_t *src_mreg, *dst_mreg; src_mreg = gmr_lookup(src, ARMCI_GROUP_WORLD.rank); dst_mreg = gmr_lookup(dst, target); ARMCII_Assert_msg(dst_mreg != NULL, "Invalid remote pointer"); /* Local operation */ if (target == ARMCI_GROUP_WORLD.rank) { if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) { gmr_dla_lock(dst_mreg); if (src_mreg) gmr_dla_lock(src_mreg); } ARMCI_Copy(src, dst, size); if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) { gmr_dla_unlock(dst_mreg); if (src_mreg) gmr_dla_unlock(src_mreg); } } /* Origin buffer is private */ else if (src_mreg == NULL || ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_NOGUARD) { gmr_lock(dst_mreg, target); gmr_put(dst_mreg, src, dst, size, target); gmr_unlock(dst_mreg, target); } /* COPY: Either origin and target buffers are in the same window and we can't * lock the same window twice (MPI semantics) or the user has requested * always-copy mode. */ else { void *src_buf; MPI_Alloc_mem(size, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); gmr_dla_lock(src_mreg); ARMCI_Copy(src, src_buf, size); gmr_dla_unlock(src_mreg); gmr_lock(dst_mreg, target); gmr_put(dst_mreg, src_buf, dst, size, target); gmr_unlock(dst_mreg, target); MPI_Free_mem(src_buf); } return 0; }
/** Declare the end of a local access epoch. * * \note MPI-2 does not allow multiple locks at once, so you can have only one * access epoch open at a time and cannot do put/get/acc while in an access * region. * * @param[in] ptr Pointer to the allocation that was accessed directly */ void ARMCI_Access_end(void *ptr) { gmr_t *mreg; mreg = gmr_lookup(ptr, ARMCI_GROUP_WORLD.rank); ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer"); gmr_dla_unlock(mreg); }
/** Finish a set of prepared buffers. Will perform communication and copies as * needed to ensure results are in the original buffers. Temporary space will be * freed. * * @param[in] orig_bufs Original set of buffers. * @param[out] new_bufs Set of private buffers. * @param[in] count Number of entries in the buffer list. * @param[in] size The size of the buffers (all are of the same size). */ void ARMCII_Buf_finish_write_vec(void **orig_bufs, void **new_bufs, int count, int size) { if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) { int i; for (i = 0; i < count; i++) { if (orig_bufs[i] != new_bufs[i]) { gmr_t *mreg = gmr_lookup(orig_bufs[i], ARMCI_GROUP_WORLD.rank); ARMCII_Assert(mreg != NULL); gmr_dla_lock(mreg); ARMCI_Copy(new_bufs[i], orig_bufs[i], size); // gmr_put(mreg, new_bufs[i], orig_bufs[i], size, ARMCI_GROUP_WORLD.rank); gmr_dla_unlock(mreg); MPI_Free_mem(new_bufs[i]); } } free(new_bufs); } }
/** Prepare a set of buffers for use with a put operation. The returned set of * buffers is guaranteed to be in private space. Copies will be made if needed, * the result should be completed by finish. * * @param[in] orig_bufs Original set of buffers. * @param[out] new_bufs Pointer to the set of private buffers. * @param[in] count Number of entries in the buffer list. * @param[in] size The size of the buffers (all are of the same size). * @return Number of buffers that were moved. */ int ARMCII_Buf_prepare_read_vec(void **orig_bufs, void ***new_bufs_ptr, int count, int size) { int num_moved = 0; if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) { void **new_bufs = malloc(count*sizeof(void*)); int i; for (i = 0; i < count; i++) new_bufs[i] = NULL; for (i = 0; i < count; i++) { // Check if the source buffer is within a shared region. If so, copy it // into a private buffer. gmr_t *mreg = gmr_lookup(orig_bufs[i], ARMCI_GROUP_WORLD.rank); if (mreg != NULL) { MPI_Alloc_mem(size, MPI_INFO_NULL, &new_bufs[i]); ARMCII_Assert(new_bufs[i] != NULL); gmr_dla_lock(mreg); ARMCI_Copy(orig_bufs[i], new_bufs[i], size); // gmr_get(mreg, orig_bufs[i], new_bufs[i], size, ARMCI_GROUP_WORLD.rank); gmr_dla_unlock(mreg); num_moved++; } else { new_bufs[i] = orig_bufs[i]; } } *new_bufs_ptr = new_bufs; } else { *new_bufs_ptr = orig_bufs; } return num_moved; }
/** Destroy/free a shared memory region. * * @param[in] ptr Pointer within range of the segment (e.g. base pointer). * @param[in] group Group on which to perform the free. */ void gmr_destroy(gmr_t *mreg, ARMCI_Group *group) { int search_proc_in, search_proc_out, search_proc_out_grp; void *search_base; int alloc_me, alloc_nproc; int world_me, world_nproc; MPI_Comm_rank(group->comm, &alloc_me); MPI_Comm_size(group->comm, &alloc_nproc); MPI_Comm_rank(ARMCI_GROUP_WORLD.comm, &world_me); MPI_Comm_size(ARMCI_GROUP_WORLD.comm, &world_nproc); /* All-to-all exchange of a <base address, proc> pair. This is so that we * can support passing NULL into ARMCI_Free() which is permitted when a * process allocates 0 bytes. Unfortunately, in this case we still need to * identify the mem region and free it. */ if (mreg == NULL) search_proc_in = -1; else { search_proc_in = world_me; search_base = mreg->slices[world_me].base; } /* Collectively decide on who will provide the base address */ MPI_Allreduce(&search_proc_in, &search_proc_out, 1, MPI_INT, MPI_MAX, group->comm); /* Everyone passed NULL. Nothing to free. */ if (search_proc_out < 0) return; /* Translate world rank to group rank */ search_proc_out_grp = ARMCII_Translate_absolute_to_group(group, search_proc_out); /* Broadcast the base address */ MPI_Bcast(&search_base, sizeof(void*), MPI_BYTE, search_proc_out_grp, group->comm); /* If we were passed NULL, look up the mem region using the <base, proc> pair */ if (mreg == NULL) mreg = gmr_lookup(search_base, search_proc_out); /* If it's still not found, the user may have passed the wrong group */ ARMCII_Assert_msg(mreg != NULL, "Could not locate the desired allocation"); switch (mreg->lock_state) { case GMR_LOCK_UNLOCKED: break; case GMR_LOCK_DLA: ARMCII_Warning("Releasing direct local access before freeing shared allocation\n"); gmr_dla_unlock(mreg); break; default: ARMCII_Error("Unable to free locked memory region (%d)\n", mreg->lock_state); } /* Remove from the list of mem regions */ if (mreg->prev == NULL) { ARMCII_Assert(gmr_list == mreg); gmr_list = mreg->next; if (mreg->next != NULL) mreg->next->prev = NULL; } else { mreg->prev->next = mreg->next; if (mreg->next != NULL) mreg->next->prev = mreg->prev; } /* Destroy the window and free all buffers */ MPI_Win_free(&mreg->window); if (mreg->slices[world_me].base != NULL) MPI_Free_mem(mreg->slices[world_me].base); free(mreg->slices); ARMCIX_Destroy_mutexes_hdl(mreg->rmw_mutex); free(mreg); }
/** Blocking operation that accumulates data from the local process into the * memory of the remote process. The data transfer is strided and blocking. * * @param[in] datatype Type of data to be transferred. * @param[in] scale Pointer to the value that input data should be scaled by. * @param[in] src_ptr Source starting address of the data block to put. * @param[in] src_stride_arr Source array of stride distances in bytes. * @param[in] dst_ptr Destination starting address to put data. * @param[in] dst_stride_ar Destination array of stride distances in bytes. * @param[in] count Block size in each dimension. count[0] should be the * number of bytes of contiguous data in leading dimension. * @param[in] stride_levels The level of strides. * @param[in] proc Remote process ID (destination). * * @return Zero on success, error code otherwise. */ int PARMCI_AccS(int datatype, void *scale, void *src_ptr, int src_stride_ar[/*stride_levels*/], void *dst_ptr, int dst_stride_ar[/*stride_levels*/], int count[/*stride_levels+1*/], int stride_levels, int proc) { int err; if (ARMCII_GLOBAL_STATE.strided_method == ARMCII_STRIDED_DIRECT) { void *src_buf = NULL; gmr_t *mreg, *gmr_loc = NULL; MPI_Datatype src_type, dst_type, mpi_datatype; int scaled, mpi_datatype_size; ARMCII_Acc_type_translate(datatype, &mpi_datatype, &mpi_datatype_size); scaled = ARMCII_Buf_acc_is_scaled(datatype, scale); /* SCALE: copy and scale if requested */ if (scaled) { armci_giov_t iov; int i, nelem; if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) gmr_loc = gmr_lookup(src_ptr, ARMCI_GROUP_WORLD.rank); for (i = 1, nelem = count[0]/mpi_datatype_size; i < stride_levels+1; i++) nelem *= count[i]; MPI_Alloc_mem(nelem*mpi_datatype_size, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); if (gmr_loc != NULL) gmr_dla_lock(gmr_loc); /* Shoehorn the strided information into an IOV */ ARMCII_Strided_to_iov(&iov, src_ptr, src_stride_ar, src_ptr, src_stride_ar, count, stride_levels); for (i = 0; i < iov.ptr_array_len; i++) ARMCII_Buf_acc_scale(iov.src_ptr_array[i], ((uint8_t*)src_buf) + i*iov.bytes, iov.bytes, datatype, scale); free(iov.src_ptr_array); free(iov.dst_ptr_array); if (gmr_loc != NULL) gmr_dla_unlock(gmr_loc); MPI_Type_contiguous(nelem, mpi_datatype, &src_type); } /* COPY: Guard shared buffers */ else if (ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_COPY) { gmr_loc = gmr_lookup(src_ptr, ARMCI_GROUP_WORLD.rank); if (gmr_loc != NULL) { int i, nelem; for (i = 1, nelem = count[0]/mpi_datatype_size; i < stride_levels+1; i++) nelem *= count[i]; MPI_Alloc_mem(nelem*mpi_datatype_size, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); gmr_dla_lock(gmr_loc); armci_write_strided(src_ptr, stride_levels, src_stride_ar, count, src_buf); gmr_dla_unlock(gmr_loc); MPI_Type_contiguous(nelem, mpi_datatype, &src_type); } } /* NOGUARD: If src_buf hasn't been assigned to a copy, the strided source * buffer is going to be used directly. */ if (src_buf == NULL) { src_buf = src_ptr; ARMCII_Strided_to_dtype(src_stride_ar, count, stride_levels, mpi_datatype, &src_type); } ARMCII_Strided_to_dtype(dst_stride_ar, count, stride_levels, mpi_datatype, &dst_type); MPI_Type_commit(&src_type); MPI_Type_commit(&dst_type); int src_size, dst_size; MPI_Type_size(src_type, &src_size); MPI_Type_size(dst_type, &dst_size); ARMCII_Assert(src_size == dst_size); mreg = gmr_lookup(dst_ptr, proc); ARMCII_Assert_msg(mreg != NULL, "Invalid shared pointer"); gmr_lock(mreg, proc); gmr_accumulate_typed(mreg, src_buf, 1, src_type, dst_ptr, 1, dst_type, proc); gmr_unlock(mreg, proc); MPI_Type_free(&src_type); MPI_Type_free(&dst_type); /* COPY/SCALE: Free temp buffer */ if (src_buf != src_ptr) MPI_Free_mem(src_buf); err = 0; } else { armci_giov_t iov; ARMCII_Strided_to_iov(&iov, src_ptr, src_stride_ar, dst_ptr, dst_stride_ar, count, stride_levels); err = PARMCI_AccV(datatype, scale, &iov, 1, proc); free(iov.src_ptr_array); free(iov.dst_ptr_array); } return err; }
/** Blocking operation that transfers data from the remote process to the * memory of the calling process. The data transfer is strided and blocking. * * @param[in] src_ptr Source starting address of the data block to put. * @param[in] src_stride_arr Source array of stride distances in bytes. * @param[in] dst_ptr Destination starting address to put data. * @param[in] dst_stride_ar Destination array of stride distances in bytes. * @param[in] count Block size in each dimension. count[0] should be the * number of bytes of contiguous data in leading dimension. * @param[in] stride_levels The level of strides. * @param[in] proc Remote process ID (destination). * * @return Zero on success, error code otherwise. */ int PARMCI_GetS(void *src_ptr, int src_stride_ar[/*stride_levels*/], void *dst_ptr, int dst_stride_ar[/*stride_levels*/], int count[/*stride_levels+1*/], int stride_levels, int proc) { int err; if (ARMCII_GLOBAL_STATE.strided_method == ARMCII_STRIDED_DIRECT) { void *dst_buf = NULL; gmr_t *mreg, *gmr_loc = NULL; MPI_Datatype src_type, dst_type; /* COPY: Guard shared buffers */ if (ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_COPY) { gmr_loc = gmr_lookup(dst_ptr, ARMCI_GROUP_WORLD.rank); if (gmr_loc != NULL) { int i, size; for (i = 1, size = count[0]; i < stride_levels+1; i++) size *= count[i]; MPI_Alloc_mem(size, MPI_INFO_NULL, &dst_buf); ARMCII_Assert(dst_buf != NULL); MPI_Type_contiguous(size, MPI_BYTE, &dst_type); } } /* NOGUARD: If dst_buf hasn't been assigned to a copy, the strided source * buffer is going to be used directly. */ if (dst_buf == NULL) { dst_buf = dst_ptr; ARMCII_Strided_to_dtype(dst_stride_ar, count, stride_levels, MPI_BYTE, &dst_type); } ARMCII_Strided_to_dtype(src_stride_ar, count, stride_levels, MPI_BYTE, &src_type); MPI_Type_commit(&src_type); MPI_Type_commit(&dst_type); mreg = gmr_lookup(src_ptr, proc); ARMCII_Assert_msg(mreg != NULL, "Invalid shared pointer"); gmr_lock(mreg, proc); gmr_get_typed(mreg, src_ptr, 1, src_type, dst_buf, 1, dst_type, proc); gmr_unlock(mreg, proc); /* COPY: Finish the transfer */ if (dst_buf != dst_ptr) { gmr_dla_lock(gmr_loc); armci_read_strided(dst_ptr, stride_levels, dst_stride_ar, count, dst_buf); gmr_dla_unlock(gmr_loc); MPI_Free_mem(dst_buf); } MPI_Type_free(&src_type); MPI_Type_free(&dst_type); err = 0; } else { armci_giov_t iov; ARMCII_Strided_to_iov(&iov, src_ptr, src_stride_ar, dst_ptr, dst_stride_ar, count, stride_levels); err = PARMCI_GetV(&iov, 1, proc); free(iov.src_ptr_array); free(iov.dst_ptr_array); } return err; }
/** One-sided accumulate operation. * * @param[in] datatype ARMCI data type for the accumulate operation (see armci.h) * @param[in] scale Pointer for a scalar of type datatype that will be used to * scale values in the source buffer * @param[in] src Source address (remote) * @param[in] dst Destination address (local) * @param[in] bytes Number of bytes to transfer * @param[in] proc Process id to target * @return 0 on success, non-zero on failure */ int ARMCI_Acc(int datatype, void *scale, void *src, void *dst, int bytes, int proc) { void *src_buf; int count, type_size, scaled, src_is_locked = 0; MPI_Datatype type; gmr_t *src_mreg, *dst_mreg; src_mreg = gmr_lookup(src, ARMCI_GROUP_WORLD.rank); dst_mreg = gmr_lookup(dst, proc); ARMCII_Assert_msg(dst_mreg != NULL, "Invalid remote pointer"); /* Prepare the input data: Apply scaling if needed and acquire the DLA lock if * needed. We hold the DLA lock if (src_buf == src && src_mreg != NULL). */ scaled = ARMCII_Buf_acc_is_scaled(datatype, scale); if (src_mreg && ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) { gmr_dla_lock(src_mreg); src_is_locked = 1; } if (scaled) { MPI_Alloc_mem(bytes, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); ARMCII_Buf_acc_scale(src, src_buf, bytes, datatype, scale); } else { src_buf = src; } /* Check if we need to copy: user requested it or same mem region */ if ( (src_buf == src) /* buf_prepare didn't make a copy */ && (ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_COPY || src_mreg == dst_mreg) ) { MPI_Alloc_mem(bytes, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); ARMCI_Copy(src, src_buf, bytes); } /* Unlock early if src_buf is a copy */ if (src_buf != src && src_is_locked) { gmr_dla_unlock(src_mreg); src_is_locked = 0; } ARMCII_Acc_type_translate(datatype, &type, &type_size); count = bytes/type_size; ARMCII_Assert_msg(bytes % type_size == 0, "Transfer size is not a multiple of the datatype size"); /* TODO: Support a local accumulate operation more efficiently */ gmr_lock(dst_mreg, proc); gmr_accumulate(dst_mreg, src_buf, dst, count, type, proc); gmr_unlock(dst_mreg, proc); if (src_is_locked) { gmr_dla_unlock(src_mreg); src_is_locked = 0; } if (src_buf != src) MPI_Free_mem(src_buf); return 0; }