/** Check if a set of pointers all corresponds to the same allocation. * * @param[in] ptrs An array of count shared pointers valid on proc. * @param[in] count Size of the ptrs array. * @param[in] proc Process on which the pointers are valid. * @return Non-zero (true) on success, zero (false) otherwise. */ int ARMCII_Iov_check_same_allocation(void **ptrs, int count, int proc) { int i; gmr_t *mreg; void *base, *extent; if (ARMCII_GLOBAL_STATE.iov_checks_disabled) return 1; mreg = gmr_lookup(ptrs[0], proc); /* If local, all must be local */ if (mreg == NULL) { for (i = 1; i < count; i++) { mreg = gmr_lookup(ptrs[i], proc); if (mreg != NULL) return 0; } } /* If shared, all must fall in this region */ else { base = mreg->slices[proc].base; extent = ((uint8_t*) base) + mreg->slices[proc].size; for (i = 1; i < count; i++) if ( !(ptrs[i] >= base && ptrs[i] < extent) ) return 0; } return 1; }
/** One-sided accumulate operation. * * @param[in] datatype ARMCI data type for the accumulate operation (see armci.h) * @param[in] scale Pointer for a scalar of type datatype that will be used to * scale values in the source buffer * @param[in] src Source address (remote) * @param[in] dst Destination address (local) * @param[in] bytes Number of bytes to transfer * @param[in] proc Process id to target * @return 0 on success, non-zero on failure */ int PARMCI_Acc(int datatype, void *scale, void *src, void *dst, int bytes, int proc) { void *src_buf; int count, type_size, scaled; MPI_Datatype type; gmr_t *src_mreg, *dst_mreg; /* If NOGUARD is set, assume the buffer is not shared */ if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) src_mreg = gmr_lookup(src, ARMCI_GROUP_WORLD.rank); else src_mreg = NULL; dst_mreg = gmr_lookup(dst, proc); ARMCII_Assert_msg(dst_mreg != NULL, "Invalid remote pointer"); /* Prepare the input data: Apply scaling if needed and acquire the DLA lock if * needed. We hold the DLA lock if (src_buf == src && src_mreg != NULL). */ scaled = ARMCII_Buf_acc_is_scaled(datatype, scale); if (scaled) { MPI_Alloc_mem(bytes, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); ARMCII_Buf_acc_scale(src, src_buf, bytes, datatype, scale); } else { src_buf = src; } /* Check if we need to copy: user requested it or same mem region */ if ( (src_buf == src) /* buf_prepare didn't make a copy */ && (ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_COPY || src_mreg == dst_mreg) ) { MPI_Alloc_mem(bytes, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); ARMCI_Copy(src, src_buf, bytes); } ARMCII_Acc_type_translate(datatype, &type, &type_size); count = bytes/type_size; ARMCII_Assert_msg(bytes % type_size == 0, "Transfer size is not a multiple of the datatype size"); /* TODO: Support a local accumulate operation more efficiently */ gmr_accumulate(dst_mreg, src_buf, dst, count, type, proc); gmr_flush(dst_mreg, proc, 1); /* flush_local */ if (src_buf != src) MPI_Free_mem(src_buf); return 0; }
/** One-sided put operation. * * @param[in] src Source address (remote) * @param[in] dst Destination address (local) * @param[in] size Number of bytes to transfer * @param[in] target Process id to target * @return 0 on success, non-zero on failure */ int ARMCI_Put(void *src, void *dst, int size, int target) { gmr_t *src_mreg, *dst_mreg; src_mreg = gmr_lookup(src, ARMCI_GROUP_WORLD.rank); dst_mreg = gmr_lookup(dst, target); ARMCII_Assert_msg(dst_mreg != NULL, "Invalid remote pointer"); /* Local operation */ if (target == ARMCI_GROUP_WORLD.rank) { if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) { gmr_dla_lock(dst_mreg); if (src_mreg) gmr_dla_lock(src_mreg); } ARMCI_Copy(src, dst, size); if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) { gmr_dla_unlock(dst_mreg); if (src_mreg) gmr_dla_unlock(src_mreg); } } /* Origin buffer is private */ else if (src_mreg == NULL || ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_NOGUARD) { gmr_lock(dst_mreg, target); gmr_put(dst_mreg, src, dst, size, target); gmr_unlock(dst_mreg, target); } /* COPY: Either origin and target buffers are in the same window and we can't * lock the same window twice (MPI semantics) or the user has requested * always-copy mode. */ else { void *src_buf; MPI_Alloc_mem(size, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); gmr_dla_lock(src_mreg); ARMCI_Copy(src, src_buf, size); gmr_dla_unlock(src_mreg); gmr_lock(dst_mreg, target); gmr_put(dst_mreg, src_buf, dst, size, target); gmr_unlock(dst_mreg, target); MPI_Free_mem(src_buf); } return 0; }
/** Prepare a set of buffers for use with a get operation. The returned set of * buffers is guaranteed to be in private space. Copies will be made if needed, * the result should be completed by finish. * * @param[in] orig_bufs Original set of buffers. * @param[out] new_bufs Pointer to the set of private buffers. * @param[in] count Number of entries in the buffer list. * @param[in] size The size of the buffers (all are of the same size). * @return Number of buffers that were moved. */ int ARMCII_Buf_prepare_write_vec(void **orig_bufs, void ***new_bufs_ptr, int count, int size) { int num_moved = 0; if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) { void **new_bufs = malloc(count*sizeof(void*)); int i; for (i = 0; i < count; i++) new_bufs[i] = NULL; for (i = 0; i < count; i++) { // Check if the destination buffer is within a shared region. If not, create // a temporary private buffer to hold the result. gmr_t *mreg = gmr_lookup(orig_bufs[i], ARMCI_GROUP_WORLD.rank); if (mreg != NULL) { MPI_Alloc_mem(size, MPI_INFO_NULL, &new_bufs[i]); ARMCII_Assert(new_bufs[i] != NULL); num_moved++; } else { new_bufs[i] = orig_bufs[i]; } } *new_bufs_ptr = new_bufs; } else { *new_bufs_ptr = orig_bufs; } return num_moved; }
/** Declare the end of a local access epoch. * * \note MPI-2 does not allow multiple locks at once, so you can have only one * access epoch open at a time and cannot do put/get/acc while in an access * region. * * @param[in] ptr Pointer to the allocation that was accessed directly */ void PARMCI_Access_end(void *ptr) { gmr_t *mreg; mreg = gmr_lookup(ptr, ARMCI_GROUP_WORLD.rank); ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer"); gmr_sync(mreg); }
/** Query the access mode for the given allocation. Non-collective. * * @param[in] ptr Pointer within the allocation. * @return Current access mode. */ int ARMCIX_Mode_get(void *ptr) { gmr_t *mreg; mreg = gmr_lookup(ptr, ARMCI_GROUP_WORLD.rank); ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer"); return mreg->access_mode; }
/** Optimized implementation of the ARMCI IOV operation that uses a single * lock/unlock pair. */ int ARMCII_Iov_op_batched(enum ARMCII_Op_e op, void **src, void **dst, int count, int elem_count, MPI_Datatype type, int proc) { int i; gmr_t *mreg; void *shr_ptr; switch(op) { case ARMCII_OP_PUT: shr_ptr = dst[0]; break; case ARMCII_OP_GET: shr_ptr = src[0]; break; case ARMCII_OP_ACC: shr_ptr = dst[0]; break; default: ARMCII_Error("unknown operation (%d)", op); return 1; } mreg = gmr_lookup(shr_ptr, proc); ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer"); gmr_lock(mreg, proc); for (i = 0; i < count; i++) { if ( ARMCII_GLOBAL_STATE.iov_batched_limit > 0 && i % ARMCII_GLOBAL_STATE.iov_batched_limit == 0 && i > 0 ) { gmr_unlock(mreg, proc); gmr_lock(mreg, proc); } switch(op) { case ARMCII_OP_PUT: gmr_put(mreg, src[i], dst[i], elem_count, proc); break; case ARMCII_OP_GET: gmr_get(mreg, src[i], dst[i], elem_count, proc); break; case ARMCII_OP_ACC: gmr_accumulate(mreg, src[i], dst[i], elem_count, type, proc); break; default: ARMCII_Error("unknown operation (%d)", op); return 1; } } gmr_unlock(mreg, proc); return 0; }
/** Perform atomic read-modify-write on the given integer or long location and * return the location's original value. * * \note ARMCI RMW operations are atomic with respect to other RMW operations, * but not with respect to other one-sided operations (get, put, acc, etc). * * @param[in] op Operation to be performed: * ARMCI_FETCH_AND_ADD (int) * ARMCI_FETCH_AND_ADD_LONG * ARMCI_SWAP (int) * ARMCI_SWAP_LONG * @param[out] ploc Location to store the original value. * @param[in] prem Location on which to perform atomic operation. * @param[in] value Value to add to remote location (ignored for swap). * @param[in] proc Process rank for the target buffer. */ int PARMCI_Rmw(int op, void *ploc, void *prem, int value, int proc) { int is_long; gmr_t *mreg; mreg = gmr_lookup(prem, proc); ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer"); if (op == ARMCI_SWAP_LONG || op == ARMCI_FETCH_AND_ADD_LONG) is_long = 1; else is_long = 0; if (op == ARMCI_SWAP || op == ARMCI_SWAP_LONG) { long swap_val_l; int swap_val_i; ARMCIX_Lock_hdl(mreg->rmw_mutex, 0, proc); PARMCI_Get(prem, is_long ? (void*) &swap_val_l : (void*) &swap_val_i, is_long ? sizeof(long) : sizeof(int), proc); PARMCI_Put(ploc, prem, is_long ? sizeof(long) : sizeof(int), proc); ARMCIX_Unlock_hdl(mreg->rmw_mutex, 0, proc); if (is_long) *(long*) ploc = swap_val_l; else *(int*) ploc = swap_val_i; } else if (op == ARMCI_FETCH_AND_ADD || op == ARMCI_FETCH_AND_ADD_LONG) { long fetch_val_l, new_val_l; int fetch_val_i, new_val_i; ARMCIX_Lock_hdl(mreg->rmw_mutex, 0, proc); PARMCI_Get(prem, is_long ? (void*) &fetch_val_l : (void*) &fetch_val_i, is_long ? sizeof(long) : sizeof(int), proc); if (is_long) new_val_l = fetch_val_l + value; else new_val_i = fetch_val_i + value; PARMCI_Put(is_long ? (void*) &new_val_l : (void*) &new_val_i, prem, is_long ? sizeof(long) : sizeof(int), proc); ARMCIX_Unlock_hdl(mreg->rmw_mutex, 0, proc); if (is_long) *(long*) ploc = fetch_val_l; else *(int*) ploc = fetch_val_i; } else { ARMCII_Error("invalid operation (%d)", op); } return 0; }
/** One-sided get operation. * * @param[in] src Source address (remote) * @param[in] dst Destination address (local) * @param[in] size Number of bytes to transfer * @param[in] target Process id to target * @return 0 on success, non-zero on failure */ int PARMCI_Get(void *src, void *dst, int size, int target) { gmr_t *src_mreg, *dst_mreg; src_mreg = gmr_lookup(src, target); /* If NOGUARD is set, assume the buffer is not shared */ if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) dst_mreg = gmr_lookup(dst, ARMCI_GROUP_WORLD.rank); else dst_mreg = NULL; ARMCII_Assert_msg(src_mreg != NULL, "Invalid remote pointer"); /* Local operation */ if (target == ARMCI_GROUP_WORLD.rank && dst_mreg == NULL) { ARMCI_Copy(src, dst, size); } /* Origin buffer is private */ else if (dst_mreg == NULL) { gmr_get(src_mreg, src, dst, size, target); gmr_flush(src_mreg, target, 0); /* it's a round trip so w.r.t. flush, local=remote */ } /* COPY: Either origin and target buffers are in the same window and we can't * lock the same window twice (MPI semantics) or the user has requested * always-copy mode. */ else { void *dst_buf; MPI_Alloc_mem(size, MPI_INFO_NULL, &dst_buf); ARMCII_Assert(dst_buf != NULL); gmr_get(src_mreg, src, dst_buf, size, target); gmr_flush(src_mreg, target, 0); /* it's a round trip so w.r.t. flush, local=remote */ ARMCI_Copy(dst_buf, dst, size); MPI_Free_mem(dst_buf); } return 0; }
/** Prepare a set of buffers for use with an accumulate operation. The * returned set of buffers is guaranteed to be in private space and scaled. * Copies will be made if needed, the result should be completed by finish. * * @param[in] orig_bufs Original set of buffers. * @param[out] new_bufs Pointer to the set of private buffers. * @param[in] count Number of entries in the buffer list. * @param[in] size The size of the buffers (all are of the same size). * @param[in] datatype The type of the buffer. * @param[in] scale Scaling constant to apply to each buffer. * @return Number of buffers that were moved. */ int ARMCII_Buf_prepare_acc_vec(void **orig_bufs, void ***new_bufs_ptr, int count, int size, int datatype, void *scale) { void **new_bufs; int i, scaled, num_moved = 0; new_bufs = malloc(count*sizeof(void*)); ARMCII_Assert(new_bufs != NULL); scaled = ARMCII_Buf_acc_is_scaled(datatype, scale); for (i = 0; i < count; i++) { gmr_t *mreg = NULL; // Check if the source buffer is within a shared region. if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) mreg = gmr_lookup(orig_bufs[i], ARMCI_GROUP_WORLD.rank); if (scaled) { MPI_Alloc_mem(size, MPI_INFO_NULL, &new_bufs[i]); ARMCII_Assert(new_bufs[i] != NULL); // Lock if needed so we can directly access the buffer if (mreg != NULL) gmr_dla_lock(mreg); ARMCII_Buf_acc_scale(orig_bufs[i], new_bufs[i], size, datatype, scale); if (mreg != NULL) gmr_dla_unlock(mreg); } else { new_bufs[i] = orig_bufs[i]; } if (mreg != NULL) { // If the buffer wasn't copied, we should copy it into a private buffer if (new_bufs[i] == orig_bufs[i]) { MPI_Alloc_mem(size, MPI_INFO_NULL, &new_bufs[i]); ARMCII_Assert(new_bufs[i] != NULL); gmr_dla_lock(mreg); ARMCI_Copy(orig_bufs[i], new_bufs[i], size); gmr_dla_unlock(mreg); } } if (new_bufs[i] == orig_bufs[i]) num_moved++; } *new_bufs_ptr = new_bufs; return num_moved; }
/** Declare the start of a local access epoch. This allows direct access to * data in local memory. * * @param[in] ptr Pointer to the allocation that will be accessed directly */ void ARMCI_Access_begin(void *ptr) { gmr_t *mreg; mreg = gmr_lookup(ptr, ARMCI_GROUP_WORLD.rank); ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer"); ARMCII_Assert_msg((mreg->access_mode & ARMCIX_MODE_NO_LOAD_STORE) == 0, "Direct local access is not permitted in the current access mode"); gmr_dla_lock(mreg); }
/** Free a shared memory allocation. Collective. * * @param[in] ptr Pointer to the local patch of the allocation */ int ARMCI_Free_group(void *ptr, ARMCI_Group *group) { gmr_t *mreg; if (ptr != NULL) { mreg = gmr_lookup(ptr, ARMCI_GROUP_WORLD.rank); ARMCII_Assert_msg(mreg != NULL, "Invalid shared pointer"); } else { ARMCII_Dbg_print(DEBUG_CAT_ALLOC, "given NULL\n"); mreg = NULL; } gmr_destroy(mreg, group); return 0; }
/** Finish a set of prepared buffers. Will perform communication and copies as * needed to ensure results are in the original buffers. Temporary space will be * freed. * * @param[in] orig_bufs Original set of buffers. * @param[out] new_bufs Set of private buffers. * @param[in] count Number of entries in the buffer list. * @param[in] size The size of the buffers (all are of the same size). */ void ARMCII_Buf_finish_write_vec(void **orig_bufs, void **new_bufs, int count, int size) { if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) { int i; for (i = 0; i < count; i++) { if (orig_bufs[i] != new_bufs[i]) { gmr_t *mreg = gmr_lookup(orig_bufs[i], ARMCI_GROUP_WORLD.rank); ARMCII_Assert(mreg != NULL); gmr_dla_lock(mreg); ARMCI_Copy(new_bufs[i], orig_bufs[i], size); // gmr_put(mreg, new_bufs[i], orig_bufs[i], size, ARMCI_GROUP_WORLD.rank); gmr_dla_unlock(mreg); MPI_Free_mem(new_bufs[i]); } } free(new_bufs); } }
/** Set the acess mode for the given allocation. Collective across the * allocation's group. Waits for all processes, finishes all communication, * and then sets the new access mode. * * @param[in] new_mode The new access mode. * @param[in] ptr Pointer within the allocation. * @return Zero upon success, error code otherwise. */ int ARMCIX_Mode_set(int new_mode, void *ptr, ARMCI_Group *group) { gmr_t *mreg; mreg = gmr_lookup(ptr, ARMCI_GROUP_WORLD.rank); ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer"); ARMCII_Assert(group->comm == mreg->group.comm); ARMCII_Assert_msg(mreg->lock_state != GMR_LOCK_DLA, "Cannot change the access mode; window is locked for local access."); ARMCII_Assert_msg(mreg->lock_state == GMR_LOCK_UNLOCKED, "Cannot change the access mode on a window that is locked."); // Wait for all processes to complete any outstanding communication before we // do the mode switch MPI_Barrier(mreg->group.comm); mreg->access_mode = new_mode; return 0; }
/** Prepare a set of buffers for use with a put operation. The returned set of * buffers is guaranteed to be in private space. Copies will be made if needed, * the result should be completed by finish. * * @param[in] orig_bufs Original set of buffers. * @param[out] new_bufs Pointer to the set of private buffers. * @param[in] count Number of entries in the buffer list. * @param[in] size The size of the buffers (all are of the same size). * @return Number of buffers that were moved. */ int ARMCII_Buf_prepare_read_vec(void **orig_bufs, void ***new_bufs_ptr, int count, int size) { int num_moved = 0; if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) { void **new_bufs = malloc(count*sizeof(void*)); int i; for (i = 0; i < count; i++) new_bufs[i] = NULL; for (i = 0; i < count; i++) { // Check if the source buffer is within a shared region. If so, copy it // into a private buffer. gmr_t *mreg = gmr_lookup(orig_bufs[i], ARMCI_GROUP_WORLD.rank); if (mreg != NULL) { MPI_Alloc_mem(size, MPI_INFO_NULL, &new_bufs[i]); ARMCII_Assert(new_bufs[i] != NULL); gmr_dla_lock(mreg); ARMCI_Copy(orig_bufs[i], new_bufs[i], size); // gmr_get(mreg, orig_bufs[i], new_bufs[i], size, ARMCI_GROUP_WORLD.rank); gmr_dla_unlock(mreg); num_moved++; } else { new_bufs[i] = orig_bufs[i]; } } *new_bufs_ptr = new_bufs; } else { *new_bufs_ptr = orig_bufs; } return num_moved; }
/** Destroy/free a shared memory region. * * @param[in] ptr Pointer within range of the segment (e.g. base pointer). * @param[in] group Group on which to perform the free. */ void gmr_destroy(gmr_t *mreg, ARMCI_Group *group) { int search_proc_in, search_proc_out, search_proc_out_grp; void *search_base; int alloc_me, alloc_nproc; int world_me, world_nproc; MPI_Comm_rank(group->comm, &alloc_me); MPI_Comm_size(group->comm, &alloc_nproc); MPI_Comm_rank(ARMCI_GROUP_WORLD.comm, &world_me); MPI_Comm_size(ARMCI_GROUP_WORLD.comm, &world_nproc); /* All-to-all exchange of a <base address, proc> pair. This is so that we * can support passing NULL into ARMCI_Free() which is permitted when a * process allocates 0 bytes. Unfortunately, in this case we still need to * identify the mem region and free it. */ if (mreg == NULL) search_proc_in = -1; else { search_proc_in = world_me; search_base = mreg->slices[world_me].base; } /* Collectively decide on who will provide the base address */ MPI_Allreduce(&search_proc_in, &search_proc_out, 1, MPI_INT, MPI_MAX, group->comm); /* Everyone passed NULL. Nothing to free. */ if (search_proc_out < 0) return; /* Translate world rank to group rank */ search_proc_out_grp = ARMCII_Translate_absolute_to_group(group, search_proc_out); /* Broadcast the base address */ MPI_Bcast(&search_base, sizeof(void*), MPI_BYTE, search_proc_out_grp, group->comm); /* If we were passed NULL, look up the mem region using the <base, proc> pair */ if (mreg == NULL) mreg = gmr_lookup(search_base, search_proc_out); /* If it's still not found, the user may have passed the wrong group */ ARMCII_Assert_msg(mreg != NULL, "Could not locate the desired allocation"); switch (mreg->lock_state) { case GMR_LOCK_UNLOCKED: break; case GMR_LOCK_DLA: ARMCII_Warning("Releasing direct local access before freeing shared allocation\n"); gmr_dla_unlock(mreg); break; default: ARMCII_Error("Unable to free locked memory region (%d)\n", mreg->lock_state); } /* Remove from the list of mem regions */ if (mreg->prev == NULL) { ARMCII_Assert(gmr_list == mreg); gmr_list = mreg->next; if (mreg->next != NULL) mreg->next->prev = NULL; } else { mreg->prev->next = mreg->next; if (mreg->next != NULL) mreg->next->prev = mreg->prev; } /* Destroy the window and free all buffers */ MPI_Win_free(&mreg->window); if (mreg->slices[world_me].base != NULL) MPI_Free_mem(mreg->slices[world_me].base); free(mreg->slices); ARMCIX_Destroy_mutexes_hdl(mreg->rmw_mutex); free(mreg); }
/** Blocking operation that accumulates data from the local process into the * memory of the remote process. The data transfer is strided and blocking. * * @param[in] datatype Type of data to be transferred. * @param[in] scale Pointer to the value that input data should be scaled by. * @param[in] src_ptr Source starting address of the data block to put. * @param[in] src_stride_arr Source array of stride distances in bytes. * @param[in] dst_ptr Destination starting address to put data. * @param[in] dst_stride_ar Destination array of stride distances in bytes. * @param[in] count Block size in each dimension. count[0] should be the * number of bytes of contiguous data in leading dimension. * @param[in] stride_levels The level of strides. * @param[in] proc Remote process ID (destination). * * @return Zero on success, error code otherwise. */ int PARMCI_AccS(int datatype, void *scale, void *src_ptr, int src_stride_ar[/*stride_levels*/], void *dst_ptr, int dst_stride_ar[/*stride_levels*/], int count[/*stride_levels+1*/], int stride_levels, int proc) { int err; if (ARMCII_GLOBAL_STATE.strided_method == ARMCII_STRIDED_DIRECT) { void *src_buf = NULL; gmr_t *mreg, *gmr_loc = NULL; MPI_Datatype src_type, dst_type, mpi_datatype; int scaled, mpi_datatype_size; ARMCII_Acc_type_translate(datatype, &mpi_datatype, &mpi_datatype_size); scaled = ARMCII_Buf_acc_is_scaled(datatype, scale); /* SCALE: copy and scale if requested */ if (scaled) { armci_giov_t iov; int i, nelem; if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) gmr_loc = gmr_lookup(src_ptr, ARMCI_GROUP_WORLD.rank); for (i = 1, nelem = count[0]/mpi_datatype_size; i < stride_levels+1; i++) nelem *= count[i]; MPI_Alloc_mem(nelem*mpi_datatype_size, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); if (gmr_loc != NULL) gmr_dla_lock(gmr_loc); /* Shoehorn the strided information into an IOV */ ARMCII_Strided_to_iov(&iov, src_ptr, src_stride_ar, src_ptr, src_stride_ar, count, stride_levels); for (i = 0; i < iov.ptr_array_len; i++) ARMCII_Buf_acc_scale(iov.src_ptr_array[i], ((uint8_t*)src_buf) + i*iov.bytes, iov.bytes, datatype, scale); free(iov.src_ptr_array); free(iov.dst_ptr_array); if (gmr_loc != NULL) gmr_dla_unlock(gmr_loc); MPI_Type_contiguous(nelem, mpi_datatype, &src_type); } /* COPY: Guard shared buffers */ else if (ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_COPY) { gmr_loc = gmr_lookup(src_ptr, ARMCI_GROUP_WORLD.rank); if (gmr_loc != NULL) { int i, nelem; for (i = 1, nelem = count[0]/mpi_datatype_size; i < stride_levels+1; i++) nelem *= count[i]; MPI_Alloc_mem(nelem*mpi_datatype_size, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); gmr_dla_lock(gmr_loc); armci_write_strided(src_ptr, stride_levels, src_stride_ar, count, src_buf); gmr_dla_unlock(gmr_loc); MPI_Type_contiguous(nelem, mpi_datatype, &src_type); } } /* NOGUARD: If src_buf hasn't been assigned to a copy, the strided source * buffer is going to be used directly. */ if (src_buf == NULL) { src_buf = src_ptr; ARMCII_Strided_to_dtype(src_stride_ar, count, stride_levels, mpi_datatype, &src_type); } ARMCII_Strided_to_dtype(dst_stride_ar, count, stride_levels, mpi_datatype, &dst_type); MPI_Type_commit(&src_type); MPI_Type_commit(&dst_type); int src_size, dst_size; MPI_Type_size(src_type, &src_size); MPI_Type_size(dst_type, &dst_size); ARMCII_Assert(src_size == dst_size); mreg = gmr_lookup(dst_ptr, proc); ARMCII_Assert_msg(mreg != NULL, "Invalid shared pointer"); gmr_lock(mreg, proc); gmr_accumulate_typed(mreg, src_buf, 1, src_type, dst_ptr, 1, dst_type, proc); gmr_unlock(mreg, proc); MPI_Type_free(&src_type); MPI_Type_free(&dst_type); /* COPY/SCALE: Free temp buffer */ if (src_buf != src_ptr) MPI_Free_mem(src_buf); err = 0; } else { armci_giov_t iov; ARMCII_Strided_to_iov(&iov, src_ptr, src_stride_ar, dst_ptr, dst_stride_ar, count, stride_levels); err = PARMCI_AccV(datatype, scale, &iov, 1, proc); free(iov.src_ptr_array); free(iov.dst_ptr_array); } return err; }
/** Blocking operation that transfers data from the remote process to the * memory of the calling process. The data transfer is strided and blocking. * * @param[in] src_ptr Source starting address of the data block to put. * @param[in] src_stride_arr Source array of stride distances in bytes. * @param[in] dst_ptr Destination starting address to put data. * @param[in] dst_stride_ar Destination array of stride distances in bytes. * @param[in] count Block size in each dimension. count[0] should be the * number of bytes of contiguous data in leading dimension. * @param[in] stride_levels The level of strides. * @param[in] proc Remote process ID (destination). * * @return Zero on success, error code otherwise. */ int PARMCI_GetS(void *src_ptr, int src_stride_ar[/*stride_levels*/], void *dst_ptr, int dst_stride_ar[/*stride_levels*/], int count[/*stride_levels+1*/], int stride_levels, int proc) { int err; if (ARMCII_GLOBAL_STATE.strided_method == ARMCII_STRIDED_DIRECT) { void *dst_buf = NULL; gmr_t *mreg, *gmr_loc = NULL; MPI_Datatype src_type, dst_type; /* COPY: Guard shared buffers */ if (ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_COPY) { gmr_loc = gmr_lookup(dst_ptr, ARMCI_GROUP_WORLD.rank); if (gmr_loc != NULL) { int i, size; for (i = 1, size = count[0]; i < stride_levels+1; i++) size *= count[i]; MPI_Alloc_mem(size, MPI_INFO_NULL, &dst_buf); ARMCII_Assert(dst_buf != NULL); MPI_Type_contiguous(size, MPI_BYTE, &dst_type); } } /* NOGUARD: If dst_buf hasn't been assigned to a copy, the strided source * buffer is going to be used directly. */ if (dst_buf == NULL) { dst_buf = dst_ptr; ARMCII_Strided_to_dtype(dst_stride_ar, count, stride_levels, MPI_BYTE, &dst_type); } ARMCII_Strided_to_dtype(src_stride_ar, count, stride_levels, MPI_BYTE, &src_type); MPI_Type_commit(&src_type); MPI_Type_commit(&dst_type); mreg = gmr_lookup(src_ptr, proc); ARMCII_Assert_msg(mreg != NULL, "Invalid shared pointer"); gmr_lock(mreg, proc); gmr_get_typed(mreg, src_ptr, 1, src_type, dst_buf, 1, dst_type, proc); gmr_unlock(mreg, proc); /* COPY: Finish the transfer */ if (dst_buf != dst_ptr) { gmr_dla_lock(gmr_loc); armci_read_strided(dst_ptr, stride_levels, dst_stride_ar, count, dst_buf); gmr_dla_unlock(gmr_loc); MPI_Free_mem(dst_buf); } MPI_Type_free(&src_type); MPI_Type_free(&dst_type); err = 0; } else { armci_giov_t iov; ARMCII_Strided_to_iov(&iov, src_ptr, src_stride_ar, dst_ptr, dst_stride_ar, count, stride_levels); err = PARMCI_GetV(&iov, 1, proc); free(iov.src_ptr_array); free(iov.dst_ptr_array); } return err; }
/** Perform atomic read-modify-write on the given integer or long location and * return the location's original value. * * \note ARMCI RMW operations are atomic with respect to other RMW operations, * but not with respect to other one-sided operations (get, put, acc, etc). * * @param[in] op Operation to be performed: * ARMCI_FETCH_AND_ADD (int) * ARMCI_FETCH_AND_ADD_LONG * ARMCI_SWAP (int) * ARMCI_SWAP_LONG * @param[out] ploc Location to store the original value. * @param[in] prem Location on which to perform atomic operation. * @param[in] value Value to add to remote location (ignored for swap). * @param[in] proc Process rank for the target buffer. */ int PARMCI_Rmw(int op, void *ploc, void *prem, int value, int proc) { int is_swap = 0, is_long = 0; MPI_Datatype type; MPI_Op rop; gmr_t *src_mreg, *dst_mreg; /* If NOGUARD is set, assume the buffer is not shared */ if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) src_mreg = gmr_lookup(ploc, ARMCI_GROUP_WORLD.rank); else src_mreg = NULL; dst_mreg = gmr_lookup(prem, proc); ARMCII_Assert_msg(dst_mreg != NULL, "Invalid remote pointer"); if (op == ARMCI_SWAP_LONG || op == ARMCI_FETCH_AND_ADD_LONG) { is_long = 1; type = MPI_LONG; } else type = MPI_INT; if (op == ARMCI_SWAP || op == ARMCI_SWAP_LONG) { is_swap = 1; rop = MPI_REPLACE; } else if (op == ARMCI_FETCH_AND_ADD || op == ARMCI_FETCH_AND_ADD_LONG) rop = MPI_SUM; else ARMCII_Error("invalid operation (%d)", op); /* We hold the DLA lock if (src_mreg != NULL). */ if (is_swap) { long out_val_l, src_val_l = *((long*)ploc); int out_val_i, src_val_i = *((int*)ploc); gmr_fetch_and_op(dst_mreg, is_long ? (void*) &src_val_l : (void*) &src_val_i /* src */, is_long ? (void*) &out_val_l : (void*) &out_val_i /* out */, prem /* dst */, type, rop, proc); gmr_flush(dst_mreg, proc, 0); /* it's a round trip so w.r.t. flush, local=remote */ if (is_long) *(long*) ploc = out_val_l; else *(int*) ploc = out_val_i; } else /* fetch-and-add */ { long fetch_val_l, add_val_l = value; int fetch_val_i, add_val_i = value; gmr_fetch_and_op(dst_mreg, is_long ? (void*) &add_val_l : (void*) &add_val_i /* src */, is_long ? (void*) &fetch_val_l : (void*) &fetch_val_i /* out */, prem /* dst */, type, rop, proc); gmr_flush(dst_mreg, proc, 0); /* it's a round trip so w.r.t. flush, local=remote */ if (is_long) *(long*) ploc = fetch_val_l; else *(int*) ploc = fetch_val_i; } return 0; }
/** Optimized implementation of the ARMCI IOV operation that uses an MPI * datatype to achieve a one-sided gather/scatter. Does not use MPI_BOTTOM. */ int ARMCII_Iov_op_datatype_no_bottom(enum ARMCII_Op_e op, void **src, void **dst, int count, int elem_count, MPI_Datatype type, int proc) { gmr_t *mreg; MPI_Datatype type_loc, type_rem; MPI_Aint disp_loc[count]; int disp_rem[count]; int block_len[count]; void *dst_win_base; int dst_win_size, i, type_size; void **buf_rem, **buf_loc; MPI_Aint base_rem; MPI_Aint base_loc; void *base_loc_ptr; switch(op) { case ARMCII_OP_ACC: case ARMCII_OP_PUT: buf_rem = dst; buf_loc = src; break; case ARMCII_OP_GET: buf_rem = src; buf_loc = dst; break; default: ARMCII_Error("unknown operation (%d)", op); return 1; } MPI_Type_size(type, &type_size); mreg = gmr_lookup(buf_rem[0], proc); ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer"); dst_win_base = mreg->slices[proc].base; dst_win_size = mreg->slices[proc].size; MPI_Get_address(dst_win_base, &base_rem); /* Pick a base address for the start of the origin's datatype */ base_loc_ptr = buf_loc[0]; MPI_Get_address(base_loc_ptr, &base_loc); for (i = 0; i < count; i++) { MPI_Aint target_rem, target_loc; MPI_Get_address(buf_loc[i], &target_loc); MPI_Get_address(buf_rem[i], &target_rem); disp_loc[i] = target_loc - base_loc; disp_rem[i] = (target_rem - base_rem)/type_size; block_len[i] = elem_count; ARMCII_Assert_msg((target_rem - base_rem) % type_size == 0, "Transfer size is not a multiple of type size"); ARMCII_Assert_msg(disp_rem[i] >= 0 && disp_rem[i] < dst_win_size, "Invalid remote pointer"); ARMCII_Assert_msg(((uint8_t*)buf_rem[i]) + block_len[i] <= ((uint8_t*)dst_win_base) + dst_win_size, "Transfer exceeds buffer length"); } MPI_Type_create_hindexed(count, block_len, disp_loc, type, &type_loc); MPI_Type_create_indexed_block(count, elem_count, disp_rem, type, &type_rem); //MPI_Type_indexed(count, block_len, disp_rem, type, &type_rem); MPI_Type_commit(&type_loc); MPI_Type_commit(&type_rem); gmr_lock(mreg, proc); switch(op) { case ARMCII_OP_ACC: gmr_accumulate_typed(mreg, base_loc_ptr, 1, type_loc, MPI_BOTTOM, 1, type_rem, proc); break; case ARMCII_OP_PUT: gmr_put_typed(mreg, base_loc_ptr, 1, type_loc, MPI_BOTTOM, 1, type_rem, proc); break; case ARMCII_OP_GET: gmr_get_typed(mreg, MPI_BOTTOM, 1, type_rem, base_loc_ptr, 1, type_loc, proc); break; default: ARMCII_Error("unknown operation (%d)", op); return 1; } gmr_unlock(mreg, proc); MPI_Type_free(&type_loc); MPI_Type_free(&type_rem); return 0; }