/** One-sided accumulate operation with typed arguments. Source buffer must be private. * * @param[in] mreg Memory region * @param[in] src Address of source data * @param[in] src_count Number of elements of the given type at the source * @param[in] src_type MPI datatype of the source elements * @param[in] dst Address of destination buffer * @param[in] dst_count Number of elements of the given type at the destination * @param[in] src_type MPI datatype of the destination elements * @param[in] size Number of bytes to transfer * @param[in] proc Absolute process id of target process * @return 0 on success, non-zero on failure */ int gmr_accumulate_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type, void *dst, int dst_count, MPI_Datatype dst_type, int proc) { int grp_proc; gmr_size_t disp; MPI_Aint lb, extent; grp_proc = ARMCII_Translate_absolute_to_group(&mreg->group, proc); ARMCII_Assert(grp_proc >= 0); // Calculate displacement from beginning of the window if (dst == MPI_BOTTOM) disp = 0; else disp = (gmr_size_t) ((uint8_t*)dst - (uint8_t*)mreg->slices[proc].base); // Perform checks MPI_Type_get_true_extent(dst_type, &lb, &extent); ARMCII_Assert(mreg->lock_state != GMR_LOCK_UNLOCKED); ARMCII_Assert_msg(disp >= 0 && disp < mreg->slices[proc].size, "Invalid remote address"); ARMCII_Assert_msg(disp + dst_count*extent <= mreg->slices[proc].size, "Transfer is out of range"); MPI_Accumulate(src, src_count, src_type, grp_proc, (MPI_Aint) disp, dst_count, dst_type, MPI_SUM, mreg->window); return 0; }
/** One-sided accumulate operation. * * @param[in] datatype ARMCI data type for the accumulate operation (see armci.h) * @param[in] scale Pointer for a scalar of type datatype that will be used to * scale values in the source buffer * @param[in] src Source address (remote) * @param[in] dst Destination address (local) * @param[in] bytes Number of bytes to transfer * @param[in] proc Process id to target * @return 0 on success, non-zero on failure */ int PARMCI_Acc(int datatype, void *scale, void *src, void *dst, int bytes, int proc) { void *src_buf; int count, type_size, scaled; MPI_Datatype type; gmr_t *src_mreg, *dst_mreg; /* If NOGUARD is set, assume the buffer is not shared */ if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) src_mreg = gmr_lookup(src, ARMCI_GROUP_WORLD.rank); else src_mreg = NULL; dst_mreg = gmr_lookup(dst, proc); ARMCII_Assert_msg(dst_mreg != NULL, "Invalid remote pointer"); /* Prepare the input data: Apply scaling if needed and acquire the DLA lock if * needed. We hold the DLA lock if (src_buf == src && src_mreg != NULL). */ scaled = ARMCII_Buf_acc_is_scaled(datatype, scale); if (scaled) { MPI_Alloc_mem(bytes, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); ARMCII_Buf_acc_scale(src, src_buf, bytes, datatype, scale); } else { src_buf = src; } /* Check if we need to copy: user requested it or same mem region */ if ( (src_buf == src) /* buf_prepare didn't make a copy */ && (ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_COPY || src_mreg == dst_mreg) ) { MPI_Alloc_mem(bytes, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); ARMCI_Copy(src, src_buf, bytes); } ARMCII_Acc_type_translate(datatype, &type, &type_size); count = bytes/type_size; ARMCII_Assert_msg(bytes % type_size == 0, "Transfer size is not a multiple of the datatype size"); /* TODO: Support a local accumulate operation more efficiently */ gmr_accumulate(dst_mreg, src_buf, dst, count, type, proc); gmr_flush(dst_mreg, proc, 1); /* flush_local */ if (src_buf != src) MPI_Free_mem(src_buf); return 0; }
/** Declare the start of a local access epoch. This allows direct access to * data in local memory. * * @param[in] ptr Pointer to the allocation that will be accessed directly */ void ARMCI_Access_begin(void *ptr) { gmr_t *mreg; mreg = gmr_lookup(ptr, ARMCI_GROUP_WORLD.rank); ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer"); ARMCII_Assert_msg((mreg->access_mode & ARMCIX_MODE_NO_LOAD_STORE) == 0, "Direct local access is not permitted in the current access mode"); gmr_dla_lock(mreg); }
/** Perform an I/O vector operation. Local buffers must be private. * * @param[in] op Operation to be performed (ARMCII_OP_PUT, ...) * @param[in] src Array of source pointers * @param[in] dst Array of destination pointers * @param[in] count Length of pointer arrays * @param[in] size Size of each transfer * @param[in] datatype Data type for accumulate op (ignored for all others) * @param[in] overlapping Do remote regions overlap? * @param[in] same_alloc Do remote regions correspond to the same allocation? * @param[in] proc Target process * @return Zero on success, error code otherwise */ int ARMCII_Iov_op_dispatch(enum ARMCII_Op_e op, void **src, void **dst, int count, int size, int datatype, int overlapping, int same_alloc, int proc) { MPI_Datatype type; int type_count, type_size; if (op == ARMCII_OP_ACC) { ARMCII_Acc_type_translate(datatype, &type, &type_size); type_count = size/type_size; ARMCII_Assert_msg(size % type_size == 0, "Transfer size is not a multiple of type size"); } else { type = MPI_BYTE; MPI_Type_size(type, &type_size); type_count = size/type_size; ARMCII_Assert_msg(size % type_size == 0, "Transfer size is not a multiple of type size"); } // CONSERVATIVE CASE: If remote pointers overlap or remote pointers correspond to // multiple allocations, use the safe implementation to avoid invalid MPI // use. if (overlapping || !same_alloc || ARMCII_GLOBAL_STATE.iov_method == ARMCII_IOV_CONSRV) { if (overlapping) ARMCII_Warning("IOV remote buffers overlap\n"); if (!same_alloc) ARMCII_Warning("IOV remote buffers are not within the same allocation\n"); return ARMCII_Iov_op_safe(op, src, dst, count, type_count, type, proc); } // OPTIMIZED CASE: It's safe for us to issue all the operations under a // single lock. else if ( ARMCII_GLOBAL_STATE.iov_method == ARMCII_IOV_DIRECT || ARMCII_GLOBAL_STATE.iov_method == ARMCII_IOV_AUTO ) { if (ARMCII_GLOBAL_STATE.no_mpi_bottom == 1) { return ARMCII_Iov_op_datatype_no_bottom(op, src, dst, count, type_count, type, proc); } else { return ARMCII_Iov_op_datatype(op, src, dst, count, type_count, type, proc); } } else if (ARMCII_GLOBAL_STATE.iov_method == ARMCII_IOV_BATCHED) { return ARMCII_Iov_op_batched(op, src, dst, count, type_count, type, proc); } else { ARMCII_Error("unknown iov method (%d)\n", ARMCII_GLOBAL_STATE.iov_method); return 1; } }
/** Query the access mode for the given allocation. Non-collective. * * @param[in] ptr Pointer within the allocation. * @return Current access mode. */ int ARMCIX_Mode_get(void *ptr) { gmr_t *mreg; mreg = gmr_lookup(ptr, ARMCI_GROUP_WORLD.rank); ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer"); return mreg->access_mode; }
/** Declare the end of a local access epoch. * * \note MPI-2 does not allow multiple locks at once, so you can have only one * access epoch open at a time and cannot do put/get/acc while in an access * region. * * @param[in] ptr Pointer to the allocation that was accessed directly */ void PARMCI_Access_end(void *ptr) { gmr_t *mreg; mreg = gmr_lookup(ptr, ARMCI_GROUP_WORLD.rank); ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer"); gmr_sync(mreg); }
/** Optimized implementation of the ARMCI IOV operation that uses a single * lock/unlock pair. */ int ARMCII_Iov_op_batched(enum ARMCII_Op_e op, void **src, void **dst, int count, int elem_count, MPI_Datatype type, int proc) { int i; gmr_t *mreg; void *shr_ptr; switch(op) { case ARMCII_OP_PUT: shr_ptr = dst[0]; break; case ARMCII_OP_GET: shr_ptr = src[0]; break; case ARMCII_OP_ACC: shr_ptr = dst[0]; break; default: ARMCII_Error("unknown operation (%d)", op); return 1; } mreg = gmr_lookup(shr_ptr, proc); ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer"); gmr_lock(mreg, proc); for (i = 0; i < count; i++) { if ( ARMCII_GLOBAL_STATE.iov_batched_limit > 0 && i % ARMCII_GLOBAL_STATE.iov_batched_limit == 0 && i > 0 ) { gmr_unlock(mreg, proc); gmr_lock(mreg, proc); } switch(op) { case ARMCII_OP_PUT: gmr_put(mreg, src[i], dst[i], elem_count, proc); break; case ARMCII_OP_GET: gmr_get(mreg, src[i], dst[i], elem_count, proc); break; case ARMCII_OP_ACC: gmr_accumulate(mreg, src[i], dst[i], elem_count, type, proc); break; default: ARMCII_Error("unknown operation (%d)", op); return 1; } } gmr_unlock(mreg, proc); return 0; }
/** Perform atomic read-modify-write on the given integer or long location and * return the location's original value. * * \note ARMCI RMW operations are atomic with respect to other RMW operations, * but not with respect to other one-sided operations (get, put, acc, etc). * * @param[in] op Operation to be performed: * ARMCI_FETCH_AND_ADD (int) * ARMCI_FETCH_AND_ADD_LONG * ARMCI_SWAP (int) * ARMCI_SWAP_LONG * @param[out] ploc Location to store the original value. * @param[in] prem Location on which to perform atomic operation. * @param[in] value Value to add to remote location (ignored for swap). * @param[in] proc Process rank for the target buffer. */ int PARMCI_Rmw(int op, void *ploc, void *prem, int value, int proc) { int is_long; gmr_t *mreg; mreg = gmr_lookup(prem, proc); ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer"); if (op == ARMCI_SWAP_LONG || op == ARMCI_FETCH_AND_ADD_LONG) is_long = 1; else is_long = 0; if (op == ARMCI_SWAP || op == ARMCI_SWAP_LONG) { long swap_val_l; int swap_val_i; ARMCIX_Lock_hdl(mreg->rmw_mutex, 0, proc); PARMCI_Get(prem, is_long ? (void*) &swap_val_l : (void*) &swap_val_i, is_long ? sizeof(long) : sizeof(int), proc); PARMCI_Put(ploc, prem, is_long ? sizeof(long) : sizeof(int), proc); ARMCIX_Unlock_hdl(mreg->rmw_mutex, 0, proc); if (is_long) *(long*) ploc = swap_val_l; else *(int*) ploc = swap_val_i; } else if (op == ARMCI_FETCH_AND_ADD || op == ARMCI_FETCH_AND_ADD_LONG) { long fetch_val_l, new_val_l; int fetch_val_i, new_val_i; ARMCIX_Lock_hdl(mreg->rmw_mutex, 0, proc); PARMCI_Get(prem, is_long ? (void*) &fetch_val_l : (void*) &fetch_val_i, is_long ? sizeof(long) : sizeof(int), proc); if (is_long) new_val_l = fetch_val_l + value; else new_val_i = fetch_val_i + value; PARMCI_Put(is_long ? (void*) &new_val_l : (void*) &new_val_i, prem, is_long ? sizeof(long) : sizeof(int), proc); ARMCIX_Unlock_hdl(mreg->rmw_mutex, 0, proc); if (is_long) *(long*) ploc = fetch_val_l; else *(int*) ploc = fetch_val_i; } else { ARMCII_Error("invalid operation (%d)", op); } return 0; }
/** Convert an ARMCI strided access description into an MPI subarray datatype. * * @param[in] stride_array Array of strides * @param[in] count Array of transfer counts * @param[in] stride_levels Number of levels of striding * @param[in] old_type Type of the data element described by count and stride_array * @param[out] new_type New MPI type for the given strided access */ void ARMCII_Strided_to_dtype(int stride_array[/*stride_levels*/], int count[/*stride_levels+1*/], int stride_levels, MPI_Datatype old_type, MPI_Datatype *new_type) { int sizes [stride_levels+1]; int subsizes[stride_levels+1]; int starts [stride_levels+1]; int i, old_type_size; MPI_Type_size(old_type, &old_type_size); /* Eliminate counts that don't count (all 1 counts at the end) */ for (i = stride_levels+1; i > 0 && stride_levels > 0 && count[i-1] == 1; i--) stride_levels--; /* A correct strided spec should me monotonic increasing and stride_array[i+1] should be a multiple of stride_array[i]. */ if (stride_levels > 0) { for (i = 1; i < stride_levels; i++) ARMCII_Assert(stride_array[i] >= stride_array[i-1] && stride_array[i] % stride_array[i-1] == 0); } /* Test for a contiguous transfer */ if (stride_levels == 0) { int elem_count = count[0]/old_type_size; ARMCII_Assert(count[0] % old_type_size == 0); MPI_Type_contiguous(elem_count, old_type, new_type); } /* Transfer is non-contiguous */ else { for (i = 0; i < stride_levels+1; i++) starts[i] = 0; sizes [stride_levels] = stride_array[0]/old_type_size; subsizes[stride_levels] = count[0]/old_type_size; ARMCII_Assert(stride_array[0] % old_type_size == 0 && count[0] % old_type_size == 0); for (i = 1; i < stride_levels; i++) { /* Convert strides into dimensions by dividing out contributions from lower dims */ sizes [stride_levels-i] = stride_array[i]/stride_array[i-1]; subsizes[stride_levels-i] = count[i]; ARMCII_Assert_msg(stride_array[i] % stride_array[i-1] == 0, "Invalid striding"); } sizes [0] = count[stride_levels]; subsizes[0] = count[stride_levels]; MPI_Type_create_subarray(stride_levels+1, sizes, subsizes, starts, MPI_ORDER_C, old_type, new_type); } }
/** Set the acess mode for the given allocation. Collective across the * allocation's group. Waits for all processes, finishes all communication, * and then sets the new access mode. * * @param[in] new_mode The new access mode. * @param[in] ptr Pointer within the allocation. * @return Zero upon success, error code otherwise. */ int ARMCIX_Mode_set(int new_mode, void *ptr, ARMCI_Group *group) { gmr_t *mreg; mreg = gmr_lookup(ptr, ARMCI_GROUP_WORLD.rank); ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer"); ARMCII_Assert(group->comm == mreg->group.comm); ARMCII_Assert_msg(mreg->lock_state != GMR_LOCK_DLA, "Cannot change the access mode; window is locked for local access."); ARMCII_Assert_msg(mreg->lock_state == GMR_LOCK_UNLOCKED, "Cannot change the access mode on a window that is locked."); // Wait for all processes to complete any outstanding communication before we // do the mode switch MPI_Barrier(mreg->group.comm); mreg->access_mode = new_mode; return 0; }
/** One-sided put operation. * * @param[in] src Source address (remote) * @param[in] dst Destination address (local) * @param[in] size Number of bytes to transfer * @param[in] target Process id to target * @return 0 on success, non-zero on failure */ int ARMCI_Put(void *src, void *dst, int size, int target) { gmr_t *src_mreg, *dst_mreg; src_mreg = gmr_lookup(src, ARMCI_GROUP_WORLD.rank); dst_mreg = gmr_lookup(dst, target); ARMCII_Assert_msg(dst_mreg != NULL, "Invalid remote pointer"); /* Local operation */ if (target == ARMCI_GROUP_WORLD.rank) { if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) { gmr_dla_lock(dst_mreg); if (src_mreg) gmr_dla_lock(src_mreg); } ARMCI_Copy(src, dst, size); if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) { gmr_dla_unlock(dst_mreg); if (src_mreg) gmr_dla_unlock(src_mreg); } } /* Origin buffer is private */ else if (src_mreg == NULL || ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_NOGUARD) { gmr_lock(dst_mreg, target); gmr_put(dst_mreg, src, dst, size, target); gmr_unlock(dst_mreg, target); } /* COPY: Either origin and target buffers are in the same window and we can't * lock the same window twice (MPI semantics) or the user has requested * always-copy mode. */ else { void *src_buf; MPI_Alloc_mem(size, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); gmr_dla_lock(src_mreg); ARMCI_Copy(src, src_buf, size); gmr_dla_unlock(src_mreg); gmr_lock(dst_mreg, target); gmr_put(dst_mreg, src_buf, dst, size, target); gmr_unlock(dst_mreg, target); MPI_Free_mem(src_buf); } return 0; }
/** Free a shared memory allocation. Collective. * * @param[in] ptr Pointer to the local patch of the allocation */ int ARMCI_Free_group(void *ptr, ARMCI_Group *group) { gmr_t *mreg; if (ptr != NULL) { mreg = gmr_lookup(ptr, ARMCI_GROUP_WORLD.rank); ARMCII_Assert_msg(mreg != NULL, "Invalid shared pointer"); } else { ARMCII_Dbg_print(DEBUG_CAT_ALLOC, "given NULL\n"); mreg = NULL; } gmr_destroy(mreg, group); return 0; }
/** Unlock a memory region that was locked for direct local access. * * @param[in] mreg Memory region */ void gmr_dla_unlock(gmr_t *mreg) { int grp_proc = ARMCII_Translate_absolute_to_group(&mreg->group, ARMCI_GROUP_WORLD.rank); ARMCII_Assert(grp_proc >= 0); ARMCII_Assert(mreg->lock_state == GMR_LOCK_DLA); ARMCII_Assert_msg((mreg->access_mode & ARMCIX_MODE_NO_LOAD_STORE) == 0, "Direct local access is not allowed in the current access mode"); mreg->dla_lock_count--; if (mreg->dla_lock_count == 0) { MPI_Win_unlock(grp_proc, mreg->window); mreg->lock_state = GMR_LOCK_UNLOCKED; } }
/** Create a group of ARMCI mutexes. Collective onthe ARMCI group. * * @param[in] count Number of mutexes on the local process. * @param[in] pgroup ARMCI group on which to create mutexes * @return Handle to the mutex group. */ armcix_mutex_hdl_t ARMCIX_Create_mutexes_hdl(int my_count, ARMCI_Group *pgroup) { int rank, nproc, max_count, i; armcix_mutex_hdl_t hdl; hdl = malloc(sizeof(struct armcix_mutex_hdl_s)); ARMCII_Assert(hdl != NULL); ARMCIX_Group_dup(pgroup, &hdl->grp); MPI_Comm_rank(hdl->grp.comm, &rank); MPI_Comm_size(hdl->grp.comm, &nproc); hdl->my_count = my_count; /* Find the max. count to determine how many windows we need. */ MPI_Allreduce(&my_count, &max_count, 1, MPI_INT, MPI_MAX, hdl->grp.comm); ARMCII_Assert_msg(max_count > 0, "Invalid number of mutexes"); hdl->max_count = max_count; hdl->windows = malloc(sizeof(MPI_Win)*max_count); if (my_count > 0) { hdl->bases = malloc(sizeof(uint8_t*)*my_count); } else { hdl->bases = NULL; } /* We need multiple windows here: one for each mutex. Otherwise performance will suffer due to exclusive access epochs. */ for (i = 0; i < max_count; i++) { int size = 0; void *base = NULL; if (i < my_count) { MPI_Alloc_mem(nproc, MPI_INFO_NULL, &hdl->bases[i]); ARMCII_Assert(hdl->bases[i] != NULL); ARMCII_Bzero(hdl->bases[i], nproc); base = hdl->bases[i]; size = nproc; } MPI_Win_create(base, size, sizeof(uint8_t), MPI_INFO_NULL, hdl->grp.comm, &hdl->windows[i]); } return hdl; }
/** One-sided get operation. * * @param[in] src Source address (remote) * @param[in] dst Destination address (local) * @param[in] size Number of bytes to transfer * @param[in] target Process id to target * @return 0 on success, non-zero on failure */ int PARMCI_Get(void *src, void *dst, int size, int target) { gmr_t *src_mreg, *dst_mreg; src_mreg = gmr_lookup(src, target); /* If NOGUARD is set, assume the buffer is not shared */ if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) dst_mreg = gmr_lookup(dst, ARMCI_GROUP_WORLD.rank); else dst_mreg = NULL; ARMCII_Assert_msg(src_mreg != NULL, "Invalid remote pointer"); /* Local operation */ if (target == ARMCI_GROUP_WORLD.rank && dst_mreg == NULL) { ARMCI_Copy(src, dst, size); } /* Origin buffer is private */ else if (dst_mreg == NULL) { gmr_get(src_mreg, src, dst, size, target); gmr_flush(src_mreg, target, 0); /* it's a round trip so w.r.t. flush, local=remote */ } /* COPY: Either origin and target buffers are in the same window and we can't * lock the same window twice (MPI semantics) or the user has requested * always-copy mode. */ else { void *dst_buf; MPI_Alloc_mem(size, MPI_INFO_NULL, &dst_buf); ARMCII_Assert(dst_buf != NULL); gmr_get(src_mreg, src, dst_buf, size, target); gmr_flush(src_mreg, target, 0); /* it's a round trip so w.r.t. flush, local=remote */ ARMCI_Copy(dst_buf, dst, size); MPI_Free_mem(dst_buf); } return 0; }
/** One-sided accumulate operation. Source buffer must be private. * * @param[in] mreg Memory region * @param[in] src Source address (local) * @param[in] dst Destination address (remote) * @param[in] type MPI type of the given buffers * @param[in] count Number of elements of the given type to transfer * @param[in] proc Absolute process id of the target * @return 0 on success, non-zero on failure */ int gmr_accumulate(gmr_t *mreg, void *src, void *dst, int count, MPI_Datatype type, int proc) { ARMCII_Assert_msg(src != NULL, "Invalid local address"); return gmr_accumulate_typed(mreg, src, count, type, dst, count, type, proc); }
/** One-sided get operation. Destination buffer must be private. * * @param[in] mreg Memory region * @param[in] src Source address (remote) * @param[in] dst Destination address (local) * @param[in] size Number of bytes to transfer * @param[in] proc Absolute process id of target process * @return 0 on success, non-zero on failure */ int gmr_get(gmr_t *mreg, void *src, void *dst, int size, int proc) { ARMCII_Assert_msg(dst != NULL, "Invalid local address"); return gmr_get_typed(mreg, src, size, MPI_BYTE, dst, size, MPI_BYTE, proc); }
/** Destroy/free a shared memory region. * * @param[in] ptr Pointer within range of the segment (e.g. base pointer). * @param[in] group Group on which to perform the free. */ void gmr_destroy(gmr_t *mreg, ARMCI_Group *group) { int search_proc_in, search_proc_out, search_proc_out_grp; void *search_base; int alloc_me, alloc_nproc; int world_me, world_nproc; MPI_Comm_rank(group->comm, &alloc_me); MPI_Comm_size(group->comm, &alloc_nproc); MPI_Comm_rank(ARMCI_GROUP_WORLD.comm, &world_me); MPI_Comm_size(ARMCI_GROUP_WORLD.comm, &world_nproc); /* All-to-all exchange of a <base address, proc> pair. This is so that we * can support passing NULL into ARMCI_Free() which is permitted when a * process allocates 0 bytes. Unfortunately, in this case we still need to * identify the mem region and free it. */ if (mreg == NULL) search_proc_in = -1; else { search_proc_in = world_me; search_base = mreg->slices[world_me].base; } /* Collectively decide on who will provide the base address */ MPI_Allreduce(&search_proc_in, &search_proc_out, 1, MPI_INT, MPI_MAX, group->comm); /* Everyone passed NULL. Nothing to free. */ if (search_proc_out < 0) return; /* Translate world rank to group rank */ search_proc_out_grp = ARMCII_Translate_absolute_to_group(group, search_proc_out); /* Broadcast the base address */ MPI_Bcast(&search_base, sizeof(void*), MPI_BYTE, search_proc_out_grp, group->comm); /* If we were passed NULL, look up the mem region using the <base, proc> pair */ if (mreg == NULL) mreg = gmr_lookup(search_base, search_proc_out); /* If it's still not found, the user may have passed the wrong group */ ARMCII_Assert_msg(mreg != NULL, "Could not locate the desired allocation"); switch (mreg->lock_state) { case GMR_LOCK_UNLOCKED: break; case GMR_LOCK_DLA: ARMCII_Warning("Releasing direct local access before freeing shared allocation\n"); gmr_dla_unlock(mreg); break; default: ARMCII_Error("Unable to free locked memory region (%d)\n", mreg->lock_state); } /* Remove from the list of mem regions */ if (mreg->prev == NULL) { ARMCII_Assert(gmr_list == mreg); gmr_list = mreg->next; if (mreg->next != NULL) mreg->next->prev = NULL; } else { mreg->prev->next = mreg->next; if (mreg->next != NULL) mreg->next->prev = mreg->prev; } /* Destroy the window and free all buffers */ MPI_Win_free(&mreg->window); if (mreg->slices[world_me].base != NULL) MPI_Free_mem(mreg->slices[world_me].base); free(mreg->slices); ARMCIX_Destroy_mutexes_hdl(mreg->rmw_mutex); free(mreg); }
/** Prepare a set of buffers for use with an accumulate operation. The * returned set of buffers is guaranteed to be in private space and scaled. * Copies will be made if needed, the result should be completed by finish. * * @param[in] buf Original set of buffers. * @param[in] count Number of entries in the buffer list. * @param[in] size The size of the buffers (all are of the same size). * @param[in] datatype The type of the buffer. * @param[in] scale Scaling constant to apply to each buffer. * @return Pointer to the new buffer or buf */ void ARMCII_Buf_acc_scale(void *buf_in, void *buf_out, int size, int datatype, void *scale) { int j, nelem; int type_size = -1; MPI_Datatype type; switch (datatype) { case ARMCI_ACC_INT: MPI_Type_size(MPI_INT, &type_size); type = MPI_INT; nelem= size/type_size; { int *src_i = (int*) buf_in; int *scl_i = (int*) buf_out; const int s = *((int*) scale); for (j = 0; j < nelem; j++) scl_i[j] = src_i[j]*s; } break; case ARMCI_ACC_LNG: MPI_Type_size(MPI_LONG, &type_size); type = MPI_LONG; nelem= size/type_size; { long *src_l = (long*) buf_in; long *scl_l = (long*) buf_out; const long s = *((long*) scale); for (j = 0; j < nelem; j++) scl_l[j] = src_l[j]*s; } break; case ARMCI_ACC_FLT: MPI_Type_size(MPI_FLOAT, &type_size); type = MPI_FLOAT; nelem= size/type_size; { float *src_f = (float*) buf_in; float *scl_f = (float*) buf_out; const float s = *((float*) scale); for (j = 0; j < nelem; j++) scl_f[j] = src_f[j]*s; } break; case ARMCI_ACC_DBL: MPI_Type_size(MPI_DOUBLE, &type_size); type = MPI_DOUBLE; nelem= size/type_size; { double *src_d = (double*) buf_in; double *scl_d = (double*) buf_out; const double s = *((double*) scale); for (j = 0; j < nelem; j++) scl_d[j] = src_d[j]*s; } break; case ARMCI_ACC_CPL: MPI_Type_size(MPI_FLOAT, &type_size); type = MPI_FLOAT; nelem= size/type_size; { float *src_fc = (float*) buf_in; float *scl_fc = (float*) buf_out; const float s_r = ((float*)scale)[0]; const float s_c = ((float*)scale)[1]; for (j = 0; j < nelem; j += 2) { // Complex multiplication: (a + bi)*(c + di) const float src_fc_j = src_fc[j]; const float src_fc_j_1 = src_fc[j+1]; /* scl_fc[j] = src_fc[j]*s_r - src_fc[j+1]*s_c; scl_fc[j+1] = src_fc[j+1]*s_r + src_fc[j]*s_c; */ scl_fc[j] = src_fc_j*s_r - src_fc_j_1*s_c; scl_fc[j+1] = src_fc_j_1*s_r + src_fc_j*s_c; } } break; case ARMCI_ACC_DCP: MPI_Type_size(MPI_DOUBLE, &type_size); type = MPI_DOUBLE; nelem= size/type_size; { double *src_dc = (double*) buf_in; double *scl_dc = (double*) buf_out; const double s_r = ((double*)scale)[0]; const double s_c = ((double*)scale)[1]; for (j = 0; j < nelem; j += 2) { // Complex multiplication: (a + bi)*(c + di) const double src_dc_j = src_dc[j]; const double src_dc_j_1 = src_dc[j+1]; /* scl_dc[j] = src_dc[j]*s_r - src_dc[j+1]*s_c; scl_dc[j+1] = src_dc[j+1]*s_r + src_dc[j]*s_c; */ scl_dc[j] = src_dc_j*s_r - src_dc_j_1*s_c; scl_dc[j+1] = src_dc_j_1*s_r + src_dc_j*s_c; } } break; default: ARMCII_Error("unknown data type (%d)", datatype); } ARMCII_Assert_msg(size % type_size == 0, "Transfer size is not a multiple of the datatype size"); }
/** Blocking operation that accumulates data from the local process into the * memory of the remote process. The data transfer is strided and blocking. * * @param[in] datatype Type of data to be transferred. * @param[in] scale Pointer to the value that input data should be scaled by. * @param[in] src_ptr Source starting address of the data block to put. * @param[in] src_stride_arr Source array of stride distances in bytes. * @param[in] dst_ptr Destination starting address to put data. * @param[in] dst_stride_ar Destination array of stride distances in bytes. * @param[in] count Block size in each dimension. count[0] should be the * number of bytes of contiguous data in leading dimension. * @param[in] stride_levels The level of strides. * @param[in] proc Remote process ID (destination). * * @return Zero on success, error code otherwise. */ int PARMCI_AccS(int datatype, void *scale, void *src_ptr, int src_stride_ar[/*stride_levels*/], void *dst_ptr, int dst_stride_ar[/*stride_levels*/], int count[/*stride_levels+1*/], int stride_levels, int proc) { int err; if (ARMCII_GLOBAL_STATE.strided_method == ARMCII_STRIDED_DIRECT) { void *src_buf = NULL; gmr_t *mreg, *gmr_loc = NULL; MPI_Datatype src_type, dst_type, mpi_datatype; int scaled, mpi_datatype_size; ARMCII_Acc_type_translate(datatype, &mpi_datatype, &mpi_datatype_size); scaled = ARMCII_Buf_acc_is_scaled(datatype, scale); /* SCALE: copy and scale if requested */ if (scaled) { armci_giov_t iov; int i, nelem; if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) gmr_loc = gmr_lookup(src_ptr, ARMCI_GROUP_WORLD.rank); for (i = 1, nelem = count[0]/mpi_datatype_size; i < stride_levels+1; i++) nelem *= count[i]; MPI_Alloc_mem(nelem*mpi_datatype_size, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); if (gmr_loc != NULL) gmr_dla_lock(gmr_loc); /* Shoehorn the strided information into an IOV */ ARMCII_Strided_to_iov(&iov, src_ptr, src_stride_ar, src_ptr, src_stride_ar, count, stride_levels); for (i = 0; i < iov.ptr_array_len; i++) ARMCII_Buf_acc_scale(iov.src_ptr_array[i], ((uint8_t*)src_buf) + i*iov.bytes, iov.bytes, datatype, scale); free(iov.src_ptr_array); free(iov.dst_ptr_array); if (gmr_loc != NULL) gmr_dla_unlock(gmr_loc); MPI_Type_contiguous(nelem, mpi_datatype, &src_type); } /* COPY: Guard shared buffers */ else if (ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_COPY) { gmr_loc = gmr_lookup(src_ptr, ARMCI_GROUP_WORLD.rank); if (gmr_loc != NULL) { int i, nelem; for (i = 1, nelem = count[0]/mpi_datatype_size; i < stride_levels+1; i++) nelem *= count[i]; MPI_Alloc_mem(nelem*mpi_datatype_size, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); gmr_dla_lock(gmr_loc); armci_write_strided(src_ptr, stride_levels, src_stride_ar, count, src_buf); gmr_dla_unlock(gmr_loc); MPI_Type_contiguous(nelem, mpi_datatype, &src_type); } } /* NOGUARD: If src_buf hasn't been assigned to a copy, the strided source * buffer is going to be used directly. */ if (src_buf == NULL) { src_buf = src_ptr; ARMCII_Strided_to_dtype(src_stride_ar, count, stride_levels, mpi_datatype, &src_type); } ARMCII_Strided_to_dtype(dst_stride_ar, count, stride_levels, mpi_datatype, &dst_type); MPI_Type_commit(&src_type); MPI_Type_commit(&dst_type); int src_size, dst_size; MPI_Type_size(src_type, &src_size); MPI_Type_size(dst_type, &dst_size); ARMCII_Assert(src_size == dst_size); mreg = gmr_lookup(dst_ptr, proc); ARMCII_Assert_msg(mreg != NULL, "Invalid shared pointer"); gmr_lock(mreg, proc); gmr_accumulate_typed(mreg, src_buf, 1, src_type, dst_ptr, 1, dst_type, proc); gmr_unlock(mreg, proc); MPI_Type_free(&src_type); MPI_Type_free(&dst_type); /* COPY/SCALE: Free temp buffer */ if (src_buf != src_ptr) MPI_Free_mem(src_buf); err = 0; } else { armci_giov_t iov; ARMCII_Strided_to_iov(&iov, src_ptr, src_stride_ar, dst_ptr, dst_stride_ar, count, stride_levels); err = PARMCI_AccV(datatype, scale, &iov, 1, proc); free(iov.src_ptr_array); free(iov.dst_ptr_array); } return err; }
/** Optimized implementation of the ARMCI IOV operation that uses an MPI * datatype to achieve a one-sided gather/scatter. Does not use MPI_BOTTOM. */ int ARMCII_Iov_op_datatype_no_bottom(enum ARMCII_Op_e op, void **src, void **dst, int count, int elem_count, MPI_Datatype type, int proc) { gmr_t *mreg; MPI_Datatype type_loc, type_rem; MPI_Aint disp_loc[count]; int disp_rem[count]; int block_len[count]; void *dst_win_base; int dst_win_size, i, type_size; void **buf_rem, **buf_loc; MPI_Aint base_rem; MPI_Aint base_loc; void *base_loc_ptr; switch(op) { case ARMCII_OP_ACC: case ARMCII_OP_PUT: buf_rem = dst; buf_loc = src; break; case ARMCII_OP_GET: buf_rem = src; buf_loc = dst; break; default: ARMCII_Error("unknown operation (%d)", op); return 1; } MPI_Type_size(type, &type_size); mreg = gmr_lookup(buf_rem[0], proc); ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer"); dst_win_base = mreg->slices[proc].base; dst_win_size = mreg->slices[proc].size; MPI_Get_address(dst_win_base, &base_rem); /* Pick a base address for the start of the origin's datatype */ base_loc_ptr = buf_loc[0]; MPI_Get_address(base_loc_ptr, &base_loc); for (i = 0; i < count; i++) { MPI_Aint target_rem, target_loc; MPI_Get_address(buf_loc[i], &target_loc); MPI_Get_address(buf_rem[i], &target_rem); disp_loc[i] = target_loc - base_loc; disp_rem[i] = (target_rem - base_rem)/type_size; block_len[i] = elem_count; ARMCII_Assert_msg((target_rem - base_rem) % type_size == 0, "Transfer size is not a multiple of type size"); ARMCII_Assert_msg(disp_rem[i] >= 0 && disp_rem[i] < dst_win_size, "Invalid remote pointer"); ARMCII_Assert_msg(((uint8_t*)buf_rem[i]) + block_len[i] <= ((uint8_t*)dst_win_base) + dst_win_size, "Transfer exceeds buffer length"); } MPI_Type_create_hindexed(count, block_len, disp_loc, type, &type_loc); MPI_Type_create_indexed_block(count, elem_count, disp_rem, type, &type_rem); //MPI_Type_indexed(count, block_len, disp_rem, type, &type_rem); MPI_Type_commit(&type_loc); MPI_Type_commit(&type_rem); gmr_lock(mreg, proc); switch(op) { case ARMCII_OP_ACC: gmr_accumulate_typed(mreg, base_loc_ptr, 1, type_loc, MPI_BOTTOM, 1, type_rem, proc); break; case ARMCII_OP_PUT: gmr_put_typed(mreg, base_loc_ptr, 1, type_loc, MPI_BOTTOM, 1, type_rem, proc); break; case ARMCII_OP_GET: gmr_get_typed(mreg, MPI_BOTTOM, 1, type_rem, base_loc_ptr, 1, type_loc, proc); break; default: ARMCII_Error("unknown operation (%d)", op); return 1; } gmr_unlock(mreg, proc); MPI_Type_free(&type_loc); MPI_Type_free(&type_rem); return 0; }
/** Blocking operation that transfers data from the remote process to the * memory of the calling process. The data transfer is strided and blocking. * * @param[in] src_ptr Source starting address of the data block to put. * @param[in] src_stride_arr Source array of stride distances in bytes. * @param[in] dst_ptr Destination starting address to put data. * @param[in] dst_stride_ar Destination array of stride distances in bytes. * @param[in] count Block size in each dimension. count[0] should be the * number of bytes of contiguous data in leading dimension. * @param[in] stride_levels The level of strides. * @param[in] proc Remote process ID (destination). * * @return Zero on success, error code otherwise. */ int PARMCI_GetS(void *src_ptr, int src_stride_ar[/*stride_levels*/], void *dst_ptr, int dst_stride_ar[/*stride_levels*/], int count[/*stride_levels+1*/], int stride_levels, int proc) { int err; if (ARMCII_GLOBAL_STATE.strided_method == ARMCII_STRIDED_DIRECT) { void *dst_buf = NULL; gmr_t *mreg, *gmr_loc = NULL; MPI_Datatype src_type, dst_type; /* COPY: Guard shared buffers */ if (ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_COPY) { gmr_loc = gmr_lookup(dst_ptr, ARMCI_GROUP_WORLD.rank); if (gmr_loc != NULL) { int i, size; for (i = 1, size = count[0]; i < stride_levels+1; i++) size *= count[i]; MPI_Alloc_mem(size, MPI_INFO_NULL, &dst_buf); ARMCII_Assert(dst_buf != NULL); MPI_Type_contiguous(size, MPI_BYTE, &dst_type); } } /* NOGUARD: If dst_buf hasn't been assigned to a copy, the strided source * buffer is going to be used directly. */ if (dst_buf == NULL) { dst_buf = dst_ptr; ARMCII_Strided_to_dtype(dst_stride_ar, count, stride_levels, MPI_BYTE, &dst_type); } ARMCII_Strided_to_dtype(src_stride_ar, count, stride_levels, MPI_BYTE, &src_type); MPI_Type_commit(&src_type); MPI_Type_commit(&dst_type); mreg = gmr_lookup(src_ptr, proc); ARMCII_Assert_msg(mreg != NULL, "Invalid shared pointer"); gmr_lock(mreg, proc); gmr_get_typed(mreg, src_ptr, 1, src_type, dst_buf, 1, dst_type, proc); gmr_unlock(mreg, proc); /* COPY: Finish the transfer */ if (dst_buf != dst_ptr) { gmr_dla_lock(gmr_loc); armci_read_strided(dst_ptr, stride_levels, dst_stride_ar, count, dst_buf); gmr_dla_unlock(gmr_loc); MPI_Free_mem(dst_buf); } MPI_Type_free(&src_type); MPI_Type_free(&dst_type); err = 0; } else { armci_giov_t iov; ARMCII_Strided_to_iov(&iov, src_ptr, src_stride_ar, dst_ptr, dst_stride_ar, count, stride_levels); err = PARMCI_GetV(&iov, 1, proc); free(iov.src_ptr_array); free(iov.dst_ptr_array); } return err; }
/** Perform atomic read-modify-write on the given integer or long location and * return the location's original value. * * \note ARMCI RMW operations are atomic with respect to other RMW operations, * but not with respect to other one-sided operations (get, put, acc, etc). * * @param[in] op Operation to be performed: * ARMCI_FETCH_AND_ADD (int) * ARMCI_FETCH_AND_ADD_LONG * ARMCI_SWAP (int) * ARMCI_SWAP_LONG * @param[out] ploc Location to store the original value. * @param[in] prem Location on which to perform atomic operation. * @param[in] value Value to add to remote location (ignored for swap). * @param[in] proc Process rank for the target buffer. */ int PARMCI_Rmw(int op, void *ploc, void *prem, int value, int proc) { int is_swap = 0, is_long = 0; MPI_Datatype type; MPI_Op rop; gmr_t *src_mreg, *dst_mreg; /* If NOGUARD is set, assume the buffer is not shared */ if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) src_mreg = gmr_lookup(ploc, ARMCI_GROUP_WORLD.rank); else src_mreg = NULL; dst_mreg = gmr_lookup(prem, proc); ARMCII_Assert_msg(dst_mreg != NULL, "Invalid remote pointer"); if (op == ARMCI_SWAP_LONG || op == ARMCI_FETCH_AND_ADD_LONG) { is_long = 1; type = MPI_LONG; } else type = MPI_INT; if (op == ARMCI_SWAP || op == ARMCI_SWAP_LONG) { is_swap = 1; rop = MPI_REPLACE; } else if (op == ARMCI_FETCH_AND_ADD || op == ARMCI_FETCH_AND_ADD_LONG) rop = MPI_SUM; else ARMCII_Error("invalid operation (%d)", op); /* We hold the DLA lock if (src_mreg != NULL). */ if (is_swap) { long out_val_l, src_val_l = *((long*)ploc); int out_val_i, src_val_i = *((int*)ploc); gmr_fetch_and_op(dst_mreg, is_long ? (void*) &src_val_l : (void*) &src_val_i /* src */, is_long ? (void*) &out_val_l : (void*) &out_val_i /* out */, prem /* dst */, type, rop, proc); gmr_flush(dst_mreg, proc, 0); /* it's a round trip so w.r.t. flush, local=remote */ if (is_long) *(long*) ploc = out_val_l; else *(int*) ploc = out_val_i; } else /* fetch-and-add */ { long fetch_val_l, add_val_l = value; int fetch_val_i, add_val_i = value; gmr_fetch_and_op(dst_mreg, is_long ? (void*) &add_val_l : (void*) &add_val_i /* src */, is_long ? (void*) &fetch_val_l : (void*) &fetch_val_i /* out */, prem /* dst */, type, rop, proc); gmr_flush(dst_mreg, proc, 0); /* it's a round trip so w.r.t. flush, local=remote */ if (is_long) *(long*) ploc = fetch_val_l; else *(int*) ploc = fetch_val_i; } return 0; }