/** One-sided put operation. * * @param[in] src Source address (remote) * @param[in] dst Destination address (local) * @param[in] size Number of bytes to transfer * @param[in] target Process id to target * @return 0 on success, non-zero on failure */ int ARMCI_Put(void *src, void *dst, int size, int target) { gmr_t *src_mreg, *dst_mreg; src_mreg = gmr_lookup(src, ARMCI_GROUP_WORLD.rank); dst_mreg = gmr_lookup(dst, target); ARMCII_Assert_msg(dst_mreg != NULL, "Invalid remote pointer"); /* Local operation */ if (target == ARMCI_GROUP_WORLD.rank) { if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) { gmr_dla_lock(dst_mreg); if (src_mreg) gmr_dla_lock(src_mreg); } ARMCI_Copy(src, dst, size); if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) { gmr_dla_unlock(dst_mreg); if (src_mreg) gmr_dla_unlock(src_mreg); } } /* Origin buffer is private */ else if (src_mreg == NULL || ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_NOGUARD) { gmr_lock(dst_mreg, target); gmr_put(dst_mreg, src, dst, size, target); gmr_unlock(dst_mreg, target); } /* COPY: Either origin and target buffers are in the same window and we can't * lock the same window twice (MPI semantics) or the user has requested * always-copy mode. */ else { void *src_buf; MPI_Alloc_mem(size, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); gmr_dla_lock(src_mreg); ARMCI_Copy(src, src_buf, size); gmr_dla_unlock(src_mreg); gmr_lock(dst_mreg, target); gmr_put(dst_mreg, src_buf, dst, size, target); gmr_unlock(dst_mreg, target); MPI_Free_mem(src_buf); } return 0; }
/** Collective index selection reduce operation (scoped). */ void armci_msg_sel_scope(int scope, void *x, int n, char* op, int type, int contribute) { MPI_Comm sel_comm; sel_data_t *data_in, *data_out; void **x_buf; /* printf("[%d] armci_msg_sel_scope(scope=%d, x=%p, n=%d, op=%s, type=%d, contribute=%d)\n", ARMCI_GROUP_WORLD.rank, scope, x, n, op, type, contribute); */ /* Determine the scope of the collective operation */ if (scope == SCOPE_ALL || scope == SCOPE_MASTERS) sel_comm = ARMCI_GROUP_WORLD.comm; else sel_comm = MPI_COMM_SELF; data_in = malloc(sizeof(sel_data_t)+n-1); data_out = malloc(sizeof(sel_data_t)+n-1); ARMCII_Assert(data_in != NULL && data_out != NULL); ARMCII_Buf_prepare_read_vec(&x, &x_buf, 1, n); data_in->contribute = contribute; data_in->type = type; if (contribute) ARMCI_Copy(x, data_in->data, n); if (strncmp(op, "min", 3) == 0) { MPI_Allreduce(data_in, data_out, sizeof(sel_data_t)+n-1, MPI_BYTE, ARMCI_MPI_SELMIN_OP, sel_comm); } else if (strncmp(op, "max", 3) == 0) { MPI_Allreduce(data_in, data_out, sizeof(sel_data_t)+n-1, MPI_BYTE, ARMCI_MPI_SELMAX_OP, sel_comm); } else { ARMCII_Error("Invalid operation (%s)", op); } ARMCI_Copy(data_out->data, x, n); ARMCII_Buf_finish_write_vec(&x, x_buf, 1, n); free(data_in); free(data_out); }
/** One-sided get operation. * * @param[in] src Source address (remote) * @param[in] dst Destination address (local) * @param[in] size Number of bytes to transfer * @param[in] target Process id to target * @return 0 on success, non-zero on failure */ int PARMCI_Get(void *src, void *dst, int size, int target) { gmr_t *src_mreg, *dst_mreg; src_mreg = gmr_lookup(src, target); /* If NOGUARD is set, assume the buffer is not shared */ if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) dst_mreg = gmr_lookup(dst, ARMCI_GROUP_WORLD.rank); else dst_mreg = NULL; ARMCII_Assert_msg(src_mreg != NULL, "Invalid remote pointer"); /* Local operation */ if (target == ARMCI_GROUP_WORLD.rank && dst_mreg == NULL) { ARMCI_Copy(src, dst, size); } /* Origin buffer is private */ else if (dst_mreg == NULL) { gmr_get(src_mreg, src, dst, size, target); gmr_flush(src_mreg, target, 0); /* it's a round trip so w.r.t. flush, local=remote */ } /* COPY: Either origin and target buffers are in the same window and we can't * lock the same window twice (MPI semantics) or the user has requested * always-copy mode. */ else { void *dst_buf; MPI_Alloc_mem(size, MPI_INFO_NULL, &dst_buf); ARMCII_Assert(dst_buf != NULL); gmr_get(src_mreg, src, dst_buf, size, target); gmr_flush(src_mreg, target, 0); /* it's a round trip so w.r.t. flush, local=remote */ ARMCI_Copy(dst_buf, dst, size); MPI_Free_mem(dst_buf); } return 0; }
/** Prepare a set of buffers for use with an accumulate operation. The * returned set of buffers is guaranteed to be in private space and scaled. * Copies will be made if needed, the result should be completed by finish. * * @param[in] orig_bufs Original set of buffers. * @param[out] new_bufs Pointer to the set of private buffers. * @param[in] count Number of entries in the buffer list. * @param[in] size The size of the buffers (all are of the same size). * @param[in] datatype The type of the buffer. * @param[in] scale Scaling constant to apply to each buffer. * @return Number of buffers that were moved. */ int ARMCII_Buf_prepare_acc_vec(void **orig_bufs, void ***new_bufs_ptr, int count, int size, int datatype, void *scale) { void **new_bufs; int i, scaled, num_moved = 0; new_bufs = malloc(count*sizeof(void*)); ARMCII_Assert(new_bufs != NULL); scaled = ARMCII_Buf_acc_is_scaled(datatype, scale); for (i = 0; i < count; i++) { gmr_t *mreg = NULL; // Check if the source buffer is within a shared region. if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) mreg = gmr_lookup(orig_bufs[i], ARMCI_GROUP_WORLD.rank); if (scaled) { MPI_Alloc_mem(size, MPI_INFO_NULL, &new_bufs[i]); ARMCII_Assert(new_bufs[i] != NULL); // Lock if needed so we can directly access the buffer if (mreg != NULL) gmr_dla_lock(mreg); ARMCII_Buf_acc_scale(orig_bufs[i], new_bufs[i], size, datatype, scale); if (mreg != NULL) gmr_dla_unlock(mreg); } else { new_bufs[i] = orig_bufs[i]; } if (mreg != NULL) { // If the buffer wasn't copied, we should copy it into a private buffer if (new_bufs[i] == orig_bufs[i]) { MPI_Alloc_mem(size, MPI_INFO_NULL, &new_bufs[i]); ARMCII_Assert(new_bufs[i] != NULL); gmr_dla_lock(mreg); ARMCI_Copy(orig_bufs[i], new_bufs[i], size); gmr_dla_unlock(mreg); } } if (new_bufs[i] == orig_bufs[i]) num_moved++; } *new_bufs_ptr = new_bufs; return num_moved; }
/** One-sided accumulate operation. * * @param[in] datatype ARMCI data type for the accumulate operation (see armci.h) * @param[in] scale Pointer for a scalar of type datatype that will be used to * scale values in the source buffer * @param[in] src Source address (remote) * @param[in] dst Destination address (local) * @param[in] bytes Number of bytes to transfer * @param[in] proc Process id to target * @return 0 on success, non-zero on failure */ int PARMCI_Acc(int datatype, void *scale, void *src, void *dst, int bytes, int proc) { void *src_buf; int count, type_size, scaled; MPI_Datatype type; gmr_t *src_mreg, *dst_mreg; /* If NOGUARD is set, assume the buffer is not shared */ if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) src_mreg = gmr_lookup(src, ARMCI_GROUP_WORLD.rank); else src_mreg = NULL; dst_mreg = gmr_lookup(dst, proc); ARMCII_Assert_msg(dst_mreg != NULL, "Invalid remote pointer"); /* Prepare the input data: Apply scaling if needed and acquire the DLA lock if * needed. We hold the DLA lock if (src_buf == src && src_mreg != NULL). */ scaled = ARMCII_Buf_acc_is_scaled(datatype, scale); if (scaled) { MPI_Alloc_mem(bytes, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); ARMCII_Buf_acc_scale(src, src_buf, bytes, datatype, scale); } else { src_buf = src; } /* Check if we need to copy: user requested it or same mem region */ if ( (src_buf == src) /* buf_prepare didn't make a copy */ && (ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_COPY || src_mreg == dst_mreg) ) { MPI_Alloc_mem(bytes, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); ARMCI_Copy(src, src_buf, bytes); } ARMCII_Acc_type_translate(datatype, &type, &type_size); count = bytes/type_size; ARMCII_Assert_msg(bytes % type_size == 0, "Transfer size is not a multiple of the datatype size"); /* TODO: Support a local accumulate operation more efficiently */ gmr_accumulate(dst_mreg, src_buf, dst, count, type, proc); gmr_flush(dst_mreg, proc, 1); /* flush_local */ if (src_buf != src) MPI_Free_mem(src_buf); return 0; }
/* Unpack strided data from a contiguous source buffer. This is a local operation. * * @param[in] src Pointer to the contiguous buffer * @param[in] stride_levels Number of levels of striding * @param[in] dst_stride_arr Array of length stride_levels of stride lengths * @param[in] count Array of length stride_levels+1 of the number of * units at each stride level (lowest is contiguous) * @param[in] dst Destination strided buffer */ void armci_read_strided(void *dst, int stride_levels, int dst_stride_arr[], int count[], char *src) { armci_giov_t iov; int i; // Shoehorn the strided information into an IOV ARMCII_Strided_to_iov(&iov, dst, dst_stride_arr, dst, dst_stride_arr, count, stride_levels); for (i = 0; i < iov.ptr_array_len; i++) ARMCI_Copy(src + i*count[0], iov.dst_ptr_array[i], iov.bytes); free(iov.src_ptr_array); free(iov.dst_ptr_array); }
/** Min operator for armci_msg_sel */ void ARMCII_Msg_sel_max_op(void *data_in, void *data_inout, int *len, MPI_Datatype *datatype) { sel_data_t *sd_1, *sd_2; sd_1 = (sel_data_t*) data_in; sd_2 = (sel_data_t*) data_inout; if (sd_1->contribute && !sd_2->contribute) { ARMCI_Copy(data_in, data_inout, *len); } else if (sd_1->contribute && sd_2->contribute) { #define MSG_SEL_MAX_OP(X,Y,LEN,TYPE) \ do { \ if (*(TYPE*)((sel_data_t*)X)->data > *(TYPE*)((sel_data_t*)Y)->data) \ ARMCI_Copy(X, Y, LEN); \ } while (0) switch (sd_1->type) { case ARMCI_INT: MSG_SEL_MAX_OP(data_in, data_inout, *len, int); break; case ARMCI_LONG: MSG_SEL_MAX_OP(data_in, data_inout, *len, long); break; case ARMCI_LONG_LONG: MSG_SEL_MAX_OP(data_in, data_inout, *len, long long); break; case ARMCI_FLOAT: MSG_SEL_MAX_OP(data_in, data_inout, *len, float); break; case ARMCI_DOUBLE: MSG_SEL_MAX_OP(data_in, data_inout, *len, double); break; default: ARMCII_Error("Invalid data type (%d)", sd_1->type); } #undef MSG_SEL_MIN_OP } /* else: no need to copy, data_inout already contains what we want to return */ }
/** Finish a set of prepared buffers. Will perform communication and copies as * needed to ensure results are in the original buffers. Temporary space will be * freed. * * @param[in] orig_bufs Original set of buffers. * @param[out] new_bufs Set of private buffers. * @param[in] count Number of entries in the buffer list. * @param[in] size The size of the buffers (all are of the same size). */ void ARMCII_Buf_finish_write_vec(void **orig_bufs, void **new_bufs, int count, int size) { if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) { int i; for (i = 0; i < count; i++) { if (orig_bufs[i] != new_bufs[i]) { gmr_t *mreg = gmr_lookup(orig_bufs[i], ARMCI_GROUP_WORLD.rank); ARMCII_Assert(mreg != NULL); gmr_dla_lock(mreg); ARMCI_Copy(new_bufs[i], orig_bufs[i], size); // gmr_put(mreg, new_bufs[i], orig_bufs[i], size, ARMCI_GROUP_WORLD.rank); gmr_dla_unlock(mreg); MPI_Free_mem(new_bufs[i]); } } free(new_bufs); } }
/** Prepare a set of buffers for use with a put operation. The returned set of * buffers is guaranteed to be in private space. Copies will be made if needed, * the result should be completed by finish. * * @param[in] orig_bufs Original set of buffers. * @param[out] new_bufs Pointer to the set of private buffers. * @param[in] count Number of entries in the buffer list. * @param[in] size The size of the buffers (all are of the same size). * @return Number of buffers that were moved. */ int ARMCII_Buf_prepare_read_vec(void **orig_bufs, void ***new_bufs_ptr, int count, int size) { int num_moved = 0; if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) { void **new_bufs = malloc(count*sizeof(void*)); int i; for (i = 0; i < count; i++) new_bufs[i] = NULL; for (i = 0; i < count; i++) { // Check if the source buffer is within a shared region. If so, copy it // into a private buffer. gmr_t *mreg = gmr_lookup(orig_bufs[i], ARMCI_GROUP_WORLD.rank); if (mreg != NULL) { MPI_Alloc_mem(size, MPI_INFO_NULL, &new_bufs[i]); ARMCII_Assert(new_bufs[i] != NULL); gmr_dla_lock(mreg); ARMCI_Copy(orig_bufs[i], new_bufs[i], size); // gmr_get(mreg, orig_bufs[i], new_bufs[i], size, ARMCI_GROUP_WORLD.rank); gmr_dla_unlock(mreg); num_moved++; } else { new_bufs[i] = orig_bufs[i]; } } *new_bufs_ptr = new_bufs; } else { *new_bufs_ptr = orig_bufs; } return num_moved; }
/** General ARMCI global operation (reduction). Collective on group. * * @param[in] scope Scope in which to perform the GOP (only SCOPE_ALL is supported) * @param[inout] x Vector of n doubles, contains input and will contain output. * @param[in] n Length of x * @param[in] op One of '+', '*', 'max', 'min', 'absmax', 'absmin' * @param[in] type Data type of x * @param[in] group Group on which to perform the GOP */ void armci_msg_group_gop_scope(int scope, void *x, int n, char *op, int type, ARMCI_Group *group) { void *out; MPI_Op mpi_op; MPI_Datatype mpi_type; MPI_Comm comm; int mpi_type_size; if (scope == SCOPE_ALL || scope == SCOPE_MASTERS) comm = group->comm; else comm = MPI_COMM_SELF; if (op[0] == '+') { mpi_op = MPI_SUM; } else if (op[0] == '*') { mpi_op = MPI_PROD; } else if (strncmp(op, "max", 3) == 0) { mpi_op = MPI_MAX; } else if (strncmp(op, "min", 3) == 0) { mpi_op = MPI_MIN; } else if (strncmp(op, "or", 2) == 0) { mpi_op = MPI_BOR; } else if (strncmp(op, "absmax", 6) == 0) { mpi_op = MPI_ABSMAX_OP; } else if (strncmp(op, "absmin", 6) == 0) { mpi_op = MPI_ABSMIN_OP; } else { ARMCII_Error("unknown operation \'%s\'", op); return; } switch(type) { case ARMCI_INT: mpi_type = MPI_INT; break; case ARMCI_LONG: mpi_type = MPI_LONG; break; case ARMCI_LONG_LONG: mpi_type = MPI_LONG_LONG; break; case ARMCI_FLOAT: mpi_type = MPI_FLOAT; break; case ARMCI_DOUBLE: mpi_type = MPI_DOUBLE; break; default: ARMCII_Error("unknown type (%d)", type); return; } // ABS MAX/MIN are unary as well as binary. We need to also apply abs in the // single processor case when reduce would normally just be a no-op. if (group->size == 1 && (mpi_op == MPI_ABSMAX_OP || mpi_op == MPI_ABSMIN_OP)) { ARMCII_Absv_op(x, x, &n, &mpi_type); return; } MPI_Type_size(mpi_type, &mpi_type_size); out = malloc(n*mpi_type_size); ARMCII_Assert(out != NULL); MPI_Allreduce(x, out, n, mpi_type, mpi_op, group->comm); ARMCI_Copy(out, x, n*mpi_type_size); free(out); }
/** General ARMCI global operation (reduction). Collective on group. * * @param[in] scope Scope in which to perform the GOP (only SCOPE_ALL is supported) * @param[inout] x Vector of n data elements, contains input and will contain output. * @param[in] n Length of x * @param[in] op One of '+', '*', 'max', 'min', 'absmax', 'absmin' * @param[in] type Data type of x (e.g. ARMCI_INT, ...) * @param[in] group Group on which to perform the GOP */ void armci_msg_group_gop_scope(int scope, void *x, int n, char *op, int type, ARMCI_Group *group) { void *out, **x_buf; MPI_Op mpi_op; MPI_Datatype mpi_type; MPI_Comm comm; int mpi_type_size; /* FIXME: scope argument presently ignored */ if (scope == SCOPE_ALL || scope == SCOPE_MASTERS) comm = group->comm; else comm = MPI_COMM_SELF; if (op[0] == '+') { mpi_op = MPI_SUM; } else if (op[0] == '*') { mpi_op = MPI_PROD; } else if (strncmp(op, "max", 3) == 0) { mpi_op = MPI_MAX; } else if (strncmp(op, "min", 3) == 0) { mpi_op = MPI_MIN; } else if (strncmp(op, "or", 2) == 0) { mpi_op = MPI_BOR; } else if (strncmp(op, "absmax", 6) == 0) { mpi_op = ARMCI_MPI_ABSMAX_OP; } else if (strncmp(op, "absmin", 6) == 0) { mpi_op = ARMCI_MPI_ABSMIN_OP; /* The following were added ComEx/ARMCI in 2017. */ /* https://github.com/GlobalArrays/ga/commit/14ef3cfa4ea3ffa7ee721c2a98685669359f7044 */ /* && and || need to be tested before & and | to avoid the latter matching the former. */ } else if ((strncmp(op, "land", 4) == 0) || (strncmp(op, "&&", 2) == 0)) { mpi_op = MPI_LAND; } else if ((strncmp(op, "lor", 3) == 0) || (strncmp(op, "||", 2) == 0)) { mpi_op = MPI_LOR; } else if ((strncmp(op, "band", 4) == 0) || (strncmp(op, "&", 1) == 0)) { mpi_op = MPI_BAND; } else if ((strncmp(op, "bor", 3) == 0) || (strncmp(op, "|", 1) == 0)) { mpi_op = MPI_BOR; } else { ARMCII_Error("unknown operation \'%s\'", op); return; } switch(type) { case ARMCI_INT: mpi_type = MPI_INT; break; case ARMCI_LONG: mpi_type = MPI_LONG; break; case ARMCI_LONG_LONG: mpi_type = MPI_LONG_LONG; break; case ARMCI_FLOAT: mpi_type = MPI_FLOAT; break; case ARMCI_DOUBLE: mpi_type = MPI_DOUBLE; break; default: ARMCII_Error("unknown type (%d)", type); return; } MPI_Type_size(mpi_type, &mpi_type_size); ARMCII_Buf_prepare_read_vec(&x, &x_buf, 1, n*mpi_type_size); // ABS MAX/MIN are unary as well as binary. We need to also apply abs in the // single processor case when reduce would normally just be a no-op. if (group->size == 1 && (mpi_op == ARMCI_MPI_ABSMAX_OP || mpi_op == ARMCI_MPI_ABSMIN_OP)) { ARMCII_Absv_op(x_buf[0], x_buf[0], &n, &mpi_type); } else { out = malloc(n*mpi_type_size); ARMCII_Assert(out != NULL); MPI_Allreduce(x_buf[0], out, n, mpi_type, mpi_op, comm); ARMCI_Copy(out, x_buf[0], n*mpi_type_size); free(out); } ARMCII_Buf_finish_write_vec(&x, x_buf, 1, n*mpi_type_size); }