/** Optimized implementation of the ARMCI IOV operation that uses a single * lock/unlock pair. */ int ARMCII_Iov_op_batched(enum ARMCII_Op_e op, void **src, void **dst, int count, int elem_count, MPI_Datatype type, int proc) { int i; gmr_t *mreg; void *shr_ptr; switch(op) { case ARMCII_OP_PUT: shr_ptr = dst[0]; break; case ARMCII_OP_GET: shr_ptr = src[0]; break; case ARMCII_OP_ACC: shr_ptr = dst[0]; break; default: ARMCII_Error("unknown operation (%d)", op); return 1; } mreg = gmr_lookup(shr_ptr, proc); ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer"); gmr_lock(mreg, proc); for (i = 0; i < count; i++) { if ( ARMCII_GLOBAL_STATE.iov_batched_limit > 0 && i % ARMCII_GLOBAL_STATE.iov_batched_limit == 0 && i > 0 ) { gmr_unlock(mreg, proc); gmr_lock(mreg, proc); } switch(op) { case ARMCII_OP_PUT: gmr_put(mreg, src[i], dst[i], elem_count, proc); break; case ARMCII_OP_GET: gmr_get(mreg, src[i], dst[i], elem_count, proc); break; case ARMCII_OP_ACC: gmr_accumulate(mreg, src[i], dst[i], elem_count, type, proc); break; default: ARMCII_Error("unknown operation (%d)", op); return 1; } } gmr_unlock(mreg, proc); return 0; }
/** Perform atomic read-modify-write on the given integer or long location and * return the location's original value. * * \note ARMCI RMW operations are atomic with respect to other RMW operations, * but not with respect to other one-sided operations (get, put, acc, etc). * * @param[in] op Operation to be performed: * ARMCI_FETCH_AND_ADD (int) * ARMCI_FETCH_AND_ADD_LONG * ARMCI_SWAP (int) * ARMCI_SWAP_LONG * @param[out] ploc Location to store the original value. * @param[in] prem Location on which to perform atomic operation. * @param[in] value Value to add to remote location (ignored for swap). * @param[in] proc Process rank for the target buffer. */ int PARMCI_Rmw(int op, void *ploc, void *prem, int value, int proc) { int is_long; gmr_t *mreg; mreg = gmr_lookup(prem, proc); ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer"); if (op == ARMCI_SWAP_LONG || op == ARMCI_FETCH_AND_ADD_LONG) is_long = 1; else is_long = 0; if (op == ARMCI_SWAP || op == ARMCI_SWAP_LONG) { long swap_val_l; int swap_val_i; ARMCIX_Lock_hdl(mreg->rmw_mutex, 0, proc); PARMCI_Get(prem, is_long ? (void*) &swap_val_l : (void*) &swap_val_i, is_long ? sizeof(long) : sizeof(int), proc); PARMCI_Put(ploc, prem, is_long ? sizeof(long) : sizeof(int), proc); ARMCIX_Unlock_hdl(mreg->rmw_mutex, 0, proc); if (is_long) *(long*) ploc = swap_val_l; else *(int*) ploc = swap_val_i; } else if (op == ARMCI_FETCH_AND_ADD || op == ARMCI_FETCH_AND_ADD_LONG) { long fetch_val_l, new_val_l; int fetch_val_i, new_val_i; ARMCIX_Lock_hdl(mreg->rmw_mutex, 0, proc); PARMCI_Get(prem, is_long ? (void*) &fetch_val_l : (void*) &fetch_val_i, is_long ? sizeof(long) : sizeof(int), proc); if (is_long) new_val_l = fetch_val_l + value; else new_val_i = fetch_val_i + value; PARMCI_Put(is_long ? (void*) &new_val_l : (void*) &new_val_i, prem, is_long ? sizeof(long) : sizeof(int), proc); ARMCIX_Unlock_hdl(mreg->rmw_mutex, 0, proc); if (is_long) *(long*) ploc = fetch_val_l; else *(int*) ploc = fetch_val_i; } else { ARMCII_Error("invalid operation (%d)", op); } return 0; }
/** Create ARMCI mutexes. Collective. * * @param[in] count Number of mutexes to create on the calling process */ int ARMCI_Create_mutexes(int count) { if (armci_mutex_hdl != NULL) ARMCII_Error("attempted to create ARMCI mutexes multiple times"); armci_mutex_hdl = ARMCIX_Create_mutexes_hdl(count, &ARMCI_GROUP_WORLD); if (armci_mutex_hdl != NULL) return 0; else return 1; }
/** Destroy/free ARMCI mutexes. Collective. */ int ARMCI_Destroy_mutexes(void) { int err; if (armci_mutex_hdl == NULL) ARMCII_Error("attempted to free unallocated ARMCI mutexes"); err = ARMCIX_Destroy_mutexes_hdl(armci_mutex_hdl); armci_mutex_hdl = NULL; return err; }
/** Perform an I/O vector operation. Local buffers must be private. * * @param[in] op Operation to be performed (ARMCII_OP_PUT, ...) * @param[in] src Array of source pointers * @param[in] dst Array of destination pointers * @param[in] count Length of pointer arrays * @param[in] size Size of each transfer * @param[in] datatype Data type for accumulate op (ignored for all others) * @param[in] overlapping Do remote regions overlap? * @param[in] same_alloc Do remote regions correspond to the same allocation? * @param[in] proc Target process * @return Zero on success, error code otherwise */ int ARMCII_Iov_op_dispatch(enum ARMCII_Op_e op, void **src, void **dst, int count, int size, int datatype, int overlapping, int same_alloc, int proc) { MPI_Datatype type; int type_count, type_size; if (op == ARMCII_OP_ACC) { ARMCII_Acc_type_translate(datatype, &type, &type_size); type_count = size/type_size; ARMCII_Assert_msg(size % type_size == 0, "Transfer size is not a multiple of type size"); } else { type = MPI_BYTE; MPI_Type_size(type, &type_size); type_count = size/type_size; ARMCII_Assert_msg(size % type_size == 0, "Transfer size is not a multiple of type size"); } // CONSERVATIVE CASE: If remote pointers overlap or remote pointers correspond to // multiple allocations, use the safe implementation to avoid invalid MPI // use. if (overlapping || !same_alloc || ARMCII_GLOBAL_STATE.iov_method == ARMCII_IOV_CONSRV) { if (overlapping) ARMCII_Warning("IOV remote buffers overlap\n"); if (!same_alloc) ARMCII_Warning("IOV remote buffers are not within the same allocation\n"); return ARMCII_Iov_op_safe(op, src, dst, count, type_count, type, proc); } // OPTIMIZED CASE: It's safe for us to issue all the operations under a // single lock. else if ( ARMCII_GLOBAL_STATE.iov_method == ARMCII_IOV_DIRECT || ARMCII_GLOBAL_STATE.iov_method == ARMCII_IOV_AUTO ) { if (ARMCII_GLOBAL_STATE.no_mpi_bottom == 1) { return ARMCII_Iov_op_datatype_no_bottom(op, src, dst, count, type_count, type, proc); } else { return ARMCII_Iov_op_datatype(op, src, dst, count, type_count, type, proc); } } else if (ARMCII_GLOBAL_STATE.iov_method == ARMCII_IOV_BATCHED) { return ARMCII_Iov_op_batched(op, src, dst, count, type_count, type, proc); } else { ARMCII_Error("unknown iov method (%d)\n", ARMCII_GLOBAL_STATE.iov_method); return 1; } }
/** MPI reduction operator that computes the maximum absolute value. */ void ARMCII_Absmax_op(void *invec, void *inoutvec, int *len, MPI_Datatype *datatype) { const int count = *len; MPI_Datatype dt = *datatype; if (dt == MPI_INT) { ABSMAX(invec, inoutvec, count, int, IABS); } else if (dt == MPI_LONG) { ABSMAX(invec, inoutvec, count, long, IABS); } else if (dt == MPI_LONG_LONG) { ABSMAX(invec, inoutvec, count, long long, IABS); } else if (dt == MPI_FLOAT) { ABSMAX(invec, inoutvec, count, float, FABS); } else if (dt == MPI_DOUBLE) { ABSMAX(invec, inoutvec, count, double, FABS); } else { ARMCII_Error("unknown type (%d)", *datatype); } }
/** Collective index selection reduce operation (scoped). */ void armci_msg_sel_scope(int scope, void *x, int n, char* op, int type, int contribute) { MPI_Comm sel_comm; sel_data_t *data_in, *data_out; void **x_buf; /* printf("[%d] armci_msg_sel_scope(scope=%d, x=%p, n=%d, op=%s, type=%d, contribute=%d)\n", ARMCI_GROUP_WORLD.rank, scope, x, n, op, type, contribute); */ /* Determine the scope of the collective operation */ if (scope == SCOPE_ALL || scope == SCOPE_MASTERS) sel_comm = ARMCI_GROUP_WORLD.comm; else sel_comm = MPI_COMM_SELF; data_in = malloc(sizeof(sel_data_t)+n-1); data_out = malloc(sizeof(sel_data_t)+n-1); ARMCII_Assert(data_in != NULL && data_out != NULL); ARMCII_Buf_prepare_read_vec(&x, &x_buf, 1, n); data_in->contribute = contribute; data_in->type = type; if (contribute) ARMCI_Copy(x, data_in->data, n); if (strncmp(op, "min", 3) == 0) { MPI_Allreduce(data_in, data_out, sizeof(sel_data_t)+n-1, MPI_BYTE, ARMCI_MPI_SELMIN_OP, sel_comm); } else if (strncmp(op, "max", 3) == 0) { MPI_Allreduce(data_in, data_out, sizeof(sel_data_t)+n-1, MPI_BYTE, ARMCI_MPI_SELMAX_OP, sel_comm); } else { ARMCII_Error("Invalid operation (%s)", op); } ARMCI_Copy(data_out->data, x, n); ARMCII_Buf_finish_write_vec(&x, x_buf, 1, n); free(data_in); free(data_out); }
/** Min operator for armci_msg_sel */ void ARMCII_Msg_sel_max_op(void *data_in, void *data_inout, int *len, MPI_Datatype *datatype) { sel_data_t *sd_1, *sd_2; sd_1 = (sel_data_t*) data_in; sd_2 = (sel_data_t*) data_inout; if (sd_1->contribute && !sd_2->contribute) { ARMCI_Copy(data_in, data_inout, *len); } else if (sd_1->contribute && sd_2->contribute) { #define MSG_SEL_MAX_OP(X,Y,LEN,TYPE) \ do { \ if (*(TYPE*)((sel_data_t*)X)->data > *(TYPE*)((sel_data_t*)Y)->data) \ ARMCI_Copy(X, Y, LEN); \ } while (0) switch (sd_1->type) { case ARMCI_INT: MSG_SEL_MAX_OP(data_in, data_inout, *len, int); break; case ARMCI_LONG: MSG_SEL_MAX_OP(data_in, data_inout, *len, long); break; case ARMCI_LONG_LONG: MSG_SEL_MAX_OP(data_in, data_inout, *len, long long); break; case ARMCI_FLOAT: MSG_SEL_MAX_OP(data_in, data_inout, *len, float); break; case ARMCI_DOUBLE: MSG_SEL_MAX_OP(data_in, data_inout, *len, double); break; default: ARMCII_Error("Invalid data type (%d)", sd_1->type); } #undef MSG_SEL_MIN_OP } /* else: no need to copy, data_inout already contains what we want to return */ }
/** Check if an operation with the given parameters requires scaling. * * @param[in] datatype Type of the data involved in the operation * @param[in] scale Value of type datatype to scale * @return Nonzero if scale is not the identity scale */ int ARMCII_Buf_acc_is_scaled(int datatype, void *scale) { switch (datatype) { case ARMCI_ACC_INT: if (*((int*)scale) == 1) return 0; break; case ARMCI_ACC_LNG: if (*((long*)scale) == 1) return 0; break; case ARMCI_ACC_FLT: if (*((float*)scale) == 1.0) return 0; break; case ARMCI_ACC_DBL: if (*((double*)scale) == 1.0) return 0; break; case ARMCI_ACC_CPL: if (((float*)scale)[0] == 1.0 && ((float*)scale)[1] == 0.0) return 0; break; case ARMCI_ACC_DCP: if (((double*)scale)[0] == 1.0 && ((double*)scale)[1] == 0.0) return 0; break; default: ARMCII_Error("unknown data type (%d)", datatype); } return 1; }
/** General ARMCI global operation (reduction). Collective on group. * * @param[in] scope Scope in which to perform the GOP (only SCOPE_ALL is supported) * @param[inout] x Vector of n doubles, contains input and will contain output. * @param[in] n Length of x * @param[in] op One of '+', '*', 'max', 'min', 'absmax', 'absmin' * @param[in] type Data type of x * @param[in] group Group on which to perform the GOP */ void armci_msg_group_gop_scope(int scope, void *x, int n, char *op, int type, ARMCI_Group *group) { void *out; MPI_Op mpi_op; MPI_Datatype mpi_type; MPI_Comm comm; int mpi_type_size; if (scope == SCOPE_ALL || scope == SCOPE_MASTERS) comm = group->comm; else comm = MPI_COMM_SELF; if (op[0] == '+') { mpi_op = MPI_SUM; } else if (op[0] == '*') { mpi_op = MPI_PROD; } else if (strncmp(op, "max", 3) == 0) { mpi_op = MPI_MAX; } else if (strncmp(op, "min", 3) == 0) { mpi_op = MPI_MIN; } else if (strncmp(op, "or", 2) == 0) { mpi_op = MPI_BOR; } else if (strncmp(op, "absmax", 6) == 0) { mpi_op = MPI_ABSMAX_OP; } else if (strncmp(op, "absmin", 6) == 0) { mpi_op = MPI_ABSMIN_OP; } else { ARMCII_Error("unknown operation \'%s\'", op); return; } switch(type) { case ARMCI_INT: mpi_type = MPI_INT; break; case ARMCI_LONG: mpi_type = MPI_LONG; break; case ARMCI_LONG_LONG: mpi_type = MPI_LONG_LONG; break; case ARMCI_FLOAT: mpi_type = MPI_FLOAT; break; case ARMCI_DOUBLE: mpi_type = MPI_DOUBLE; break; default: ARMCII_Error("unknown type (%d)", type); return; } // ABS MAX/MIN are unary as well as binary. We need to also apply abs in the // single processor case when reduce would normally just be a no-op. if (group->size == 1 && (mpi_op == MPI_ABSMAX_OP || mpi_op == MPI_ABSMIN_OP)) { ARMCII_Absv_op(x, x, &n, &mpi_type); return; } MPI_Type_size(mpi_type, &mpi_type_size); out = malloc(n*mpi_type_size); ARMCII_Assert(out != NULL); MPI_Allreduce(x, out, n, mpi_type, mpi_op, group->comm); ARMCI_Copy(out, x, n*mpi_type_size); free(out); }
/** Destroy/free a shared memory region. * * @param[in] ptr Pointer within range of the segment (e.g. base pointer). * @param[in] group Group on which to perform the free. */ void gmr_destroy(gmr_t *mreg, ARMCI_Group *group) { int search_proc_in, search_proc_out, search_proc_out_grp; void *search_base; int alloc_me, alloc_nproc; int world_me, world_nproc; MPI_Comm_rank(group->comm, &alloc_me); MPI_Comm_size(group->comm, &alloc_nproc); MPI_Comm_rank(ARMCI_GROUP_WORLD.comm, &world_me); MPI_Comm_size(ARMCI_GROUP_WORLD.comm, &world_nproc); /* All-to-all exchange of a <base address, proc> pair. This is so that we * can support passing NULL into ARMCI_Free() which is permitted when a * process allocates 0 bytes. Unfortunately, in this case we still need to * identify the mem region and free it. */ if (mreg == NULL) search_proc_in = -1; else { search_proc_in = world_me; search_base = mreg->slices[world_me].base; } /* Collectively decide on who will provide the base address */ MPI_Allreduce(&search_proc_in, &search_proc_out, 1, MPI_INT, MPI_MAX, group->comm); /* Everyone passed NULL. Nothing to free. */ if (search_proc_out < 0) return; /* Translate world rank to group rank */ search_proc_out_grp = ARMCII_Translate_absolute_to_group(group, search_proc_out); /* Broadcast the base address */ MPI_Bcast(&search_base, sizeof(void*), MPI_BYTE, search_proc_out_grp, group->comm); /* If we were passed NULL, look up the mem region using the <base, proc> pair */ if (mreg == NULL) mreg = gmr_lookup(search_base, search_proc_out); /* If it's still not found, the user may have passed the wrong group */ ARMCII_Assert_msg(mreg != NULL, "Could not locate the desired allocation"); switch (mreg->lock_state) { case GMR_LOCK_UNLOCKED: break; case GMR_LOCK_DLA: ARMCII_Warning("Releasing direct local access before freeing shared allocation\n"); gmr_dla_unlock(mreg); break; default: ARMCII_Error("Unable to free locked memory region (%d)\n", mreg->lock_state); } /* Remove from the list of mem regions */ if (mreg->prev == NULL) { ARMCII_Assert(gmr_list == mreg); gmr_list = mreg->next; if (mreg->next != NULL) mreg->next->prev = NULL; } else { mreg->prev->next = mreg->next; if (mreg->next != NULL) mreg->next->prev = mreg->prev; } /* Destroy the window and free all buffers */ MPI_Win_free(&mreg->window); if (mreg->slices[world_me].base != NULL) MPI_Free_mem(mreg->slices[world_me].base); free(mreg->slices); ARMCIX_Destroy_mutexes_hdl(mreg->rmw_mutex); free(mreg); }
/** Unlock a mutex. * * @param[in] mutex Number of the mutex to unlock * @param[in] proc Target process for the unlock operation */ void ARMCI_Unlock(int mutex, int proc) { if (armci_mutex_hdl == NULL) ARMCII_Error("attempted to unlock on unallocated ARMCI mutexes"); ARMCIX_Unlock_hdl(armci_mutex_hdl, mutex, proc); }
/** Optimized implementation of the ARMCI IOV operation that uses an MPI * datatype to achieve a one-sided gather/scatter. Does not use MPI_BOTTOM. */ int ARMCII_Iov_op_datatype_no_bottom(enum ARMCII_Op_e op, void **src, void **dst, int count, int elem_count, MPI_Datatype type, int proc) { gmr_t *mreg; MPI_Datatype type_loc, type_rem; MPI_Aint disp_loc[count]; int disp_rem[count]; int block_len[count]; void *dst_win_base; int dst_win_size, i, type_size; void **buf_rem, **buf_loc; MPI_Aint base_rem; MPI_Aint base_loc; void *base_loc_ptr; switch(op) { case ARMCII_OP_ACC: case ARMCII_OP_PUT: buf_rem = dst; buf_loc = src; break; case ARMCII_OP_GET: buf_rem = src; buf_loc = dst; break; default: ARMCII_Error("unknown operation (%d)", op); return 1; } MPI_Type_size(type, &type_size); mreg = gmr_lookup(buf_rem[0], proc); ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer"); dst_win_base = mreg->slices[proc].base; dst_win_size = mreg->slices[proc].size; MPI_Get_address(dst_win_base, &base_rem); /* Pick a base address for the start of the origin's datatype */ base_loc_ptr = buf_loc[0]; MPI_Get_address(base_loc_ptr, &base_loc); for (i = 0; i < count; i++) { MPI_Aint target_rem, target_loc; MPI_Get_address(buf_loc[i], &target_loc); MPI_Get_address(buf_rem[i], &target_rem); disp_loc[i] = target_loc - base_loc; disp_rem[i] = (target_rem - base_rem)/type_size; block_len[i] = elem_count; ARMCII_Assert_msg((target_rem - base_rem) % type_size == 0, "Transfer size is not a multiple of type size"); ARMCII_Assert_msg(disp_rem[i] >= 0 && disp_rem[i] < dst_win_size, "Invalid remote pointer"); ARMCII_Assert_msg(((uint8_t*)buf_rem[i]) + block_len[i] <= ((uint8_t*)dst_win_base) + dst_win_size, "Transfer exceeds buffer length"); } MPI_Type_create_hindexed(count, block_len, disp_loc, type, &type_loc); MPI_Type_create_indexed_block(count, elem_count, disp_rem, type, &type_rem); //MPI_Type_indexed(count, block_len, disp_rem, type, &type_rem); MPI_Type_commit(&type_loc); MPI_Type_commit(&type_rem); gmr_lock(mreg, proc); switch(op) { case ARMCII_OP_ACC: gmr_accumulate_typed(mreg, base_loc_ptr, 1, type_loc, MPI_BOTTOM, 1, type_rem, proc); break; case ARMCII_OP_PUT: gmr_put_typed(mreg, base_loc_ptr, 1, type_loc, MPI_BOTTOM, 1, type_rem, proc); break; case ARMCII_OP_GET: gmr_get_typed(mreg, MPI_BOTTOM, 1, type_rem, base_loc_ptr, 1, type_loc, proc); break; default: ARMCII_Error("unknown operation (%d)", op); return 1; } gmr_unlock(mreg, proc); MPI_Type_free(&type_loc); MPI_Type_free(&type_rem); return 0; }
/** General ARMCI global operation (reduction). Collective on group. * * @param[in] scope Scope in which to perform the GOP (only SCOPE_ALL is supported) * @param[inout] x Vector of n data elements, contains input and will contain output. * @param[in] n Length of x * @param[in] op One of '+', '*', 'max', 'min', 'absmax', 'absmin' * @param[in] type Data type of x (e.g. ARMCI_INT, ...) * @param[in] group Group on which to perform the GOP */ void armci_msg_group_gop_scope(int scope, void *x, int n, char *op, int type, ARMCI_Group *group) { void *out, **x_buf; MPI_Op mpi_op; MPI_Datatype mpi_type; MPI_Comm comm; int mpi_type_size; /* FIXME: scope argument presently ignored */ if (scope == SCOPE_ALL || scope == SCOPE_MASTERS) comm = group->comm; else comm = MPI_COMM_SELF; if (op[0] == '+') { mpi_op = MPI_SUM; } else if (op[0] == '*') { mpi_op = MPI_PROD; } else if (strncmp(op, "max", 3) == 0) { mpi_op = MPI_MAX; } else if (strncmp(op, "min", 3) == 0) { mpi_op = MPI_MIN; } else if (strncmp(op, "or", 2) == 0) { mpi_op = MPI_BOR; } else if (strncmp(op, "absmax", 6) == 0) { mpi_op = ARMCI_MPI_ABSMAX_OP; } else if (strncmp(op, "absmin", 6) == 0) { mpi_op = ARMCI_MPI_ABSMIN_OP; /* The following were added ComEx/ARMCI in 2017. */ /* https://github.com/GlobalArrays/ga/commit/14ef3cfa4ea3ffa7ee721c2a98685669359f7044 */ /* && and || need to be tested before & and | to avoid the latter matching the former. */ } else if ((strncmp(op, "land", 4) == 0) || (strncmp(op, "&&", 2) == 0)) { mpi_op = MPI_LAND; } else if ((strncmp(op, "lor", 3) == 0) || (strncmp(op, "||", 2) == 0)) { mpi_op = MPI_LOR; } else if ((strncmp(op, "band", 4) == 0) || (strncmp(op, "&", 1) == 0)) { mpi_op = MPI_BAND; } else if ((strncmp(op, "bor", 3) == 0) || (strncmp(op, "|", 1) == 0)) { mpi_op = MPI_BOR; } else { ARMCII_Error("unknown operation \'%s\'", op); return; } switch(type) { case ARMCI_INT: mpi_type = MPI_INT; break; case ARMCI_LONG: mpi_type = MPI_LONG; break; case ARMCI_LONG_LONG: mpi_type = MPI_LONG_LONG; break; case ARMCI_FLOAT: mpi_type = MPI_FLOAT; break; case ARMCI_DOUBLE: mpi_type = MPI_DOUBLE; break; default: ARMCII_Error("unknown type (%d)", type); return; } MPI_Type_size(mpi_type, &mpi_type_size); ARMCII_Buf_prepare_read_vec(&x, &x_buf, 1, n*mpi_type_size); // ABS MAX/MIN are unary as well as binary. We need to also apply abs in the // single processor case when reduce would normally just be a no-op. if (group->size == 1 && (mpi_op == ARMCI_MPI_ABSMAX_OP || mpi_op == ARMCI_MPI_ABSMIN_OP)) { ARMCII_Absv_op(x_buf[0], x_buf[0], &n, &mpi_type); } else { out = malloc(n*mpi_type_size); ARMCII_Assert(out != NULL); MPI_Allreduce(x_buf[0], out, n, mpi_type, mpi_op, comm); ARMCI_Copy(out, x_buf[0], n*mpi_type_size); free(out); } ARMCII_Buf_finish_write_vec(&x, x_buf, 1, n*mpi_type_size); }
/** Perform atomic read-modify-write on the given integer or long location and * return the location's original value. * * \note ARMCI RMW operations are atomic with respect to other RMW operations, * but not with respect to other one-sided operations (get, put, acc, etc). * * @param[in] op Operation to be performed: * ARMCI_FETCH_AND_ADD (int) * ARMCI_FETCH_AND_ADD_LONG * ARMCI_SWAP (int) * ARMCI_SWAP_LONG * @param[out] ploc Location to store the original value. * @param[in] prem Location on which to perform atomic operation. * @param[in] value Value to add to remote location (ignored for swap). * @param[in] proc Process rank for the target buffer. */ int PARMCI_Rmw(int op, void *ploc, void *prem, int value, int proc) { int is_swap = 0, is_long = 0; MPI_Datatype type; MPI_Op rop; gmr_t *src_mreg, *dst_mreg; /* If NOGUARD is set, assume the buffer is not shared */ if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) src_mreg = gmr_lookup(ploc, ARMCI_GROUP_WORLD.rank); else src_mreg = NULL; dst_mreg = gmr_lookup(prem, proc); ARMCII_Assert_msg(dst_mreg != NULL, "Invalid remote pointer"); if (op == ARMCI_SWAP_LONG || op == ARMCI_FETCH_AND_ADD_LONG) { is_long = 1; type = MPI_LONG; } else type = MPI_INT; if (op == ARMCI_SWAP || op == ARMCI_SWAP_LONG) { is_swap = 1; rop = MPI_REPLACE; } else if (op == ARMCI_FETCH_AND_ADD || op == ARMCI_FETCH_AND_ADD_LONG) rop = MPI_SUM; else ARMCII_Error("invalid operation (%d)", op); /* We hold the DLA lock if (src_mreg != NULL). */ if (is_swap) { long out_val_l, src_val_l = *((long*)ploc); int out_val_i, src_val_i = *((int*)ploc); gmr_fetch_and_op(dst_mreg, is_long ? (void*) &src_val_l : (void*) &src_val_i /* src */, is_long ? (void*) &out_val_l : (void*) &out_val_i /* out */, prem /* dst */, type, rop, proc); gmr_flush(dst_mreg, proc, 0); /* it's a round trip so w.r.t. flush, local=remote */ if (is_long) *(long*) ploc = out_val_l; else *(int*) ploc = out_val_i; } else /* fetch-and-add */ { long fetch_val_l, add_val_l = value; int fetch_val_i, add_val_i = value; gmr_fetch_and_op(dst_mreg, is_long ? (void*) &add_val_l : (void*) &add_val_i /* src */, is_long ? (void*) &fetch_val_l : (void*) &fetch_val_i /* out */, prem /* dst */, type, rop, proc); gmr_flush(dst_mreg, proc, 0); /* it's a round trip so w.r.t. flush, local=remote */ if (is_long) *(long*) ploc = fetch_val_l; else *(int*) ploc = fetch_val_i; } return 0; }
void armci_msg_reduce_scope(int scope, void *x, int n, char *op, int type) { ARMCII_Error("unimplemented"); // TODO }
/** Prepare a set of buffers for use with an accumulate operation. The * returned set of buffers is guaranteed to be in private space and scaled. * Copies will be made if needed, the result should be completed by finish. * * @param[in] buf Original set of buffers. * @param[in] count Number of entries in the buffer list. * @param[in] size The size of the buffers (all are of the same size). * @param[in] datatype The type of the buffer. * @param[in] scale Scaling constant to apply to each buffer. * @return Pointer to the new buffer or buf */ void ARMCII_Buf_acc_scale(void *buf_in, void *buf_out, int size, int datatype, void *scale) { int j, nelem; int type_size = -1; MPI_Datatype type; switch (datatype) { case ARMCI_ACC_INT: MPI_Type_size(MPI_INT, &type_size); type = MPI_INT; nelem= size/type_size; { int *src_i = (int*) buf_in; int *scl_i = (int*) buf_out; const int s = *((int*) scale); for (j = 0; j < nelem; j++) scl_i[j] = src_i[j]*s; } break; case ARMCI_ACC_LNG: MPI_Type_size(MPI_LONG, &type_size); type = MPI_LONG; nelem= size/type_size; { long *src_l = (long*) buf_in; long *scl_l = (long*) buf_out; const long s = *((long*) scale); for (j = 0; j < nelem; j++) scl_l[j] = src_l[j]*s; } break; case ARMCI_ACC_FLT: MPI_Type_size(MPI_FLOAT, &type_size); type = MPI_FLOAT; nelem= size/type_size; { float *src_f = (float*) buf_in; float *scl_f = (float*) buf_out; const float s = *((float*) scale); for (j = 0; j < nelem; j++) scl_f[j] = src_f[j]*s; } break; case ARMCI_ACC_DBL: MPI_Type_size(MPI_DOUBLE, &type_size); type = MPI_DOUBLE; nelem= size/type_size; { double *src_d = (double*) buf_in; double *scl_d = (double*) buf_out; const double s = *((double*) scale); for (j = 0; j < nelem; j++) scl_d[j] = src_d[j]*s; } break; case ARMCI_ACC_CPL: MPI_Type_size(MPI_FLOAT, &type_size); type = MPI_FLOAT; nelem= size/type_size; { float *src_fc = (float*) buf_in; float *scl_fc = (float*) buf_out; const float s_r = ((float*)scale)[0]; const float s_c = ((float*)scale)[1]; for (j = 0; j < nelem; j += 2) { // Complex multiplication: (a + bi)*(c + di) const float src_fc_j = src_fc[j]; const float src_fc_j_1 = src_fc[j+1]; /* scl_fc[j] = src_fc[j]*s_r - src_fc[j+1]*s_c; scl_fc[j+1] = src_fc[j+1]*s_r + src_fc[j]*s_c; */ scl_fc[j] = src_fc_j*s_r - src_fc_j_1*s_c; scl_fc[j+1] = src_fc_j_1*s_r + src_fc_j*s_c; } } break; case ARMCI_ACC_DCP: MPI_Type_size(MPI_DOUBLE, &type_size); type = MPI_DOUBLE; nelem= size/type_size; { double *src_dc = (double*) buf_in; double *scl_dc = (double*) buf_out; const double s_r = ((double*)scale)[0]; const double s_c = ((double*)scale)[1]; for (j = 0; j < nelem; j += 2) { // Complex multiplication: (a + bi)*(c + di) const double src_dc_j = src_dc[j]; const double src_dc_j_1 = src_dc[j+1]; /* scl_dc[j] = src_dc[j]*s_r - src_dc[j+1]*s_c; scl_dc[j+1] = src_dc[j+1]*s_r + src_dc[j]*s_c; */ scl_dc[j] = src_dc_j*s_r - src_dc_j_1*s_c; scl_dc[j+1] = src_dc_j_1*s_r + src_dc_j*s_c; } } break; default: ARMCII_Error("unknown data type (%d)", datatype); } ARMCII_Assert_msg(size % type_size == 0, "Transfer size is not a multiple of the datatype size"); }