/** Translate a strided operation into a more general IO Vector iterator. * * @param[in] src_ptr Source starting address of the data block to put. * @param[in] src_stride_arr Source array of stride distances in bytes. * @param[in] dst_ptr Destination starting address to put data. * @param[in] dst_stride_ar Destination array of stride distances in bytes. * @param[in] count Block size in each dimension. count[0] should be the * number of bytes of contiguous data in leading dimension. * @param[in] stride_levels The level of strides. * * @return ARMCI IOV iterator corresponding to the strided parameters. */ armcii_iov_iter_t *ARMCII_Strided_to_iov_iter( void *src_ptr, int src_stride_ar[/*stride_levels*/], void *dst_ptr, int dst_stride_ar[/*stride_levels*/], int count[/*stride_levels+1*/], int stride_levels) { int i; armcii_iov_iter_t *it = malloc(sizeof(armcii_iov_iter_t)); ARMCII_Assert(it != NULL); it->src = src_ptr; it->dst = dst_ptr; it->stride_levels = stride_levels; it->base_ptr = malloc(sizeof(int)*(4*stride_levels+1)); it->was_contiguous= 0; ARMCII_Assert( it->base_ptr != NULL ); it->src_stride_ar = &it->base_ptr[0*stride_levels]; it->dst_stride_ar = &it->base_ptr[1*stride_levels]; it->count = &it->base_ptr[2*stride_levels]; it->idx = &it->base_ptr[3*stride_levels+1]; for (i = 0; i < stride_levels; i++) { it->src_stride_ar[i] = src_stride_ar[i]; it->dst_stride_ar[i] = dst_stride_ar[i]; it->count[i] = count[i]; it->idx[i] = 0; } return it; }
/** Create a mutex group. Collective. * * @param[in] count Number of mutexes to create on the calling process * @return Handle to the mutex group */ armcix_mutex_hdl_t ARMCIX_Create_mutexes_hdl(int count, ARMCI_Group *pgroup) { int ierr, i; armcix_mutex_hdl_t hdl; hdl = malloc(sizeof(struct armcix_mutex_hdl_s)); ARMCII_Assert(hdl != NULL); MPI_Comm_dup(pgroup->comm, &hdl->comm); if (count > 0) { MPI_Alloc_mem(count*sizeof(long), MPI_INFO_NULL, &hdl->base); ARMCII_Assert(hdl->base != NULL); } else { hdl->base = NULL; } hdl->count = count; // Initialize mutexes to 0 for (i = 0; i < count; i++) hdl->base[i] = 0; ierr = MPI_Win_create(hdl->base, count*sizeof(long), sizeof(long) /* displacement size */, MPI_INFO_NULL, hdl->comm, &hdl->window); ARMCII_Assert(ierr == MPI_SUCCESS); return hdl; }
/** One-sided accumulate operation with typed arguments. Source buffer must be private. * * @param[in] mreg Memory region * @param[in] src Address of source data * @param[in] src_count Number of elements of the given type at the source * @param[in] src_type MPI datatype of the source elements * @param[in] dst Address of destination buffer * @param[in] dst_count Number of elements of the given type at the destination * @param[in] src_type MPI datatype of the destination elements * @param[in] size Number of bytes to transfer * @param[in] proc Absolute process id of target process * @return 0 on success, non-zero on failure */ int gmr_accumulate_typed(gmr_t *mreg, void *src, int src_count, MPI_Datatype src_type, void *dst, int dst_count, MPI_Datatype dst_type, int proc) { int grp_proc; gmr_size_t disp; MPI_Aint lb, extent; grp_proc = ARMCII_Translate_absolute_to_group(&mreg->group, proc); ARMCII_Assert(grp_proc >= 0); // Calculate displacement from beginning of the window if (dst == MPI_BOTTOM) disp = 0; else disp = (gmr_size_t) ((uint8_t*)dst - (uint8_t*)mreg->slices[proc].base); // Perform checks MPI_Type_get_true_extent(dst_type, &lb, &extent); ARMCII_Assert(mreg->lock_state != GMR_LOCK_UNLOCKED); ARMCII_Assert_msg(disp >= 0 && disp < mreg->slices[proc].size, "Invalid remote address"); ARMCII_Assert_msg(disp + dst_count*extent <= mreg->slices[proc].size, "Transfer is out of range"); MPI_Accumulate(src, src_count, src_type, grp_proc, (MPI_Aint) disp, dst_count, dst_type, MPI_SUM, mreg->window); return 0; }
/** Convert an ARMCI strided access description into an MPI subarray datatype. * * @param[in] stride_array Array of strides * @param[in] count Array of transfer counts * @param[in] stride_levels Number of levels of striding * @param[in] old_type Type of the data element described by count and stride_array * @param[out] new_type New MPI type for the given strided access */ void ARMCII_Strided_to_dtype(int stride_array[/*stride_levels*/], int count[/*stride_levels+1*/], int stride_levels, MPI_Datatype old_type, MPI_Datatype *new_type) { int sizes [stride_levels+1]; int subsizes[stride_levels+1]; int starts [stride_levels+1]; int i, old_type_size; MPI_Type_size(old_type, &old_type_size); /* Eliminate counts that don't count (all 1 counts at the end) */ for (i = stride_levels+1; i > 0 && stride_levels > 0 && count[i-1] == 1; i--) stride_levels--; /* A correct strided spec should me monotonic increasing and stride_array[i+1] should be a multiple of stride_array[i]. */ if (stride_levels > 0) { for (i = 1; i < stride_levels; i++) ARMCII_Assert(stride_array[i] >= stride_array[i-1] && stride_array[i] % stride_array[i-1] == 0); } /* Test for a contiguous transfer */ if (stride_levels == 0) { int elem_count = count[0]/old_type_size; ARMCII_Assert(count[0] % old_type_size == 0); MPI_Type_contiguous(elem_count, old_type, new_type); } /* Transfer is non-contiguous */ else { for (i = 0; i < stride_levels+1; i++) starts[i] = 0; sizes [stride_levels] = stride_array[0]/old_type_size; subsizes[stride_levels] = count[0]/old_type_size; ARMCII_Assert(stride_array[0] % old_type_size == 0 && count[0] % old_type_size == 0); for (i = 1; i < stride_levels; i++) { /* Convert strides into dimensions by dividing out contributions from lower dims */ sizes [stride_levels-i] = stride_array[i]/stride_array[i-1]; subsizes[stride_levels-i] = count[i]; ARMCII_Assert_msg(stride_array[i] % stride_array[i-1] == 0, "Invalid striding"); } sizes [0] = count[stride_levels]; subsizes[0] = count[stride_levels]; MPI_Type_create_subarray(stride_levels+1, sizes, subsizes, starts, MPI_ORDER_C, old_type, new_type); } }
/** Prepare a set of buffers for use with an accumulate operation. The * returned set of buffers is guaranteed to be in private space and scaled. * Copies will be made if needed, the result should be completed by finish. * * @param[in] orig_bufs Original set of buffers. * @param[out] new_bufs Pointer to the set of private buffers. * @param[in] count Number of entries in the buffer list. * @param[in] size The size of the buffers (all are of the same size). * @param[in] datatype The type of the buffer. * @param[in] scale Scaling constant to apply to each buffer. * @return Number of buffers that were moved. */ int ARMCII_Buf_prepare_acc_vec(void **orig_bufs, void ***new_bufs_ptr, int count, int size, int datatype, void *scale) { void **new_bufs; int i, scaled, num_moved = 0; new_bufs = malloc(count*sizeof(void*)); ARMCII_Assert(new_bufs != NULL); scaled = ARMCII_Buf_acc_is_scaled(datatype, scale); for (i = 0; i < count; i++) { gmr_t *mreg = NULL; // Check if the source buffer is within a shared region. if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) mreg = gmr_lookup(orig_bufs[i], ARMCI_GROUP_WORLD.rank); if (scaled) { MPI_Alloc_mem(size, MPI_INFO_NULL, &new_bufs[i]); ARMCII_Assert(new_bufs[i] != NULL); // Lock if needed so we can directly access the buffer if (mreg != NULL) gmr_dla_lock(mreg); ARMCII_Buf_acc_scale(orig_bufs[i], new_bufs[i], size, datatype, scale); if (mreg != NULL) gmr_dla_unlock(mreg); } else { new_bufs[i] = orig_bufs[i]; } if (mreg != NULL) { // If the buffer wasn't copied, we should copy it into a private buffer if (new_bufs[i] == orig_bufs[i]) { MPI_Alloc_mem(size, MPI_INFO_NULL, &new_bufs[i]); ARMCII_Assert(new_bufs[i] != NULL); gmr_dla_lock(mreg); ARMCI_Copy(orig_bufs[i], new_bufs[i], size); gmr_dla_unlock(mreg); } } if (new_bufs[i] == orig_bufs[i]) num_moved++; } *new_bufs_ptr = new_bufs; return num_moved; }
/** One-sided accumulate operation. * * @param[in] datatype ARMCI data type for the accumulate operation (see armci.h) * @param[in] scale Pointer for a scalar of type datatype that will be used to * scale values in the source buffer * @param[in] src Source address (remote) * @param[in] dst Destination address (local) * @param[in] bytes Number of bytes to transfer * @param[in] proc Process id to target * @return 0 on success, non-zero on failure */ int PARMCI_Acc(int datatype, void *scale, void *src, void *dst, int bytes, int proc) { void *src_buf; int count, type_size, scaled; MPI_Datatype type; gmr_t *src_mreg, *dst_mreg; /* If NOGUARD is set, assume the buffer is not shared */ if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) src_mreg = gmr_lookup(src, ARMCI_GROUP_WORLD.rank); else src_mreg = NULL; dst_mreg = gmr_lookup(dst, proc); ARMCII_Assert_msg(dst_mreg != NULL, "Invalid remote pointer"); /* Prepare the input data: Apply scaling if needed and acquire the DLA lock if * needed. We hold the DLA lock if (src_buf == src && src_mreg != NULL). */ scaled = ARMCII_Buf_acc_is_scaled(datatype, scale); if (scaled) { MPI_Alloc_mem(bytes, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); ARMCII_Buf_acc_scale(src, src_buf, bytes, datatype, scale); } else { src_buf = src; } /* Check if we need to copy: user requested it or same mem region */ if ( (src_buf == src) /* buf_prepare didn't make a copy */ && (ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_COPY || src_mreg == dst_mreg) ) { MPI_Alloc_mem(bytes, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); ARMCI_Copy(src, src_buf, bytes); } ARMCII_Acc_type_translate(datatype, &type, &type_size); count = bytes/type_size; ARMCII_Assert_msg(bytes % type_size == 0, "Transfer size is not a multiple of the datatype size"); /* TODO: Support a local accumulate operation more efficiently */ gmr_accumulate(dst_mreg, src_buf, dst, count, type, proc); gmr_flush(dst_mreg, proc, 1); /* flush_local */ if (src_buf != src) MPI_Free_mem(src_buf); return 0; }
/** Lock a mutex. * * @param[in] hdl Mutex group that the mutex belongs to. * @param[in] mutex Desired mutex number [0..count-1] * @param[in] world_proc Absolute ID of process where the mutex lives */ void ARMCIX_Lock_hdl(armcix_mutex_hdl_t hdl, int mutex, int world_proc) { int rank, nproc, already_locked, i, proc; uint8_t *buf; ARMCII_Assert(mutex >= 0 && mutex < hdl->max_count); MPI_Comm_rank(hdl->grp.comm, &rank); MPI_Comm_size(hdl->grp.comm, &nproc); /* User gives us the absolute ID. Translate to the rank in the mutex's group. */ proc = ARMCII_Translate_absolute_to_group(&hdl->grp, world_proc); ARMCII_Assert(proc >= 0); buf = malloc(nproc*sizeof(uint8_t)); ARMCII_Assert(buf != NULL); buf[rank] = 1; /* Get all data from the lock_buf, except the byte belonging to * me. Set the byte belonging to me to 1. */ MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->windows[mutex]); MPI_Put(&buf[rank], 1, MPI_BYTE, proc, rank, 1, MPI_BYTE, hdl->windows[mutex]); /* Get data to the left of rank */ if (rank > 0) { MPI_Get(buf, rank, MPI_BYTE, proc, 0, rank, MPI_BYTE, hdl->windows[mutex]); } /* Get data to the right of rank */ if (rank < nproc - 1) { MPI_Get(&buf[rank+1], nproc-1-rank, MPI_BYTE, proc, rank + 1, nproc-1-rank, MPI_BYTE, hdl->windows[mutex]); } MPI_Win_unlock(proc, hdl->windows[mutex]); ARMCII_Assert(buf[rank] == 1); for (i = already_locked = 0; i < nproc; i++) if (buf[i] && i != rank) already_locked = 1; /* Wait for notification */ if (already_locked) { MPI_Status status; ARMCII_Dbg_print(DEBUG_CAT_MUTEX, "waiting for notification [proc = %d, mutex = %d]\n", proc, mutex); MPI_Recv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE, ARMCI_MUTEX_TAG+mutex, hdl->grp.comm, &status); } ARMCII_Dbg_print(DEBUG_CAT_MUTEX, "lock acquired [proc = %d, mutex = %d]\n", proc, mutex); free(buf); }
/** Lock a mutex. * * @param[in] hdl Mutex group that the mutex belongs to. * @param[in] mutex Desired mutex number [0..count-1] * @param[in] world_proc Absolute ID of process where the mutex lives */ void ARMCIX_Lock_hdl(armcix_mutex_hdl_t hdl, int mutex, int world_proc) { int rank, nproc, proc; long lock_val, unlock_val, lock_out; int timeout = 1; MPI_Comm_rank(hdl->comm, &rank); MPI_Comm_size(hdl->comm, &nproc); /* User gives us the absolute ID. Translate to the rank in the mutex's group. */ proc = ARMCII_Translate_absolute_to_group(hdl->comm, world_proc); ARMCII_Assert(proc >= 0); lock_val = rank+1; // Map into range 1..nproc unlock_val = -1 * (rank+1); /* mutex <- mutex + rank */ MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->window); MPI_Accumulate(&lock_val, 1, MPI_LONG, proc, mutex, 1, MPI_LONG, MPI_SUM, hdl->window); MPI_Win_unlock(proc, hdl->window); for (;;) { /* read mutex value */ MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->window); MPI_Get(&lock_out, 1, MPI_LONG, proc, mutex, 1, MPI_LONG, hdl->window); MPI_Win_unlock(proc, hdl->window); ARMCII_Assert(lock_out > 0); ARMCII_Assert(lock_out <= nproc*(nproc+1)/2); // Must be < sum of all ranks /* We are holding the mutex */ if (lock_out == rank+1) break; /* mutex <- mutex - rank */ MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->window); MPI_Accumulate(&unlock_val, 1, MPI_LONG, proc, mutex, 1, MPI_LONG, MPI_SUM, hdl->window); MPI_Win_unlock(proc, hdl->window); /* Exponential backoff */ usleep(timeout + rand()%timeout); timeout = MIN(timeout*TIMEOUT_MUL, MAX_TIMEOUT); if (rand() % nproc == 0) // Chance to reset timeout timeout = 1; /* mutex <- mutex + rank */ MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->window); MPI_Accumulate(&lock_val, 1, MPI_LONG, proc, mutex, 1, MPI_LONG, MPI_SUM, hdl->window); MPI_Win_unlock(proc, hdl->window); } }
/** Unlock a mutex. * * @param[in] hdl Mutex group that the mutex belongs to. * @param[in] mutex Desired mutex number [0..count-1] * @param[in] world_proc Absolute ID of process where the mutex lives */ void ARMCIX_Unlock_hdl(armcix_mutex_hdl_t hdl, int mutex, int world_proc) { int rank, nproc, i, proc; uint8_t *buf; ARMCII_Assert(mutex >= 0 && mutex < hdl->max_count); MPI_Comm_rank(hdl->grp.comm, &rank); MPI_Comm_size(hdl->grp.comm, &nproc); proc = ARMCII_Translate_absolute_to_group(&hdl->grp, world_proc); ARMCII_Assert(proc >= 0); buf = malloc(nproc*sizeof(uint8_t)); buf[rank] = 0; /* Get all data from the lock_buf, except the byte belonging to * me. Set the byte belonging to me to 0. */ MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->windows[mutex]); MPI_Put(&buf[rank], 1, MPI_BYTE, proc, rank, 1, MPI_BYTE, hdl->windows[mutex]); /* Get data to the left of rank */ if (rank > 0) { MPI_Get(buf, rank, MPI_BYTE, proc, 0, rank, MPI_BYTE, hdl->windows[mutex]); } /* Get data to the right of rank */ if (rank < nproc - 1) { MPI_Get(&buf[rank+1], nproc-1-rank, MPI_BYTE, proc, rank + 1, nproc-1-rank, MPI_BYTE, hdl->windows[mutex]); } MPI_Win_unlock(proc, hdl->windows[mutex]); ARMCII_Assert(buf[rank] == 0); /* Notify the next waiting process, starting to my right for fairness */ for (i = 1; i < nproc; i++) { int p = (rank + i) % nproc; if (buf[p] == 1) { ARMCII_Dbg_print(DEBUG_CAT_MUTEX, "notifying %d [proc = %d, mutex = %d]\n", p, proc, mutex); MPI_Send(NULL, 0, MPI_BYTE, p, ARMCI_MUTEX_TAG+mutex, hdl->grp.comm); break; } } ARMCII_Dbg_print(DEBUG_CAT_MUTEX, "lock released [proc = %d, mutex = %d]\n", proc, mutex); free(buf); }
/** Unlock a memory region that was locked for direct local access. * * @param[in] mreg Memory region */ void gmr_dla_unlock(gmr_t *mreg) { int grp_proc = ARMCII_Translate_absolute_to_group(&mreg->group, ARMCI_GROUP_WORLD.rank); ARMCII_Assert(grp_proc >= 0); ARMCII_Assert(mreg->lock_state == GMR_LOCK_DLA); ARMCII_Assert_msg((mreg->access_mode & ARMCIX_MODE_NO_LOAD_STORE) == 0, "Direct local access is not allowed in the current access mode"); mreg->dla_lock_count--; if (mreg->dla_lock_count == 0) { MPI_Win_unlock(grp_proc, mreg->window); mreg->lock_state = GMR_LOCK_UNLOCKED; } }
/** Prepare a set of buffers for use with a get operation. The returned set of * buffers is guaranteed to be in private space. Copies will be made if needed, * the result should be completed by finish. * * @param[in] orig_bufs Original set of buffers. * @param[out] new_bufs Pointer to the set of private buffers. * @param[in] count Number of entries in the buffer list. * @param[in] size The size of the buffers (all are of the same size). * @return Number of buffers that were moved. */ int ARMCII_Buf_prepare_write_vec(void **orig_bufs, void ***new_bufs_ptr, int count, int size) { int num_moved = 0; if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) { void **new_bufs = malloc(count*sizeof(void*)); int i; for (i = 0; i < count; i++) new_bufs[i] = NULL; for (i = 0; i < count; i++) { // Check if the destination buffer is within a shared region. If not, create // a temporary private buffer to hold the result. gmr_t *mreg = gmr_lookup(orig_bufs[i], ARMCI_GROUP_WORLD.rank); if (mreg != NULL) { MPI_Alloc_mem(size, MPI_INFO_NULL, &new_bufs[i]); ARMCII_Assert(new_bufs[i] != NULL); num_moved++; } else { new_bufs[i] = orig_bufs[i]; } } *new_bufs_ptr = new_bufs; } else { *new_bufs_ptr = orig_bufs; } return num_moved; }
/** Translate a group process rank to the corresponding process rank in the * ARMCI world group. * * @param[in] group Group to translate from. * @param[in] group_rank Rank of the process in group. */ int ARMCI_Absolute_id(ARMCI_Group *group, int group_rank) { int world_rank; MPI_Group world_group, sub_group; ARMCII_Assert(group_rank >= 0 && group_rank < group->size); /* Check if group is the world group */ if (group->comm == ARMCI_GROUP_WORLD.comm) world_rank = group_rank; /* Check for translation cache */ else if (group->grp_to_abs != NULL) world_rank = group->grp_to_abs[group_rank]; else { /* Translate the rank */ MPI_Comm_group(ARMCI_GROUP_WORLD.comm, &world_group); MPI_Comm_group(group->comm, &sub_group); MPI_Group_translate_ranks(sub_group, 1, &group_rank, world_group, &world_rank); MPI_Group_free(&world_group); MPI_Group_free(&sub_group); } /* Check if translation failed */ if (world_rank == MPI_UNDEFINED) return -1; else return world_rank; }
/** Broadcast on a group. Collective. * * @param[in] scope ARMCI scope * @param[inout] buf Input on the root, output on all other processes * @param[in] len Number of bytes in the message * @param[in] abs_root Absolute rank of the process at the root of the broadcast * @param[in] group ARMCI group on which to perform communication */ void armci_msg_group_bcast_scope(int scope, void *buf_in, int len, int abs_root, ARMCI_Group *group) { int grp_root; void **buf; if (scope == SCOPE_ALL || scope == SCOPE_MASTERS) { /* Is the buffer an input or an output? */ if (ARMCI_GROUP_WORLD.rank == abs_root) ARMCII_Buf_prepare_read_vec(&buf_in, &buf, 1, len); else ARMCII_Buf_prepare_write_vec(&buf_in, &buf, 1, len); grp_root = ARMCII_Translate_absolute_to_group(group, abs_root); ARMCII_Assert(grp_root >= 0 && grp_root < group->size); MPI_Bcast(buf[0], len, MPI_BYTE, grp_root, group->comm); if (ARMCI_GROUP_WORLD.rank == abs_root) ARMCII_Buf_finish_read_vec(&buf_in, buf, 1, len); else ARMCII_Buf_finish_write_vec(&buf_in, buf, 1, len); } else /* SCOPE_NODE */ { grp_root = 0; /* This is a self-broadcast, which is a no-op. */ } }
/** Allocate a shared memory segment. Collective. * * @param[out] base_ptrs Array that will contain pointers to the base address of * each process' patch of the segment. Array is of length * equal to the number of processes in the group. * @param[in] size Number of bytes to allocate on the local process. */ int ARMCI_Malloc_group(void **base_ptrs, armci_size_t size, ARMCI_Group *group) { int i; gmr_t *mreg; ARMCII_Assert(PARMCI_Initialized()); mreg = gmr_create(size, base_ptrs, group); if (DEBUG_CAT_ENABLED(DEBUG_CAT_ALLOC)) { #define BUF_LEN 1000 char ptr_string[BUF_LEN]; int count = 0; if (mreg == NULL) { strncpy(ptr_string, "NULL", 5); } else { for (i = 0; i < mreg->nslices && count < BUF_LEN; i++) count += snprintf(ptr_string+count, BUF_LEN-count, (i == mreg->nslices-1) ? "%p" : "%p ", base_ptrs[i]); } ARMCII_Dbg_print(DEBUG_CAT_ALLOC, "base ptrs [%s]\n", ptr_string); #undef BUF_LEN } return 0; }
/** Create a group of ARMCI mutexes. Collective onthe ARMCI group. * * @param[in] count Number of mutexes on the local process. * @param[in] pgroup ARMCI group on which to create mutexes * @return Handle to the mutex group. */ armcix_mutex_hdl_t ARMCIX_Create_mutexes_hdl(int my_count, ARMCI_Group *pgroup) { int rank, nproc, max_count, i; armcix_mutex_hdl_t hdl; hdl = malloc(sizeof(struct armcix_mutex_hdl_s)); ARMCII_Assert(hdl != NULL); ARMCIX_Group_dup(pgroup, &hdl->grp); MPI_Comm_rank(hdl->grp.comm, &rank); MPI_Comm_size(hdl->grp.comm, &nproc); hdl->my_count = my_count; /* Find the max. count to determine how many windows we need. */ MPI_Allreduce(&my_count, &max_count, 1, MPI_INT, MPI_MAX, hdl->grp.comm); ARMCII_Assert_msg(max_count > 0, "Invalid number of mutexes"); hdl->max_count = max_count; hdl->windows = malloc(sizeof(MPI_Win)*max_count); if (my_count > 0) { hdl->bases = malloc(sizeof(uint8_t*)*my_count); } else { hdl->bases = NULL; } /* We need multiple windows here: one for each mutex. Otherwise performance will suffer due to exclusive access epochs. */ for (i = 0; i < max_count; i++) { int size = 0; void *base = NULL; if (i < my_count) { MPI_Alloc_mem(nproc, MPI_INFO_NULL, &hdl->bases[i]); ARMCII_Assert(hdl->bases[i] != NULL); ARMCII_Bzero(hdl->bases[i], nproc); base = hdl->bases[i]; size = nproc; } MPI_Win_create(base, size, sizeof(uint8_t), MPI_INFO_NULL, hdl->grp.comm, &hdl->windows[i]); } return hdl; }
/** Lock a memory region so that one-sided operations can be performed. * * @param[in] mreg Memory region * @param[in] mode Lock mode (exclusive, shared, etc...) * @param[in] proc Absolute process id of the target * @return 0 on success, non-zero on failure */ void gmr_lock(gmr_t *mreg, int proc) { int grp_proc = ARMCII_Translate_absolute_to_group(&mreg->group, proc); int grp_me = ARMCII_Translate_absolute_to_group(&mreg->group, ARMCI_GROUP_WORLD.rank); int lock_assert, lock_mode; ARMCII_Assert(grp_proc >= 0 && grp_me >= 0); ARMCII_Assert(mreg->lock_state == GMR_LOCK_UNLOCKED || mreg->lock_state == GMR_LOCK_DLA); /* Check for active DLA and suspend if needed */ if (mreg->lock_state == GMR_LOCK_DLA) { ARMCII_Assert(grp_me == mreg->lock_target); MPI_Win_unlock(mreg->lock_target, mreg->window); mreg->lock_state = GMR_LOCK_DLA_SUSP; } if ( mreg->access_mode & ARMCIX_MODE_CONFLICT_FREE && mreg->access_mode & ARMCIX_MODE_NO_LOAD_STORE ) { /* Only non-conflicting RMA accesses allowed. Shared and exclusive locks. */ lock_assert = MPI_MODE_NOCHECK; lock_mode = MPI_LOCK_SHARED; } else if (mreg->access_mode & ARMCIX_MODE_CONFLICT_FREE) { /* Non-conflicting RMA and local accesses allowed. Shared and exclusive locks. */ lock_assert = 0; lock_mode = MPI_LOCK_SHARED; } else { /* Conflicting RMA and local accesses allowed. Exclusive locks. */ lock_assert = 0; lock_mode = MPI_LOCK_EXCLUSIVE; } MPI_Win_lock(lock_mode, grp_proc, lock_assert, mreg->window); if (lock_mode == MPI_LOCK_EXCLUSIVE) mreg->lock_state = GMR_LOCK_EXCLUSIVE; else mreg->lock_state = GMR_LOCK_SHARED; mreg->lock_target = grp_proc; }
/** Initialize an ARMCI group's remaining fields using the communicator field. */ void ARMCII_Group_init_from_comm(ARMCI_Group *group) { if (group->comm != MPI_COMM_NULL) { MPI_Comm_size(group->comm, &group->size); MPI_Comm_rank(group->comm, &group->rank); } else { group->rank = -1; group->size = 0; } /* If noncollective groups are in use, create a separate communicator that can be used for noncollective group creation with this group as the parent. This ensures that calls to MPI_Intercomm_create can't clash with any user communication. */ if (ARMCII_GLOBAL_STATE.noncollective_groups && group->comm != MPI_COMM_NULL) MPI_Comm_dup(group->comm, &group->noncoll_pgroup_comm); else group->noncoll_pgroup_comm = MPI_COMM_NULL; /* Check if translation caching is enabled */ if (ARMCII_GLOBAL_STATE.cache_rank_translation) { if (group->comm != MPI_COMM_NULL) { int *ranks, i; MPI_Group world_group, sub_group; group->abs_to_grp = malloc(sizeof(int)*ARMCI_GROUP_WORLD.size); group->grp_to_abs = malloc(sizeof(int)*group->size); ranks = malloc(sizeof(int)*ARMCI_GROUP_WORLD.size); ARMCII_Assert(group->abs_to_grp != NULL && group->grp_to_abs != NULL && ranks != NULL); for (i = 0; i < ARMCI_GROUP_WORLD.size; i++) ranks[i] = i; MPI_Comm_group(ARMCI_GROUP_WORLD.comm, &world_group); MPI_Comm_group(group->comm, &sub_group); MPI_Group_translate_ranks(sub_group, group->size, ranks, world_group, group->grp_to_abs); MPI_Group_translate_ranks(world_group, ARMCI_GROUP_WORLD.size, ranks, sub_group, group->abs_to_grp); MPI_Group_free(&world_group); MPI_Group_free(&sub_group); free(ranks); } } /* Translation caching is disabled */ else { group->abs_to_grp = NULL; group->grp_to_abs = NULL; } }
/** Unlock a mutex. * * @param[in] hdl Mutex group that the mutex belongs to. * @param[in] mutex Desired mutex number [0..count-1] * @param[in] world_proc Absolute ID of process where the mutex lives */ void ARMCIX_Unlock_hdl(armcix_mutex_hdl_t hdl, int mutex, int world_proc) { int rank, nproc, proc; long unlock_val; ARMCII_Assert(mutex >= 0); MPI_Comm_rank(hdl->comm, &rank); MPI_Comm_size(hdl->comm, &nproc); /* User gives us the absolute ID. Translate to the rank in the mutex's group. */ proc = ARMCII_Translate_absolute_to_group(hdl->comm, world_proc); ARMCII_Assert(proc >= 0); unlock_val = -1 * (rank+1); /* mutex <- mutex - rank */ MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->window); MPI_Accumulate(&unlock_val, 1, MPI_LONG, proc, mutex, 1, MPI_LONG, MPI_SUM, hdl->window); MPI_Win_unlock(proc, hdl->window); }
/** Attempt to lock a mutex (non-blocking). * * @param[in] hdl Mutex group that the mutex belongs to. * @param[in] mutex Desired mutex number [0..count-1] * @param[in] world_proc Absolute ID of process where the mutex lives * @return 0 on success, non-zero on failure */ int ARMCIX_Trylock_hdl(armcix_mutex_hdl_t hdl, int mutex, int world_proc) { int rank, nproc, proc; long lock_val, unlock_val, lock_out; ARMCII_Assert(mutex >= 0); MPI_Comm_rank(hdl->comm, &rank); MPI_Comm_size(hdl->comm, &nproc); /* User gives us the absolute ID. Translate to the rank in the mutex's group. */ proc = ARMCII_Translate_absolute_to_group(hdl->comm, world_proc); ARMCII_Assert(proc >= 0); lock_val = rank+1; unlock_val = -1 * (rank+1); /* mutex <- mutex + rank */ MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->window); MPI_Accumulate(&lock_val, 1, MPI_LONG, proc, mutex, 1, MPI_LONG, MPI_SUM, hdl->window); MPI_Win_unlock(proc, hdl->window); /* read mutex value */ MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->window); MPI_Get(&lock_out, 1, MPI_LONG, proc, mutex, 1, MPI_LONG, hdl->window); MPI_Win_unlock(proc, hdl->window); ARMCII_Assert(lock_out > 0); ARMCII_Assert(lock_out <= nproc*(nproc+1)/2); // Must be < sum of all ranks /* We are holding the mutex */ if (lock_out == rank+1) return 0; /* mutex <- mutex - rank */ MPI_Win_lock(MPI_LOCK_EXCLUSIVE, proc, 0, hdl->window); MPI_Accumulate(&unlock_val, 1, MPI_LONG, proc, mutex, 1, MPI_LONG, MPI_SUM, hdl->window); MPI_Win_unlock(proc, hdl->window); return 1; }
/** One-sided put operation. * * @param[in] src Source address (remote) * @param[in] dst Destination address (local) * @param[in] size Number of bytes to transfer * @param[in] target Process id to target * @return 0 on success, non-zero on failure */ int ARMCI_Put(void *src, void *dst, int size, int target) { gmr_t *src_mreg, *dst_mreg; src_mreg = gmr_lookup(src, ARMCI_GROUP_WORLD.rank); dst_mreg = gmr_lookup(dst, target); ARMCII_Assert_msg(dst_mreg != NULL, "Invalid remote pointer"); /* Local operation */ if (target == ARMCI_GROUP_WORLD.rank) { if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) { gmr_dla_lock(dst_mreg); if (src_mreg) gmr_dla_lock(src_mreg); } ARMCI_Copy(src, dst, size); if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) { gmr_dla_unlock(dst_mreg); if (src_mreg) gmr_dla_unlock(src_mreg); } } /* Origin buffer is private */ else if (src_mreg == NULL || ARMCII_GLOBAL_STATE.shr_buf_method == ARMCII_SHR_BUF_NOGUARD) { gmr_lock(dst_mreg, target); gmr_put(dst_mreg, src, dst, size, target); gmr_unlock(dst_mreg, target); } /* COPY: Either origin and target buffers are in the same window and we can't * lock the same window twice (MPI semantics) or the user has requested * always-copy mode. */ else { void *src_buf; MPI_Alloc_mem(size, MPI_INFO_NULL, &src_buf); ARMCII_Assert(src_buf != NULL); gmr_dla_lock(src_mreg); ARMCI_Copy(src, src_buf, size); gmr_dla_unlock(src_mreg); gmr_lock(dst_mreg, target); gmr_put(dst_mreg, src_buf, dst, size, target); gmr_unlock(dst_mreg, target); MPI_Free_mem(src_buf); } return 0; }
/** Unlock a memory region. * * @param[in] mreg Memory region * @param[in] proc Absolute process id of the target * @return 0 on success, non-zero on failure */ void gmr_unlock(gmr_t *mreg, int proc) { int grp_proc = ARMCII_Translate_absolute_to_group(&mreg->group, proc); int grp_me = ARMCII_Translate_absolute_to_group(&mreg->group, ARMCI_GROUP_WORLD.rank); ARMCII_Assert(grp_proc >= 0 && grp_me >= 0); ARMCII_Assert(mreg->lock_state == GMR_LOCK_EXCLUSIVE || mreg->lock_state == GMR_LOCK_SHARED); ARMCII_Assert(mreg->lock_target == grp_proc); /* Check if DLA is suspended and needs to be resumed */ if (mreg->dla_lock_count > 0) { if (mreg->lock_state != GMR_LOCK_EXCLUSIVE || mreg->lock_target != grp_me) { MPI_Win_unlock(grp_proc, mreg->window); MPI_Win_lock(MPI_LOCK_EXCLUSIVE, grp_me, 0, mreg->window); // FIXME: NOCHECK here? } mreg->lock_state = GMR_LOCK_DLA; mreg->lock_target= grp_me; } else { MPI_Win_unlock(grp_proc, mreg->window); mreg->lock_state = GMR_LOCK_UNLOCKED; } }
/** Collective index selection reduce operation (scoped). */ void armci_msg_sel_scope(int scope, void *x, int n, char* op, int type, int contribute) { MPI_Comm sel_comm; sel_data_t *data_in, *data_out; void **x_buf; /* printf("[%d] armci_msg_sel_scope(scope=%d, x=%p, n=%d, op=%s, type=%d, contribute=%d)\n", ARMCI_GROUP_WORLD.rank, scope, x, n, op, type, contribute); */ /* Determine the scope of the collective operation */ if (scope == SCOPE_ALL || scope == SCOPE_MASTERS) sel_comm = ARMCI_GROUP_WORLD.comm; else sel_comm = MPI_COMM_SELF; data_in = malloc(sizeof(sel_data_t)+n-1); data_out = malloc(sizeof(sel_data_t)+n-1); ARMCII_Assert(data_in != NULL && data_out != NULL); ARMCII_Buf_prepare_read_vec(&x, &x_buf, 1, n); data_in->contribute = contribute; data_in->type = type; if (contribute) ARMCI_Copy(x, data_in->data, n); if (strncmp(op, "min", 3) == 0) { MPI_Allreduce(data_in, data_out, sizeof(sel_data_t)+n-1, MPI_BYTE, ARMCI_MPI_SELMIN_OP, sel_comm); } else if (strncmp(op, "max", 3) == 0) { MPI_Allreduce(data_in, data_out, sizeof(sel_data_t)+n-1, MPI_BYTE, ARMCI_MPI_SELMAX_OP, sel_comm); } else { ARMCII_Error("Invalid operation (%s)", op); } ARMCI_Copy(data_out->data, x, n); ARMCII_Buf_finish_write_vec(&x, x_buf, 1, n); free(data_in); free(data_out); }
/** One-sided get operation. * * @param[in] src Source address (remote) * @param[in] dst Destination address (local) * @param[in] size Number of bytes to transfer * @param[in] target Process id to target * @return 0 on success, non-zero on failure */ int PARMCI_Get(void *src, void *dst, int size, int target) { gmr_t *src_mreg, *dst_mreg; src_mreg = gmr_lookup(src, target); /* If NOGUARD is set, assume the buffer is not shared */ if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) dst_mreg = gmr_lookup(dst, ARMCI_GROUP_WORLD.rank); else dst_mreg = NULL; ARMCII_Assert_msg(src_mreg != NULL, "Invalid remote pointer"); /* Local operation */ if (target == ARMCI_GROUP_WORLD.rank && dst_mreg == NULL) { ARMCI_Copy(src, dst, size); } /* Origin buffer is private */ else if (dst_mreg == NULL) { gmr_get(src_mreg, src, dst, size, target); gmr_flush(src_mreg, target, 0); /* it's a round trip so w.r.t. flush, local=remote */ } /* COPY: Either origin and target buffers are in the same window and we can't * lock the same window twice (MPI semantics) or the user has requested * always-copy mode. */ else { void *dst_buf; MPI_Alloc_mem(size, MPI_INFO_NULL, &dst_buf); ARMCII_Assert(dst_buf != NULL); gmr_get(src_mreg, src, dst_buf, size, target); gmr_flush(src_mreg, target, 0); /* it's a round trip so w.r.t. flush, local=remote */ ARMCI_Copy(dst_buf, dst, size); MPI_Free_mem(dst_buf); } return 0; }
/** Lookup a shared memory region using an address and process id. * * @param[in] ptr Pointer within range of the segment (e.g. base pointer). * @param[in] proc Process on which the data lives. * @return Pointer to the mem region object. */ gmr_t *gmr_lookup(void *ptr, int proc) { gmr_t *mreg; mreg = gmr_list; while (mreg != NULL) { ARMCII_Assert(proc < mreg->nslices); if (proc < mreg->nslices) { const uint8_t *base = mreg->slices[proc].base; const gmr_size_t size = mreg->slices[proc].size; if ((uint8_t*) ptr >= base && (uint8_t*) ptr < base + size) break; } mreg = mreg->next; } return mreg; }
/** Set the acess mode for the given allocation. Collective across the * allocation's group. Waits for all processes, finishes all communication, * and then sets the new access mode. * * @param[in] new_mode The new access mode. * @param[in] ptr Pointer within the allocation. * @return Zero upon success, error code otherwise. */ int ARMCIX_Mode_set(int new_mode, void *ptr, ARMCI_Group *group) { gmr_t *mreg; mreg = gmr_lookup(ptr, ARMCI_GROUP_WORLD.rank); ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer"); ARMCII_Assert(group->comm == mreg->group.comm); ARMCII_Assert_msg(mreg->lock_state != GMR_LOCK_DLA, "Cannot change the access mode; window is locked for local access."); ARMCII_Assert_msg(mreg->lock_state == GMR_LOCK_UNLOCKED, "Cannot change the access mode on a window that is locked."); // Wait for all processes to complete any outstanding communication before we // do the mode switch MPI_Barrier(mreg->group.comm); mreg->access_mode = new_mode; return 0; }
/** Finish a set of prepared buffers. Will perform communication and copies as * needed to ensure results are in the original buffers. Temporary space will be * freed. * * @param[in] orig_bufs Original set of buffers. * @param[out] new_bufs Set of private buffers. * @param[in] count Number of entries in the buffer list. * @param[in] size The size of the buffers (all are of the same size). */ void ARMCII_Buf_finish_write_vec(void **orig_bufs, void **new_bufs, int count, int size) { if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) { int i; for (i = 0; i < count; i++) { if (orig_bufs[i] != new_bufs[i]) { gmr_t *mreg = gmr_lookup(orig_bufs[i], ARMCI_GROUP_WORLD.rank); ARMCII_Assert(mreg != NULL); gmr_dla_lock(mreg); ARMCI_Copy(new_bufs[i], orig_bufs[i], size); // gmr_put(mreg, new_bufs[i], orig_bufs[i], size, ARMCI_GROUP_WORLD.rank); gmr_dla_unlock(mreg); MPI_Free_mem(new_bufs[i]); } } free(new_bufs); } }
/** Prepare a set of buffers for use with a put operation. The returned set of * buffers is guaranteed to be in private space. Copies will be made if needed, * the result should be completed by finish. * * @param[in] orig_bufs Original set of buffers. * @param[out] new_bufs Pointer to the set of private buffers. * @param[in] count Number of entries in the buffer list. * @param[in] size The size of the buffers (all are of the same size). * @return Number of buffers that were moved. */ int ARMCII_Buf_prepare_read_vec(void **orig_bufs, void ***new_bufs_ptr, int count, int size) { int num_moved = 0; if (ARMCII_GLOBAL_STATE.shr_buf_method != ARMCII_SHR_BUF_NOGUARD) { void **new_bufs = malloc(count*sizeof(void*)); int i; for (i = 0; i < count; i++) new_bufs[i] = NULL; for (i = 0; i < count; i++) { // Check if the source buffer is within a shared region. If so, copy it // into a private buffer. gmr_t *mreg = gmr_lookup(orig_bufs[i], ARMCI_GROUP_WORLD.rank); if (mreg != NULL) { MPI_Alloc_mem(size, MPI_INFO_NULL, &new_bufs[i]); ARMCII_Assert(new_bufs[i] != NULL); gmr_dla_lock(mreg); ARMCI_Copy(orig_bufs[i], new_bufs[i], size); // gmr_get(mreg, orig_bufs[i], new_bufs[i], size, ARMCI_GROUP_WORLD.rank); gmr_dla_unlock(mreg); num_moved++; } else { new_bufs[i] = orig_bufs[i]; } } *new_bufs_ptr = new_bufs; } else { *new_bufs_ptr = orig_bufs; } return num_moved; }
/** General ARMCI global operation (reduction). Collective on group. * * @param[in] scope Scope in which to perform the GOP (only SCOPE_ALL is supported) * @param[inout] x Vector of n doubles, contains input and will contain output. * @param[in] n Length of x * @param[in] op One of '+', '*', 'max', 'min', 'absmax', 'absmin' * @param[in] type Data type of x * @param[in] group Group on which to perform the GOP */ void armci_msg_group_gop_scope(int scope, void *x, int n, char *op, int type, ARMCI_Group *group) { void *out; MPI_Op mpi_op; MPI_Datatype mpi_type; MPI_Comm comm; int mpi_type_size; if (scope == SCOPE_ALL || scope == SCOPE_MASTERS) comm = group->comm; else comm = MPI_COMM_SELF; if (op[0] == '+') { mpi_op = MPI_SUM; } else if (op[0] == '*') { mpi_op = MPI_PROD; } else if (strncmp(op, "max", 3) == 0) { mpi_op = MPI_MAX; } else if (strncmp(op, "min", 3) == 0) { mpi_op = MPI_MIN; } else if (strncmp(op, "or", 2) == 0) { mpi_op = MPI_BOR; } else if (strncmp(op, "absmax", 6) == 0) { mpi_op = MPI_ABSMAX_OP; } else if (strncmp(op, "absmin", 6) == 0) { mpi_op = MPI_ABSMIN_OP; } else { ARMCII_Error("unknown operation \'%s\'", op); return; } switch(type) { case ARMCI_INT: mpi_type = MPI_INT; break; case ARMCI_LONG: mpi_type = MPI_LONG; break; case ARMCI_LONG_LONG: mpi_type = MPI_LONG_LONG; break; case ARMCI_FLOAT: mpi_type = MPI_FLOAT; break; case ARMCI_DOUBLE: mpi_type = MPI_DOUBLE; break; default: ARMCII_Error("unknown type (%d)", type); return; } // ABS MAX/MIN are unary as well as binary. We need to also apply abs in the // single processor case when reduce would normally just be a no-op. if (group->size == 1 && (mpi_op == MPI_ABSMAX_OP || mpi_op == MPI_ABSMIN_OP)) { ARMCII_Absv_op(x, x, &n, &mpi_type); return; } MPI_Type_size(mpi_type, &mpi_type_size); out = malloc(n*mpi_type_size); ARMCII_Assert(out != NULL); MPI_Allreduce(x, out, n, mpi_type, mpi_op, group->comm); ARMCI_Copy(out, x, n*mpi_type_size); free(out); }
/** Destroy/free a shared memory region. * * @param[in] ptr Pointer within range of the segment (e.g. base pointer). * @param[in] group Group on which to perform the free. */ void gmr_destroy(gmr_t *mreg, ARMCI_Group *group) { int search_proc_in, search_proc_out, search_proc_out_grp; void *search_base; int alloc_me, alloc_nproc; int world_me, world_nproc; MPI_Comm_rank(group->comm, &alloc_me); MPI_Comm_size(group->comm, &alloc_nproc); MPI_Comm_rank(ARMCI_GROUP_WORLD.comm, &world_me); MPI_Comm_size(ARMCI_GROUP_WORLD.comm, &world_nproc); /* All-to-all exchange of a <base address, proc> pair. This is so that we * can support passing NULL into ARMCI_Free() which is permitted when a * process allocates 0 bytes. Unfortunately, in this case we still need to * identify the mem region and free it. */ if (mreg == NULL) search_proc_in = -1; else { search_proc_in = world_me; search_base = mreg->slices[world_me].base; } /* Collectively decide on who will provide the base address */ MPI_Allreduce(&search_proc_in, &search_proc_out, 1, MPI_INT, MPI_MAX, group->comm); /* Everyone passed NULL. Nothing to free. */ if (search_proc_out < 0) return; /* Translate world rank to group rank */ search_proc_out_grp = ARMCII_Translate_absolute_to_group(group, search_proc_out); /* Broadcast the base address */ MPI_Bcast(&search_base, sizeof(void*), MPI_BYTE, search_proc_out_grp, group->comm); /* If we were passed NULL, look up the mem region using the <base, proc> pair */ if (mreg == NULL) mreg = gmr_lookup(search_base, search_proc_out); /* If it's still not found, the user may have passed the wrong group */ ARMCII_Assert_msg(mreg != NULL, "Could not locate the desired allocation"); switch (mreg->lock_state) { case GMR_LOCK_UNLOCKED: break; case GMR_LOCK_DLA: ARMCII_Warning("Releasing direct local access before freeing shared allocation\n"); gmr_dla_unlock(mreg); break; default: ARMCII_Error("Unable to free locked memory region (%d)\n", mreg->lock_state); } /* Remove from the list of mem regions */ if (mreg->prev == NULL) { ARMCII_Assert(gmr_list == mreg); gmr_list = mreg->next; if (mreg->next != NULL) mreg->next->prev = NULL; } else { mreg->prev->next = mreg->next; if (mreg->next != NULL) mreg->next->prev = mreg->prev; } /* Destroy the window and free all buffers */ MPI_Win_free(&mreg->window); if (mreg->slices[world_me].base != NULL) MPI_Free_mem(mreg->slices[world_me].base); free(mreg->slices); ARMCIX_Destroy_mutexes_hdl(mreg->rmw_mutex); free(mreg); }
/** Create a distributed shared memory region. Collective on ARMCI group. * * @param[in] local_size Size of the local slice of the memory region. * @param[out] base_ptrs Array of base pointers for each process in group. * @param[in] group Group on which to perform allocation. * @return Pointer to the memory region object. */ gmr_t *gmr_create(gmr_size_t local_size, void **base_ptrs, ARMCI_Group *group) { int i; gmr_size_t aggregate_size; int alloc_me, alloc_nproc; int world_me, world_nproc; MPI_Group world_group, alloc_group; gmr_t *mreg; gmr_slice_t *alloc_slices, gmr_slice; ARMCII_Assert(local_size >= 0); ARMCII_Assert(group != NULL); MPI_Comm_rank(group->comm, &alloc_me); MPI_Comm_size(group->comm, &alloc_nproc); MPI_Comm_rank(ARMCI_GROUP_WORLD.comm, &world_me); MPI_Comm_size(ARMCI_GROUP_WORLD.comm, &world_nproc); mreg = malloc(sizeof(gmr_t)); ARMCII_Assert(mreg != NULL); mreg->slices = malloc(sizeof(gmr_slice_t)*world_nproc); ARMCII_Assert(mreg->slices != NULL); alloc_slices = malloc(sizeof(gmr_slice_t)*alloc_nproc); ARMCII_Assert(alloc_slices != NULL); mreg->group = *group; /* NOTE: I think it is invalid in GA/ARMCI to free a group before its allocations. If this is not the case, then assignment here is incorrect and this should really duplicated the group (communicator). */ mreg->nslices = world_nproc; mreg->access_mode = ARMCIX_MODE_ALL; mreg->lock_state = GMR_LOCK_UNLOCKED; mreg->dla_lock_count = 0; mreg->prev = NULL; mreg->next = NULL; /* Allocate my slice of the GMR */ alloc_slices[alloc_me].size = local_size; if (local_size == 0) { alloc_slices[alloc_me].base = NULL; } else { MPI_Alloc_mem(local_size, MPI_INFO_NULL, &(alloc_slices[alloc_me].base)); ARMCII_Assert(alloc_slices[alloc_me].base != NULL); } /* Debugging: Zero out shared memory if enabled */ if (ARMCII_GLOBAL_STATE.debug_alloc && local_size > 0) { ARMCII_Assert(alloc_slices[alloc_me].base != NULL); ARMCII_Bzero(alloc_slices[alloc_me].base, local_size); } /* All-to-all on <base, size> to build up slices vector */ gmr_slice = alloc_slices[alloc_me]; MPI_Allgather( &gmr_slice, sizeof(gmr_slice_t), MPI_BYTE, alloc_slices, sizeof(gmr_slice_t), MPI_BYTE, group->comm); /* Check for a global size 0 allocation */ for (i = aggregate_size = 0; i < alloc_nproc; i++) { aggregate_size += alloc_slices[i].size; } /* Everyone asked for 0 bytes, return a NULL vector */ if (aggregate_size == 0) { free(alloc_slices); free(mreg->slices); free(mreg); for (i = 0; i < alloc_nproc; i++) base_ptrs[i] = NULL; return NULL; } MPI_Win_create(alloc_slices[alloc_me].base, (MPI_Aint) local_size, 1, MPI_INFO_NULL, group->comm, &mreg->window); /* Populate the base pointers array */ for (i = 0; i < alloc_nproc; i++) base_ptrs[i] = alloc_slices[i].base; /* We have to do lookup on global ranks, so shovel the contents of alloc_slices into the mreg->slices array which is indexed by global rank. */ memset(mreg->slices, 0, sizeof(gmr_slice_t)*world_nproc); MPI_Comm_group(ARMCI_GROUP_WORLD.comm, &world_group); MPI_Comm_group(group->comm, &alloc_group); for (i = 0; i < alloc_nproc; i++) { int world_rank; MPI_Group_translate_ranks(alloc_group, 1, &i, world_group, &world_rank); mreg->slices[world_rank] = alloc_slices[i]; } free(alloc_slices); MPI_Group_free(&world_group); MPI_Group_free(&alloc_group); /* Create the RMW mutex: Keeps RMW operations atomic wrt each other */ mreg->rmw_mutex = ARMCIX_Create_mutexes_hdl(1, group); /* Append the new region onto the region list */ if (gmr_list == NULL) { gmr_list = mreg; } else { gmr_t *parent = gmr_list; while (parent->next != NULL) parent = parent->next; parent->next = mreg; mreg->prev = parent; } return mreg; }