int MPIDI_VCRT_Create(int size, struct MPIDI_VCRT **vcrt_ptr) { MPIDI_VCRT_t * vcrt; int mpi_errno = MPI_SUCCESS; MPIR_CHKPMEM_DECL(1); MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_VCRT_CREATE); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_VCRT_CREATE); MPIR_CHKPMEM_MALLOC(vcrt, MPIDI_VCRT_t *, sizeof(MPIDI_VCRT_t) + (size - 1) * sizeof(MPIDI_VC_t *), mpi_errno, "**nomem"); vcrt->handle = HANDLE_SET_KIND(0, HANDLE_KIND_INVALID); MPIR_Object_set_ref(vcrt, 1); vcrt->size = size; *vcrt_ptr = vcrt; fn_exit: MPIR_CHKPMEM_COMMIT(); MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_VCRT_CREATE); return mpi_errno; fn_fail: /* --BEGIN ERROR HANDLING-- */ MPIR_CHKPMEM_REAP(); goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_RMA_init(void) { int mpi_errno = MPI_SUCCESS; int i; MPIR_CHKPMEM_DECL(3); MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_RMA_INIT); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_RMA_INIT); MPIR_CHKPMEM_MALLOC(global_rma_op_pool_start, MPIDI_RMA_Op_t *, sizeof(MPIDI_RMA_Op_t) * MPIR_CVAR_CH3_RMA_OP_GLOBAL_POOL_SIZE, mpi_errno, "RMA op pool", MPL_MEM_RMA); for (i = 0; i < MPIR_CVAR_CH3_RMA_OP_GLOBAL_POOL_SIZE; i++) { global_rma_op_pool_start[i].pool_type = MPIDI_RMA_POOL_GLOBAL; DL_APPEND(global_rma_op_pool_head, &(global_rma_op_pool_start[i])); } MPIR_CHKPMEM_MALLOC(global_rma_target_pool_start, MPIDI_RMA_Target_t *, sizeof(MPIDI_RMA_Target_t) * MPIR_CVAR_CH3_RMA_TARGET_GLOBAL_POOL_SIZE, mpi_errno, "RMA target pool", MPL_MEM_RMA); for (i = 0; i < MPIR_CVAR_CH3_RMA_TARGET_GLOBAL_POOL_SIZE; i++) { global_rma_target_pool_start[i].pool_type = MPIDI_RMA_POOL_GLOBAL; DL_APPEND(global_rma_target_pool_head, &(global_rma_target_pool_start[i])); } fn_exit: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_RMA_INIT); return mpi_errno; fn_fail: MPIR_CHKPMEM_REAP(); goto fn_fail; }
static int MPIDI_CH3I_Win_detect_shm(MPIR_Win ** win_ptr) { int mpi_errno = MPI_SUCCESS; MPIR_Win *shm_win_ptr = NULL; int i, node_size; MPI_Aint *base_shm_offs; MPIR_CHKPMEM_DECL(1); MPIR_CHKLMEM_DECL(1); MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_WIN_DETECT_SHM); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_WIN_DETECT_SHM); if ((*win_ptr)->comm_ptr->node_comm == NULL) { goto fn_exit; } node_size = (*win_ptr)->comm_ptr->node_comm->local_size; MPIR_CHKLMEM_MALLOC(base_shm_offs, MPI_Aint *, node_size * sizeof(MPI_Aint), mpi_errno, "base_shm_offs"); /* Return the first matched shared window. * It is noted that the shared windows including all local processes are * stored in every local process in the same order, hence the first matched * shared window on every local process should be the same. */ mpi_errno = MPIDI_CH3I_SHM_Wins_match(win_ptr, &shm_win_ptr, &base_shm_offs); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (shm_win_ptr == NULL) goto fn_exit; (*win_ptr)->shm_allocated = TRUE; MPIR_CHKPMEM_MALLOC((*win_ptr)->shm_base_addrs, void **, node_size * sizeof(void *), mpi_errno, "(*win_ptr)->shm_base_addrs"); /* Compute the base address of shm buffer on each process. * shm_base_addrs[i] = my_shm_base_addr + off[i] */ for (i = 0; i < node_size; i++) { (*win_ptr)->shm_base_addrs[i] = (void *) ((MPI_Aint) shm_win_ptr->shm_base_addr + base_shm_offs[i]); } /* TODO: should we use the same mutex or create a new one ? * It causes unnecessary synchronization.*/ (*win_ptr)->shm_mutex = shm_win_ptr->shm_mutex; fn_exit: MPIR_CHKLMEM_FREEALL(); MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_WIN_DETECT_SHM); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: MPIR_CHKPMEM_REAP(); goto fn_exit; /* --END ERROR HANDLING-- */ }
static int handle_mprobe(const ptl_event_t *e) { int mpi_errno = MPI_SUCCESS; MPIR_Request *const req = e->user_ptr; MPIR_CHKPMEM_DECL(1); MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_HANDLE_PROBE); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_HANDLE_PROBE); if (e->ni_fail_type == PTL_NI_NO_MATCH) { REQ_PTL(req)->found = FALSE; goto finish_mprobe; } REQ_PTL(req)->found = TRUE; req->status.MPI_SOURCE = NPTL_MATCH_GET_RANK(e->match_bits); req->status.MPI_TAG = NPTL_MATCH_GET_TAG(e->match_bits); MPIR_STATUS_SET_COUNT(req->status, NPTL_HEADER_GET_LENGTH(e->hdr_data)); MPIDI_Request_set_sync_send_flag(req, e->hdr_data & NPTL_SSEND); MPIR_CHKPMEM_MALLOC(req->dev.tmpbuf, void *, e->mlength, mpi_errno, "tmpbuf", MPL_MEM_BUFFER); MPIR_Memcpy((char *)req->dev.tmpbuf, e->start, e->mlength); req->dev.recv_data_sz = e->mlength; if (!(e->hdr_data & NPTL_LARGE)) { MPIDI_Request_set_msg_type(req, MPIDI_REQUEST_EAGER_MSG); } else { MPIR_Assert (e->mlength == PTL_LARGE_THRESHOLD); req->dev.match.parts.tag = req->status.MPI_TAG; req->dev.match.parts.context_id = NPTL_MATCH_GET_CTX(e->match_bits); req->dev.match.parts.rank = req->status.MPI_SOURCE; MPIDI_Request_set_msg_type(req, MPIDI_REQUEST_RNDV_MSG); } /* At this point we know the ME is unlinked. Invalidate the handle to prevent further accesses, e.g. an attempted cancel. */ REQ_PTL(req)->put_me = PTL_INVALID_HANDLE; req->dev.recv_pending_count = 1; finish_mprobe: mpi_errno = MPID_Request_complete(req); if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); } fn_exit: MPIR_CHKPMEM_COMMIT(); MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_HANDLE_PROBE); return mpi_errno; fn_fail: MPIR_CHKPMEM_REAP(); goto fn_exit; }
static int sched_get_cid_nonblock(MPIR_Comm * comm_ptr, MPIR_Comm * newcomm, MPIR_Context_id_t * ctx0, MPIR_Context_id_t * ctx1, MPIR_Sched_t s, MPIR_Comm_kind_t gcn_cid_kind) { int mpi_errno = MPI_SUCCESS; struct gcn_state *st = NULL; MPIR_CHKPMEM_DECL(1); if (initialize_context_mask) { context_id_init(); } MPIR_CHKPMEM_MALLOC(st, struct gcn_state *, sizeof(struct gcn_state), mpi_errno, "gcn_state"); st->ctx0 = ctx0; st->ctx1 = ctx1; if (gcn_cid_kind == MPIR_COMM_KIND__INTRACOMM) { st->comm_ptr = comm_ptr; st->comm_ptr_inter = NULL; } else { st->comm_ptr = comm_ptr->local_comm; st->comm_ptr_inter = comm_ptr; } st->s = s; st->gcn_cid_kind = gcn_cid_kind; *(st->ctx0) = 0; st->own_eager_mask = 0; st->first_iter = 1; st->new_comm = newcomm; st->own_mask = 0; if (eager_nelem < 0) { /* Ensure that at least one word of deadlock-free context IDs is * always set aside for the base protocol */ MPIR_Assert(MPIR_CVAR_CTXID_EAGER_SIZE >= 0 && MPIR_CVAR_CTXID_EAGER_SIZE < MPIR_MAX_CONTEXT_MASK - 1); eager_nelem = MPIR_CVAR_CTXID_EAGER_SIZE; } mpi_errno = MPIR_Sched_cb(&sched_cb_gcn_copy_mask, st, s); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_SCHED_BARRIER(s); MPIR_CHKPMEM_COMMIT(); fn_exit: return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: MPIR_CHKPMEM_REAP(); goto fn_exit; /* --END ERROR HANDLING-- */ }
static int handler_recv_dequeue_unpack_large(const ptl_event_t *e) { int mpi_errno = MPI_SUCCESS; MPIR_Request *const rreq = e->user_ptr; MPIDI_VC_t *vc; MPI_Aint last; void *buf; MPIR_CHKPMEM_DECL(1); MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_HANDLER_RECV_DEQUEUE_UNPACK_LARGE); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_HANDLER_RECV_DEQUEUE_UNPACK_LARGE); MPIR_Assert(e->type == PTL_EVENT_PUT || e->type == PTL_EVENT_PUT_OVERFLOW); MPIDI_Comm_get_vc(rreq->comm, NPTL_MATCH_GET_RANK(e->match_bits), &vc); dequeue_req(e); if (!(e->hdr_data & NPTL_LARGE)) { /* all data has already been received; we're done */ mpi_errno = handler_recv_unpack_complete(e); if (mpi_errno) MPIR_ERR_POP(mpi_errno); goto fn_exit; } if (e->type == PTL_EVENT_PUT_OVERFLOW) buf = e->start; else buf = REQ_PTL(rreq)->chunk_buffer[0]; MPIR_Assert(e->mlength == PTL_LARGE_THRESHOLD); last = PTL_LARGE_THRESHOLD; MPIDU_Segment_unpack(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, buf); MPIR_Assert(last == PTL_LARGE_THRESHOLD); rreq->dev.segment_first += PTL_LARGE_THRESHOLD; MPL_free(REQ_PTL(rreq)->chunk_buffer[0]); MPIR_CHKPMEM_MALLOC(REQ_PTL(rreq)->chunk_buffer[0], void *, rreq->dev.segment_size - rreq->dev.segment_first, mpi_errno, "chunk_buffer"); big_get(REQ_PTL(rreq)->chunk_buffer[0], rreq->dev.segment_size - rreq->dev.segment_first, vc, e->match_bits, rreq); fn_exit: MPIR_CHKPMEM_COMMIT(); fn_exit2: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_HANDLER_RECV_DEQUEUE_UNPACK_LARGE); return mpi_errno; fn_fail: MPIR_CHKPMEM_REAP(); goto fn_exit2; }
int MPID_nem_mpich_init(void) { int mpi_errno = MPI_SUCCESS; int i; MPIR_CHKPMEM_DECL (2); MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPID_NEM_MPICH_INIT); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPID_NEM_MPICH_INIT); MPID_nem_prefetched_cell = NULL; MPIR_CHKPMEM_MALLOC (MPID_nem_recv_seqno, unsigned short *, sizeof(*MPID_nem_recv_seqno) * MPID_nem_mem_region.num_procs, mpi_errno, "recv seqno"); for (i = 0; i < MPID_nem_mem_region.num_procs; ++i) { MPID_nem_recv_seqno[i] = 0; } /* set up fbox queue */ MPIR_CHKPMEM_MALLOC (MPID_nem_fboxq_elem_list, MPID_nem_fboxq_elem_t *, MPID_nem_mem_region.num_local * sizeof(MPID_nem_fboxq_elem_t), mpi_errno, "fastbox element list"); for (i = 0; i < MPID_nem_mem_region.num_local; ++i) { MPID_nem_fboxq_elem_list[i].usage = 0; MPID_nem_fboxq_elem_list[i].prev = NULL; MPID_nem_fboxq_elem_list[i].next = NULL; MPID_nem_fboxq_elem_list[i].grank = MPID_nem_mem_region.local_procs[i]; MPID_nem_fboxq_elem_list[i].fbox = &MPID_nem_mem_region.mailboxes.in[i]->mpich; } MPID_nem_fboxq_head = NULL; MPID_nem_fboxq_tail = NULL; MPID_nem_curr_fboxq_elem = NULL; MPID_nem_curr_fbox_all_poll = &MPID_nem_fboxq_elem_list[0]; MPID_nem_fboxq_elem_list_last = &MPID_nem_fboxq_elem_list[MPID_nem_mem_region.num_local - 1]; MPIR_CHKPMEM_COMMIT(); fn_exit: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPID_NEM_MPICH_INIT); return mpi_errno; fn_fail: /* --BEGIN ERROR HANDLING-- */ MPIR_CHKPMEM_REAP(); goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_CH3_Connect_to_root (const char *port_name, MPIDI_VC_t **new_vc) { int mpi_errno = MPI_SUCCESS; MPIDI_VC_t * vc; MPIR_CHKPMEM_DECL(1); MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3_CONNECT_TO_ROOT); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_CH3_CONNECT_TO_ROOT); *new_vc = NULL; /* so that the err handling knows to cleanup */ MPIR_CHKPMEM_MALLOC (vc, MPIDI_VC_t *, sizeof(MPIDI_VC_t), mpi_errno, "vc", MPL_MEM_ADDRESS); /* FIXME - where does this vc get freed? ANSWER (goodell@) - ch3u_port.c FreeNewVC (but the VC_Destroy is in this file) */ /* init ch3 portion of vc */ MPIDI_VC_Init (vc, NULL, 0); /* init channel portion of vc */ MPIR_ERR_CHKINTERNAL(!nemesis_initialized, mpi_errno, "Nemesis not initialized"); vc->ch.recv_active = NULL; MPIDI_CHANGE_VC_STATE(vc, ACTIVE); *new_vc = vc; /* we now have a valid, disconnected, temp VC */ mpi_errno = MPID_nem_connect_to_root (port_name, vc); if (mpi_errno) MPIR_ERR_POP (mpi_errno); MPIR_CHKPMEM_COMMIT(); fn_exit: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_CH3_CONNECT_TO_ROOT); return mpi_errno; fn_fail: /* freeing without giving the lower layer a chance to cleanup can lead to leaks on error */ if (*new_vc) MPIDI_CH3_VC_Destroy(*new_vc); MPIR_CHKPMEM_REAP(); goto fn_exit; }
int MPIR_T_pvar_session_create_impl(MPI_T_pvar_session *session) { int mpi_errno = MPI_SUCCESS; MPIR_CHKPMEM_DECL(1); *session = MPI_T_PVAR_SESSION_NULL; MPIR_CHKPMEM_MALLOC(*session, MPI_T_pvar_session, sizeof(**session), mpi_errno, "performance var session"); /* essential for utlist to work */ (*session)->hlist = NULL; #ifdef HAVE_ERROR_CHECKING (*session)->kind = MPIR_T_PVAR_SESSION; #endif MPIR_CHKPMEM_COMMIT(); fn_exit: return mpi_errno; fn_fail: MPIR_CHKPMEM_REAP(); goto fn_exit; }
static int MPIDI_CH3I_Win_allocate_shm(MPI_Aint size, int disp_unit, MPIR_Info * info, MPIR_Comm * comm_ptr, void *base_ptr, MPIR_Win ** win_ptr) { int mpi_errno = MPI_SUCCESS; void **base_pp = (void **) base_ptr; int i, node_size, node_rank; MPIR_Comm *node_comm_ptr; MPI_Aint *node_sizes; MPIR_Errflag_t errflag = MPIR_ERR_NONE; int noncontig = FALSE; MPIR_CHKPMEM_DECL(1); MPIR_CHKLMEM_DECL(1); MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM); if ((*win_ptr)->comm_ptr->node_comm == NULL) { mpi_errno = MPIDI_CH3U_Win_allocate_no_shm(size, disp_unit, info, comm_ptr, base_ptr, win_ptr); goto fn_exit; } /* see if we can allocate all windows contiguously */ noncontig = (*win_ptr)->info_args.alloc_shared_noncontig; (*win_ptr)->shm_allocated = TRUE; /* When allocating shared memory region segment, we need comm of processes * that are on the same node as this process (node_comm). * If node_comm == NULL, this process is the only one on this node, therefore * we use comm_self as node comm. */ node_comm_ptr = (*win_ptr)->comm_ptr->node_comm; MPIR_Assert(node_comm_ptr != NULL); node_size = node_comm_ptr->local_size; node_rank = node_comm_ptr->rank; MPIR_T_PVAR_TIMER_START(RMA, rma_wincreate_allgather); /* allocate memory for the base addresses, disp_units, and * completion counters of all processes */ MPIR_CHKPMEM_MALLOC((*win_ptr)->shm_base_addrs, void **, node_size * sizeof(void *), mpi_errno, "(*win_ptr)->shm_base_addrs"); /* get the sizes of the windows and window objectsof * all processes. allocate temp. buffer for communication */ MPIR_CHKLMEM_MALLOC(node_sizes, MPI_Aint *, node_size * sizeof(MPI_Aint), mpi_errno, "node_sizes"); /* FIXME: This needs to be fixed for heterogeneous systems */ node_sizes[node_rank] = (MPI_Aint) size; mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, node_sizes, sizeof(MPI_Aint), MPI_BYTE, node_comm_ptr, &errflag); MPIR_T_PVAR_TIMER_END(RMA, rma_wincreate_allgather); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); (*win_ptr)->shm_segment_len = 0; for (i = 0; i < node_size; i++) { if (noncontig) /* Round up to next page size */ (*win_ptr)->shm_segment_len += MPIDI_CH3_ROUND_UP_PAGESIZE(node_sizes[i]); else (*win_ptr)->shm_segment_len += node_sizes[i]; } if ((*win_ptr)->shm_segment_len == 0) { (*win_ptr)->base = NULL; } else { mpi_errno = MPL_shm_hnd_init(&(*win_ptr)->shm_segment_handle); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (node_rank == 0) { char *serialized_hnd_ptr = NULL; /* create shared memory region for all processes in win and map */ mpi_errno = MPL_shm_seg_create_and_attach((*win_ptr)->shm_segment_handle, (*win_ptr)->shm_segment_len, (char **) &(*win_ptr)->shm_base_addr, 0); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* serialize handle and broadcast it to the other processes in win */ mpi_errno = MPL_shm_hnd_get_serialized_by_ref((*win_ptr)->shm_segment_handle, &serialized_hnd_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Bcast_impl(serialized_hnd_ptr, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* wait for other processes to attach to win */ mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* unlink shared memory region so it gets deleted when all processes exit */ mpi_errno = MPL_shm_seg_remove((*win_ptr)->shm_segment_handle); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else { char serialized_hnd[MPL_SHM_GHND_SZ] = { 0 }; /* get serialized handle from rank 0 and deserialize it */ mpi_errno = MPIR_Bcast_impl(serialized_hnd, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); mpi_errno = MPL_shm_hnd_deserialize((*win_ptr)->shm_segment_handle, serialized_hnd, strlen(serialized_hnd)); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* attach to shared memory region created by rank 0 */ mpi_errno = MPL_shm_seg_attach((*win_ptr)->shm_segment_handle, (*win_ptr)->shm_segment_len, (char **) &(*win_ptr)->shm_base_addr, 0); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); } /* Allocated the interprocess mutex segment. */ mpi_errno = MPL_shm_hnd_init(&(*win_ptr)->shm_mutex_segment_handle); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (node_rank == 0) { char *serialized_hnd_ptr = NULL; /* create shared memory region for all processes in win and map */ mpi_errno = MPL_shm_seg_create_and_attach((*win_ptr)->shm_mutex_segment_handle, sizeof(MPIDI_CH3I_SHM_MUTEX), (char **) &(*win_ptr)->shm_mutex, 0); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIDI_CH3I_SHM_MUTEX_INIT(*win_ptr); /* serialize handle and broadcast it to the other processes in win */ mpi_errno = MPL_shm_hnd_get_serialized_by_ref((*win_ptr)->shm_mutex_segment_handle, &serialized_hnd_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Bcast_impl(serialized_hnd_ptr, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* wait for other processes to attach to win */ mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); /* unlink shared memory region so it gets deleted when all processes exit */ mpi_errno = MPL_shm_seg_remove((*win_ptr)->shm_mutex_segment_handle); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else { char serialized_hnd[MPL_SHM_GHND_SZ] = { 0 }; /* get serialized handle from rank 0 and deserialize it */ mpi_errno = MPIR_Bcast_impl(serialized_hnd, MPL_SHM_GHND_SZ, MPI_CHAR, 0, node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); mpi_errno = MPL_shm_hnd_deserialize((*win_ptr)->shm_mutex_segment_handle, serialized_hnd, strlen(serialized_hnd)); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* attach to shared memory region created by rank 0 */ mpi_errno = MPL_shm_seg_attach((*win_ptr)->shm_mutex_segment_handle, sizeof(MPIDI_CH3I_SHM_MUTEX), (char **) &(*win_ptr)->shm_mutex, 0); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIR_Barrier_impl(node_comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); } /* compute the base addresses of each process within the shared memory segment */ { char *cur_base; int cur_rank; cur_base = (*win_ptr)->shm_base_addr; cur_rank = 0; ((*win_ptr)->shm_base_addrs)[0] = (*win_ptr)->shm_base_addr; for (i = 1; i < node_size; ++i) { if (node_sizes[i]) { /* For the base addresses, we track the previous * process that has allocated non-zero bytes of shared * memory. We can not simply use "i-1" for the * previous process because rank "i-1" might not have * allocated any memory. */ if (noncontig) { ((*win_ptr)->shm_base_addrs)[i] = cur_base + MPIDI_CH3_ROUND_UP_PAGESIZE(node_sizes[cur_rank]); } else { ((*win_ptr)->shm_base_addrs)[i] = cur_base + node_sizes[cur_rank]; } cur_base = ((*win_ptr)->shm_base_addrs)[i]; cur_rank = i; } else { ((*win_ptr)->shm_base_addrs)[i] = NULL; } } } (*win_ptr)->base = (*win_ptr)->shm_base_addrs[node_rank]; } *base_pp = (*win_ptr)->base; /* gather window information among processes via shared memory region. */ mpi_errno = MPIDI_CH3I_Win_gather_info((*base_pp), size, disp_unit, info, comm_ptr, win_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); /* Cache SHM windows */ MPIDI_CH3I_SHM_Wins_append(&shm_wins_list, (*win_ptr)); fn_exit: MPIR_CHKLMEM_FREEALL(); MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3I_WIN_ALLOCATE_SHM); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: MPIR_CHKPMEM_REAP(); goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_PG_Create(int vct_sz, void * pg_id, MPIDI_PG_t ** pg_ptr) { MPIDI_PG_t * pg = NULL, *pgnext; int p; int mpi_errno = MPI_SUCCESS; MPIR_CHKPMEM_DECL(2); MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_PG_CREATE); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_MPIDI_PG_CREATE); MPIR_CHKPMEM_MALLOC(pg,MPIDI_PG_t*,sizeof(MPIDI_PG_t),mpi_errno,"pg"); MPIR_CHKPMEM_MALLOC(pg->vct,MPIDI_VC_t *,sizeof(MPIDI_VC_t)*vct_sz, mpi_errno,"pg->vct"); if (verbose) { fprintf( stdout, "Creating a process group of size %d\n", vct_sz ); fflush(stdout); } pg->handle = 0; /* The reference count indicates the number of vc's that are or have been in use and not disconnected. It starts at zero, except for MPI_COMM_WORLD. */ MPIR_Object_set_ref(pg, 0); pg->size = vct_sz; pg->id = pg_id; pg->finalize = 0; /* Initialize the connection information to null. Use the appropriate MPIDI_PG_InitConnXXX routine to set up these fields */ pg->connData = 0; pg->getConnInfo = 0; pg->connInfoToString = 0; pg->connInfoFromString = 0; pg->freeConnInfo = 0; for (p = 0; p < vct_sz; p++) { /* Initialize device fields in the VC object */ MPIDI_VC_Init(&pg->vct[p], pg, p); } /* We may first need to initialize the channel before calling the channel VC init functions. This routine may be a no-op; look in the ch3_init.c file in each channel */ MPIDI_CH3_PG_Init(pg); /* These are now done in MPIDI_VC_Init */ #if 0 for (p = 0; p < vct_sz; p++) { /* Initialize the channel fields in the VC object */ MPIDI_CH3_VC_Init( &pg->vct[p] ); } #endif /* The first process group is always the world group */ if (!pg_world) { pg_world = pg; } /* Add pg's at the tail so that comm world is always the first pg */ pg->next = 0; if (!MPIDI_PG_list) { MPIDI_PG_list = pg; } else { pgnext = MPIDI_PG_list; while (pgnext->next) { pgnext = pgnext->next; } pgnext->next = pg; } *pg_ptr = pg; fn_exit: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_MPIDI_PG_CREATE); return mpi_errno; fn_fail: MPIR_CHKPMEM_REAP(); goto fn_exit; }
int MPIR_T_pvar_handle_alloc_impl(MPI_T_pvar_session session, int pvar_index, void *obj_handle, MPI_T_pvar_handle *handle,int *count) { int mpi_errno = MPI_SUCCESS; int cnt, extra, bytes; int is_sum, is_watermark; const pvar_table_entry_t *info; MPIR_T_pvar_handle_t *hnd; MPIR_CHKPMEM_DECL(1); info = (pvar_table_entry_t *) utarray_eltptr(pvar_table, pvar_index); if (info->get_count == NULL) { cnt = info->count; } else { info->get_count(info->addr, obj_handle, &cnt); } bytes = MPID_Datatype_get_basic_size(info->datatype); is_sum = FALSE; is_watermark = FALSE; extra = 0; if (info->varclass == MPI_T_PVAR_CLASS_COUNTER || info->varclass == MPI_T_PVAR_CLASS_AGGREGATE || info->varclass == MPI_T_PVAR_CLASS_TIMER) { /* Extra memory for accum, offset, current */ is_sum = TRUE; extra = bytes * cnt * 3; } else if (info->varclass == MPI_T_PVAR_CLASS_HIGHWATERMARK || info->varclass == MPI_T_PVAR_CLASS_LOWWATERMARK) { is_watermark = TRUE; } /* Allocate memory and bzero it */ MPIR_CHKPMEM_CALLOC(hnd, MPIR_T_pvar_handle_t*, sizeof(*hnd) + extra, mpi_errno, "performance variable handle"); #ifdef HAVE_ERROR_CHECKING hnd->kind = MPIR_T_PVAR_HANDLE; #endif /* Setup the common fields */ if (is_sum) hnd->flags |= MPIR_T_PVAR_FLAG_SUM; else if (is_watermark) hnd->flags |= MPIR_T_PVAR_FLAG_WATERMARK; hnd->addr = info->addr; hnd->datatype = info->datatype; hnd->count = cnt; hnd->varclass = info->varclass; hnd->flags = info->flags; hnd->session = session; hnd->info = info; hnd->obj_handle = obj_handle; hnd->get_value = info->get_value; hnd->bytes = bytes; hnd->count = cnt; /* Init pointers to cache buffers for a SUM */ if (MPIR_T_pvar_is_sum(hnd)) { hnd->accum = (char*)(hnd) + sizeof(*hnd); hnd->offset = (char*)(hnd) + sizeof(*hnd) + bytes*cnt; hnd->current = (char*)(hnd) + sizeof(*hnd) + bytes*cnt*2; } if (MPIR_T_pvar_is_continuous(hnd)) MPIR_T_pvar_set_started(hnd); /* Set starting value of a continuous SUM */ if (MPIR_T_pvar_is_continuous(hnd) && MPIR_T_pvar_is_sum(hnd)) { /* Cache current value of a SUM in offset. * accum is zero since we called CALLOC before. */ if (hnd->get_value == NULL) MPIR_Memcpy(hnd->offset, hnd->addr, bytes*cnt); else hnd->get_value(hnd->addr, hnd->obj_handle, hnd->count, hnd->offset); } /* Link a WATERMARK handle to its pvar & set starting value if continuous */ if (MPIR_T_pvar_is_watermark(hnd)) { MPIR_T_pvar_watermark_t *mark = (MPIR_T_pvar_watermark_t *)hnd->addr; if (!mark->first_used) { /* Use the special handle slot for optimization if available */ mark->first_used = TRUE; MPIR_T_pvar_set_first(hnd); /* Set starting value */ if (MPIR_T_pvar_is_continuous(hnd)) { mark->first_started = TRUE; mark->watermark = mark->current; } else { mark->first_started = FALSE; } } else { /* If the special handle slot is unavailable, link it to hlist */ if (mark->hlist == NULL) { hnd->prev2 = hnd; mark->hlist = hnd; } else { hnd->prev2 = hnd; hnd->next2 = mark->hlist; mark->hlist->prev2 = hnd; mark->hlist = hnd; } /* Set starting value */ if (MPIR_T_pvar_is_continuous(hnd)) hnd->watermark = mark->current; } } /* Link the handle in its session and return it */ MPL_DL_APPEND(session->hlist, hnd); *handle = hnd; *count = cnt; MPIR_CHKPMEM_COMMIT(); fn_exit: return mpi_errno; fn_fail: MPIR_CHKPMEM_REAP(); goto fn_exit; }
/* MPIR_Find_local -- from the list of processes in comm, * builds a list of local processes, i.e., processes on this same node. * * Note that this will not work correctly for spawned or attached * processes. * * OUT: * local_size_p - number of processes on this node. * local_rank_p - rank of this processes among local processes. * local_ranks_p - (*local_ranks_p)[i] = the rank in comm * of the process with local rank i. * This is of size (*local_size_p). * intranode_table_p - (*intranode_table_p)[i] = the rank in * (optional) *local_ranks_p of rank i in comm or -1 if not * applicable. It is of size comm->remote_size. * No return if NULL is specified. */ int MPIR_Find_local(MPIR_Comm * comm, int *local_size_p, int *local_rank_p, int **local_ranks_p, int **intranode_table_p) { int mpi_errno = MPI_SUCCESS; int i, local_size, local_rank; int *local_ranks = NULL, *intranode_table = NULL; int node_id = -1, my_node_id = -1; MPIR_CHKPMEM_DECL(2); /* local_ranks will be realloc'ed later to the appropriate size (currently unknown) */ /* FIXME: realloc doesn't guarantee that the allocated area will be * shrunk - so using realloc is not an appropriate strategy. */ MPIR_CHKPMEM_MALLOC(local_ranks, int *, sizeof(int) * comm->remote_size, mpi_errno, "local_ranks", MPL_MEM_COMM); MPIR_CHKPMEM_MALLOC(intranode_table, int *, sizeof(int) * comm->remote_size, mpi_errno, "intranode_table", MPL_MEM_COMM); for (i = 0; i < comm->remote_size; ++i) intranode_table[i] = -1; mpi_errno = MPID_Get_node_id(comm, comm->rank, &my_node_id); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_Assert(my_node_id >= 0); local_size = 0; local_rank = -1; /* Scan through the list of processes in comm. */ for (i = 0; i < comm->remote_size; ++i) { mpi_errno = MPID_Get_node_id(comm, i, &node_id); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* The upper level can catch this non-fatal error and should be * able to recover gracefully. */ MPIR_ERR_CHKANDJUMP(node_id < 0, mpi_errno, MPI_ERR_OTHER, "**dynamic_node_ids"); /* build list of local processes */ if (node_id == my_node_id) { if (i == comm->rank) local_rank = local_size; intranode_table[i] = local_size; local_ranks[local_size] = i; ++local_size; } } #ifdef ENABLE_DEBUG printf("------------------------------------------------------------------------\n"); printf("[%d]comm = %p\n", comm->rank, comm); printf("[%d]comm->size = %d\n", comm->rank, comm->remote_size); printf("[%d]comm->rank = %d\n", comm->rank, comm->rank); printf("[%d]local_size = %d\n", comm->rank, local_size); printf("[%d]local_rank = %d\n", comm->rank, local_rank); printf("[%d]local_ranks = %p\n", comm->rank, local_ranks); for (i = 0; i < local_size; ++i) printf("[%d] local_ranks[%d] = %d\n", comm->rank, i, local_ranks[i]); printf("[%d]intranode_table = %p\n", comm->rank, intranode_table); for (i = 0; i < comm->remote_size; ++i) printf("[%d] intranode_table[%d] = %d\n", comm->rank, i, intranode_table[i]); #endif MPIR_CHKPMEM_COMMIT(); *local_size_p = local_size; *local_rank_p = local_rank; *local_ranks_p = MPL_realloc(local_ranks, sizeof(int) * local_size, MPL_MEM_COMM); MPIR_ERR_CHKANDJUMP(*local_ranks_p == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem2"); if (intranode_table_p) *intranode_table_p = intranode_table; /* no need to realloc */ else MPL_free(intranode_table); /* free internally if caller passes NULL */ fn_exit: return mpi_errno; fn_fail: MPIR_CHKPMEM_REAP(); goto fn_exit; }
/* MPIR_Find_external -- from the list of processes in comm, * builds a list of external processes, i.e., one process from each node. * You can think of this as the root or master process for each node. * * Note that this will not work correctly for spawned or attached * processes. * * OUT: * external_size_p - number of external processes * external_rank_p - rank of this process among the external * processes, or -1 if this process is not external * external_ranks_p - (*external_ranks_p)[i] = the rank in comm * of the process with external rank i. * This is of size (*external_size_p) * internode_table_p - (*internode_table_p)[i] = the rank in * (optional) *external_ranks_p of the root of the node * containing rank i in comm. It is of size * comm->remote_size. No return if NULL is specified. */ int MPIR_Find_external(MPIR_Comm * comm, int *external_size_p, int *external_rank_p, int **external_ranks_p, int **internode_table_p) { int mpi_errno = MPI_SUCCESS; int *nodes; int i, external_size, external_rank; int *external_ranks, *internode_table; int max_node_id, node_id; MPIR_CHKLMEM_DECL(1); MPIR_CHKPMEM_DECL(2); /* Scan through the list of processes in comm and add one * process from each node to the list of "external" processes. We * add the first process we find from each node. nodes[] is an * array where we keep track of whether we have already added that * node to the list. */ /* external_ranks will be realloc'ed later to the appropriate size (currently unknown) */ /* FIXME: realloc doesn't guarantee that the allocated area will be * shrunk - so using realloc is not an appropriate strategy. */ MPIR_CHKPMEM_MALLOC(external_ranks, int *, sizeof(int) * comm->remote_size, mpi_errno, "external_ranks", MPL_MEM_COMM); MPIR_CHKPMEM_MALLOC(internode_table, int *, sizeof(int) * comm->remote_size, mpi_errno, "internode_table", MPL_MEM_COMM); mpi_errno = MPID_Get_max_node_id(comm, &max_node_id); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_Assert(max_node_id >= 0); MPIR_CHKLMEM_MALLOC(nodes, int *, sizeof(int) * (max_node_id + 1), mpi_errno, "nodes", MPL_MEM_COMM); /* nodes maps node_id to rank in external_ranks of leader for that node */ for (i = 0; i < (max_node_id + 1); ++i) nodes[i] = -1; external_size = 0; external_rank = -1; for (i = 0; i < comm->remote_size; ++i) { mpi_errno = MPID_Get_node_id(comm, i, &node_id); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* The upper level can catch this non-fatal error and should be * able to recover gracefully. */ MPIR_ERR_CHKANDJUMP(node_id < 0, mpi_errno, MPI_ERR_OTHER, "**dynamic_node_ids"); MPIR_Assert(node_id <= max_node_id); /* build list of external processes */ if (nodes[node_id] == -1) { if (i == comm->rank) external_rank = external_size; nodes[node_id] = external_size; external_ranks[external_size] = i; ++external_size; } /* build the map from rank in comm to rank in external_ranks */ internode_table[i] = nodes[node_id]; } #ifdef ENABLE_DEBUG printf("------------------------------------------------------------------------\n"); printf("[%d]comm = %p\n", comm->rank, comm); printf("[%d]comm->size = %d\n", comm->rank, comm->remote_size); printf("[%d]comm->rank = %d\n", comm->rank, comm->rank); printf("[%d]external_size = %d\n", comm->rank, external_size); printf("[%d]external_rank = %d\n", comm->rank, external_rank); printf("[%d]external_ranks = %p\n", comm->rank, external_ranks); for (i = 0; i < external_size; ++i) printf("[%d] external_ranks[%d] = %d\n", comm->rank, i, external_ranks[i]); printf("[%d]internode_table = %p\n", comm->rank, internode_table); for (i = 0; i < comm->remote_size; ++i) printf("[%d] internode_table[%d] = %d\n", comm->rank, i, internode_table[i]); printf("[%d]nodes = %p\n", comm->rank, nodes); for (i = 0; i < (max_node_id + 1); ++i) printf("[%d] nodes[%d] = %d\n", comm->rank, i, nodes[i]); #endif MPIR_CHKPMEM_COMMIT(); *external_size_p = external_size; *external_rank_p = external_rank; *external_ranks_p = MPL_realloc(external_ranks, sizeof(int) * external_size, MPL_MEM_COMM); MPIR_ERR_CHKANDJUMP(*external_ranks_p == NULL, mpi_errno, MPI_ERR_OTHER, "**nomem2"); if (internode_table_p) *internode_table_p = internode_table; /* no need to realloc */ else MPL_free(internode_table); /* free internally if caller passes NULL */ fn_exit: MPIR_CHKLMEM_FREEALL(); return mpi_errno; fn_fail: MPIR_CHKPMEM_REAP(); goto fn_exit; }
/*@ MPI_Dist_graph_create - MPI_DIST_GRAPH_CREATE returns a handle to a new communicator to which the distributed graph topology information is attached. Input Parameters: + comm_old - input communicator (handle) . n - number of source nodes for which this process specifies edges (non-negative integer) . sources - array containing the n source nodes for which this process specifies edges (array of non-negative integers) . degrees - array specifying the number of destinations for each source node in the source node array (array of non-negative integers) . destinations - destination nodes for the source nodes in the source node array (array of non-negative integers) . weights - weights for source to destination edges (array of non-negative integers or MPI_UNWEIGHTED) . info - hints on optimization and interpretation of weights (handle) - reorder - the process may be reordered (true) or not (false) (logical) Output Parameters: . comm_dist_graph - communicator with distributed graph topology added (handle) .N ThreadSafe .N Fortran .N Errors .N MPI_SUCCESS .N MPI_ERR_ARG .N MPI_ERR_OTHER @*/ int MPI_Dist_graph_create(MPI_Comm comm_old, int n, const int sources[], const int degrees[], const int destinations[], const int weights[], MPI_Info info, int reorder, MPI_Comm * comm_dist_graph) { int mpi_errno = MPI_SUCCESS; MPIR_Comm *comm_ptr = NULL; MPIR_Comm *comm_dist_graph_ptr = NULL; MPIR_Request **reqs = NULL; MPIR_Topology *topo_ptr = NULL; MPII_Dist_graph_topology *dist_graph_ptr = NULL; int i; int j; int idx; int comm_size = 0; int in_capacity; int out_capacity; int **rout = NULL; int **rin = NULL; int *rin_sizes; int *rout_sizes; int *rin_idx; int *rout_idx; int *rs; int in_out_peers[2] = { -1, -1 }; MPIR_Errflag_t errflag = MPIR_ERR_NONE; MPIR_CHKLMEM_DECL(9); MPIR_CHKPMEM_DECL(1); MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPI_DIST_GRAPH_CREATE); MPIR_ERRTEST_INITIALIZED_ORDIE(); MPID_THREAD_CS_ENTER(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); MPIR_FUNC_TERSE_ENTER(MPID_STATE_MPI_DIST_GRAPH_CREATE); /* Validate parameters, especially handles needing to be converted */ #ifdef HAVE_ERROR_CHECKING { MPID_BEGIN_ERROR_CHECKS; { MPIR_ERRTEST_COMM(comm_old, mpi_errno); MPIR_ERRTEST_INFO_OR_NULL(info, mpi_errno); if (mpi_errno != MPI_SUCCESS) goto fn_fail; } MPID_END_ERROR_CHECKS; } #endif /* Convert MPI object handles to object pointers */ MPIR_Comm_get_ptr(comm_old, comm_ptr); /* Validate parameters and objects (post conversion) */ #ifdef HAVE_ERROR_CHECKING { MPID_BEGIN_ERROR_CHECKS; { /* Validate comm_ptr */ MPIR_Comm_valid_ptr(comm_ptr, mpi_errno, FALSE); /* If comm_ptr is not valid, it will be reset to null */ if (comm_ptr) { MPIR_ERRTEST_COMM_INTRA(comm_ptr, mpi_errno); } MPIR_ERRTEST_ARGNEG(n, "n", mpi_errno); if (n > 0) { int have_degrees = 0; MPIR_ERRTEST_ARGNULL(sources, "sources", mpi_errno); MPIR_ERRTEST_ARGNULL(degrees, "degrees", mpi_errno); for (i = 0; i < n; ++i) { if (degrees[i]) { have_degrees = 1; break; } } if (have_degrees) { MPIR_ERRTEST_ARGNULL(destinations, "destinations", mpi_errno); if (weights != MPI_UNWEIGHTED) MPIR_ERRTEST_ARGNULL(weights, "weights", mpi_errno); } } if (mpi_errno != MPI_SUCCESS) goto fn_fail; } MPID_END_ERROR_CHECKS; } #endif /* HAVE_ERROR_CHECKING */ /* ... body of routine ... */ /* Implementation based on Torsten Hoefler's reference implementation * attached to MPI-2.2 ticket #33. */ *comm_dist_graph = MPI_COMM_NULL; comm_size = comm_ptr->local_size; /* following the spirit of the old topo interface, attributes do not * propagate to the new communicator (see MPI-2.1 pp. 243 line 11) */ mpi_errno = MPII_Comm_copy(comm_ptr, comm_size, &comm_dist_graph_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_Assert(comm_dist_graph_ptr != NULL); /* rin is an array of size comm_size containing pointers to arrays of * rin_sizes[x]. rin[x] is locally known number of edges into this process * from rank x. * * rout is an array of comm_size containing pointers to arrays of * rout_sizes[x]. rout[x] is the locally known number of edges out of this * process to rank x. */ MPIR_CHKLMEM_MALLOC(rout, int **, comm_size * sizeof(int *), mpi_errno, "rout", MPL_MEM_COMM); MPIR_CHKLMEM_MALLOC(rin, int **, comm_size * sizeof(int *), mpi_errno, "rin", MPL_MEM_COMM); MPIR_CHKLMEM_MALLOC(rin_sizes, int *, comm_size * sizeof(int), mpi_errno, "rin_sizes", MPL_MEM_COMM); MPIR_CHKLMEM_MALLOC(rout_sizes, int *, comm_size * sizeof(int), mpi_errno, "rout_sizes", MPL_MEM_COMM); MPIR_CHKLMEM_MALLOC(rin_idx, int *, comm_size * sizeof(int), mpi_errno, "rin_idx", MPL_MEM_COMM); MPIR_CHKLMEM_MALLOC(rout_idx, int *, comm_size * sizeof(int), mpi_errno, "rout_idx", MPL_MEM_COMM); memset(rout, 0, comm_size * sizeof(int *)); memset(rin, 0, comm_size * sizeof(int *)); memset(rin_sizes, 0, comm_size * sizeof(int)); memset(rout_sizes, 0, comm_size * sizeof(int)); memset(rin_idx, 0, comm_size * sizeof(int)); memset(rout_idx, 0, comm_size * sizeof(int)); /* compute array sizes */ idx = 0; for (i = 0; i < n; ++i) { MPIR_Assert(sources[i] < comm_size); for (j = 0; j < degrees[i]; ++j) { MPIR_Assert(destinations[idx] < comm_size); /* rout_sizes[i] is twice as long as the number of edges to be * sent to rank i by this process */ rout_sizes[sources[i]] += 2; rin_sizes[destinations[idx]] += 2; ++idx; } } /* allocate arrays */ for (i = 0; i < comm_size; ++i) { /* can't use CHKLMEM macros b/c we are in a loop */ if (rin_sizes[i]) { rin[i] = MPL_malloc(rin_sizes[i] * sizeof(int), MPL_MEM_COMM); } if (rout_sizes[i]) { rout[i] = MPL_malloc(rout_sizes[i] * sizeof(int), MPL_MEM_COMM); } } /* populate arrays */ idx = 0; for (i = 0; i < n; ++i) { /* TODO add this assert as proper error checking above */ int s_rank = sources[i]; MPIR_Assert(s_rank < comm_size); MPIR_Assert(s_rank >= 0); for (j = 0; j < degrees[i]; ++j) { int d_rank = destinations[idx]; int weight = (weights == MPI_UNWEIGHTED ? 0 : weights[idx]); /* TODO add this assert as proper error checking above */ MPIR_Assert(d_rank < comm_size); MPIR_Assert(d_rank >= 0); /* XXX DJG what about self-edges? do we need to drop one of these * cases when there is a self-edge to avoid double-counting? */ /* rout[s][2*x] is the value of d for the j'th edge between (s,d) * with weight rout[s][2*x+1], where x is the current end of the * outgoing edge list for s. x==(rout_idx[s]/2) */ rout[s_rank][rout_idx[s_rank]++] = d_rank; rout[s_rank][rout_idx[s_rank]++] = weight; /* rin[d][2*x] is the value of s for the j'th edge between (s,d) * with weight rout[d][2*x+1], where x is the current end of the * incoming edge list for d. x==(rin_idx[d]/2) */ rin[d_rank][rin_idx[d_rank]++] = s_rank; rin[d_rank][rin_idx[d_rank]++] = weight; ++idx; } } for (i = 0; i < comm_size; ++i) { /* sanity check that all arrays are fully populated */ MPIR_Assert(rin_idx[i] == rin_sizes[i]); MPIR_Assert(rout_idx[i] == rout_sizes[i]); } MPIR_CHKLMEM_MALLOC(rs, int *, 2 * comm_size * sizeof(int), mpi_errno, "red-scat source buffer", MPL_MEM_COMM); for (i = 0; i < comm_size; ++i) { rs[2 * i] = (rin_sizes[i] ? 1 : 0); rs[2 * i + 1] = (rout_sizes[i] ? 1 : 0); } /* compute the number of peers I will recv from */ mpi_errno = MPIR_Reduce_scatter_block(rs, in_out_peers, 2, MPI_INT, MPI_SUM, comm_ptr, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); MPIR_Assert(in_out_peers[0] <= comm_size && in_out_peers[0] >= 0); MPIR_Assert(in_out_peers[1] <= comm_size && in_out_peers[1] >= 0); idx = 0; /* must be 2*comm_size requests because we will possibly send inbound and * outbound edges to everyone in our communicator */ MPIR_CHKLMEM_MALLOC(reqs, MPIR_Request **, 2 * comm_size * sizeof(MPIR_Request *), mpi_errno, "temp request array", MPL_MEM_COMM); for (i = 0; i < comm_size; ++i) { if (rin_sizes[i]) { /* send edges where i is a destination to process i */ mpi_errno = MPIC_Isend(&rin[i][0], rin_sizes[i], MPI_INT, i, MPIR_TOPO_A_TAG, comm_ptr, &reqs[idx++], &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } if (rout_sizes[i]) { /* send edges where i is a source to process i */ mpi_errno = MPIC_Isend(&rout[i][0], rout_sizes[i], MPI_INT, i, MPIR_TOPO_B_TAG, comm_ptr, &reqs[idx++], &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } } MPIR_Assert(idx <= (2 * comm_size)); /* Create the topology structure */ MPIR_CHKPMEM_MALLOC(topo_ptr, MPIR_Topology *, sizeof(MPIR_Topology), mpi_errno, "topo_ptr", MPL_MEM_COMM); topo_ptr->kind = MPI_DIST_GRAPH; dist_graph_ptr = &topo_ptr->topo.dist_graph; dist_graph_ptr->indegree = 0; dist_graph_ptr->in = NULL; dist_graph_ptr->in_weights = NULL; dist_graph_ptr->outdegree = 0; dist_graph_ptr->out = NULL; dist_graph_ptr->out_weights = NULL; dist_graph_ptr->is_weighted = (weights != MPI_UNWEIGHTED); /* can't use CHKPMEM macros for this b/c we need to realloc */ in_capacity = 10; /* arbitrary */ dist_graph_ptr->in = MPL_malloc(in_capacity * sizeof(int), MPL_MEM_COMM); if (dist_graph_ptr->is_weighted) { dist_graph_ptr->in_weights = MPL_malloc(in_capacity * sizeof(int), MPL_MEM_COMM); MPIR_Assert(dist_graph_ptr->in_weights != NULL); } out_capacity = 10; /* arbitrary */ dist_graph_ptr->out = MPL_malloc(out_capacity * sizeof(int), MPL_MEM_COMM); if (dist_graph_ptr->is_weighted) { dist_graph_ptr->out_weights = MPL_malloc(out_capacity * sizeof(int), MPL_MEM_COMM); MPIR_Assert(dist_graph_ptr->out_weights); } for (i = 0; i < in_out_peers[0]; ++i) { MPI_Status status; MPI_Aint count; int *buf; /* receive inbound edges */ mpi_errno = MPIC_Probe(MPI_ANY_SOURCE, MPIR_TOPO_A_TAG, comm_old, &status); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_Get_count_impl(&status, MPI_INT, &count); /* can't use CHKLMEM macros b/c we are in a loop */ /* FIXME: Why not - there is only one allocated at a time. Is it only * that there is no defined macro to pop and free an item? */ buf = MPL_malloc(count * sizeof(int), MPL_MEM_COMM); MPIR_ERR_CHKANDJUMP(!buf, mpi_errno, MPI_ERR_OTHER, "**nomem"); mpi_errno = MPIC_Recv(buf, count, MPI_INT, MPI_ANY_SOURCE, MPIR_TOPO_A_TAG, comm_ptr, MPI_STATUS_IGNORE, &errflag); /* FIXME: buf is never freed on error! */ if (mpi_errno) MPIR_ERR_POP(mpi_errno); for (j = 0; j < count / 2; ++j) { int deg = dist_graph_ptr->indegree++; if (deg >= in_capacity) { in_capacity *= 2; /* FIXME: buf is never freed on error! */ MPIR_REALLOC_ORJUMP(dist_graph_ptr->in, in_capacity * sizeof(int), MPL_MEM_COMM, mpi_errno); if (dist_graph_ptr->is_weighted) /* FIXME: buf is never freed on error! */ MPIR_REALLOC_ORJUMP(dist_graph_ptr->in_weights, in_capacity * sizeof(int), MPL_MEM_COMM, mpi_errno); } dist_graph_ptr->in[deg] = buf[2 * j]; if (dist_graph_ptr->is_weighted) dist_graph_ptr->in_weights[deg] = buf[2 * j + 1]; } MPL_free(buf); } for (i = 0; i < in_out_peers[1]; ++i) { MPI_Status status; MPI_Aint count; int *buf; /* receive outbound edges */ mpi_errno = MPIC_Probe(MPI_ANY_SOURCE, MPIR_TOPO_B_TAG, comm_old, &status); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_Get_count_impl(&status, MPI_INT, &count); /* can't use CHKLMEM macros b/c we are in a loop */ /* Why not? */ buf = MPL_malloc(count * sizeof(int), MPL_MEM_COMM); MPIR_ERR_CHKANDJUMP(!buf, mpi_errno, MPI_ERR_OTHER, "**nomem"); mpi_errno = MPIC_Recv(buf, count, MPI_INT, MPI_ANY_SOURCE, MPIR_TOPO_B_TAG, comm_ptr, MPI_STATUS_IGNORE, &errflag); /* FIXME: buf is never freed on error! */ if (mpi_errno) MPIR_ERR_POP(mpi_errno); for (j = 0; j < count / 2; ++j) { int deg = dist_graph_ptr->outdegree++; if (deg >= out_capacity) { out_capacity *= 2; /* FIXME: buf is never freed on error! */ MPIR_REALLOC_ORJUMP(dist_graph_ptr->out, out_capacity * sizeof(int), MPL_MEM_COMM, mpi_errno); if (dist_graph_ptr->is_weighted) /* FIXME: buf is never freed on error! */ MPIR_REALLOC_ORJUMP(dist_graph_ptr->out_weights, out_capacity * sizeof(int), MPL_MEM_COMM, mpi_errno); } dist_graph_ptr->out[deg] = buf[2 * j]; if (dist_graph_ptr->is_weighted) dist_graph_ptr->out_weights[deg] = buf[2 * j + 1]; } MPL_free(buf); } mpi_errno = MPIC_Waitall(idx, reqs, MPI_STATUSES_IGNORE, &errflag); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* remove any excess memory allocation */ MPIR_REALLOC_ORJUMP(dist_graph_ptr->in, dist_graph_ptr->indegree * sizeof(int), MPL_MEM_COMM, mpi_errno); MPIR_REALLOC_ORJUMP(dist_graph_ptr->out, dist_graph_ptr->outdegree * sizeof(int), MPL_MEM_COMM, mpi_errno); if (dist_graph_ptr->is_weighted) { MPIR_REALLOC_ORJUMP(dist_graph_ptr->in_weights, dist_graph_ptr->indegree * sizeof(int), MPL_MEM_COMM, mpi_errno); MPIR_REALLOC_ORJUMP(dist_graph_ptr->out_weights, dist_graph_ptr->outdegree * sizeof(int), MPL_MEM_COMM, mpi_errno); } mpi_errno = MPIR_Topology_put(comm_dist_graph_ptr, topo_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_CHKPMEM_COMMIT(); MPIR_OBJ_PUBLISH_HANDLE(*comm_dist_graph, comm_dist_graph_ptr->handle); /* ... end of body of routine ... */ fn_exit: for (i = 0; i < comm_size; ++i) { MPL_free(rin[i]); MPL_free(rout[i]); } MPIR_CHKLMEM_FREEALL(); MPIR_FUNC_TERSE_EXIT(MPID_STATE_MPI_DIST_GRAPH_CREATE); MPID_THREAD_CS_EXIT(GLOBAL, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: if (dist_graph_ptr) { MPL_free(dist_graph_ptr->in); MPL_free(dist_graph_ptr->in_weights); MPL_free(dist_graph_ptr->out); MPL_free(dist_graph_ptr->out_weights); } MPIR_CHKPMEM_REAP(); #ifdef HAVE_ERROR_CHECKING mpi_errno = MPIR_Err_create_code(mpi_errno, MPIR_ERR_RECOVERABLE, __func__, __LINE__, MPI_ERR_OTHER, "**mpi_dist_graph_create", "**mpi_dist_graph_create %C %d %p %p %p %p %I %d %p", comm_old, n, sources, degrees, destinations, weights, info, reorder, comm_dist_graph); #endif mpi_errno = MPIR_Err_return_comm(comm_ptr, __func__, mpi_errno); goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_CH3U_Win_gather_info(void *base, MPI_Aint size, int disp_unit, MPIR_Info * info, MPIR_Comm * comm_ptr, MPIR_Win ** win_ptr) { int mpi_errno = MPI_SUCCESS, i, k, comm_size, rank; MPI_Aint *tmp_buf; MPIR_Errflag_t errflag = MPIR_ERR_NONE; MPIR_CHKPMEM_DECL(1); MPIR_CHKLMEM_DECL(1); MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_MPIDI_CH3U_WIN_GATHER_INFO); MPIR_FUNC_VERBOSE_RMA_ENTER(MPID_STATE_MPIDI_CH3U_WIN_GATHER_INFO); comm_size = (*win_ptr)->comm_ptr->local_size; rank = (*win_ptr)->comm_ptr->rank; MPIR_T_PVAR_TIMER_START(RMA, rma_wincreate_allgather); /* allocate memory for the base addresses, disp_units, and * completion counters of all processes */ MPIR_CHKPMEM_MALLOC((*win_ptr)->basic_info_table, MPIDI_Win_basic_info_t *, comm_size * sizeof(MPIDI_Win_basic_info_t), mpi_errno, "(*win_ptr)->basic_info_table"); /* get the addresses of the windows, window objects, and completion * counters of all processes. allocate temp. buffer for communication */ MPIR_CHKLMEM_MALLOC(tmp_buf, MPI_Aint *, 4 * comm_size * sizeof(MPI_Aint), mpi_errno, "tmp_buf"); /* FIXME: This needs to be fixed for heterogeneous systems */ /* FIXME: If we wanted to validate the transfer as within range at the * origin, we'd also need the window size. */ tmp_buf[4 * rank] = MPIR_Ptr_to_aint(base); tmp_buf[4 * rank + 1] = size; tmp_buf[4 * rank + 2] = (MPI_Aint) disp_unit; tmp_buf[4 * rank + 3] = (MPI_Aint) (*win_ptr)->handle; mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, tmp_buf, 4, MPI_AINT, (*win_ptr)->comm_ptr, &errflag); MPIR_T_PVAR_TIMER_END(RMA, rma_wincreate_allgather); if (mpi_errno) { MPIR_ERR_POP(mpi_errno); } MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); k = 0; for (i = 0; i < comm_size; i++) { (*win_ptr)->basic_info_table[i].base_addr = MPIR_Aint_to_ptr(tmp_buf[k++]); (*win_ptr)->basic_info_table[i].size = tmp_buf[k++]; (*win_ptr)->basic_info_table[i].disp_unit = (int) tmp_buf[k++]; (*win_ptr)->basic_info_table[i].win_handle = (MPI_Win) tmp_buf[k++]; } fn_exit: MPIR_CHKLMEM_FREEALL(); MPIR_FUNC_VERBOSE_RMA_EXIT(MPID_STATE_MPIDI_CH3U_WIN_GATHER_INFO); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: MPIR_CHKPMEM_REAP(); goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPII_Comm_create_calculate_mapping(MPIR_Group *group_ptr, MPIR_Comm *comm_ptr, int **mapping_out, MPIR_Comm **mapping_comm) { int mpi_errno = MPI_SUCCESS; int subsetOfWorld = 0; int i, j; int n; int *mapping=0; MPIR_CHKPMEM_DECL(1); MPIR_FUNC_TERSE_STATE_DECL(MPID_STATE_MPIR_COMM_CREATE_CALCULATE_MAPPING); MPIR_FUNC_TERSE_ENTER(MPID_STATE_MPIR_COMM_CREATE_CALCULATE_MAPPING); *mapping_out = NULL; *mapping_comm = comm_ptr; n = group_ptr->size; MPIR_CHKPMEM_MALLOC(mapping,int*,n*sizeof(int),mpi_errno,"mapping",MPL_MEM_ADDRESS); /* Make sure that the processes for this group are contained within the input communicator. Also identify the mapping from the ranks of the old communicator to the new communicator. We do this by matching the lpids of the members of the group with the lpids of the members of the input communicator. It is an error if the group contains a reference to an lpid that does not exist in the communicator. An important special case is groups (and communicators) that are subsets of MPI_COMM_WORLD. In this case, the lpids are exactly the same as the ranks in comm world. */ /* we examine the group's lpids in both the intracomm and non-comm_world cases */ MPII_Group_setup_lpid_list( group_ptr ); /* Optimize for groups contained within MPI_COMM_WORLD. */ if (comm_ptr->comm_kind == MPIR_COMM_KIND__INTRACOMM) { int wsize; subsetOfWorld = 1; wsize = MPIR_Process.comm_world->local_size; for (i=0; i<n; i++) { int g_lpid = group_ptr->lrank_to_lpid[i].lpid; /* This mapping is relative to comm world */ MPL_DBG_MSG_FMT(MPIR_DBG_COMM,VERBOSE, (MPL_DBG_FDEST, "comm-create - mapping into world[%d] = %d", i, g_lpid )); if (g_lpid < wsize) { mapping[i] = g_lpid; } else { subsetOfWorld = 0; break; } } } MPL_DBG_MSG_D(MPIR_DBG_COMM,VERBOSE, "subsetOfWorld=%d", subsetOfWorld ); if (subsetOfWorld) { # ifdef HAVE_ERROR_CHECKING { MPID_BEGIN_ERROR_CHECKS; { mpi_errno = MPIR_Group_check_subset( group_ptr, comm_ptr ); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } MPID_END_ERROR_CHECKS; } # endif /* Override the comm to be used with the mapping array. */ *mapping_comm = MPIR_Process.comm_world; } else { for (i=0; i<n; i++) { /* mapping[i] is the rank in the communicator of the process that is the ith element of the group */ /* FIXME : BUBBLE SORT */ mapping[i] = -1; for (j=0; j<comm_ptr->local_size; j++) { int comm_lpid; MPID_Comm_get_lpid( comm_ptr, j, &comm_lpid, FALSE ); if (comm_lpid == group_ptr->lrank_to_lpid[i].lpid) { mapping[i] = j; break; } } MPIR_ERR_CHKANDJUMP1(mapping[i] == -1,mpi_errno,MPI_ERR_GROUP, "**groupnotincomm", "**groupnotincomm %d", i ); } } MPIR_Assert(mapping != NULL); *mapping_out = mapping; MPL_VG_CHECK_MEM_IS_DEFINED(*mapping_out, n * sizeof(**mapping_out)); MPIR_CHKPMEM_COMMIT(); fn_exit: MPIR_FUNC_TERSE_EXIT(MPID_STATE_MPIR_COMM_CREATE_CALCULATE_MAPPING); return mpi_errno; fn_fail: MPIR_CHKPMEM_REAP(); goto fn_exit; }
static int win_init(MPI_Aint size, int disp_unit, int create_flavor, int model, MPIR_Info * info, MPIR_Comm * comm_ptr, MPIR_Win ** win_ptr) { int mpi_errno = MPI_SUCCESS; int i; MPIR_Comm *win_comm_ptr; int win_target_pool_size; MPIR_CHKPMEM_DECL(5); MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_WIN_INIT); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_WIN_INIT); MPID_THREAD_CS_ENTER(POBJ, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); if (initRMAoptions) { MPIDI_CH3_RMA_Init_sync_pvars(); MPIDI_CH3_RMA_Init_pkthandler_pvars(); initRMAoptions = 0; } MPID_THREAD_CS_EXIT(POBJ, MPIR_THREAD_GLOBAL_ALLFUNC_MUTEX); *win_ptr = (MPIR_Win *) MPIR_Handle_obj_alloc(&MPIR_Win_mem); MPIR_ERR_CHKANDJUMP1(!(*win_ptr), mpi_errno, MPI_ERR_OTHER, "**nomem", "**nomem %s", "MPIR_Win_mem"); mpi_errno = MPIR_Comm_dup_impl(comm_ptr, &win_comm_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); MPIR_Object_set_ref(*win_ptr, 1); /* (*win_ptr)->errhandler is set by upper level; */ /* (*win_ptr)->base is set by caller; */ (*win_ptr)->size = size; (*win_ptr)->disp_unit = disp_unit; (*win_ptr)->create_flavor = create_flavor; (*win_ptr)->model = model; (*win_ptr)->attributes = NULL; (*win_ptr)->comm_ptr = win_comm_ptr; (*win_ptr)->at_completion_counter = 0; (*win_ptr)->shm_base_addrs = NULL; /* (*win_ptr)->basic_info_table[] is set by caller; */ (*win_ptr)->current_lock_type = MPID_LOCK_NONE; (*win_ptr)->shared_lock_ref_cnt = 0; (*win_ptr)->target_lock_queue_head = NULL; (*win_ptr)->shm_allocated = FALSE; (*win_ptr)->states.access_state = MPIDI_RMA_NONE; (*win_ptr)->states.exposure_state = MPIDI_RMA_NONE; (*win_ptr)->num_targets_with_pending_net_ops = 0; (*win_ptr)->start_ranks_in_win_grp = NULL; (*win_ptr)->start_grp_size = 0; (*win_ptr)->lock_all_assert = 0; (*win_ptr)->lock_epoch_count = 0; (*win_ptr)->outstanding_locks = 0; (*win_ptr)->current_target_lock_data_bytes = 0; (*win_ptr)->sync_request_cnt = 0; (*win_ptr)->active = FALSE; (*win_ptr)->next = NULL; (*win_ptr)->prev = NULL; (*win_ptr)->outstanding_acks = 0; /* Initialize the info flags */ (*win_ptr)->info_args.no_locks = 0; (*win_ptr)->info_args.accumulate_ordering = MPIDI_ACC_ORDER_RAR | MPIDI_ACC_ORDER_RAW | MPIDI_ACC_ORDER_WAR | MPIDI_ACC_ORDER_WAW; (*win_ptr)->info_args.accumulate_ops = MPIDI_ACC_OPS_SAME_OP_NO_OP; (*win_ptr)->info_args.same_size = 0; (*win_ptr)->info_args.same_disp_unit = FALSE; (*win_ptr)->info_args.alloc_shared_noncontig = 0; (*win_ptr)->info_args.alloc_shm = FALSE; if ((*win_ptr)->create_flavor == MPI_WIN_FLAVOR_ALLOCATE || (*win_ptr)->create_flavor == MPI_WIN_FLAVOR_SHARED) { (*win_ptr)->info_args.alloc_shm = TRUE; } /* Set info_args on window based on info provided by user */ mpi_errno = MPID_Win_set_info((*win_ptr), info); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); MPIR_CHKPMEM_MALLOC((*win_ptr)->op_pool_start, MPIDI_RMA_Op_t *, sizeof(MPIDI_RMA_Op_t) * MPIR_CVAR_CH3_RMA_OP_WIN_POOL_SIZE, mpi_errno, "RMA op pool", MPL_MEM_RMA); (*win_ptr)->op_pool_head = NULL; for (i = 0; i < MPIR_CVAR_CH3_RMA_OP_WIN_POOL_SIZE; i++) { (*win_ptr)->op_pool_start[i].pool_type = MPIDI_RMA_POOL_WIN; DL_APPEND((*win_ptr)->op_pool_head, &((*win_ptr)->op_pool_start[i])); } win_target_pool_size = MPL_MIN(MPIR_CVAR_CH3_RMA_TARGET_WIN_POOL_SIZE, MPIR_Comm_size(win_comm_ptr)); MPIR_CHKPMEM_MALLOC((*win_ptr)->target_pool_start, MPIDI_RMA_Target_t *, sizeof(MPIDI_RMA_Target_t) * win_target_pool_size, mpi_errno, "RMA target pool", MPL_MEM_RMA); (*win_ptr)->target_pool_head = NULL; for (i = 0; i < win_target_pool_size; i++) { (*win_ptr)->target_pool_start[i].pool_type = MPIDI_RMA_POOL_WIN; DL_APPEND((*win_ptr)->target_pool_head, &((*win_ptr)->target_pool_start[i])); } (*win_ptr)->num_slots = MPL_MIN(MPIR_CVAR_CH3_RMA_SLOTS_SIZE, MPIR_Comm_size(win_comm_ptr)); MPIR_CHKPMEM_MALLOC((*win_ptr)->slots, MPIDI_RMA_Slot_t *, sizeof(MPIDI_RMA_Slot_t) * (*win_ptr)->num_slots, mpi_errno, "RMA slots", MPL_MEM_RMA); for (i = 0; i < (*win_ptr)->num_slots; i++) { (*win_ptr)->slots[i].target_list_head = NULL; } MPIR_CHKPMEM_MALLOC((*win_ptr)->target_lock_entry_pool_start, MPIDI_RMA_Target_lock_entry_t *, sizeof(MPIDI_RMA_Target_lock_entry_t) * MPIR_CVAR_CH3_RMA_TARGET_LOCK_ENTRY_WIN_POOL_SIZE, mpi_errno, "RMA lock entry pool", MPL_MEM_RMA); (*win_ptr)->target_lock_entry_pool_head = NULL; for (i = 0; i < MPIR_CVAR_CH3_RMA_TARGET_LOCK_ENTRY_WIN_POOL_SIZE; i++) { DL_APPEND((*win_ptr)->target_lock_entry_pool_head, &((*win_ptr)->target_lock_entry_pool_start[i])); } if (MPIDI_RMA_Win_inactive_list_head == NULL && MPIDI_RMA_Win_active_list_head == NULL) { /* this is the first window, register RMA progress hook */ mpi_errno = MPID_Progress_register_hook(MPIDI_CH3I_RMA_Make_progress_global, &MPIDI_CH3I_RMA_Progress_hook_id); if (mpi_errno) { MPIR_ERR_POP(mpi_errno); } } DL_APPEND(MPIDI_RMA_Win_inactive_list_head, (*win_ptr)); if (MPIDI_CH3U_Win_hooks.win_init != NULL) { mpi_errno = MPIDI_CH3U_Win_hooks.win_init(size, disp_unit, create_flavor, model, info, comm_ptr, win_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } fn_exit: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_WIN_INIT); return mpi_errno; fn_fail: MPIR_CHKPMEM_REAP(); goto fn_exit; }
/* This is the main function which generates a tree in shared memory. The tree is parameterized * over the different data-structures: * k_val : the tree K-value * shared_region : the shared memory region where the tree will be generated * max_entries_per_level : the maximum number of ranks per level * ranks_per_package : the different ranks at each level * max_ranks_per_package : the maximum ranks in any package * package_ctr : number of ranks in each package * package_level : the topology level where we cutoff the tree * num_ranks : the number of ranks * */ int MPIDI_SHM_gen_tree(int k_val, int *shared_region, int *max_entries_per_level, int **ranks_per_package, int max_ranks_per_package, int *package_ctr, int package_level, int num_ranks, bool package_leaders_first, bool right_skewed, MPIR_Errflag_t * errflag) { int mpi_errno = MPI_SUCCESS, mpi_errno_ret = MPI_SUCCESS; int i, j, p, r, rank, idx; int num_packages = max_entries_per_level[package_level]; int package_count = 0; MPIDI_SHM_topotree_t package_tree, tree, template_tree; const int package_tree_sz = num_packages > num_ranks ? num_packages : num_ranks; int *package_leaders = NULL; MPIR_CHKPMEM_DECL(1); mpi_errno = MPIDI_SHM_topotree_allocate(&tree, num_ranks, k_val); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIX_ERR_PROC_FAILED == MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } mpi_errno = MPIDI_SHM_topotree_allocate(&package_tree, package_tree_sz, k_val); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIX_ERR_PROC_FAILED == MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } MPIR_CHKPMEM_CALLOC(package_leaders, int *, num_packages * sizeof(int), mpi_errno, "intra_node_package_leaders", MPL_MEM_OTHER); /* We pick package leaders as the first rank in each package */ for (p = 0; p < max_entries_per_level[package_level]; ++p) { package_leaders[p] = -1; if (package_ctr[p] > 0) { package_leaders[package_count++] = ranks_per_package[p][0]; } } num_packages = package_count; /* STEP 4. Now use the template tree to generate the top level tree */ MPIDI_SHM_gen_package_tree(num_packages, k_val, &package_tree, package_leaders); /* STEP 5. Create a template tree for the ranks */ mpi_errno = MPIDI_SHM_create_template_tree(&template_tree, k_val, right_skewed, max_ranks_per_package, errflag); if (mpi_errno) { /* for communication errors, just record the error but continue */ *errflag = MPIX_ERR_PROC_FAILED == MPIR_ERR_GET_CLASS(mpi_errno) ? MPIR_ERR_PROC_FAILED : MPIR_ERR_OTHER; MPIR_ERR_SET(mpi_errno, *errflag, "**fail"); MPIR_ERR_ADD(mpi_errno_ret, mpi_errno); } if (MPIDI_SHM_TOPOTREE_DEBUG) { for (i = 0; i < max_entries_per_level[package_level]; ++i) { fprintf(stderr, "pre-Rank %d, parent %d, children=%d [", i, MPIDI_SHM_TOPOTREE_PARENT(&tree, i), MPIDI_SHM_TOPOTREE_NUM_CHILD(&tree, i)); for (j = 0; j < MPIDI_SHM_TOPOTREE_NUM_CHILD(&tree, i); ++j) { fprintf(stderr, "%d, ", MPIDI_SHM_TOPOTREE_CHILD(&tree, i, j)); } fprintf(stderr, "]\n"); } } /* use the template tree to generate the tree for each rank */ for (p = 0; p < max_entries_per_level[package_level]; ++p) { for (r = 0; r < package_ctr[p]; ++r) { rank = ranks_per_package[p][r]; if (MPIDI_SHM_TOPOTREE_DEBUG) fprintf(stderr, "Rank=%d, p=%d, r=%d, opt1=%d, opt2=%d\n", rank, p, r, MPIDI_SHM_TOPOTREE_PARENT(&template_tree, r), ranks_per_package[p][MPIDI_SHM_TOPOTREE_PARENT(&template_tree, r)]); if (MPIDI_SHM_TOPOTREE_PARENT(&template_tree, r) == -1) { MPIDI_SHM_TOPOTREE_PARENT(&tree, rank) = -1; } else { MPIDI_SHM_TOPOTREE_PARENT(&tree, rank) = ranks_per_package[p][MPIDI_SHM_TOPOTREE_PARENT(&template_tree, r)]; } for (j = 0; j < MPIDI_SHM_TOPOTREE_NUM_CHILD(&template_tree, r); ++j) { idx = MPIDI_SHM_TOPOTREE_NUM_CHILD(&tree, rank); if (MPIDI_SHM_TOPOTREE_CHILD(&template_tree, r, j) < package_ctr[p]) { MPIDI_SHM_TOPOTREE_NUM_CHILD(&tree, rank)++; MPIDI_SHM_TOPOTREE_CHILD(&tree, rank, idx) = ranks_per_package[p][MPIDI_SHM_TOPOTREE_CHILD(&template_tree, r, j)]; } } } } if (MPIDI_SHM_TOPOTREE_DEBUG) { char str[1024], tmp[128]; for (i = 0; i < num_ranks; ++i) { sprintf(str, "*BaseTreeRank %d, parent %d, children=%d [", i, MPIDI_SHM_TOPOTREE_PARENT(&tree, i), MPIDI_SHM_TOPOTREE_NUM_CHILD(&tree, i)); for (j = 0; j < MPIDI_SHM_TOPOTREE_NUM_CHILD(&tree, i); ++j) { sprintf(tmp, "%d, ", MPIDI_SHM_TOPOTREE_CHILD(&tree, i, j)); strcat(str, tmp); } fprintf(stderr, "%s]\n", str); } } /* Assemble the per package tree package leaders tree and copy it to shared memory region */ MPIDI_SHM_gen_tree_sharedmemory(shared_region, &tree, &package_tree, package_leaders, num_packages, num_ranks, k_val, package_leaders_first); MPL_free(tree.base); MPL_free(package_tree.base); MPL_free(template_tree.base); fn_exit: MPIR_CHKPMEM_REAP(); return mpi_errno; fn_fail: goto fn_exit; }
static int handler_recv_dequeue_large(const ptl_event_t *e) { int mpi_errno = MPI_SUCCESS; MPIR_Request *const rreq = e->user_ptr; MPIDI_VC_t *vc; MPID_nem_ptl_vc_area *vc_ptl; int ret; int dt_contig; intptr_t data_sz; MPIDU_Datatype*dt_ptr; MPI_Aint dt_true_lb; MPI_Aint last; MPIR_CHKPMEM_DECL(1); MPIR_FUNC_VERBOSE_STATE_DECL(MPID_STATE_HANDLER_RECV_DEQUEUE_LARGE); MPIR_FUNC_VERBOSE_ENTER(MPID_STATE_HANDLER_RECV_DEQUEUE_LARGE); MPIR_Assert(e->type == PTL_EVENT_PUT || e->type == PTL_EVENT_PUT_OVERFLOW); MPIDI_Comm_get_vc(rreq->comm, NPTL_MATCH_GET_RANK(e->match_bits), &vc); vc_ptl = VC_PTL(vc); dequeue_req(e); MPIDI_Datatype_get_info(rreq->dev.user_count, rreq->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); /* unpack data from unexpected buffer first */ if (e->type == PTL_EVENT_PUT_OVERFLOW) { if (dt_contig) { MPIR_Memcpy((char *)rreq->dev.user_buf + dt_true_lb, e->start, e->mlength); } else { last = e->mlength; MPIDU_Segment_unpack(rreq->dev.segment_ptr, 0, &last, e->start); MPIR_Assert(last == e->mlength); rreq->dev.segment_first = e->mlength; } } if (!(e->hdr_data & NPTL_LARGE)) { /* all data has already been received; we're done */ mpi_errno = handler_recv_complete(e); if (mpi_errno) MPIR_ERR_POP(mpi_errno); goto fn_exit; } MPIR_Assert (e->mlength == PTL_LARGE_THRESHOLD); /* we need to GET the rest of the data from the sender's buffer */ if (dt_contig) { big_get((char *)rreq->dev.user_buf + dt_true_lb + PTL_LARGE_THRESHOLD, data_sz - PTL_LARGE_THRESHOLD, vc, e->match_bits, rreq); goto fn_exit; } /* noncontig recv buffer */ last = rreq->dev.segment_size; rreq->dev.iov_count = MPL_IOV_LIMIT; MPIDU_Segment_pack_vector(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, rreq->dev.iov, &rreq->dev.iov_count); if (last == rreq->dev.segment_size && rreq->dev.segment_size <= MPIDI_nem_ptl_ni_limits.max_msg_size + PTL_LARGE_THRESHOLD) { /* Rest of message fits in one IOV */ ptl_md_t md; md.start = rreq->dev.iov; md.length = rreq->dev.iov_count; md.options = PTL_IOVEC; md.eq_handle = MPIDI_nem_ptl_origin_eq; md.ct_handle = PTL_CT_NONE; ret = PtlMDBind(MPIDI_nem_ptl_ni, &md, &REQ_PTL(rreq)->md); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmdbind", "**ptlmdbind %s", MPID_nem_ptl_strerror(ret)); REQ_PTL(rreq)->event_handler = handler_recv_complete; ret = MPID_nem_ptl_rptl_get(REQ_PTL(rreq)->md, 0, rreq->dev.segment_size - rreq->dev.segment_first, vc_ptl->id, vc_ptl->ptg, e->match_bits, 0, rreq); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlget", "**ptlget %s", MPID_nem_ptl_strerror(ret)); goto fn_exit; } /* message won't fit in a single IOV, allocate buffer and unpack when received */ /* FIXME: For now, allocate a single large buffer to hold entire message */ MPIR_CHKPMEM_MALLOC(REQ_PTL(rreq)->chunk_buffer[0], void *, data_sz - PTL_LARGE_THRESHOLD, mpi_errno, "chunk_buffer"); big_get(REQ_PTL(rreq)->chunk_buffer[0], data_sz - PTL_LARGE_THRESHOLD, vc, e->match_bits, rreq); fn_exit: MPIR_CHKPMEM_COMMIT(); fn_exit2: MPIR_FUNC_VERBOSE_EXIT(MPID_STATE_HANDLER_RECV_DEQUEUE_LARGE); return mpi_errno; fn_fail: MPIR_CHKPMEM_REAP(); goto fn_exit2; }
int MPIR_Graph_create( MPIR_Comm *comm_ptr, int nnodes, const int indx[], const int edges[], int reorder, MPI_Comm *comm_graph) { int mpi_errno = MPI_SUCCESS; int i, nedges; MPIR_Comm *newcomm_ptr = NULL; MPIR_Topology *graph_ptr = NULL; MPIR_CHKPMEM_DECL(3); /* Set this to null in case there is an error */ *comm_graph = MPI_COMM_NULL; /* Create a new communicator */ if (reorder) { int nrank; /* Allow the cart map routine to remap the assignment of ranks to processes */ mpi_errno = MPIR_Graph_map_impl(comm_ptr, nnodes, indx, edges, &nrank); if (mpi_errno) MPIR_ERR_POP(mpi_errno); /* Create the new communicator with split, since we need to reorder the ranks (including the related internals, such as the connection tables */ mpi_errno = MPIR_Comm_split_impl( comm_ptr, nrank == MPI_UNDEFINED ? MPI_UNDEFINED : 1, nrank, &newcomm_ptr ); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else { /* Just use the first nnodes processes in the communicator */ mpi_errno = MPII_Comm_copy( (MPIR_Comm *)comm_ptr, nnodes, &newcomm_ptr ); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } /* If this process is not in the resulting communicator, return a null communicator and exit */ if (!newcomm_ptr) { *comm_graph = MPI_COMM_NULL; goto fn_exit; } nedges = indx[nnodes-1]; MPIR_CHKPMEM_MALLOC(graph_ptr,MPIR_Topology*,sizeof(MPIR_Topology), mpi_errno,"graph_ptr"); graph_ptr->kind = MPI_GRAPH; graph_ptr->topo.graph.nnodes = nnodes; graph_ptr->topo.graph.nedges = nedges; MPIR_CHKPMEM_MALLOC(graph_ptr->topo.graph.index,int*, nnodes*sizeof(int),mpi_errno,"graph.index"); MPIR_CHKPMEM_MALLOC(graph_ptr->topo.graph.edges,int*, nedges*sizeof(int),mpi_errno,"graph.edges"); for (i=0; i<nnodes; i++) graph_ptr->topo.graph.index[i] = indx[i]; for (i=0; i<nedges; i++) graph_ptr->topo.graph.edges[i] = edges[i]; /* Finally, place the topology onto the new communicator and return the handle */ mpi_errno = MPIR_Topology_put( newcomm_ptr, graph_ptr ); if (mpi_errno != MPI_SUCCESS) goto fn_fail; MPIR_OBJ_PUBLISH_HANDLE(*comm_graph, newcomm_ptr->handle); /* ... end of body of routine ... */ fn_exit: return mpi_errno; fn_fail: /* --BEGIN ERROR HANDLING-- */ MPIR_CHKPMEM_REAP(); # ifdef HAVE_ERROR_CHECKING { mpi_errno = MPIR_Err_create_code( mpi_errno, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**mpi_graph_create", "**mpi_graph_create %C %d %p %p %d %p", comm_ptr->handle, nnodes, indx, edges, reorder, comm_graph); } # endif mpi_errno = MPIR_Err_return_comm( (MPIR_Comm*)comm_ptr, FCNAME, mpi_errno ); goto fn_exit; /* --END ERROR HANDLING-- */ }