/* returns TRUE iff the request was sent on the vc */ static inline int req_uses_vc(const MPID_Request* req, const MPIDI_VC_t *vc) { MPIDI_VC_t *vc1; MPIDI_Comm_get_vc(req->comm, req->dev.match.parts.rank, &vc1); return vc == vc1; }
/* returns TRUE iff the vc is part of the comm*/ static inline int is_vc_in_comm(const MPIDI_VC_t *vc, const MPID_Comm *comm) { int i; for (i = 0; i < comm->remote_size; ++i) { MPIDI_VC_t *vc1; MPIDI_Comm_get_vc(comm, i, &vc1); if (vc == vc1) return TRUE; } return FALSE; }
static int handler_recv_dequeue_unpack_large(const ptl_event_t *e) { int mpi_errno = MPI_SUCCESS; MPID_Request *const rreq = e->user_ptr; MPIDI_VC_t *vc; MPI_Aint last; void *buf; MPIU_CHKPMEM_DECL(1); MPIDI_STATE_DECL(MPID_STATE_HANDLER_RECV_DEQUEUE_UNPACK_LARGE); MPIDI_FUNC_ENTER(MPID_STATE_HANDLER_RECV_DEQUEUE_UNPACK_LARGE); MPIU_Assert(e->type == PTL_EVENT_PUT || e->type == PTL_EVENT_PUT_OVERFLOW); MPIDI_Comm_get_vc(rreq->comm, NPTL_MATCH_GET_RANK(e->match_bits), &vc); dequeue_req(e); if (!(e->hdr_data & NPTL_LARGE)) { /* all data has already been received; we're done */ mpi_errno = handler_recv_unpack_complete(e); if (mpi_errno) MPIR_ERR_POP(mpi_errno); goto fn_exit; } if (e->type == PTL_EVENT_PUT_OVERFLOW) buf = e->start; else buf = REQ_PTL(rreq)->chunk_buffer[0]; MPIU_Assert(e->mlength == PTL_LARGE_THRESHOLD); last = PTL_LARGE_THRESHOLD; MPID_Segment_unpack(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, buf); MPIU_Assert(last == PTL_LARGE_THRESHOLD); rreq->dev.segment_first += PTL_LARGE_THRESHOLD; MPIU_Free(REQ_PTL(rreq)->chunk_buffer[0]); MPIU_CHKPMEM_MALLOC(REQ_PTL(rreq)->chunk_buffer[0], void *, rreq->dev.segment_size - rreq->dev.segment_first, mpi_errno, "chunk_buffer"); big_get(REQ_PTL(rreq)->chunk_buffer[0], rreq->dev.segment_size - rreq->dev.segment_first, vc, e->match_bits, rreq); fn_exit: MPIU_CHKPMEM_COMMIT(); fn_exit2: MPIDI_FUNC_EXIT(MPID_STATE_HANDLER_RECV_DEQUEUE_UNPACK_LARGE); return mpi_errno; fn_fail: MPIU_CHKPMEM_REAP(); goto fn_exit2; }
static int nonempty_intersection(MPID_Comm *comm, MPID_Group *group, int *flag) { int mpi_errno = MPI_SUCCESS; int i_g, i_c; MPIDI_VC_t *vc_g, *vc_c; MPIDI_STATE_DECL(MPID_STATE_NONEMPTY_INTERSECTION); MPIDI_FUNC_ENTER(MPID_STATE_NONEMPTY_INTERSECTION); /* handle common case fast */ if (comm == MPIR_Process.comm_world || comm == MPIR_Process.icomm_world) { *flag = TRUE; MPIU_DBG_MSG(CH3_OTHER, VERBOSE, "comm is comm_world or icomm_world"); goto fn_exit; } *flag = FALSE; /* FIXME: This algorithm assumes that the number of processes in group is very small (like 1). So doing a linear search for them in comm is better than sorting the procs in comm and group then doing a binary search */ for (i_g = 0; i_g < group->size; ++i_g) { /* FIXME: This won't work for dynamic procs */ MPIDI_PG_Get_vc(MPIDI_Process.my_pg, group->lrank_to_lpid[i_g].lpid, &vc_g); for (i_c = 0; i_c < comm->remote_size; ++i_c) { MPIDI_Comm_get_vc(comm, i_c, &vc_c); if (vc_g == vc_c) { *flag = TRUE; goto fn_exit; } } } fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_NONEMPTY_INTERSECTION); return mpi_errno; fn_fail: goto fn_exit; }
int MPIDI_CH3I_RMA_Cleanup_target_aggressive(MPIR_Win * win_ptr, MPIDI_RMA_Target_t ** target) { int i, local_completed ATTRIBUTE((unused)) = 0, remote_completed = 0; int made_progress = 0; MPIDI_RMA_Target_t *curr_target = NULL; int mpi_errno = MPI_SUCCESS; (*target) = NULL; if (win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_CALLED) { /* switch to window-wide protocol */ MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL; MPIDI_Comm_get_vc(win_ptr->comm_ptr, win_ptr->comm_ptr->rank, &orig_vc); for (i = 0; i < win_ptr->comm_ptr->local_size; i++) { if (i == win_ptr->comm_ptr->rank) continue; MPIDI_Comm_get_vc(win_ptr->comm_ptr, i, &target_vc); if (orig_vc->node_id != target_vc->node_id) { mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, i, &curr_target); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (curr_target == NULL) { win_ptr->outstanding_locks++; mpi_errno = send_lock_msg(i, MPI_LOCK_SHARED, win_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } } } win_ptr->states.access_state = MPIDI_RMA_LOCK_ALL_ISSUED; } do { /* find a non-empty slot and set the FLUSH flag on the first * target */ /* TODO: we should think about better strategies on selecting the target */ for (i = 0; i < win_ptr->num_slots; i++) if (win_ptr->slots[i].target_list_head != NULL) break; curr_target = win_ptr->slots[i].target_list_head; if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH) { curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH; } /* Issue out all operations. */ mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, curr_target->target_rank, &made_progress); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); /* Wait for remote completion. */ do { MPIDI_CH3I_RMA_ops_completion(win_ptr, curr_target, local_completed, remote_completed); if (!remote_completed) { mpi_errno = wait_progress_engine(); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } } while (!remote_completed); /* Cleanup the target. */ mpi_errno = MPIDI_CH3I_Win_target_dequeue_and_free(win_ptr, curr_target); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); /* check if we got a target */ (*target) = MPIDI_CH3I_Win_target_alloc(win_ptr); } while ((*target) == NULL); fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
int MPID_Compare_and_swap(const void *origin_addr, const void *compare_addr, void *result_addr, MPI_Datatype datatype, int target_rank, MPI_Aint target_disp, MPID_Win * win_ptr) { int mpi_errno = MPI_SUCCESS; int rank; MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL; int made_progress = 0; MPIDI_STATE_DECL(MPID_STATE_MPID_COMPARE_AND_SWAP); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPID_COMPARE_AND_SWAP); MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE, mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync"); if (target_rank == MPI_PROC_NULL) { goto fn_exit; } rank = win_ptr->comm_ptr->rank; if (win_ptr->shm_allocated == TRUE && target_rank != rank && win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) { /* check if target is local and shared memory is allocated on window, * if so, we directly perform this operation on shared memory region. */ /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on * the same node. However, in ch3:sock, even if origin and target are on the same node, they do * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first, * which is only set to TRUE when SHM region is allocated in nemesis. * In future we need to figure out a way to check if origin and target are in the same "SHM comm". */ MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc); MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc); } /* The datatype must be predefined, and one of: C integer, Fortran integer, * Logical, Multi-language types, or Byte. This is checked above the ADI, * so there's no need to check it again here. */ /* FIXME: For shared memory windows, we should provide an implementation * that uses a processor atomic operation. */ if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED || (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) { mpi_errno = MPIDI_CH3I_Shm_cas_op(origin_addr, compare_addr, result_addr, datatype, target_rank, target_disp, win_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else { MPIDI_RMA_Op_t *op_ptr = NULL; MPIDI_CH3_Pkt_cas_t *cas_pkt = NULL; MPI_Aint type_size; void *src = NULL, *dest = NULL; /* Append this operation to the RMA ops queue */ mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set); /******************** Setting operation struct areas ***********************/ op_ptr->origin_addr = (void *) origin_addr; op_ptr->origin_count = 1; op_ptr->origin_datatype = datatype; op_ptr->result_addr = result_addr; op_ptr->result_datatype = datatype; op_ptr->compare_addr = (void *) compare_addr; op_ptr->compare_datatype = datatype; op_ptr->target_rank = target_rank; op_ptr->piggyback_lock_candidate = 1; /* CAS is always able to piggyback LOCK */ /************** Setting packet struct areas in operation ****************/ cas_pkt = &(op_ptr->pkt.cas); MPIDI_Pkt_init(cas_pkt, MPIDI_CH3_PKT_CAS_IMMED); cas_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr + win_ptr->basic_info_table[target_rank].disp_unit * target_disp; cas_pkt->datatype = datatype; cas_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle; cas_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE; /* REQUIRE: All datatype arguments must be of the same, builtin * type and counts must be 1. */ MPID_Datatype_get_size_macro(datatype, type_size); MPIU_Assert(type_size <= sizeof(MPIDI_CH3_CAS_Immed_u)); src = (void *) origin_addr, dest = (void *) (&(cas_pkt->origin_data)); mpi_errno = immed_copy(src, dest, type_size); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); src = (void *) compare_addr, dest = (void *) (&(cas_pkt->compare_data)); mpi_errno = immed_copy(src, dest, type_size); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set); mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 && MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) { while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) { mpi_errno = wait_progress_engine(); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } } } fn_exit: MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPID_COMPARE_AND_SWAP); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, void *result_addr, int result_count, MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Op op, MPID_Win * win_ptr, MPID_Request * ureq) { int mpi_errno = MPI_SUCCESS; MPIDI_msg_sz_t orig_data_sz, target_data_sz; int rank; int dt_contig ATTRIBUTE((unused)); MPI_Aint dt_true_lb ATTRIBUTE((unused)); MPID_Datatype *dtp; MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL; int made_progress = 0; MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE); MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE, mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync"); if (target_rank == MPI_PROC_NULL) { goto fn_exit; } MPIDI_Datatype_get_info(target_count, target_datatype, dt_contig, target_data_sz, dtp, dt_true_lb); if (target_data_sz == 0) { goto fn_exit; } rank = win_ptr->comm_ptr->rank; if (win_ptr->shm_allocated == TRUE && target_rank != rank && win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) { /* check if target is local and shared memory is allocated on window, * if so, we directly perform this operation on shared memory region. */ /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on * the same node. However, in ch3:sock, even if origin and target are on the same node, they do * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first, * which is only set to TRUE when SHM region is allocated in nemesis. * In future we need to figure out a way to check if origin and target are in the same "SHM comm". */ MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc); MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc); } /* Do =! rank first (most likely branch?) */ if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED || (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) { mpi_errno = MPIDI_CH3I_Shm_get_acc_op(origin_addr, origin_count, origin_datatype, result_addr, result_count, result_datatype, target_rank, target_disp, target_count, target_datatype, op, win_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (ureq) { /* Complete user request and release the ch3 ref */ mpi_errno = MPID_Request_complete(ureq); if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); } } } else { MPIDI_RMA_Op_t *op_ptr = NULL; MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt; MPI_Aint origin_type_size; MPI_Aint target_type_size; int use_immed_pkt = FALSE, i; int is_origin_contig, is_target_contig, is_result_contig; MPI_Aint stream_elem_count, stream_unit_count; MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent; MPID_Datatype *origin_dtp = NULL, *target_dtp = NULL, *result_dtp = NULL; int is_empty_origin = FALSE; /* Judge if origin buffer is empty */ if (op == MPI_NO_OP) is_empty_origin = TRUE; /* Append the operation to the window's RMA ops queue */ mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); /* TODO: Can we use the MPIDI_RMA_ACC_CONTIG optimization? */ MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set); /******************** Setting operation struct areas ***********************/ op_ptr->origin_addr = (void *) origin_addr; op_ptr->origin_count = origin_count; op_ptr->origin_datatype = origin_datatype; op_ptr->result_addr = result_addr; op_ptr->result_count = result_count; op_ptr->result_datatype = result_datatype; op_ptr->target_rank = target_rank; /* Remember user request */ op_ptr->ureq = ureq; /* if source or target datatypes are derived, increment their * reference counts */ if (is_empty_origin == FALSE && !MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) { MPID_Datatype_get_ptr(origin_datatype, origin_dtp); } if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) { MPID_Datatype_get_ptr(result_datatype, result_dtp); } if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) { MPID_Datatype_get_ptr(target_datatype, target_dtp); } if (is_empty_origin == FALSE) { MPID_Datatype_get_size_macro(origin_datatype, origin_type_size); MPIU_Assign_trunc(orig_data_sz, origin_count * origin_type_size, MPIDI_msg_sz_t); } else { /* If origin buffer is empty, set origin data size to 0 */ orig_data_sz = 0; } MPID_Datatype_get_size_macro(target_datatype, target_type_size); /* Get size and count for predefined datatype elements */ if (MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) { predefined_dtp_size = target_type_size; predefined_dtp_count = target_count; MPID_Datatype_get_extent_macro(target_datatype, predefined_dtp_extent); } else { MPIU_Assert(target_dtp->basic_type != MPI_DATATYPE_NULL); MPID_Datatype_get_size_macro(target_dtp->basic_type, predefined_dtp_size); predefined_dtp_count = target_data_sz / predefined_dtp_size; MPID_Datatype_get_extent_macro(target_dtp->basic_type, predefined_dtp_extent); } MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 && predefined_dtp_extent > 0); /* Calculate number of predefined elements in each stream unit, and * total number of stream units. */ stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent; stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1; MPIU_Assert(stream_elem_count > 0 && stream_unit_count > 0); for (i = 0; i < stream_unit_count; i++) { if (origin_dtp != NULL) { MPID_Datatype_add_ref(origin_dtp); } if (target_dtp != NULL) { MPID_Datatype_add_ref(target_dtp); } if (result_dtp != NULL) { MPID_Datatype_add_ref(result_dtp); } } if (is_empty_origin == FALSE) { MPID_Datatype_is_contig(origin_datatype, &is_origin_contig); } else { /* If origin buffer is empty, mark origin data as contig data */ is_origin_contig = 1; } MPID_Datatype_is_contig(target_datatype, &is_target_contig); MPID_Datatype_is_contig(result_datatype, &is_result_contig); /* Judge if we can use IMMED data packet */ if ((is_empty_origin == TRUE || MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) && MPIR_DATATYPE_IS_PREDEFINED(result_datatype) && MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig && is_result_contig) { if (target_data_sz <= MPIDI_RMA_IMMED_BYTES) use_immed_pkt = TRUE; } /* Judge if this operation is a piggyback candidate */ if ((is_empty_origin == TRUE || MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) && MPIR_DATATYPE_IS_PREDEFINED(result_datatype) && MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) { /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes * for origin, target and result data. We should extend this optimization to derived * datatypes as well. */ if (orig_data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE) op_ptr->piggyback_lock_candidate = 1; } /************** Setting packet struct areas in operation ****************/ get_accum_pkt = &(op_ptr->pkt.get_accum); if (use_immed_pkt) { MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM_IMMED); } else { MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM); } get_accum_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr + win_ptr->basic_info_table[target_rank].disp_unit * target_disp; get_accum_pkt->count = target_count; get_accum_pkt->datatype = target_datatype; get_accum_pkt->info.dataloop_size = 0; get_accum_pkt->op = op; get_accum_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle; get_accum_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE; if (use_immed_pkt) { void *src = (void *) origin_addr, *dest = (void *) (get_accum_pkt->info.data); mpi_errno = immed_copy(src, dest, orig_data_sz); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set); mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 && MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) { while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) { mpi_errno = wait_progress_engine(); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } } } fn_exit: MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr, MPID_Request * ureq) { int mpi_errno = MPI_SUCCESS; int dt_contig ATTRIBUTE((unused)), rank; MPID_Datatype *dtp; MPI_Aint dt_true_lb ATTRIBUTE((unused)); MPIDI_msg_sz_t data_sz; MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL; int made_progress = 0; MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_PUT); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_PUT); MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE, mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync"); if (target_rank == MPI_PROC_NULL) { goto fn_exit; } MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb); if (data_sz == 0) { goto fn_exit; } rank = win_ptr->comm_ptr->rank; if (win_ptr->shm_allocated == TRUE && target_rank != rank && win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) { /* check if target is local and shared memory is allocated on window, * if so, we directly perform this operation on shared memory region. */ /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on * the same node. However, in ch3:sock, even if origin and target are on the same node, they do * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first, * which is only set to TRUE when SHM region is allocated in nemesis. * In future we need to figure out a way to check if origin and target are in the same "SHM comm". */ MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc); MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc); } /* If the put is a local operation, do it here */ if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED || (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) { mpi_errno = MPIDI_CH3I_Shm_put_op(origin_addr, origin_count, origin_datatype, target_rank, target_disp, target_count, target_datatype, win_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (ureq) { /* Complete user request and release the ch3 ref */ mpi_errno = MPID_Request_complete(ureq); if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); } } } else { MPIDI_RMA_Op_t *op_ptr = NULL; MPIDI_CH3_Pkt_put_t *put_pkt = NULL; int use_immed_pkt = FALSE; int is_origin_contig, is_target_contig; /* queue it up */ mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set); /******************** Setting operation struct areas ***********************/ /* FIXME: For contig and very short operations, use a streamlined op */ op_ptr->origin_addr = (void *) origin_addr; op_ptr->origin_count = origin_count; op_ptr->origin_datatype = origin_datatype; op_ptr->target_rank = target_rank; /* Remember user request */ op_ptr->ureq = ureq; /* if source or target datatypes are derived, increment their * reference counts */ if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) { MPID_Datatype_get_ptr(origin_datatype, dtp); MPID_Datatype_add_ref(dtp); } if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) { MPID_Datatype_get_ptr(target_datatype, dtp); MPID_Datatype_add_ref(dtp); } MPID_Datatype_is_contig(origin_datatype, &is_origin_contig); MPID_Datatype_is_contig(target_datatype, &is_target_contig); /* Judge if we can use IMMED data packet */ if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) && MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig) { if (data_sz <= MPIDI_RMA_IMMED_BYTES) use_immed_pkt = TRUE; } /* Judge if this operation is an piggyback candidate */ if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) && MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) { /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes * for both origin and target data. We should extend this optimization to derived * datatypes as well. */ if (data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE) op_ptr->piggyback_lock_candidate = 1; } /************** Setting packet struct areas in operation ****************/ put_pkt = &(op_ptr->pkt.put); if (use_immed_pkt) { MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT_IMMED); } else { MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT); } put_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr + win_ptr->basic_info_table[target_rank].disp_unit * target_disp; put_pkt->count = target_count; put_pkt->datatype = target_datatype; put_pkt->info.dataloop_size = 0; put_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle; put_pkt->source_win_handle = win_ptr->handle; put_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE; if (use_immed_pkt) { void *src = (void *) origin_addr, *dest = (void *) (put_pkt->info.data); mpi_errno = immed_copy(src, dest, data_sz); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set); mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 && MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) { while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) { mpi_errno = wait_progress_engine(); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } } } fn_exit: MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_PUT); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_CH3I_RMA_Cleanup_target_aggressive(MPID_Win * win_ptr, MPIDI_RMA_Target_t ** target) { int i, local_completed = 0, remote_completed = 0; int made_progress = 0; MPIDI_RMA_Target_t *curr_target = NULL; int mpi_errno = MPI_SUCCESS; (*target) = NULL; /* If we are in an aggressive cleanup, the window must be holding * up resources. If it isn't, we are in the wrong window and * incorrectly entered this function. */ MPIU_ERR_CHKANDJUMP(win_ptr->non_empty_slots == 0, mpi_errno, MPI_ERR_OTHER, "**rmanotarget"); if (win_ptr->states.access_state == MPIDI_RMA_LOCK_ALL_CALLED) { /* switch to window-wide protocol */ MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL; MPIDI_Comm_get_vc(win_ptr->comm_ptr, win_ptr->comm_ptr->rank, &orig_vc); for (i = 0; i < win_ptr->comm_ptr->local_size; i++) { if (i == win_ptr->comm_ptr->rank) continue; MPIDI_Comm_get_vc(win_ptr->comm_ptr, i, &target_vc); if (orig_vc->node_id != target_vc->node_id) { mpi_errno = MPIDI_CH3I_Win_find_target(win_ptr, i, &curr_target); if (mpi_errno) MPIU_ERR_POP(mpi_errno); if (curr_target == NULL) { win_ptr->outstanding_locks++; mpi_errno = send_lock_msg(i, MPI_LOCK_SHARED, win_ptr); if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno); } } } win_ptr->states.access_state = MPIDI_RMA_LOCK_ALL_ISSUED; } do { /* find a non-empty slot and set the FLUSH flag on the first * target */ /* TODO: we should think about better strategies on selecting the target */ for (i = 0; i < win_ptr->num_slots; i++) if (win_ptr->slots[i].target_list != NULL) break; curr_target = win_ptr->slots[i].target_list; if (curr_target->sync.sync_flag < MPIDI_RMA_SYNC_FLUSH) { curr_target->sync.sync_flag = MPIDI_RMA_SYNC_FLUSH; curr_target->sync.have_remote_incomplete_ops = 0; curr_target->sync.outstanding_acks++; } /* Issue out all operations. */ mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, curr_target->target_rank, &made_progress); if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno); /* Wait for remote completion. */ do { mpi_errno = MPIDI_CH3I_RMA_Cleanup_ops_target(win_ptr, curr_target, &local_completed, &remote_completed); if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno); if (!remote_completed) { mpi_errno = wait_progress_engine(); if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno); } } while (!remote_completed); /* Cleanup the target. */ mpi_errno = MPIDI_CH3I_RMA_Cleanup_single_target(win_ptr, curr_target); if (mpi_errno != MPI_SUCCESS) MPIU_ERR_POP(mpi_errno); /* check if we got a target */ (*target) = MPIDI_CH3I_Win_target_alloc(win_ptr); } while ((*target) == NULL); fn_exit: return mpi_errno; fn_fail: goto fn_exit; }
MPID_Request * MPIDI_CH3U_Recvq_FDU_or_AEP(int source, int tag, int context_id, MPID_Comm *comm, void *user_buf, int user_count, MPI_Datatype datatype, int * foundp) { MPID_Time_t timer_start; int found; MPID_Request *rreq, *prev_rreq; MPIDI_Message_match match; MPIDI_Message_match mask; MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_RECVQ_FDU_OR_AEP); MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_RECVQ_FDU_OR_AEP); MPIU_THREAD_CS_ASSERT_HELD(MSGQUEUE); /* Store how much time is spent traversing the queue */ MPIR_T_START_TIMER(RECVQ_STATISTICS, timer_start); /* Optimize this loop for an empty unexpected receive queue */ rreq = recvq_unexpected_head; if (rreq) { prev_rreq = NULL; match.parts.context_id = context_id; match.parts.tag = tag; match.parts.rank = source; if (tag != MPI_ANY_TAG && source != MPI_ANY_SOURCE) { do { MPIR_T_INC(RECVQ_STATISTICS, unexpected_recvq_match_attempts); if (MATCH_WITH_NO_MASK(rreq->dev.match, match)) { if (prev_rreq != NULL) { prev_rreq->dev.next = rreq->dev.next; } else { recvq_unexpected_head = rreq->dev.next; } if (rreq->dev.next == NULL) { recvq_unexpected_tail = prev_rreq; } MPIR_T_DEC(RECVQ_STATISTICS, unexpected_qlen); if (MPIDI_Request_get_msg_type(rreq) == MPIDI_REQUEST_EAGER_MSG) MPIR_T_SUBTRACT(RECVQ_STATISTICS, MPIDI_CH3I_unexpected_recvq_buffer_size, rreq->dev.tmpbuf_sz); rreq->comm = comm; MPIR_Comm_add_ref(comm); rreq->dev.user_buf = user_buf; rreq->dev.user_count = user_count; rreq->dev.datatype = datatype; found = TRUE; goto lock_exit; } prev_rreq = rreq; rreq = rreq->dev.next; } while (rreq); } else { mask.parts.context_id = mask.parts.rank = mask.parts.tag = ~0; if (tag == MPI_ANY_TAG) match.parts.tag = mask.parts.tag = 0; if (source == MPI_ANY_SOURCE) match.parts.rank = mask.parts.rank = 0; do { MPIR_T_INC(RECVQ_STATISTICS, unexpected_recvq_match_attempts); if (MATCH_WITH_LEFT_MASK(rreq->dev.match, match, mask)) { if (prev_rreq != NULL) { prev_rreq->dev.next = rreq->dev.next; } else { recvq_unexpected_head = rreq->dev.next; } if (rreq->dev.next == NULL) { recvq_unexpected_tail = prev_rreq; } MPIR_T_DEC(RECVQ_STATISTICS, unexpected_qlen); if (MPIDI_Request_get_msg_type(rreq) == MPIDI_REQUEST_EAGER_MSG) MPIR_T_SUBTRACT(RECVQ_STATISTICS, MPIDI_CH3I_unexpected_recvq_buffer_size, rreq->dev.tmpbuf_sz); rreq->comm = comm; MPIR_Comm_add_ref(comm); rreq->dev.user_buf = user_buf; rreq->dev.user_count = user_count; rreq->dev.datatype = datatype; found = TRUE; goto lock_exit; } prev_rreq = rreq; rreq = rreq->dev.next; } while (rreq); } } MPIR_T_END_TIMER(RECVQ_STATISTICS, timer_start, time_matching_unexpectedq); /* A matching request was not found in the unexpected queue, so we need to allocate a new request and add it to the posted queue */ { int mpi_errno = MPI_SUCCESS; found = FALSE; MPIDI_Request_create_rreq( rreq, mpi_errno, goto lock_exit ); rreq->dev.match.parts.tag = tag; rreq->dev.match.parts.rank = source; rreq->dev.match.parts.context_id = context_id; /* Added a mask for faster search on 64-bit capable * platforms */ rreq->dev.mask.parts.context_id = ~0; if (rreq->dev.match.parts.rank == MPI_ANY_SOURCE) rreq->dev.mask.parts.rank = 0; else rreq->dev.mask.parts.rank = ~0; if (rreq->dev.match.parts.tag == MPI_ANY_TAG) rreq->dev.mask.parts.tag = 0; else rreq->dev.mask.parts.tag = ~0; rreq->comm = comm; MPIR_Comm_add_ref(comm); rreq->dev.user_buf = user_buf; rreq->dev.user_count = user_count; rreq->dev.datatype = datatype; /* check whether VC has failed, or this is an ANY_SOURCE in a failed communicator */ if (source != MPI_ANY_SOURCE) { MPIDI_VC_t *vc; MPIDI_Comm_get_vc(comm, source, &vc); if (vc->state == MPIDI_VC_STATE_MORIBUND) { MPIU_ERR_SET1(mpi_errno, MPIX_ERR_PROC_FAIL_STOP, "**comm_fail", "**comm_fail %d", vc->pg_rank); rreq->status.MPI_ERROR = mpi_errno; MPIDI_CH3U_Request_complete(rreq); goto lock_exit; } } else if (!MPIDI_CH3I_Comm_AS_enabled(comm)) { MPIU_ERR_SET(mpi_errno, MPIX_ERR_PROC_FAIL_STOP, "**comm_fail"); rreq->status.MPI_ERROR = mpi_errno; MPIDI_CH3U_Request_complete(rreq); goto lock_exit; } rreq->dev.next = NULL; if (recvq_posted_tail != NULL) { recvq_posted_tail->dev.next = rreq; } else { recvq_posted_head = rreq; } recvq_posted_tail = rreq; MPIR_T_INC(RECVQ_STATISTICS, posted_qlen); MPIDI_POSTED_RECV_ENQUEUE_HOOK(rreq); } lock_exit: *foundp = found; /* If a match was not found, the timer was stopped after the traversal */ if (found) MPIR_T_END_TIMER(RECVQ_STATISTICS, timer_start, time_matching_unexpectedq); MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_RECVQ_FDU_OR_AEP); return rreq; }
static int handler_recv_dequeue_large(const ptl_event_t *e) { int mpi_errno = MPI_SUCCESS; MPID_Request *const rreq = e->user_ptr; MPIDI_VC_t *vc; MPID_nem_ptl_vc_area *vc_ptl; int ret; int dt_contig; MPIDI_msg_sz_t data_sz; MPID_Datatype *dt_ptr; MPI_Aint dt_true_lb; MPI_Aint last; MPIU_CHKPMEM_DECL(1); MPIDI_STATE_DECL(MPID_STATE_HANDLER_RECV_DEQUEUE_LARGE); MPIDI_FUNC_ENTER(MPID_STATE_HANDLER_RECV_DEQUEUE_LARGE); MPIU_Assert(e->type == PTL_EVENT_PUT || e->type == PTL_EVENT_PUT_OVERFLOW); MPIDI_Comm_get_vc(rreq->comm, NPTL_MATCH_GET_RANK(e->match_bits), &vc); vc_ptl = VC_PTL(vc); dequeue_req(e); MPIDI_Datatype_get_info(rreq->dev.user_count, rreq->dev.datatype, dt_contig, data_sz, dt_ptr, dt_true_lb); /* unpack data from unexpected buffer first */ if (e->type == PTL_EVENT_PUT_OVERFLOW) { if (dt_contig) { MPIU_Memcpy((char *)rreq->dev.user_buf + dt_true_lb, e->start, e->mlength); } else { last = e->mlength; MPID_Segment_unpack(rreq->dev.segment_ptr, 0, &last, e->start); MPIU_Assert(last == e->mlength); rreq->dev.segment_first = e->mlength; } } if (!(e->hdr_data & NPTL_LARGE)) { /* all data has already been received; we're done */ mpi_errno = handler_recv_complete(e); if (mpi_errno) MPIR_ERR_POP(mpi_errno); goto fn_exit; } MPIU_Assert (e->mlength == PTL_LARGE_THRESHOLD); /* we need to GET the rest of the data from the sender's buffer */ if (dt_contig) { big_get((char *)rreq->dev.user_buf + dt_true_lb + PTL_LARGE_THRESHOLD, data_sz - PTL_LARGE_THRESHOLD, vc, e->match_bits, rreq); goto fn_exit; } /* noncontig recv buffer */ last = rreq->dev.segment_size; rreq->dev.iov_count = MPL_IOV_LIMIT; MPID_Segment_pack_vector(rreq->dev.segment_ptr, rreq->dev.segment_first, &last, rreq->dev.iov, &rreq->dev.iov_count); if (last == rreq->dev.segment_size && rreq->dev.segment_size <= MPIDI_nem_ptl_ni_limits.max_msg_size + PTL_LARGE_THRESHOLD) { /* Rest of message fits in one IOV */ ptl_md_t md; md.start = rreq->dev.iov; md.length = rreq->dev.iov_count; md.options = PTL_IOVEC; md.eq_handle = MPIDI_nem_ptl_origin_eq; md.ct_handle = PTL_CT_NONE; ret = PtlMDBind(MPIDI_nem_ptl_ni, &md, &REQ_PTL(rreq)->md); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlmdbind", "**ptlmdbind %s", MPID_nem_ptl_strerror(ret)); REQ_PTL(rreq)->event_handler = handler_recv_complete; ret = MPID_nem_ptl_rptl_get(REQ_PTL(rreq)->md, 0, rreq->dev.segment_size - rreq->dev.segment_first, vc_ptl->id, vc_ptl->ptg, e->match_bits, 0, rreq); MPIR_ERR_CHKANDJUMP1(ret, mpi_errno, MPI_ERR_OTHER, "**ptlget", "**ptlget %s", MPID_nem_ptl_strerror(ret)); goto fn_exit; } /* message won't fit in a single IOV, allocate buffer and unpack when received */ /* FIXME: For now, allocate a single large buffer to hold entire message */ MPIU_CHKPMEM_MALLOC(REQ_PTL(rreq)->chunk_buffer[0], void *, data_sz - PTL_LARGE_THRESHOLD, mpi_errno, "chunk_buffer"); big_get(REQ_PTL(rreq)->chunk_buffer[0], data_sz - PTL_LARGE_THRESHOLD, vc, e->match_bits, rreq); fn_exit: MPIU_CHKPMEM_COMMIT(); fn_exit2: MPIDI_FUNC_EXIT(MPID_STATE_HANDLER_RECV_DEQUEUE_LARGE); return mpi_errno; fn_fail: MPIU_CHKPMEM_REAP(); goto fn_exit2; }
int create_2level_comm (MPI_Comm comm, int size, int my_rank) { static const char FCNAME[] = "create_2level_comm"; int mpi_errno = MPI_SUCCESS; MPID_Comm* comm_ptr; MPID_Comm* comm_world_ptr; MPI_Group subgroup1, comm_group; MPID_Group *group_ptr=NULL; int leader_comm_size, my_local_size, my_local_id, input_flag =0, output_flag=0; int errflag = FALSE; int leader_group_size=0; MPIU_THREADPRIV_DECL; MPIU_THREADPRIV_GET; MPID_Comm_get_ptr( comm, comm_ptr ); MPID_Comm_get_ptr( MPI_COMM_WORLD, comm_world_ptr ); int* shmem_group = MPIU_Malloc(sizeof(int) * size); if (NULL == shmem_group){ printf("Couldn't malloc shmem_group\n"); ibv_error_abort (GEN_EXIT_ERR, "create_2level_com"); } /* Creating local shmem group */ int i = 0; int local_rank = 0; int grp_index = 0; comm_ptr->ch.leader_comm=MPI_COMM_NULL; comm_ptr->ch.shmem_comm=MPI_COMM_NULL; MPIDI_VC_t* vc = NULL; for (; i < size ; ++i){ MPIDI_Comm_get_vc(comm_ptr, i, &vc); if (my_rank == i || vc->smp.local_rank >= 0){ shmem_group[grp_index] = i; if (my_rank == i){ local_rank = grp_index; } ++grp_index; } } /* Creating leader group */ int leader = 0; leader = shmem_group[0]; /* Gives the mapping to any process's leader in comm */ comm_ptr->ch.leader_map = MPIU_Malloc(sizeof(int) * size); if (NULL == comm_ptr->ch.leader_map){ printf("Couldn't malloc group\n"); ibv_error_abort (GEN_EXIT_ERR, "create_2level_com"); } mpi_errno = MPIR_Allgather_impl (&leader, 1, MPI_INT , comm_ptr->ch.leader_map, 1, MPI_INT, comm_ptr, &errflag); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } int* leader_group = MPIU_Malloc(sizeof(int) * size); if (NULL == leader_group){ printf("Couldn't malloc leader_group\n"); ibv_error_abort (GEN_EXIT_ERR, "create_2level_com"); } /* Gives the mapping from leader's rank in comm to * leader's rank in leader_comm */ comm_ptr->ch.leader_rank = MPIU_Malloc(sizeof(int) * size); if (NULL == comm_ptr->ch.leader_rank){ printf("Couldn't malloc marker\n"); ibv_error_abort (GEN_EXIT_ERR, "create_2level_com"); } for (i=0; i < size ; ++i){ comm_ptr->ch.leader_rank[i] = -1; } int* group = comm_ptr->ch.leader_map; grp_index = 0; for (i=0; i < size ; ++i){ if (comm_ptr->ch.leader_rank[(group[i])] == -1){ comm_ptr->ch.leader_rank[(group[i])] = grp_index; leader_group[grp_index++] = group[i]; } } leader_group_size = grp_index; comm_ptr->ch.leader_group_size = leader_group_size; mpi_errno = PMPI_Comm_group(comm, &comm_group); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } mpi_errno = PMPI_Group_incl(comm_group, leader_group_size, leader_group, &subgroup1); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } mpi_errno = PMPI_Comm_create(comm, subgroup1, &(comm_ptr->ch.leader_comm)); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } MPID_Comm *leader_ptr; MPID_Comm_get_ptr( comm_ptr->ch.leader_comm, leader_ptr ); MPIU_Free(leader_group); MPID_Group_get_ptr( subgroup1, group_ptr ); if(group_ptr != NULL) { mpi_errno = PMPI_Group_free(&subgroup1); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } } mpi_errno = PMPI_Comm_split(comm, leader, local_rank, &(comm_ptr->ch.shmem_comm)); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } MPID_Comm *shmem_ptr; MPID_Comm_get_ptr(comm_ptr->ch.shmem_comm, shmem_ptr); mpi_errno = PMPI_Comm_rank(comm_ptr->ch.shmem_comm, &my_local_id); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } mpi_errno = PMPI_Comm_size(comm_ptr->ch.shmem_comm, &my_local_size); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } if(my_local_id == 0) { int array_index=0; mpi_errno = PMPI_Comm_size(comm_ptr->ch.leader_comm, &leader_comm_size); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } comm_ptr->ch.node_sizes = MPIU_Malloc(sizeof(int)*leader_comm_size); mpi_errno = PMPI_Allgather(&my_local_size, 1, MPI_INT, comm_ptr->ch.node_sizes, 1, MPI_INT, comm_ptr->ch.leader_comm); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } comm_ptr->ch.is_uniform = 1; for(array_index=0; array_index < leader_comm_size; array_index++) { if(comm_ptr->ch.node_sizes[0] != comm_ptr->ch.node_sizes[array_index]) { comm_ptr->ch.is_uniform = 0; break; } } } comm_ptr->ch.is_global_block = 0; /* We need to check to see if the ranks are block or not. Each node leader * gets the global ranks of all of its children processes. It scans through * this array to see if the ranks are in block order. The node-leaders then * do an allreduce to see if all the other nodes are also in block order. * This is followed by an intra-node bcast to let the children processes * know of the result of this step */ if(my_local_id == 0) { int is_local_block = 1; int index = 1; while( index < my_local_size) { if( (shmem_group[index] - 1) != shmem_group[index - 1]) { is_local_block = 0; break; } index++; } comm_ptr->ch.shmem_coll_ok = 0;/* To prevent Allreduce taking shmem route*/ mpi_errno = MPIR_Allreduce_impl(&(is_local_block), &(comm_ptr->ch.is_global_block), 1, MPI_INT, MPI_LAND, leader_ptr, &errflag); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } mpi_errno = MPIR_Bcast_impl(&(comm_ptr->ch.is_global_block),1, MPI_INT, 0, shmem_ptr, &errflag); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } } else { mpi_errno = MPIR_Bcast_impl(&(comm_ptr->ch.is_global_block),1, MPI_INT, 0, shmem_ptr, &errflag); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } } if (my_local_id == 0){ lock_shmem_region(); increment_shmem_comm_count(); shmem_comm_count = get_shmem_comm_count(); unlock_shmem_region(); } shmem_ptr->ch.shmem_coll_ok = 0; /* To prevent Bcast taking the knomial_2level_bcast route */ mpi_errno = MPIR_Bcast_impl (&shmem_comm_count, 1, MPI_INT, 0, shmem_ptr, &errflag); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } if (shmem_comm_count <= g_shmem_coll_blocks){ shmem_ptr->ch.shmem_comm_rank = shmem_comm_count-1; input_flag = 1; } else{ input_flag = 0; } comm_ptr->ch.shmem_coll_ok = 0;/* To prevent Allreduce taking shmem route*/ mpi_errno = MPIR_Allreduce_impl(&input_flag, &output_flag, 1, MPI_INT, MPI_LAND, comm_ptr, &errflag); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } comm_ptr->ch.allgather_comm_ok = 0; if (allgather_ranking){ int is_contig =1, check_leader =1, check_size=1, is_local_ok=0,is_block=0; int PPN; int shmem_grp_size = my_local_size; int leader_rank; MPI_Group allgather_group; comm_ptr->ch.allgather_comm=MPI_COMM_NULL; comm_ptr->ch.allgather_new_ranks=NULL; if(comm_ptr->ch.leader_comm != MPI_COMM_NULL) { PMPI_Comm_rank(comm_ptr->ch.leader_comm, &leader_rank); } mpi_errno=MPIR_Bcast_impl(&leader_rank, 1, MPI_INT, 0, shmem_ptr, &errflag); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } for (i=1; i < shmem_grp_size; i++ ){ if (shmem_group[i] != shmem_group[i-1]+1){ is_contig =0; break; } } if (leader != (shmem_grp_size*leader_rank)){ check_leader=0; } if (shmem_grp_size != (size/leader_group_size)){ check_size=0; } is_local_ok = is_contig && check_leader && check_size; mpi_errno = MPIR_Allreduce_impl(&is_local_ok, &is_block, 1, MPI_INT, MPI_LAND, comm_ptr, &errflag); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } if (is_block){ int counter=0,j; comm_ptr->ch.allgather_new_ranks = MPIU_Malloc(sizeof(int)*size); if (NULL == comm_ptr->ch.allgather_new_ranks){ mpi_errno = MPIR_Err_create_code( MPI_SUCCESS, MPIR_ERR_RECOVERABLE, FCNAME, __LINE__, MPI_ERR_OTHER, "**nomem", 0 ); return mpi_errno; } PPN = shmem_grp_size; for (j=0; j < PPN; j++){ for (i=0; i < leader_group_size; i++){ comm_ptr->ch.allgather_new_ranks[counter] = j + i*PPN; counter++; } } mpi_errno = PMPI_Group_incl(comm_group, size, comm_ptr->ch.allgather_new_ranks, &allgather_group); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } mpi_errno = PMPI_Comm_create(comm_ptr->handle, allgather_group, &(comm_ptr->ch.allgather_comm)); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } comm_ptr->ch.allgather_comm_ok = 1; mpi_errno=PMPI_Group_free(&allgather_group); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } } } mpi_errno=PMPI_Group_free(&comm_group); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } if (output_flag == 1){ comm_ptr->ch.shmem_coll_ok = 1; comm_registry[comm_registered++] = comm_ptr->context_id; } else{ comm_ptr->ch.shmem_coll_ok = 0; MPID_Group_get_ptr( subgroup1, group_ptr ); if(group_ptr != NULL) { mpi_errno = PMPI_Group_free(&subgroup1); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } } MPID_Group_get_ptr( comm_group, group_ptr ); if(group_ptr != NULL) { mpi_errno = PMPI_Group_free(&comm_group); if(mpi_errno) { MPIU_ERR_POP(mpi_errno); } } free_2level_comm(comm_ptr); comm_ptr->ch.shmem_comm = MPI_COMM_NULL; comm_ptr->ch.leader_comm = MPI_COMM_NULL; } ++comm_count; MPIU_Free(shmem_group); fn_fail: MPIDU_ERR_CHECK_MULTIPLE_THREADS_EXIT( comm_ptr ); return (mpi_errno); }
int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Op op, MPID_Win *win_ptr) { int mpi_errno=MPI_SUCCESS; MPIDI_msg_sz_t data_sz; int dt_contig ATTRIBUTE((unused)), rank; MPI_Aint dt_true_lb ATTRIBUTE((unused)); MPID_Datatype *dtp; MPIDI_VC_t *orig_vc, *target_vc; MPIDI_STATE_DECL(MPID_STATE_MPIDI_ACCUMULATE); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_ACCUMULATE); if (target_rank == MPI_PROC_NULL) { goto fn_exit; } if (win_ptr->epoch_state == MPIDI_EPOCH_NONE && win_ptr->fence_issued) { win_ptr->epoch_state = MPIDI_EPOCH_FENCE; } MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state == MPIDI_EPOCH_NONE, mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync"); MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb); if (data_sz == 0) { goto fn_exit; } rank = win_ptr->comm_ptr->rank; if (win_ptr->shm_allocated == TRUE && target_rank != rank && win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) { /* check if target is local and shared memory is allocated on window, if so, we directly perform this operation on shared memory region. */ /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on the same node. However, in ch3:sock, even if origin and target are on the same node, they do not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first, which is only set to TRUE when SHM region is allocated in nemesis. In future we need to figure out a way to check if origin and target are in the same "SHM comm". */ MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc); MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc); } /* Do =! rank first (most likely branch?) */ if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED || (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) { mpi_errno = MPIDI_CH3I_Shm_acc_op(origin_addr, origin_count, origin_datatype, target_rank, target_disp, target_count, target_datatype, op, win_ptr); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } else { MPIDI_RMA_Ops_list_t *ops_list = MPIDI_CH3I_RMA_Get_ops_list(win_ptr, target_rank); MPIDI_RMA_Op_t *new_ptr = NULL; /* queue it up */ MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_alloc); mpi_errno = MPIDI_CH3I_RMA_Ops_alloc_tail(ops_list, &new_ptr); MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_alloc); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } /* If predefined and contiguous, use a simplified element */ if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) && MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && enableShortACC) { MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set); new_ptr->type = MPIDI_RMA_ACC_CONTIG; /* Only the information needed for the contig/predefined acc */ /* Cast away const'ness for origin_address as * MPIDI_RMA_Op_t contain both PUT and GET like ops */ new_ptr->origin_addr = (void *) origin_addr; new_ptr->origin_count = origin_count; new_ptr->origin_datatype = origin_datatype; new_ptr->target_rank = target_rank; new_ptr->target_disp = target_disp; new_ptr->target_count = target_count; new_ptr->target_datatype = target_datatype; new_ptr->op = op; MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set); goto fn_exit; } MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set); new_ptr->type = MPIDI_RMA_ACCUMULATE; /* Cast away const'ness for origin_address as MPIDI_RMA_Op_t * contain both PUT and GET like ops */ new_ptr->origin_addr = (void *) origin_addr; new_ptr->origin_count = origin_count; new_ptr->origin_datatype = origin_datatype; new_ptr->target_rank = target_rank; new_ptr->target_disp = target_disp; new_ptr->target_count = target_count; new_ptr->target_datatype = target_datatype; new_ptr->op = op; MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set); /* if source or target datatypes are derived, increment their reference counts */ if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) { MPID_Datatype_get_ptr(origin_datatype, dtp); MPID_Datatype_add_ref(dtp); } if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) { MPID_Datatype_get_ptr(target_datatype, dtp); MPID_Datatype_add_ref(dtp); } } fn_exit: MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_ACCUMULATE); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
MPID_Request * MPIDI_CH3U_Recvq_FDU_or_AEP(int source, int tag, int context_id, MPID_Comm *comm, void *user_buf, int user_count, MPI_Datatype datatype, int * foundp) { int found; MPID_Request *rreq, *prev_rreq; MPIDI_Message_match match; MPIDI_Message_match mask; MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_RECVQ_FDU_OR_AEP); MPIDI_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_RECVQ_FDU_OR_AEP); MPIU_THREAD_CS_ASSERT_HELD(MSGQUEUE); /* Optimize this loop for an empty unexpected receive queue */ rreq = recvq_unexpected_head; if (rreq) { prev_rreq = NULL; match.parts.context_id = context_id; match.parts.tag = tag; match.parts.rank = source; if (tag != MPI_ANY_TAG && source != MPI_ANY_SOURCE) { do { if (MATCH_WITH_NO_MASK(rreq->dev.match, match)) { if (prev_rreq != NULL) { prev_rreq->dev.next = rreq->dev.next; } else { recvq_unexpected_head = rreq->dev.next; } if (rreq->dev.next == NULL) { recvq_unexpected_tail = prev_rreq; } rreq->comm = comm; MPIR_Comm_add_ref(comm); rreq->dev.user_buf = user_buf; rreq->dev.user_count = user_count; rreq->dev.datatype = datatype; found = TRUE; goto lock_exit; } prev_rreq = rreq; rreq = rreq->dev.next; } while (rreq); } else { mask.parts.context_id = mask.parts.rank = mask.parts.tag = ~0; if (tag == MPI_ANY_TAG) match.parts.tag = mask.parts.tag = 0; if (source == MPI_ANY_SOURCE) match.parts.rank = mask.parts.rank = 0; do { if (MATCH_WITH_LEFT_MASK(rreq->dev.match, match, mask)) { if (prev_rreq != NULL) { prev_rreq->dev.next = rreq->dev.next; } else { recvq_unexpected_head = rreq->dev.next; } if (rreq->dev.next == NULL) { recvq_unexpected_tail = prev_rreq; } rreq->comm = comm; MPIR_Comm_add_ref(comm); rreq->dev.user_buf = user_buf; rreq->dev.user_count = user_count; rreq->dev.datatype = datatype; found = TRUE; goto lock_exit; } prev_rreq = rreq; rreq = rreq->dev.next; } while (rreq); } } /* A matching request was not found in the unexpected queue, so we need to allocate a new request and add it to the posted queue */ { int mpi_errno = MPI_SUCCESS; found = FALSE; MPIDI_Request_create_rreq( rreq, mpi_errno, goto lock_exit ); rreq->dev.match.parts.tag = tag; rreq->dev.match.parts.rank = source; rreq->dev.match.parts.context_id = context_id; /* Added a mask for faster search on 64-bit capable * platforms */ rreq->dev.mask.parts.context_id = ~0; if (rreq->dev.match.parts.rank == MPI_ANY_SOURCE) rreq->dev.mask.parts.rank = 0; else rreq->dev.mask.parts.rank = ~0; if (rreq->dev.match.parts.tag == MPI_ANY_TAG) rreq->dev.mask.parts.tag = 0; else rreq->dev.mask.parts.tag = ~0; rreq->comm = comm; MPIR_Comm_add_ref(comm); rreq->dev.user_buf = user_buf; rreq->dev.user_count = user_count; rreq->dev.datatype = datatype; /* check whether VC has failed, or this is an ANY_SOURCE in a failed communicator */ if (source != MPI_ANY_SOURCE) { MPIDI_VC_t *vc; MPIDI_Comm_get_vc(comm, source, &vc); if (vc->state == MPIDI_VC_STATE_MORIBUND) { MPIU_ERR_SET1(mpi_errno, MPI_ERR_OTHER, "**comm_fail", "**comm_fail %d", vc->pg_rank); rreq->status.MPI_ERROR = mpi_errno; MPIDI_CH3U_Request_complete(rreq); goto lock_exit; } } else if (MPID_VCRT_Contains_failed_vc(comm->vcrt)) { MPIU_ERR_SET(mpi_errno, MPI_ERR_OTHER, "**comm_fail"); rreq->status.MPI_ERROR = mpi_errno; MPIDI_CH3U_Request_complete(rreq); goto lock_exit; } rreq->dev.next = NULL; if (recvq_posted_tail != NULL) { recvq_posted_tail->dev.next = rreq; } else { recvq_posted_head = rreq; } recvq_posted_tail = rreq; MPIDI_POSTED_RECV_ENQUEUE_HOOK(rreq); } lock_exit: *foundp = found; MPIDI_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_RECVQ_FDU_OR_AEP); return rreq; }