int MPID_Get_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, void *result_addr, int result_count, MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Op op, MPID_Win * win_ptr) { int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPID_GET_ACCUMULATE); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPID_GET_ACCUMULATE); mpi_errno = MPIDI_CH3I_Get_accumulate(origin_addr, origin_count, origin_datatype, result_addr, result_count, result_datatype, target_rank, target_disp, target_count, target_datatype, op, win_ptr, NULL); fn_exit: MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPID_GET_ACCUMULATE); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_CH3U_Win_allocate(MPI_Aint size, int disp_unit, MPID_Info * info, MPID_Comm * comm_ptr, void *baseptr, MPID_Win ** win_ptr) { int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_WIN_ALLOCATE); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_WIN_ALLOCATE); if ((*win_ptr)->info_args.alloc_shm == TRUE) { if (MPIDI_CH3U_Win_fns.allocate_shm != NULL) { mpi_errno = MPIDI_CH3U_Win_fns.allocate_shm(size, disp_unit, info, comm_ptr, baseptr, win_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); goto fn_exit; } } mpi_errno = MPIDI_CH3U_Win_allocate_no_shm(size, disp_unit, info, comm_ptr, baseptr, win_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); fn_exit: MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_WIN_ALLOCATE); return mpi_errno; fn_fail: goto fn_exit; }
int MPIDI_CH3U_Win_create(void *base, MPI_Aint size, int disp_unit, MPID_Info * info, MPID_Comm * comm_ptr, MPID_Win ** win_ptr) { int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_WIN_CREATE); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_WIN_CREATE); mpi_errno = MPIDI_CH3U_Win_fns.gather_info(base, size, disp_unit, info, comm_ptr, win_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); if ((*win_ptr)->info_args.alloc_shm == TRUE && MPIDI_CH3U_Win_fns.detect_shm != NULL) { /* Detect if shared buffers are specified for the processes in the * current node. If so, enable shm RMA.*/ mpi_errno = MPIDI_CH3U_Win_fns.detect_shm(win_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); goto fn_exit; } fn_exit: MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_WIN_CREATE); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_CH3_SHM_Win_free(MPID_Win **win_ptr) { int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_SHM_WIN_FREE); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3_SHM_WIN_FREE); /* Free shared memory region */ if ((*win_ptr)->shm_allocated) { /* free shm_base_addrs that's only used for shared memory windows */ MPIU_Free((*win_ptr)->shm_base_addrs); /* detach from shared memory segment */ mpi_errno = MPIU_SHMW_Seg_detach((*win_ptr)->shm_segment_handle, (char **)&(*win_ptr)->shm_base_addr, (*win_ptr)->shm_segment_len); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPIU_SHMW_Hnd_finalize(&(*win_ptr)->shm_segment_handle); } mpi_errno = MPIDI_Win_free(win_ptr); if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); } fn_exit: MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3_SHM_WIN_FREE); return mpi_errno; fn_fail: goto fn_exit; }
int MPIDI_CH3U_Win_allocate_shared(MPI_Aint size, int disp_unit, MPID_Info *info, MPID_Comm *comm_ptr, void **base_ptr, MPID_Win **win_ptr) { int mpi_errno = MPI_SUCCESS; MPID_Comm *comm_self_ptr = NULL; MPID_Group *group_comm, *group_self; int result; MPIU_CHKPMEM_DECL(1); MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_WIN_ALLOCATE_SHARED); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_WIN_ALLOCATE_SHARED); #ifdef HAVE_ERROR_CHECKING /* The baseline CH3 implementation only works with MPI_COMM_SELF */ MPID_Comm_get_ptr( MPI_COMM_SELF, comm_self_ptr ); mpi_errno = MPIR_Comm_group_impl(comm_ptr, &group_comm); if (mpi_errno) MPIU_ERR_POP(mpi_errno); mpi_errno = MPIR_Comm_group_impl(comm_self_ptr, &group_self); if (mpi_errno) MPIU_ERR_POP(mpi_errno); mpi_errno = MPIR_Group_compare_impl(group_comm, group_self, &result); if (mpi_errno) MPIU_ERR_POP(mpi_errno); mpi_errno = MPIR_Group_free_impl(group_comm); if (mpi_errno) MPIU_ERR_POP(mpi_errno); mpi_errno = MPIR_Group_free_impl(group_self); if (mpi_errno) MPIU_ERR_POP(mpi_errno); if (result != MPI_IDENT) { MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_RMA_SHARED, "**ch3|win_shared_comm"); } #endif mpi_errno = MPIDI_CH3U_Win_allocate(size, disp_unit, info, comm_ptr, base_ptr, win_ptr); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPIU_CHKPMEM_MALLOC((*win_ptr)->shm_base_addrs, void **, 1 /* comm_size */ * sizeof(void *), mpi_errno, "(*win_ptr)->shm_base_addrs"); (*win_ptr)->shm_base_addrs[0] = *base_ptr; /* Register the shared memory window free function, which will free the memory allocated here. */ (*win_ptr)->RMAFns.Win_free = MPIDI_SHM_Win_free; fn_exit: MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_WIN_ALLOCATE_SHARED); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: MPIU_CHKPMEM_REAP(); goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_Win_fns_init(MPIDI_CH3U_Win_fns_t *win_fns) { int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_FNS_INIT); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_FNS_INIT); win_fns->create = MPIDI_CH3U_Win_create; win_fns->allocate = MPIDI_CH3U_Win_allocate; win_fns->allocate_shared = MPIDI_CH3U_Win_allocate_shared; win_fns->create_dynamic = MPIDI_CH3U_Win_create_dynamic; MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_FNS_INIT); return mpi_errno; }
int MPIDI_Win_detach(MPID_Win *win, const void *base) { int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_DETACH); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_DETACH); /* no op, all of memory is exposed */ fn_exit: MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_DETACH); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_Win_free(MPID_Win **win_ptr) { int mpi_errno=MPI_SUCCESS; int in_use; MPID_Comm *comm_ptr; MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_FREE); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_FREE); MPIU_ERR_CHKANDJUMP((*win_ptr)->epoch_state != MPIDI_EPOCH_NONE, mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync"); mpi_errno = MPIDI_CH3I_Wait_for_pt_ops_finish(*win_ptr); if(mpi_errno) MPIU_ERR_POP(mpi_errno); comm_ptr = (*win_ptr)->comm_ptr; mpi_errno = MPIR_Comm_free_impl(comm_ptr); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPIU_Free((*win_ptr)->targets); MPIU_Free((*win_ptr)->base_addrs); MPIU_Free((*win_ptr)->sizes); MPIU_Free((*win_ptr)->disp_units); MPIU_Free((*win_ptr)->all_win_handles); MPIU_Free((*win_ptr)->pt_rma_puts_accs); /* Free the attached buffer for windows created with MPI_Win_allocate() */ if ((*win_ptr)->create_flavor == MPI_WIN_FLAVOR_ALLOCATE || (*win_ptr)->create_flavor == MPI_WIN_FLAVOR_SHARED) { if ((*win_ptr)->shm_allocated == FALSE && (*win_ptr)->size > 0) { MPIU_Free((*win_ptr)->base); } } MPIU_Object_release_ref(*win_ptr, &in_use); /* MPI windows don't have reference count semantics, so this should always be true */ MPIU_Assert(!in_use); MPIU_Handle_obj_free( &MPID_Win_mem, *win_ptr ); fn_exit: MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_FREE); return mpi_errno; fn_fail: goto fn_exit; }
int MPIDI_CH3U_Win_allocate_shared(MPI_Aint size, int disp_unit, MPID_Info *info, MPID_Comm *comm_ptr, void **base_ptr, MPID_Win **win_ptr) { int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_WIN_ALLOCATE_SHARED); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_WIN_ALLOCATE_SHARED); MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_OTHER, "**notimpl"); fn_exit: MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_WIN_ALLOCATE_SHARED); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_CH3U_Win_create_dynamic(MPID_Info *info, MPID_Comm *comm_ptr, MPID_Win **win_ptr ) { int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_WIN_CREATE_DYNAMIC); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_WIN_CREATE_DYNAMIC); mpi_errno = MPIDI_CH3U_Win_create_gather(MPI_BOTTOM, 0, 1, info, comm_ptr, win_ptr); if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); } fn_exit: MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_WIN_CREATE_DYNAMIC); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_CH3_SHM_Win_shared_query(MPID_Win *win_ptr, int target_rank, MPI_Aint *size, int *disp_unit, void *baseptr) { int comm_size; int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3_WIN_SHARED_QUERY); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3_WIN_SHARED_QUERY); comm_size = win_ptr->comm_ptr->local_size; if (win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) { MPIU_ERR_SETANDJUMP(mpi_errno, MPI_ERR_RMA_FLAVOR, "**winflavor"); } /* Scan the sizes to locate the first process that allocated a nonzero * amount of space */ if (target_rank == MPI_PROC_NULL) { int i; /* Default, if no processes have size > 0. */ *size = 0; *((void**) baseptr) = NULL; for (i = 0; i < comm_size; i++) { if (win_ptr->sizes[i] > 0) { *size = win_ptr->sizes[i]; *((void**) baseptr) = win_ptr->shm_base_addrs[i]; break; } } } else { *size = win_ptr->sizes[target_rank]; *((void**) baseptr) = win_ptr->shm_base_addrs[target_rank]; } fn_exit: MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3_WIN_SHARED_QUERY); return mpi_errno; fn_fail: goto fn_exit; }
int MPIDI_CH3U_Win_create(void *base, MPI_Aint size, int disp_unit, MPID_Info *info, MPID_Comm *comm_ptr, MPID_Win **win_ptr ) { int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_WIN_CREATE); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_WIN_CREATE); mpi_errno = MPIDI_CH3U_Win_create_gather(base, size, disp_unit, info, comm_ptr, win_ptr); if (mpi_errno != MPI_SUCCESS) { MPIU_ERR_POP(mpi_errno); } fn_exit: MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_WIN_CREATE); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_Win_shared_query(MPID_Win *win_ptr, int target_rank, MPI_Aint *size, int *disp_unit, void *baseptr) { int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_SHARED_QUERY); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_SHARED_QUERY); *(void**) baseptr = win_ptr->base; *size = win_ptr->size; *disp_unit = win_ptr->disp_unit; fn_exit: MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_SHARED_QUERY); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPID_Get(void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr) { int mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_MPID_GET); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPID_GET); mpi_errno = MPIDI_CH3I_Get(origin_addr, origin_count, origin_datatype, target_rank, target_disp, target_count, target_datatype, win_ptr, NULL); fn_exit: MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPID_GET); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_CH3U_Win_create_gather( void *base, MPI_Aint size, int disp_unit, MPID_Info *info, MPID_Comm *comm_ptr, MPID_Win **win_ptr ) { int mpi_errno=MPI_SUCCESS, i, k, comm_size, rank; MPI_Aint *tmp_buf; int errflag = FALSE; MPIU_CHKPMEM_DECL(5); MPIU_CHKLMEM_DECL(1); MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_WIN_CREATE_GATHER); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_WIN_CREATE_GATHER); comm_size = (*win_ptr)->comm_ptr->local_size; rank = (*win_ptr)->comm_ptr->rank; /* RMA handlers should be set before calling this function */ mpi_errno = (*win_ptr)->RMAFns.Win_set_info(*win_ptr, info); MPIU_INSTR_DURATION_START(wincreate_allgather); /* allocate memory for the base addresses, disp_units, and completion counters of all processes */ MPIU_CHKPMEM_MALLOC((*win_ptr)->base_addrs, void **, comm_size*sizeof(void *), mpi_errno, "(*win_ptr)->base_addrs"); MPIU_CHKPMEM_MALLOC((*win_ptr)->sizes, MPI_Aint *, comm_size*sizeof(MPI_Aint), mpi_errno, "(*win_ptr)->sizes"); MPIU_CHKPMEM_MALLOC((*win_ptr)->disp_units, int *, comm_size*sizeof(int), mpi_errno, "(*win_ptr)->disp_units"); MPIU_CHKPMEM_MALLOC((*win_ptr)->all_win_handles, MPI_Win *, comm_size*sizeof(MPI_Win), mpi_errno, "(*win_ptr)->all_win_handles"); MPIU_CHKPMEM_MALLOC((*win_ptr)->pt_rma_puts_accs, int *, comm_size*sizeof(int), mpi_errno, "(*win_ptr)->pt_rma_puts_accs"); for (i=0; i<comm_size; i++) (*win_ptr)->pt_rma_puts_accs[i] = 0; /* get the addresses of the windows, window objects, and completion counters of all processes. allocate temp. buffer for communication */ MPIU_CHKLMEM_MALLOC(tmp_buf, MPI_Aint *, 4*comm_size*sizeof(MPI_Aint), mpi_errno, "tmp_buf"); /* FIXME: This needs to be fixed for heterogeneous systems */ /* FIXME: If we wanted to validate the transfer as within range at the origin, we'd also need the window size. */ tmp_buf[4*rank] = MPIU_PtrToAint(base); tmp_buf[4*rank+1] = size; tmp_buf[4*rank+2] = (MPI_Aint) disp_unit; tmp_buf[4*rank+3] = (MPI_Aint) (*win_ptr)->handle; mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, tmp_buf, 4, MPI_AINT, (*win_ptr)->comm_ptr, &errflag); MPIU_INSTR_DURATION_END(wincreate_allgather); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); k = 0; for (i=0; i<comm_size; i++) { (*win_ptr)->base_addrs[i] = MPIU_AintToPtr(tmp_buf[k++]); (*win_ptr)->sizes[i] = tmp_buf[k++]; (*win_ptr)->disp_units[i] = (int) tmp_buf[k++]; (*win_ptr)->all_win_handles[i] = (MPI_Win) tmp_buf[k++]; } fn_exit: MPIU_CHKLMEM_FREEALL(); MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_WIN_CREATE_GATHER); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: MPIU_CHKPMEM_REAP(); goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_CH3I_Put(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPID_Win * win_ptr, MPID_Request * ureq) { int mpi_errno = MPI_SUCCESS; int dt_contig ATTRIBUTE((unused)), rank; MPID_Datatype *dtp; MPI_Aint dt_true_lb ATTRIBUTE((unused)); MPIDI_msg_sz_t data_sz; MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL; int made_progress = 0; MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_PUT); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_PUT); MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE, mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync"); if (target_rank == MPI_PROC_NULL) { goto fn_exit; } MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb); if (data_sz == 0) { goto fn_exit; } rank = win_ptr->comm_ptr->rank; if (win_ptr->shm_allocated == TRUE && target_rank != rank && win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) { /* check if target is local and shared memory is allocated on window, * if so, we directly perform this operation on shared memory region. */ /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on * the same node. However, in ch3:sock, even if origin and target are on the same node, they do * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first, * which is only set to TRUE when SHM region is allocated in nemesis. * In future we need to figure out a way to check if origin and target are in the same "SHM comm". */ MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc); MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc); } /* If the put is a local operation, do it here */ if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED || (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) { mpi_errno = MPIDI_CH3I_Shm_put_op(origin_addr, origin_count, origin_datatype, target_rank, target_disp, target_count, target_datatype, win_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (ureq) { /* Complete user request and release the ch3 ref */ mpi_errno = MPID_Request_complete(ureq); if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); } } } else { MPIDI_RMA_Op_t *op_ptr = NULL; MPIDI_CH3_Pkt_put_t *put_pkt = NULL; int use_immed_pkt = FALSE; int is_origin_contig, is_target_contig; /* queue it up */ mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set); /******************** Setting operation struct areas ***********************/ /* FIXME: For contig and very short operations, use a streamlined op */ op_ptr->origin_addr = (void *) origin_addr; op_ptr->origin_count = origin_count; op_ptr->origin_datatype = origin_datatype; op_ptr->target_rank = target_rank; /* Remember user request */ op_ptr->ureq = ureq; /* if source or target datatypes are derived, increment their * reference counts */ if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) { MPID_Datatype_get_ptr(origin_datatype, dtp); MPID_Datatype_add_ref(dtp); } if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) { MPID_Datatype_get_ptr(target_datatype, dtp); MPID_Datatype_add_ref(dtp); } MPID_Datatype_is_contig(origin_datatype, &is_origin_contig); MPID_Datatype_is_contig(target_datatype, &is_target_contig); /* Judge if we can use IMMED data packet */ if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) && MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig) { if (data_sz <= MPIDI_RMA_IMMED_BYTES) use_immed_pkt = TRUE; } /* Judge if this operation is an piggyback candidate */ if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) && MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) { /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes * for both origin and target data. We should extend this optimization to derived * datatypes as well. */ if (data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE) op_ptr->piggyback_lock_candidate = 1; } /************** Setting packet struct areas in operation ****************/ put_pkt = &(op_ptr->pkt.put); if (use_immed_pkt) { MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT_IMMED); } else { MPIDI_Pkt_init(put_pkt, MPIDI_CH3_PKT_PUT); } put_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr + win_ptr->basic_info_table[target_rank].disp_unit * target_disp; put_pkt->count = target_count; put_pkt->datatype = target_datatype; put_pkt->info.dataloop_size = 0; put_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle; put_pkt->source_win_handle = win_ptr->handle; put_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE; if (use_immed_pkt) { void *src = (void *) origin_addr, *dest = (void *) (put_pkt->info.data); mpi_errno = immed_copy(src, dest, data_sz); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set); mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 && MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) { while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) { mpi_errno = wait_progress_engine(); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } } } fn_exit: MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_PUT); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_CH3I_Get_accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, void *result_addr, int result_count, MPI_Datatype result_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Op op, MPID_Win * win_ptr, MPID_Request * ureq) { int mpi_errno = MPI_SUCCESS; MPIDI_msg_sz_t orig_data_sz, target_data_sz; int rank; int dt_contig ATTRIBUTE((unused)); MPI_Aint dt_true_lb ATTRIBUTE((unused)); MPID_Datatype *dtp; MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL; int made_progress = 0; MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE); MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE, mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync"); if (target_rank == MPI_PROC_NULL) { goto fn_exit; } MPIDI_Datatype_get_info(target_count, target_datatype, dt_contig, target_data_sz, dtp, dt_true_lb); if (target_data_sz == 0) { goto fn_exit; } rank = win_ptr->comm_ptr->rank; if (win_ptr->shm_allocated == TRUE && target_rank != rank && win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) { /* check if target is local and shared memory is allocated on window, * if so, we directly perform this operation on shared memory region. */ /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on * the same node. However, in ch3:sock, even if origin and target are on the same node, they do * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first, * which is only set to TRUE when SHM region is allocated in nemesis. * In future we need to figure out a way to check if origin and target are in the same "SHM comm". */ MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc); MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc); } /* Do =! rank first (most likely branch?) */ if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED || (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) { mpi_errno = MPIDI_CH3I_Shm_get_acc_op(origin_addr, origin_count, origin_datatype, result_addr, result_count, result_datatype, target_rank, target_disp, target_count, target_datatype, op, win_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); if (ureq) { /* Complete user request and release the ch3 ref */ mpi_errno = MPID_Request_complete(ureq); if (mpi_errno != MPI_SUCCESS) { MPIR_ERR_POP(mpi_errno); } } } else { MPIDI_RMA_Op_t *op_ptr = NULL; MPIDI_CH3_Pkt_get_accum_t *get_accum_pkt; MPI_Aint origin_type_size; MPI_Aint target_type_size; int use_immed_pkt = FALSE, i; int is_origin_contig, is_target_contig, is_result_contig; MPI_Aint stream_elem_count, stream_unit_count; MPI_Aint predefined_dtp_size, predefined_dtp_count, predefined_dtp_extent; MPID_Datatype *origin_dtp = NULL, *target_dtp = NULL, *result_dtp = NULL; int is_empty_origin = FALSE; /* Judge if origin buffer is empty */ if (op == MPI_NO_OP) is_empty_origin = TRUE; /* Append the operation to the window's RMA ops queue */ mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); /* TODO: Can we use the MPIDI_RMA_ACC_CONTIG optimization? */ MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set); /******************** Setting operation struct areas ***********************/ op_ptr->origin_addr = (void *) origin_addr; op_ptr->origin_count = origin_count; op_ptr->origin_datatype = origin_datatype; op_ptr->result_addr = result_addr; op_ptr->result_count = result_count; op_ptr->result_datatype = result_datatype; op_ptr->target_rank = target_rank; /* Remember user request */ op_ptr->ureq = ureq; /* if source or target datatypes are derived, increment their * reference counts */ if (is_empty_origin == FALSE && !MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) { MPID_Datatype_get_ptr(origin_datatype, origin_dtp); } if (!MPIR_DATATYPE_IS_PREDEFINED(result_datatype)) { MPID_Datatype_get_ptr(result_datatype, result_dtp); } if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) { MPID_Datatype_get_ptr(target_datatype, target_dtp); } if (is_empty_origin == FALSE) { MPID_Datatype_get_size_macro(origin_datatype, origin_type_size); MPIU_Assign_trunc(orig_data_sz, origin_count * origin_type_size, MPIDI_msg_sz_t); } else { /* If origin buffer is empty, set origin data size to 0 */ orig_data_sz = 0; } MPID_Datatype_get_size_macro(target_datatype, target_type_size); /* Get size and count for predefined datatype elements */ if (MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) { predefined_dtp_size = target_type_size; predefined_dtp_count = target_count; MPID_Datatype_get_extent_macro(target_datatype, predefined_dtp_extent); } else { MPIU_Assert(target_dtp->basic_type != MPI_DATATYPE_NULL); MPID_Datatype_get_size_macro(target_dtp->basic_type, predefined_dtp_size); predefined_dtp_count = target_data_sz / predefined_dtp_size; MPID_Datatype_get_extent_macro(target_dtp->basic_type, predefined_dtp_extent); } MPIU_Assert(predefined_dtp_count > 0 && predefined_dtp_size > 0 && predefined_dtp_extent > 0); /* Calculate number of predefined elements in each stream unit, and * total number of stream units. */ stream_elem_count = MPIDI_CH3U_Acc_stream_size / predefined_dtp_extent; stream_unit_count = (predefined_dtp_count - 1) / stream_elem_count + 1; MPIU_Assert(stream_elem_count > 0 && stream_unit_count > 0); for (i = 0; i < stream_unit_count; i++) { if (origin_dtp != NULL) { MPID_Datatype_add_ref(origin_dtp); } if (target_dtp != NULL) { MPID_Datatype_add_ref(target_dtp); } if (result_dtp != NULL) { MPID_Datatype_add_ref(result_dtp); } } if (is_empty_origin == FALSE) { MPID_Datatype_is_contig(origin_datatype, &is_origin_contig); } else { /* If origin buffer is empty, mark origin data as contig data */ is_origin_contig = 1; } MPID_Datatype_is_contig(target_datatype, &is_target_contig); MPID_Datatype_is_contig(result_datatype, &is_result_contig); /* Judge if we can use IMMED data packet */ if ((is_empty_origin == TRUE || MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) && MPIR_DATATYPE_IS_PREDEFINED(result_datatype) && MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && is_origin_contig && is_target_contig && is_result_contig) { if (target_data_sz <= MPIDI_RMA_IMMED_BYTES) use_immed_pkt = TRUE; } /* Judge if this operation is a piggyback candidate */ if ((is_empty_origin == TRUE || MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) && MPIR_DATATYPE_IS_PREDEFINED(result_datatype) && MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) { /* FIXME: currently we only piggyback LOCK flag with op using predefined datatypes * for origin, target and result data. We should extend this optimization to derived * datatypes as well. */ if (orig_data_sz <= MPIR_CVAR_CH3_RMA_OP_PIGGYBACK_LOCK_DATA_SIZE) op_ptr->piggyback_lock_candidate = 1; } /************** Setting packet struct areas in operation ****************/ get_accum_pkt = &(op_ptr->pkt.get_accum); if (use_immed_pkt) { MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM_IMMED); } else { MPIDI_Pkt_init(get_accum_pkt, MPIDI_CH3_PKT_GET_ACCUM); } get_accum_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr + win_ptr->basic_info_table[target_rank].disp_unit * target_disp; get_accum_pkt->count = target_count; get_accum_pkt->datatype = target_datatype; get_accum_pkt->info.dataloop_size = 0; get_accum_pkt->op = op; get_accum_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle; get_accum_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE; if (use_immed_pkt) { void *src = (void *) origin_addr, *dest = (void *) (get_accum_pkt->info.data); mpi_errno = immed_copy(src, dest, orig_data_sz); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set); mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 && MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) { while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) { mpi_errno = wait_progress_engine(); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } } } fn_exit: MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3I_GET_ACCUMULATE); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
static inline int check_window_state(MPID_Win * win_ptr, int *made_progress) { MPID_Request *fence_req_ptr = NULL; int i, mpi_errno = MPI_SUCCESS; MPIDI_STATE_DECL(MPID_STATE_CHECK_WINDOW_STATE); MPIDI_RMA_FUNC_ENTER(MPID_STATE_CHECK_WINDOW_STATE); (*made_progress) = 0; switch (win_ptr->states.access_state) { case MPIDI_RMA_FENCE_ISSUED: MPID_Request_get_ptr(win_ptr->fence_sync_req, fence_req_ptr); if (MPID_Request_is_complete(fence_req_ptr)) { win_ptr->states.access_state = MPIDI_RMA_FENCE_GRANTED; MPID_Request_release(fence_req_ptr); win_ptr->fence_sync_req = MPI_REQUEST_NULL; num_active_issued_win--; MPIU_Assert(num_active_issued_win >= 0); (*made_progress) = 1; } break; case MPIDI_RMA_PSCW_ISSUED: if (win_ptr->start_req == NULL) { /* for MPI_MODE_NOCHECK and all targets on SHM, * we do not create PSCW requests on window. */ win_ptr->states.access_state = MPIDI_RMA_PSCW_GRANTED; num_active_issued_win--; MPIU_Assert(num_active_issued_win >= 0); (*made_progress) = 1; } else { for (i = 0; i < win_ptr->start_grp_size; i++) { MPID_Request *start_req_ptr = NULL; if (win_ptr->start_req[i] == MPI_REQUEST_NULL) continue; MPID_Request_get_ptr(win_ptr->start_req[i], start_req_ptr); if (MPID_Request_is_complete(start_req_ptr)) { MPID_Request_release(start_req_ptr); win_ptr->start_req[i] = MPI_REQUEST_NULL; } else { break; } } if (i == win_ptr->start_grp_size) { win_ptr->states.access_state = MPIDI_RMA_PSCW_GRANTED; num_active_issued_win--; MPIU_Assert(num_active_issued_win >= 0); (*made_progress) = 1; MPIU_Free(win_ptr->start_req); win_ptr->start_req = NULL; } } break; case MPIDI_RMA_LOCK_ALL_ISSUED: if (win_ptr->outstanding_locks == 0) { win_ptr->states.access_state = MPIDI_RMA_LOCK_ALL_GRANTED; (*made_progress) = 1; } break; default: break; } /* end of switch */ fn_exit: MPIDI_RMA_FUNC_EXIT(MPID_STATE_CHECK_WINDOW_STATE); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_CH3U_Win_gather_info(void *base, MPI_Aint size, int disp_unit, MPID_Info * info, MPID_Comm * comm_ptr, MPID_Win ** win_ptr) { int mpi_errno = MPI_SUCCESS, i, k, comm_size, rank; MPI_Aint *tmp_buf; MPIR_Errflag_t errflag = MPIR_ERR_NONE; MPIU_CHKPMEM_DECL(1); MPIU_CHKLMEM_DECL(1); MPIDI_STATE_DECL(MPID_STATE_MPIDI_CH3U_WIN_GATHER_INFO); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_CH3U_WIN_GATHER_INFO); comm_size = (*win_ptr)->comm_ptr->local_size; rank = (*win_ptr)->comm_ptr->rank; MPIR_T_PVAR_TIMER_START(RMA, rma_wincreate_allgather); /* allocate memory for the base addresses, disp_units, and * completion counters of all processes */ MPIU_CHKPMEM_MALLOC((*win_ptr)->basic_info_table, MPIDI_Win_basic_info_t *, comm_size * sizeof(MPIDI_Win_basic_info_t), mpi_errno, "(*win_ptr)->basic_info_table"); /* get the addresses of the windows, window objects, and completion * counters of all processes. allocate temp. buffer for communication */ MPIU_CHKLMEM_MALLOC(tmp_buf, MPI_Aint *, 4 * comm_size * sizeof(MPI_Aint), mpi_errno, "tmp_buf"); /* FIXME: This needs to be fixed for heterogeneous systems */ /* FIXME: If we wanted to validate the transfer as within range at the * origin, we'd also need the window size. */ tmp_buf[4 * rank] = MPIU_PtrToAint(base); tmp_buf[4 * rank + 1] = size; tmp_buf[4 * rank + 2] = (MPI_Aint) disp_unit; tmp_buf[4 * rank + 3] = (MPI_Aint) (*win_ptr)->handle; mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, tmp_buf, 4, MPI_AINT, (*win_ptr)->comm_ptr, &errflag); MPIR_T_PVAR_TIMER_END(RMA, rma_wincreate_allgather); if (mpi_errno) { MPIR_ERR_POP(mpi_errno); } MPIR_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); k = 0; for (i = 0; i < comm_size; i++) { (*win_ptr)->basic_info_table[i].base_addr = MPIU_AintToPtr(tmp_buf[k++]); (*win_ptr)->basic_info_table[i].size = tmp_buf[k++]; (*win_ptr)->basic_info_table[i].disp_unit = (int) tmp_buf[k++]; (*win_ptr)->basic_info_table[i].win_handle = (MPI_Win) tmp_buf[k++]; } fn_exit: MPIU_CHKLMEM_FREEALL(); MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_CH3U_WIN_GATHER_INFO); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: MPIU_CHKPMEM_REAP(); goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_Win_create(void *base, MPI_Aint size, int disp_unit, MPID_Info *info, MPID_Comm *comm_ptr, MPID_Win **win_ptr ) { int mpi_errno=MPI_SUCCESS, i, k, comm_size, rank; MPI_Aint *tmp_buf; MPID_Comm *win_comm_ptr; int errflag = FALSE; MPIU_CHKPMEM_DECL(4); MPIU_CHKLMEM_DECL(1); MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_CREATE); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_CREATE); /* FIXME: There should be no unreferenced args */ MPIU_UNREFERENCED_ARG(info); if(initRMAoptions) { int rc; MPIU_THREADSAFE_INIT_BLOCK_BEGIN(initRMAoptions); /* Default is to enable the use of the immediate accumulate feature */ if (!MPL_env2bool( "MPICH_RMA_ACC_IMMED", &rc )) rc = 1; MPIDI_CH3_RMA_SetAccImmed(rc); #ifdef USE_MPIU_INSTR /* Define all instrumentation handle used in the CH3 RMA here*/ MPIU_INSTR_DURATION_INIT(wincreate_allgather,0,"WIN_CREATE:Allgather"); MPIU_INSTR_DURATION_INIT(winfree_rs,0,"WIN_FREE:ReduceScatterBlock"); MPIU_INSTR_DURATION_INIT(winfree_complete,0,"WIN_FREE:Complete"); MPIU_INSTR_DURATION_INIT(rmaqueue_alloc,0,"Allocate RMA Queue element"); MPIDI_CH3_RMA_InitInstr(); #endif MPIU_THREADSAFE_INIT_CLEAR(initRMAoptions); MPIU_THREADSAFE_INIT_BLOCK_END(initRMAoptions); } comm_size = comm_ptr->local_size; rank = comm_ptr->rank; *win_ptr = (MPID_Win *)MPIU_Handle_obj_alloc( &MPID_Win_mem ); MPIU_ERR_CHKANDJUMP1(!(*win_ptr),mpi_errno,MPI_ERR_OTHER,"**nomem", "**nomem %s","MPID_Win_mem"); MPIU_Object_set_ref(*win_ptr, 1); (*win_ptr)->fence_cnt = 0; (*win_ptr)->base = base; (*win_ptr)->size = size; (*win_ptr)->disp_unit = disp_unit; (*win_ptr)->start_group_ptr = NULL; (*win_ptr)->start_assert = 0; (*win_ptr)->attributes = NULL; (*win_ptr)->rma_ops_list_head = NULL; (*win_ptr)->rma_ops_list_tail = NULL; (*win_ptr)->lock_granted = 0; (*win_ptr)->current_lock_type = MPID_LOCK_NONE; (*win_ptr)->shared_lock_ref_cnt = 0; (*win_ptr)->lock_queue = NULL; (*win_ptr)->my_counter = 0; (*win_ptr)->my_pt_rma_puts_accs = 0; mpi_errno = MPIR_Comm_dup_impl(comm_ptr, &win_comm_ptr); if (mpi_errno) MPIU_ERR_POP(mpi_errno); (*win_ptr)->comm_ptr = win_comm_ptr; (*win_ptr)->myrank = rank; MPIU_INSTR_DURATION_START(wincreate_allgather); /* allocate memory for the base addresses, disp_units, and completion counters of all processes */ MPIU_CHKPMEM_MALLOC((*win_ptr)->base_addrs, void **, comm_size*sizeof(void *), mpi_errno, "(*win_ptr)->base_addrs"); MPIU_CHKPMEM_MALLOC((*win_ptr)->disp_units, int *, comm_size*sizeof(int), mpi_errno, "(*win_ptr)->disp_units"); MPIU_CHKPMEM_MALLOC((*win_ptr)->all_win_handles, MPI_Win *, comm_size*sizeof(MPI_Win), mpi_errno, "(*win_ptr)->all_win_handles"); MPIU_CHKPMEM_MALLOC((*win_ptr)->pt_rma_puts_accs, int *, comm_size*sizeof(int), mpi_errno, "(*win_ptr)->pt_rma_puts_accs"); for (i=0; i<comm_size; i++) (*win_ptr)->pt_rma_puts_accs[i] = 0; /* get the addresses of the windows, window objects, and completion counters of all processes. allocate temp. buffer for communication */ MPIU_CHKLMEM_MALLOC(tmp_buf, MPI_Aint *, 3*comm_size*sizeof(MPI_Aint), mpi_errno, "tmp_buf"); /* FIXME: This needs to be fixed for heterogeneous systems */ tmp_buf[3*rank] = MPIU_PtrToAint(base); tmp_buf[3*rank+1] = (MPI_Aint) disp_unit; tmp_buf[3*rank+2] = (MPI_Aint) (*win_ptr)->handle; mpi_errno = MPIR_Allgather_impl(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, tmp_buf, 3 * sizeof(MPI_Aint), MPI_BYTE, comm_ptr, &errflag); MPIU_INSTR_DURATION_END(wincreate_allgather); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); k = 0; for (i=0; i<comm_size; i++) { (*win_ptr)->base_addrs[i] = MPIU_AintToPtr(tmp_buf[k++]); (*win_ptr)->disp_units[i] = (int) tmp_buf[k++]; (*win_ptr)->all_win_handles[i] = (MPI_Win) tmp_buf[k++]; } fn_exit: MPIU_CHKLMEM_FREEALL(); MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_CREATE); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: MPIU_CHKPMEM_REAP(); goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_Accumulate(const void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPI_Op op, MPID_Win *win_ptr) { int mpi_errno=MPI_SUCCESS; MPIDI_msg_sz_t data_sz; int dt_contig ATTRIBUTE((unused)), rank; MPI_Aint dt_true_lb ATTRIBUTE((unused)); MPID_Datatype *dtp; MPIDI_VC_t *orig_vc, *target_vc; MPIDI_STATE_DECL(MPID_STATE_MPIDI_ACCUMULATE); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_ACCUMULATE); if (target_rank == MPI_PROC_NULL) { goto fn_exit; } if (win_ptr->epoch_state == MPIDI_EPOCH_NONE && win_ptr->fence_issued) { win_ptr->epoch_state = MPIDI_EPOCH_FENCE; } MPIU_ERR_CHKANDJUMP(win_ptr->epoch_state == MPIDI_EPOCH_NONE, mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync"); MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb); if (data_sz == 0) { goto fn_exit; } rank = win_ptr->comm_ptr->rank; if (win_ptr->shm_allocated == TRUE && target_rank != rank && win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) { /* check if target is local and shared memory is allocated on window, if so, we directly perform this operation on shared memory region. */ /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on the same node. However, in ch3:sock, even if origin and target are on the same node, they do not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first, which is only set to TRUE when SHM region is allocated in nemesis. In future we need to figure out a way to check if origin and target are in the same "SHM comm". */ MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc); MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc); } /* Do =! rank first (most likely branch?) */ if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED || (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) { mpi_errno = MPIDI_CH3I_Shm_acc_op(origin_addr, origin_count, origin_datatype, target_rank, target_disp, target_count, target_datatype, op, win_ptr); if (mpi_errno) MPIU_ERR_POP(mpi_errno); } else { MPIDI_RMA_Ops_list_t *ops_list = MPIDI_CH3I_RMA_Get_ops_list(win_ptr, target_rank); MPIDI_RMA_Op_t *new_ptr = NULL; /* queue it up */ MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_alloc); mpi_errno = MPIDI_CH3I_RMA_Ops_alloc_tail(ops_list, &new_ptr); MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_alloc); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } /* If predefined and contiguous, use a simplified element */ if (MPIR_DATATYPE_IS_PREDEFINED(origin_datatype) && MPIR_DATATYPE_IS_PREDEFINED(target_datatype) && enableShortACC) { MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set); new_ptr->type = MPIDI_RMA_ACC_CONTIG; /* Only the information needed for the contig/predefined acc */ /* Cast away const'ness for origin_address as * MPIDI_RMA_Op_t contain both PUT and GET like ops */ new_ptr->origin_addr = (void *) origin_addr; new_ptr->origin_count = origin_count; new_ptr->origin_datatype = origin_datatype; new_ptr->target_rank = target_rank; new_ptr->target_disp = target_disp; new_ptr->target_count = target_count; new_ptr->target_datatype = target_datatype; new_ptr->op = op; MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set); goto fn_exit; } MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set); new_ptr->type = MPIDI_RMA_ACCUMULATE; /* Cast away const'ness for origin_address as MPIDI_RMA_Op_t * contain both PUT and GET like ops */ new_ptr->origin_addr = (void *) origin_addr; new_ptr->origin_count = origin_count; new_ptr->origin_datatype = origin_datatype; new_ptr->target_rank = target_rank; new_ptr->target_disp = target_disp; new_ptr->target_count = target_count; new_ptr->target_datatype = target_datatype; new_ptr->op = op; MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set); /* if source or target datatypes are derived, increment their reference counts */ if (!MPIR_DATATYPE_IS_PREDEFINED(origin_datatype)) { MPID_Datatype_get_ptr(origin_datatype, dtp); MPID_Datatype_add_ref(dtp); } if (!MPIR_DATATYPE_IS_PREDEFINED(target_datatype)) { MPID_Datatype_get_ptr(target_datatype, dtp); MPID_Datatype_add_ref(dtp); } } fn_exit: MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_ACCUMULATE); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_Get(void *origin_addr, int origin_count, MPI_Datatype origin_datatype, int target_rank, MPI_Aint target_disp, int target_count, MPI_Datatype target_datatype, MPID_Win *win_ptr) { int mpi_errno = MPI_SUCCESS; MPIDI_msg_sz_t data_sz; int dt_contig, rank, predefined; MPI_Aint dt_true_lb; MPIDI_RMA_ops *new_ptr; MPID_Datatype *dtp; MPIU_CHKPMEM_DECL(1); MPIDI_STATE_DECL(MPID_STATE_MPIDI_GET); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_GET); MPIDI_Datatype_get_info(origin_count, origin_datatype, dt_contig, data_sz, dtp, dt_true_lb); if ((data_sz == 0) || (target_rank == MPI_PROC_NULL)) { goto fn_exit; } rank = win_ptr->myrank; /* If the get is a local operation, do it here */ if (target_rank == rank) { mpi_errno = MPIR_Localcopy((char *) win_ptr->base + win_ptr->disp_unit * target_disp, target_count, target_datatype, origin_addr, origin_count, origin_datatype); } else { /* queue it up */ MPIU_INSTR_DURATION_START(rmaqueue_alloc); MPIU_CHKPMEM_MALLOC(new_ptr, MPIDI_RMA_ops *, sizeof(MPIDI_RMA_ops), mpi_errno, "RMA operation entry"); MPIU_INSTR_DURATION_END(rmaqueue_alloc); if (win_ptr->rma_ops_list_tail) win_ptr->rma_ops_list_tail->next = new_ptr; else win_ptr->rma_ops_list_head = new_ptr; win_ptr->rma_ops_list_tail = new_ptr; /* FIXME: For contig and very short operations, use a streamlined op */ new_ptr->next = NULL; new_ptr->type = MPIDI_RMA_GET; new_ptr->origin_addr = origin_addr; new_ptr->origin_count = origin_count; new_ptr->origin_datatype = origin_datatype; new_ptr->target_rank = target_rank; new_ptr->target_disp = target_disp; new_ptr->target_count = target_count; new_ptr->target_datatype = target_datatype; /* if source or target datatypes are derived, increment their reference counts */ MPIDI_CH3I_DATATYPE_IS_PREDEFINED(origin_datatype, predefined); if (!predefined) { MPID_Datatype_get_ptr(origin_datatype, dtp); MPID_Datatype_add_ref(dtp); } MPIDI_CH3I_DATATYPE_IS_PREDEFINED(target_datatype, predefined); if (!predefined) { MPID_Datatype_get_ptr(target_datatype, dtp); MPID_Datatype_add_ref(dtp); } } fn_exit: MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_GET); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: MPIU_CHKPMEM_REAP(); goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPID_Compare_and_swap(const void *origin_addr, const void *compare_addr, void *result_addr, MPI_Datatype datatype, int target_rank, MPI_Aint target_disp, MPID_Win * win_ptr) { int mpi_errno = MPI_SUCCESS; int rank; MPIDI_VC_t *orig_vc = NULL, *target_vc = NULL; int made_progress = 0; MPIDI_STATE_DECL(MPID_STATE_MPID_COMPARE_AND_SWAP); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPID_COMPARE_AND_SWAP); MPIR_ERR_CHKANDJUMP(win_ptr->states.access_state == MPIDI_RMA_NONE, mpi_errno, MPI_ERR_RMA_SYNC, "**rmasync"); if (target_rank == MPI_PROC_NULL) { goto fn_exit; } rank = win_ptr->comm_ptr->rank; if (win_ptr->shm_allocated == TRUE && target_rank != rank && win_ptr->create_flavor != MPI_WIN_FLAVOR_SHARED) { /* check if target is local and shared memory is allocated on window, * if so, we directly perform this operation on shared memory region. */ /* FIXME: Here we decide whether to perform SHM operations by checking if origin and target are on * the same node. However, in ch3:sock, even if origin and target are on the same node, they do * not within the same SHM region. Here we filter out ch3:sock by checking shm_allocated flag first, * which is only set to TRUE when SHM region is allocated in nemesis. * In future we need to figure out a way to check if origin and target are in the same "SHM comm". */ MPIDI_Comm_get_vc(win_ptr->comm_ptr, rank, &orig_vc); MPIDI_Comm_get_vc(win_ptr->comm_ptr, target_rank, &target_vc); } /* The datatype must be predefined, and one of: C integer, Fortran integer, * Logical, Multi-language types, or Byte. This is checked above the ADI, * so there's no need to check it again here. */ /* FIXME: For shared memory windows, we should provide an implementation * that uses a processor atomic operation. */ if (target_rank == rank || win_ptr->create_flavor == MPI_WIN_FLAVOR_SHARED || (win_ptr->shm_allocated == TRUE && orig_vc->node_id == target_vc->node_id)) { mpi_errno = MPIDI_CH3I_Shm_cas_op(origin_addr, compare_addr, result_addr, datatype, target_rank, target_disp, win_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); } else { MPIDI_RMA_Op_t *op_ptr = NULL; MPIDI_CH3_Pkt_cas_t *cas_pkt = NULL; MPI_Aint type_size; void *src = NULL, *dest = NULL; /* Append this operation to the RMA ops queue */ mpi_errno = MPIDI_CH3I_Win_get_op(win_ptr, &op_ptr); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); MPIR_T_PVAR_TIMER_START(RMA, rma_rmaqueue_set); /******************** Setting operation struct areas ***********************/ op_ptr->origin_addr = (void *) origin_addr; op_ptr->origin_count = 1; op_ptr->origin_datatype = datatype; op_ptr->result_addr = result_addr; op_ptr->result_datatype = datatype; op_ptr->compare_addr = (void *) compare_addr; op_ptr->compare_datatype = datatype; op_ptr->target_rank = target_rank; op_ptr->piggyback_lock_candidate = 1; /* CAS is always able to piggyback LOCK */ /************** Setting packet struct areas in operation ****************/ cas_pkt = &(op_ptr->pkt.cas); MPIDI_Pkt_init(cas_pkt, MPIDI_CH3_PKT_CAS_IMMED); cas_pkt->addr = (char *) win_ptr->basic_info_table[target_rank].base_addr + win_ptr->basic_info_table[target_rank].disp_unit * target_disp; cas_pkt->datatype = datatype; cas_pkt->target_win_handle = win_ptr->basic_info_table[target_rank].win_handle; cas_pkt->flags = MPIDI_CH3_PKT_FLAG_NONE; /* REQUIRE: All datatype arguments must be of the same, builtin * type and counts must be 1. */ MPID_Datatype_get_size_macro(datatype, type_size); MPIU_Assert(type_size <= sizeof(MPIDI_CH3_CAS_Immed_u)); src = (void *) origin_addr, dest = (void *) (&(cas_pkt->origin_data)); mpi_errno = immed_copy(src, dest, type_size); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); src = (void *) compare_addr, dest = (void *) (&(cas_pkt->compare_data)); mpi_errno = immed_copy(src, dest, type_size); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); MPIR_T_PVAR_TIMER_END(RMA, rma_rmaqueue_set); mpi_errno = MPIDI_CH3I_Win_enqueue_op(win_ptr, op_ptr); if (mpi_errno) MPIR_ERR_POP(mpi_errno); mpi_errno = MPIDI_CH3I_RMA_Make_progress_target(win_ptr, target_rank, &made_progress); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); if (MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD >= 0 && MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) { while (MPIDI_CH3I_RMA_Active_req_cnt >= MPIR_CVAR_CH3_RMA_ACTIVE_REQ_THRESHOLD) { mpi_errno = wait_progress_engine(); if (mpi_errno != MPI_SUCCESS) MPIR_ERR_POP(mpi_errno); } } } fn_exit: MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPID_COMPARE_AND_SWAP); return mpi_errno; /* --BEGIN ERROR HANDLING-- */ fn_fail: goto fn_exit; /* --END ERROR HANDLING-- */ }
int MPIDI_Win_free(MPID_Win **win_ptr) { int mpi_errno=MPI_SUCCESS, total_pt_rma_puts_accs; int in_use; MPID_Comm *comm_ptr; int errflag = FALSE; MPIDI_STATE_DECL(MPID_STATE_MPIDI_WIN_FREE); MPIDI_RMA_FUNC_ENTER(MPID_STATE_MPIDI_WIN_FREE); comm_ptr = (*win_ptr)->comm_ptr; MPIU_INSTR_DURATION_START(winfree_rs); mpi_errno = MPIR_Reduce_scatter_block_impl((*win_ptr)->pt_rma_puts_accs, &total_pt_rma_puts_accs, 1, MPI_INT, MPI_SUM, comm_ptr, &errflag); if (mpi_errno) { MPIU_ERR_POP(mpi_errno); } MPIU_ERR_CHKANDJUMP(errflag, mpi_errno, MPI_ERR_OTHER, "**coll_fail"); MPIU_INSTR_DURATION_END(winfree_rs); if (total_pt_rma_puts_accs != (*win_ptr)->my_pt_rma_puts_accs) { MPID_Progress_state progress_state; /* poke the progress engine until the two are equal */ MPIU_INSTR_DURATION_START(winfree_complete); MPID_Progress_start(&progress_state); while (total_pt_rma_puts_accs != (*win_ptr)->my_pt_rma_puts_accs) { mpi_errno = MPID_Progress_wait(&progress_state); /* --BEGIN ERROR HANDLING-- */ if (mpi_errno != MPI_SUCCESS) { MPID_Progress_end(&progress_state); MPIU_ERR_SETANDJUMP(mpi_errno,MPI_ERR_OTHER,"**winnoprogress"); } /* --END ERROR HANDLING-- */ } MPID_Progress_end(&progress_state); MPIU_INSTR_DURATION_END(winfree_complete); } mpi_errno = MPIR_Comm_free_impl(comm_ptr); if (mpi_errno) MPIU_ERR_POP(mpi_errno); MPIU_Free((*win_ptr)->base_addrs); MPIU_Free((*win_ptr)->disp_units); MPIU_Free((*win_ptr)->all_win_handles); MPIU_Free((*win_ptr)->pt_rma_puts_accs); MPIU_Object_release_ref(*win_ptr, &in_use); /* MPI windows don't have reference count semantics, so this should always be true */ MPIU_Assert(!in_use); MPIU_Handle_obj_free( &MPID_Win_mem, *win_ptr ); fn_exit: MPIDI_RMA_FUNC_EXIT(MPID_STATE_MPIDI_WIN_FREE); return mpi_errno; fn_fail: goto fn_exit; }