static int clear_nacks(ptl_process_t target_id) { struct rptl_target *target; struct rptl_op *op; int ret = PTL_OK; MPIDI_STATE_DECL(MPID_STATE_CLEAR_NACKS); MPIDI_FUNC_ENTER(MPID_STATE_CLEAR_NACKS); ret = find_target(target_id, &target); RPTLU_ERR_POP(ret, "error finding target\n"); for (op = target->data_op_list; op; op = op->next) { if ((op->op_type == RPTL_OP_PUT && IDS_ARE_EQUAL(op->u.put.target_id, target_id)) || (op->op_type == RPTL_OP_GET && IDS_ARE_EQUAL(op->u.get.target_id, target_id))) { if (op->state == RPTL_OP_STATE_NACKED) op->state = RPTL_OP_STATE_QUEUED; } } target->state = RPTL_TARGET_STATE_ACTIVE; ret = poke_progress(); RPTLU_ERR_POP(ret, "error in poke_progress\n"); fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_CLEAR_NACKS); return ret; fn_fail: goto fn_exit; }
static int rptl_put(ptl_handle_md_t md_handle, ptl_size_t local_offset, ptl_size_t length, ptl_ack_req_t ack_req, ptl_process_t target_id, ptl_pt_index_t pt_index, ptl_match_bits_t match_bits, ptl_size_t remote_offset, void *user_ptr, ptl_hdr_data_t hdr_data, enum rptl_pt_type pt_type) { struct rptl_op *op; int ret = PTL_OK; struct rptl_target *target; MPIDI_STATE_DECL(MPID_STATE_RPTL_PUT); MPIDI_FUNC_ENTER(MPID_STATE_RPTL_PUT); ret = find_target(target_id, &target); RPTLU_ERR_POP(ret, "error finding target structure\n"); ret = rptli_op_alloc(&op, target); RPTLU_ERR_POP(ret, "error allocating op\n"); op->op_type = RPTL_OP_PUT; op->state = RPTL_OP_STATE_QUEUED; /* store the user parameters */ op->u.put.md_handle = md_handle; op->u.put.local_offset = local_offset; op->u.put.length = length; op->u.put.ack_req = ack_req; op->u.put.target_id = target_id; op->u.put.pt_index = pt_index; op->u.put.match_bits = match_bits; op->u.put.remote_offset = remote_offset; op->u.put.user_ptr = user_ptr; op->u.put.hdr_data = hdr_data; /* place to store the send and ack events */ op->u.put.send = NULL; op->u.put.ack = NULL; op->u.put.pt_type = pt_type; op->events_ready = 0; op->target = target; if (op->u.put.pt_type == RPTL_PT_DATA) MPL_DL_APPEND(target->data_op_list, op); else MPL_DL_APPEND(target->control_op_list, op); ret = poke_progress(); RPTLU_ERR_POP(ret, "Error from poke_progress\n"); fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_RPTL_PUT); return ret; fn_fail: goto fn_exit; }
int MPID_nem_ptl_rptl_ptinit(ptl_handle_ni_t ni_handle, ptl_handle_eq_t eq_handle, ptl_pt_index_t data_pt, ptl_pt_index_t control_pt) { int ret = PTL_OK; struct rptl *rptl; int mpi_errno = MPI_SUCCESS; int i; ptl_md_t md; MPIU_CHKPMEM_DECL(2); MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_PTL_RPTL_PTINIT); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_PTL_RPTL_PTINIT); /* setup the parts of rptls that can be done before world size or * target information */ MPIU_CHKPMEM_MALLOC(rptl, struct rptl *, sizeof(struct rptl), mpi_errno, "rptl"); MPL_DL_APPEND(rptl_info.rptl_list, rptl); rptl->local_state = RPTL_LOCAL_STATE_ACTIVE; rptl->pause_ack_counter = 0; rptl->data.ob_max_count = 0; rptl->data.ob_curr_count = 0; rptl->data.pt = data_pt; rptl->control.pt = control_pt; rptl->ni = ni_handle; rptl->eq = eq_handle; md.start = 0; md.length = (ptl_size_t) (-1); md.options = 0x0; md.eq_handle = rptl->eq; md.ct_handle = PTL_CT_NONE; ret = PtlMDBind(rptl->ni, &md, &rptl->md); RPTLU_ERR_POP(ret, "Error binding new global MD\n"); /* post world_size number of empty buffers on the control portal */ if (rptl->control.pt != PTL_PT_ANY) { MPIU_CHKPMEM_MALLOC(rptl->control.me, ptl_handle_me_t *, 2 * rptl_info.world_size * sizeof(ptl_handle_me_t), mpi_errno, "rptl target info"); for (i = 0; i < 2 * rptl_info.world_size; i++) { ret = rptli_post_control_buffer(rptl->ni, rptl->control.pt, &rptl->control.me[i]); RPTLU_ERR_POP(ret, "Error in rptli_post_control_buffer\n"); } rptl->control.me_idx = 0; }
int MPID_nem_ptl_rptl_ptfini(ptl_pt_index_t pt_index) { int i; int ret = PTL_OK; struct rptl *rptl; MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_PTL_RPTL_PTFINI); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_PTL_RPTL_PTFINI); /* find the right rptl */ for (rptl = rptl_info.rptl_list; rptl && rptl->data.pt != pt_index; rptl = rptl->next); assert(rptl); /* free control portals that were created */ if (rptl->control.pt != PTL_PT_ANY) { for (i = 0; i < rptl_info.world_size * 2; i++) { ret = PtlMEUnlink(rptl->control.me[i]); RPTLU_ERR_POP(ret, "Error unlinking control buffers\n"); } MPIU_Free(rptl->control.me); } MPL_DL_DELETE(rptl_info.rptl_list, rptl); MPIU_Free(rptl); fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_PTL_RPTL_PTFINI); return ret; fn_fail: goto fn_exit; }
int MPID_nem_ptl_rptl_get(ptl_handle_md_t md_handle, ptl_size_t local_offset, ptl_size_t length, ptl_process_t target_id, ptl_pt_index_t pt_index, ptl_match_bits_t match_bits, ptl_size_t remote_offset, void *user_ptr) { struct rptl_op *op; int ret = PTL_OK; struct rptl_target *target; MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_PTL_RPTL_GET); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_PTL_RPTL_GET); ret = find_target(target_id, &target); RPTLU_ERR_POP(ret, "error finding target structure\n"); ret = rptli_op_alloc(&op, target); RPTLU_ERR_POP(ret, "error allocating op\n"); op->op_type = RPTL_OP_GET; op->state = RPTL_OP_STATE_QUEUED; /* store the user parameters */ op->u.get.md_handle = md_handle; op->u.get.local_offset = local_offset; op->u.get.length = length; op->u.get.target_id = target_id; op->u.get.pt_index = pt_index; op->u.get.match_bits = match_bits; op->u.get.remote_offset = remote_offset; op->u.get.user_ptr = user_ptr; op->events_ready = 0; op->target = target; MPL_DL_APPEND(target->data_op_list, op); ret = poke_progress(); RPTLU_ERR_POP(ret, "Error from poke_progress\n"); fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_PTL_RPTL_GET); return ret; fn_fail: goto fn_exit; }
static int send_pause_messages(struct rptl *rptl) { int i, mpi_errno = MPI_SUCCESS; ptl_process_t id; ptl_pt_index_t data_pt, control_pt; int ret = PTL_OK; MPIDI_STATE_DECL(MPID_STATE_SEND_PAUSE_MESSAGES); MPIDI_FUNC_ENTER(MPID_STATE_SEND_PAUSE_MESSAGES); /* if no control portal is setup for this rptl, we are doomed */ assert(rptl->control.pt != PTL_PT_ANY); /* set the max message count in the overflow buffers we can keep * before sending the unpause messages */ rptl->data.ob_max_count = rptl->data.ob_curr_count / 2; for (i = 0; i < rptl_info.world_size; i++) { if (i == MPIDI_Process.my_pg_rank) continue; mpi_errno = rptl_info.get_target_info(i, &id, rptl->data.pt, &data_pt, &control_pt); if (mpi_errno) { ret = PTL_FAIL; RPTLU_ERR_POP(ret, "Error getting target info while sending pause messages\n"); } /* make sure the user setup a control portal */ assert(control_pt != PTL_PT_ANY); ret = rptl_put(rptl->md, 0, 0, PTL_NO_ACK_REQ, id, control_pt, 0, 0, NULL, RPTL_CONTROL_MSG_PAUSE, RPTL_PT_CONTROL); RPTLU_ERR_POP(ret, "Error sending pause message\n"); } fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_SEND_PAUSE_MESSAGES); return ret; fn_fail: goto fn_exit; }
int MPID_nem_ptl_rptl_drain_eq(int eq_count, ptl_handle_eq_t *eq) { int ret = PTL_OK; ptl_event_t event; struct rptl_op_pool_segment *op_segment; int i; struct rptl_target *target, *t; MPIDI_STATE_DECL(MPID_STATE_MPID_NEM_PTL_RPTL_FINALIZE); MPIDI_FUNC_ENTER(MPID_STATE_MPID_NEM_PTL_RPTL_FINALIZE); for (target = rptl_info.target_list; target; target = target->next) { while (target->control_op_list || target->data_op_list) { for (i = 0; i < eq_count; i++) { /* read and ignore all events */ ret = MPID_nem_ptl_rptl_eqget(eq[i], &event); if (ret == PTL_EQ_EMPTY) ret = PTL_OK; RPTLU_ERR_POP(ret, "Error calling MPID_nem_ptl_rptl_eqget\n"); } } } for (target = rptl_info.target_list; target;) { assert(target->data_op_list == NULL); assert(target->control_op_list == NULL); while (target->op_segment_list) { op_segment = target->op_segment_list; MPL_DL_DELETE(target->op_segment_list, op_segment); MPIU_Free(op_segment); } t = target->next; MPIU_Free(target); target = t; } fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_MPID_NEM_PTL_RPTL_FINALIZE); return ret; fn_fail: goto fn_exit; }
static int get_event_info(ptl_event_t * event, struct rptl **ret_rptl, struct rptl_op **ret_op) { struct rptl *rptl; struct rptl_op *op; int ret = PTL_OK; MPIDI_STATE_DECL(MPID_STATE_GET_EVENT_INFO); MPIDI_FUNC_ENTER(MPID_STATE_GET_EVENT_INFO); if (event->type == PTL_EVENT_SEND || event->type == PTL_EVENT_REPLY || event->type == PTL_EVENT_ACK) { op = (struct rptl_op *) event->user_ptr; rptl_info.origin_events_left++; if (event->type != PTL_EVENT_SEND) op->target->issued_data_ops--; /* see if there are any pending ops to be issued */ ret = poke_progress(); RPTLU_ERR_POP(ret, "Error returned from poke_progress\n"); assert(op); rptl = NULL; } else { /* for all target-side events, we look up the rptl based on * the pt_index */ for (rptl = rptl_info.rptl_list; rptl; rptl = rptl->next) if (rptl->data.pt == event->pt_index || rptl->control.pt == event->pt_index) break; assert(rptl); op = NULL; } *ret_rptl = rptl; *ret_op = op; fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_GET_EVENT_INFO); return ret; fn_fail: goto fn_exit; }
int rptli_post_control_buffer(ptl_handle_ni_t ni_handle, ptl_pt_index_t pt, ptl_handle_me_t * me_handle) { int ret; ptl_me_t me; ptl_process_t id; MPIDI_STATE_DECL(MPID_STATE_RPTLI_POST_CONTROL_BUFFER); MPIDI_FUNC_ENTER(MPID_STATE_RPTLI_POST_CONTROL_BUFFER); id.phys.nid = PTL_NID_ANY; id.phys.pid = PTL_PID_ANY; me.start = NULL; me.length = 0; me.ct_handle = PTL_CT_NONE; me.uid = PTL_UID_ANY; me.options = (PTL_ME_OP_PUT | PTL_ME_OP_GET | PTL_ME_USE_ONCE | PTL_ME_IS_ACCESSIBLE | PTL_ME_EVENT_LINK_DISABLE | PTL_ME_EVENT_UNLINK_DISABLE); me.match_id = id; me.match_bits = 0; me.ignore_bits = 0; me.min_free = 0; while (1) { ret = PtlMEAppend(ni_handle, pt, &me, PTL_PRIORITY_LIST, NULL, me_handle); if (ret != PTL_NO_SPACE) break; } RPTLU_ERR_POP(ret, "Error appending empty buffer to priority list\n"); fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_RPTLI_POST_CONTROL_BUFFER); return ret; fn_fail: goto fn_exit; }
static int poke_progress(void) { int ret = PTL_OK; struct rptl_target *target; struct rptl_op *op; struct rptl *rptl; int i; int mpi_errno = MPI_SUCCESS; ptl_process_t id; ptl_pt_index_t data_pt, control_pt; MPIDI_STATE_DECL(MPID_STATE_POKE_PROGRESS); MPIDI_FUNC_ENTER(MPID_STATE_POKE_PROGRESS); /* make progress on local RPTLs */ for (rptl = rptl_info.rptl_list; rptl; rptl = rptl->next) { /* if the local state is active, there's nothing to do */ if (rptl->local_state == RPTL_LOCAL_STATE_ACTIVE) continue; /* if we are in a local AWAITING PAUSE ACKS state, see if we * can send out the unpause message */ if (rptl->local_state == RPTL_LOCAL_STATE_AWAITING_PAUSE_ACKS && rptl->pause_ack_counter == rptl_info.world_size - 1) { /* if we are over the max count limit, do not send an * unpause message yet */ if (rptl->data.ob_curr_count > rptl->data.ob_max_count) continue; ret = PtlPTEnable(rptl->ni, rptl->data.pt); RPTLU_ERR_POP(ret, "Error returned while reenabling PT\n"); rptl->local_state = RPTL_LOCAL_STATE_ACTIVE; for (i = 0; i < rptl_info.world_size; i++) { if (i == MPIDI_Process.my_pg_rank) continue; mpi_errno = rptl_info.get_target_info(i, &id, rptl->data.pt, &data_pt, &control_pt); if (mpi_errno) { ret = PTL_FAIL; RPTLU_ERR_POP(ret, "Error getting target info\n"); } /* make sure the user setup a control portal */ assert(control_pt != PTL_PT_ANY); ret = rptl_put(rptl->md, 0, 0, PTL_NO_ACK_REQ, id, control_pt, 0, 0, NULL, RPTL_CONTROL_MSG_UNPAUSE, RPTL_PT_CONTROL); RPTLU_ERR_POP(ret, "Error sending unpause message\n"); } } } /* make progress on targets */ for (target = rptl_info.target_list; target; target = target->next) { if (target->state == RPTL_TARGET_STATE_RECEIVED_PAUSE) { for (op = target->data_op_list; op; op = op->next) if (op->state == RPTL_OP_STATE_ISSUED) break; if (op) continue; /* send a pause ack message */ assert(target->rptl); for (i = 0; i < rptl_info.world_size; i++) { if (i == MPIDI_Process.my_pg_rank) continue; /* find the target that has this target id and get the * control portal information for it */ mpi_errno = rptl_info.get_target_info(i, &id, target->rptl->data.pt, &data_pt, &control_pt); if (mpi_errno) { ret = PTL_FAIL; RPTLU_ERR_POP(ret, "Error getting target info\n"); } if (IDS_ARE_EQUAL(id, target->id)) break; } /* make sure the user setup a control portal */ assert(control_pt != PTL_PT_ANY); target->state = RPTL_TARGET_STATE_PAUSE_ACKED; ret = rptl_put(target->rptl->md, 0, 0, PTL_NO_ACK_REQ, id, control_pt, 0, 0, NULL, RPTL_CONTROL_MSG_PAUSE_ACK, RPTL_PT_CONTROL); RPTLU_ERR_POP(ret, "Error sending pause ack message\n"); continue; } /* issue out all the control messages first */ for (op = target->control_op_list; op; op = op->next) { assert(op->op_type == RPTL_OP_PUT); /* skip all the issued ops */ if (op->state == RPTL_OP_STATE_ISSUED) continue; /* we should not get any NACKs on the control portal */ assert(op->state != RPTL_OP_STATE_NACKED); if (rptl_info.origin_events_left < 2 || target->issued_data_ops > PER_TARGET_THRESHOLD) { /* too few origin events left. we can't issue this op * or any following op to this target in order to * maintain ordering */ break; } rptl_info.origin_events_left -= 2; target->issued_data_ops++; /* force request for an ACK even if the user didn't ask * for it. replace the user pointer with the OP id. */ ret = PtlPut(op->u.put.md_handle, op->u.put.local_offset, op->u.put.length, PTL_ACK_REQ, op->u.put.target_id, op->u.put.pt_index, op->u.put.match_bits, op->u.put.remote_offset, op, op->u.put.hdr_data); RPTLU_ERR_POP(ret, "Error issuing PUT\n"); op->state = RPTL_OP_STATE_ISSUED; } if (target->state == RPTL_TARGET_STATE_DISABLED || target->state == RPTL_TARGET_STATE_PAUSE_ACKED) continue; /* then issue out all the data messages */ for (op = target->data_op_list; op; op = op->next) { if (op->op_type == RPTL_OP_PUT) { /* skip all the issued ops */ if (op->state == RPTL_OP_STATE_ISSUED) continue; /* if an op has been nacked, don't issue anything else * to this target */ if (op->state == RPTL_OP_STATE_NACKED) break; if (rptl_info.origin_events_left < 2 || target->issued_data_ops > PER_TARGET_THRESHOLD) { /* too few origin events left. we can't issue * this op or any following op to this target in * order to maintain ordering */ break; } rptl_info.origin_events_left -= 2; target->issued_data_ops++; /* force request for an ACK even if the user didn't * ask for it. replace the user pointer with the OP * id. */ ret = PtlPut(op->u.put.md_handle, op->u.put.local_offset, op->u.put.length, PTL_ACK_REQ, op->u.put.target_id, op->u.put.pt_index, op->u.put.match_bits, op->u.put.remote_offset, op, op->u.put.hdr_data); RPTLU_ERR_POP(ret, "Error issuing PUT\n"); } else if (op->op_type == RPTL_OP_GET) { /* skip all the issued ops */ if (op->state == RPTL_OP_STATE_ISSUED) continue; /* if an op has been nacked, don't issue anything else * to this target */ if (op->state == RPTL_OP_STATE_NACKED) break; if (rptl_info.origin_events_left < 1 || target->issued_data_ops > PER_TARGET_THRESHOLD) { /* too few origin events left. we can't issue * this op or any following op to this target in * order to maintain ordering */ break; } rptl_info.origin_events_left--; target->issued_data_ops++; ret = PtlGet(op->u.get.md_handle, op->u.get.local_offset, op->u.get.length, op->u.get.target_id, op->u.get.pt_index, op->u.get.match_bits, op->u.get.remote_offset, op); RPTLU_ERR_POP(ret, "Error issuing GET\n"); } op->state = RPTL_OP_STATE_ISSUED; } } fn_exit: MPIDI_FUNC_EXIT(MPID_STATE_POKE_PROGRESS); return ret; fn_fail: goto fn_exit; }