int mca_pml_ob1_add_comm(ompi_communicator_t* comm) { /* allocate pml specific comm data */ mca_pml_ob1_comm_t* pml_comm = OBJ_NEW(mca_pml_ob1_comm_t); opal_list_item_t *item, *next_item; mca_pml_ob1_recv_frag_t* frag; mca_pml_ob1_comm_proc_t* pml_proc; mca_pml_ob1_match_hdr_t* hdr; int i; if (NULL == pml_comm) { return OMPI_ERR_OUT_OF_RESOURCE; } /* should never happen, but it was, so check */ if (comm->c_contextid > mca_pml_ob1.super.pml_max_contextid) { OBJ_RELEASE(pml_comm); return OMPI_ERR_OUT_OF_RESOURCE; } mca_pml_ob1_comm_init_size(pml_comm, comm->c_remote_group->grp_proc_count); comm->c_pml_comm = pml_comm; for( i = 0; i < comm->c_remote_group->grp_proc_count; i++ ) { pml_comm->procs[i].ompi_proc = ompi_group_peer_lookup(comm->c_remote_group,i); OBJ_RETAIN(pml_comm->procs[i].ompi_proc); } /* Grab all related messages from the non_existing_communicator pending queue */ for( item = opal_list_get_first(&mca_pml_ob1.non_existing_communicator_pending); item != opal_list_get_end(&mca_pml_ob1.non_existing_communicator_pending); item = next_item ) { frag = (mca_pml_ob1_recv_frag_t*)item; next_item = opal_list_get_next(item); hdr = &frag->hdr.hdr_match; /* Is this fragment for the current communicator ? */ if( frag->hdr.hdr_match.hdr_ctx != comm->c_contextid ) continue; /* As we now know we work on a fragment for this communicator * we should remove it from the * non_existing_communicator_pending list. */ opal_list_remove_item( &mca_pml_ob1.non_existing_communicator_pending, item ); add_fragment_to_unexpected: /* We generate the MSG_ARRIVED event as soon as the PML is aware * of a matching fragment arrival. Independing if it is received * on the correct order or not. This will allow the tools to * figure out if the messages are not received in the correct * order (if multiple network interfaces). */ PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm, hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); /* There is no matching to be done, and no lock to be held on the communicator as * we know at this point that the communicator has not yet been returned to the user. * The only required protection is around the non_existing_communicator_pending queue. * We just have to push the fragment into the unexpected list of the corresponding * proc, or into the out-of-order (cant_match) list. */ pml_proc = &(pml_comm->procs[hdr->hdr_src]); if( ((uint16_t)hdr->hdr_seq) == ((uint16_t)pml_proc->expected_sequence) ) { /* We're now expecting the next sequence number. */ pml_proc->expected_sequence++; opal_list_append( &pml_proc->unexpected_frags, (opal_list_item_t*)frag ); PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_INSERT_IN_UNEX_Q, comm, hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); /* And now the ugly part. As some fragments can be inserted in the cant_match list, * every time we succesfully add a fragment in the unexpected list we have to make * sure the next one is not in the cant_match. Otherwise, we will endup in a deadlock * situation as the cant_match is only checked when a new fragment is received from * the network. */ for(frag = (mca_pml_ob1_recv_frag_t *)opal_list_get_first(&pml_proc->frags_cant_match); frag != (mca_pml_ob1_recv_frag_t *)opal_list_get_end(&pml_proc->frags_cant_match); frag = (mca_pml_ob1_recv_frag_t *)opal_list_get_next(frag)) { hdr = &frag->hdr.hdr_match; /* If the message has the next expected seq from that proc... */ if(hdr->hdr_seq != pml_proc->expected_sequence) continue; opal_list_remove_item(&pml_proc->frags_cant_match, (opal_list_item_t*)frag); goto add_fragment_to_unexpected; } } else { opal_list_append( &pml_proc->frags_cant_match, (opal_list_item_t*)frag ); } } return OMPI_SUCCESS; }
void mca_pml_csum_recv_frag_callback_match(mca_btl_base_module_t* btl, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t* des, void* cbdata ) { mca_btl_base_segment_t* segments = des->des_dst; mca_pml_csum_match_hdr_t* hdr = (mca_pml_csum_match_hdr_t*)segments->seg_addr.pval; ompi_communicator_t *comm_ptr; mca_pml_csum_recv_request_t *match = NULL; mca_pml_csum_comm_t *comm; mca_pml_csum_comm_proc_t *proc; mca_pml_csum_recv_frag_t* frag = NULL; size_t num_segments = des->des_dst_cnt; size_t bytes_received = 0; uint16_t csum_received, csum=0; uint32_t csum_data; if( OPAL_UNLIKELY(segments->seg_len < OMPI_PML_CSUM_MATCH_HDR_LEN) ) { return; } csum_hdr_ntoh(((mca_pml_csum_hdr_t*) hdr), MCA_PML_CSUM_HDR_TYPE_MATCH); csum_received = hdr->hdr_common.hdr_csum; hdr->hdr_common.hdr_csum = 0; #if OPAL_ENABLE_HETEROGENEOUS_SUPPORT hdr->hdr_common.hdr_flags &= ~MCA_PML_CSUM_HDR_FLAGS_NBO; #endif csum = opal_csum16(hdr, OMPI_PML_CSUM_MATCH_HDR_LEN); hdr->hdr_common.hdr_csum = csum_received; OPAL_OUTPUT_VERBOSE((5, mca_pml_base_output, "%s:%s:%d common_hdr: %02x:%02x:%04x match_hdr: %04x:%04x:%08x:%08x:%08x", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, hdr->hdr_common.hdr_type, hdr->hdr_common.hdr_flags, hdr->hdr_common.hdr_csum, hdr->hdr_ctx, hdr->hdr_seq, hdr->hdr_src, hdr->hdr_tag, hdr->hdr_csum)); if (csum_received != csum) { opal_output(0, "%s:%s:%d: Invalid \'match header\' - received csum:0x%04x != computed csum:0x%04x\n", ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, csum_received, csum); orte_notifier.log(ORTE_NOTIFIER_CRIT, 1, "Checksum header violation: job %s file %s line %d", (NULL == orte_job_ident) ? "UNKNOWN" : orte_job_ident, __FILE__, __LINE__); dump_csum_error_data(segments, 1); orte_errmgr.abort(-1,NULL); } /* communicator pointer */ comm_ptr = ompi_comm_lookup(hdr->hdr_ctx); if(OPAL_UNLIKELY(NULL == comm_ptr)) { /* This is a special case. A message for a not yet existing * communicator can happens. Instead of doing a matching we * will temporarily add it the a pending queue in the PML. * Later on, when the communicator is completely instantiated, * this pending queue will be searched and all matching fragments * moved to the right communicator. */ append_frag_to_list( &mca_pml_csum.non_existing_communicator_pending, btl, hdr, segments, num_segments, frag ); return; } comm = (mca_pml_csum_comm_t *)comm_ptr->c_pml_comm; /* source sequence number */ proc = &comm->procs[hdr->hdr_src]; /* We generate the MSG_ARRIVED event as soon as the PML is aware * of a matching fragment arrival. Independing if it is received * on the correct order or not. This will allow the tools to * figure out if the messages are not received in the correct * order (if multiple network interfaces). */ PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm_ptr, hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); /* get next expected message sequence number - if threaded * run, lock to make sure that if another thread is processing * a frag from the same message a match is made only once. * Also, this prevents other posted receives (for a pair of * end points) from being processed, and potentially "loosing" * the fragment. */ OPAL_THREAD_LOCK(&comm->matching_lock); /* get sequence number of next message that can be processed */ if(OPAL_UNLIKELY((((uint16_t) hdr->hdr_seq) != ((uint16_t) proc->expected_sequence)) || (opal_list_get_size(&proc->frags_cant_match) > 0 ))) { goto slow_path; } /* This is the sequence number we were expecting, so we can try * matching it to already posted receives. */ /* We're now expecting the next sequence number. */ proc->expected_sequence++; /* We generate the SEARCH_POSTED_QUEUE only when the message is * received in the correct sequence. Otherwise, we delay the event * generation until we reach the correct sequence number. */ PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_BEGIN, comm_ptr, hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); match = match_one(btl, hdr, segments, num_segments, comm_ptr, proc, frag); /* The match is over. We generate the SEARCH_POSTED_Q_END here, * before going into the mca_pml_csum_check_cantmatch_for_match so * we can make a difference for the searching time for all * messages. */ PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_END, comm_ptr, hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); /* release matching lock before processing fragment */ OPAL_THREAD_UNLOCK(&comm->matching_lock); if(OPAL_LIKELY(match)) { bytes_received = segments->seg_len - OMPI_PML_CSUM_MATCH_HDR_LEN; match->req_recv.req_bytes_packed = bytes_received; MCA_PML_CSUM_RECV_REQUEST_MATCHED(match, hdr); if(bytes_received > 0) { struct iovec iov[2]; uint32_t iov_count = 1; /* * Make user buffer accessable(defined) before unpacking. */ MEMCHECKER( memchecker_call(&opal_memchecker_base_mem_defined, match->req_recv.req_base.req_addr, match->req_recv.req_base.req_count, match->req_recv.req_base.req_datatype); ); iov[0].iov_len = bytes_received; iov[0].iov_base = (IOVBASE_TYPE*)((unsigned char*)segments->seg_addr.pval + OMPI_PML_CSUM_MATCH_HDR_LEN); while (iov_count < num_segments) { bytes_received += segments[iov_count].seg_len; iov[iov_count].iov_len = segments[iov_count].seg_len; iov[iov_count].iov_base = (IOVBASE_TYPE*)((unsigned char*)segments[iov_count].seg_addr.pval); iov_count++; } opal_convertor_unpack( &match->req_recv.req_base.req_convertor, iov, &iov_count, &bytes_received ); match->req_bytes_received = bytes_received; /* * Unpacking finished, make the user buffer unaccessable again. */ MEMCHECKER( memchecker_call(&opal_memchecker_base_mem_noaccess, match->req_recv.req_base.req_addr, match->req_recv.req_base.req_count, match->req_recv.req_base.req_datatype); ); }
void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl, mca_btl_base_tag_t tag, mca_btl_base_descriptor_t* des, void* cbdata ) { mca_btl_base_segment_t* segments = des->des_dst; mca_pml_ob1_match_hdr_t* hdr = (mca_pml_ob1_match_hdr_t*)segments->seg_addr.pval; ompi_communicator_t *comm_ptr; mca_pml_ob1_recv_request_t *match = NULL; mca_pml_ob1_comm_t *comm; mca_pml_ob1_comm_proc_t *proc; mca_pml_ob1_recv_frag_t* frag = NULL; size_t num_segments = des->des_dst_cnt; size_t bytes_received = 0; if( OPAL_UNLIKELY(segments->seg_len < OMPI_PML_OB1_MATCH_HDR_LEN) ) { return; } ob1_hdr_ntoh(((mca_pml_ob1_hdr_t*) hdr), MCA_PML_OB1_HDR_TYPE_MATCH); /* communicator pointer */ comm_ptr = ompi_comm_lookup(hdr->hdr_ctx); if(OPAL_UNLIKELY(NULL == comm_ptr)) { /* This is a special case. A message for a not yet existing * communicator can happens. Instead of doing a matching we * will temporarily add it the a pending queue in the PML. * Later on, when the communicator is completely instantiated, * this pending queue will be searched and all matching fragments * moved to the right communicator. */ append_frag_to_list( &mca_pml_ob1.non_existing_communicator_pending, btl, hdr, segments, num_segments, frag ); return; } comm = (mca_pml_ob1_comm_t *)comm_ptr->c_pml_comm; /* source sequence number */ proc = &comm->procs[hdr->hdr_src]; /* We generate the MSG_ARRIVED event as soon as the PML is aware * of a matching fragment arrival. Independing if it is received * on the correct order or not. This will allow the tools to * figure out if the messages are not received in the correct * order (if multiple network interfaces). */ PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm_ptr, hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); /* get next expected message sequence number - if threaded * run, lock to make sure that if another thread is processing * a frag from the same message a match is made only once. * Also, this prevents other posted receives (for a pair of * end points) from being processed, and potentially "loosing" * the fragment. */ OPAL_THREAD_LOCK(&comm->matching_lock); /* get sequence number of next message that can be processed */ if(OPAL_UNLIKELY((((uint16_t) hdr->hdr_seq) != ((uint16_t) proc->expected_sequence)) || (opal_list_get_size(&proc->frags_cant_match) > 0 ))) { goto slow_path; } /* This is the sequence number we were expecting, so we can try * matching it to already posted receives. */ /* We're now expecting the next sequence number. */ proc->expected_sequence++; /* We generate the SEARCH_POSTED_QUEUE only when the message is * received in the correct sequence. Otherwise, we delay the event * generation until we reach the correct sequence number. */ PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_BEGIN, comm_ptr, hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); match = match_one(btl, hdr, segments, num_segments, comm_ptr, proc, frag); /* The match is over. We generate the SEARCH_POSTED_Q_END here, * before going into the mca_pml_ob1_check_cantmatch_for_match so * we can make a difference for the searching time for all * messages. */ PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_END, comm_ptr, hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV); /* release matching lock before processing fragment */ OPAL_THREAD_UNLOCK(&comm->matching_lock); if(OPAL_LIKELY(match)) { bytes_received = segments->seg_len - OMPI_PML_OB1_MATCH_HDR_LEN; match->req_recv.req_bytes_packed = bytes_received; MCA_PML_OB1_RECV_REQUEST_MATCHED(match, hdr); if(match->req_bytes_delivered > 0) { struct iovec iov[2]; uint32_t iov_count = 1; /* * Make user buffer accessable(defined) before unpacking. */ MEMCHECKER( memchecker_call(&opal_memchecker_base_mem_defined, match->req_recv.req_base.req_addr, match->req_recv.req_base.req_count, match->req_recv.req_base.req_datatype); ); iov[0].iov_len = bytes_received; iov[0].iov_base = (IOVBASE_TYPE*)((unsigned char*)segments->seg_addr.pval + OMPI_PML_OB1_MATCH_HDR_LEN); while (iov_count < num_segments) { bytes_received += segments[iov_count].seg_len; iov[iov_count].iov_len = segments[iov_count].seg_len; iov[iov_count].iov_base = (IOVBASE_TYPE*)((unsigned char*)segments[iov_count].seg_addr.pval); iov_count++; } ompi_convertor_unpack( &match->req_recv.req_base.req_convertor, iov, &iov_count, &bytes_received ); match->req_bytes_received = bytes_received; /* * Unpacking finished, make the user buffer unaccessable again. */ MEMCHECKER( memchecker_call(&opal_memchecker_base_mem_noaccess, match->req_recv.req_base.req_addr, match->req_recv.req_base.req_count, match->req_recv.req_base.req_datatype); ); }