Example #1
0
int mca_pml_ob1_add_comm(ompi_communicator_t* comm)
{
    /* allocate pml specific comm data */
    mca_pml_ob1_comm_t* pml_comm = OBJ_NEW(mca_pml_ob1_comm_t);
    opal_list_item_t *item, *next_item;
    mca_pml_ob1_recv_frag_t* frag;
    mca_pml_ob1_comm_proc_t* pml_proc;
    mca_pml_ob1_match_hdr_t* hdr;
    int i;

    if (NULL == pml_comm) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    /* should never happen, but it was, so check */
    if (comm->c_contextid > mca_pml_ob1.super.pml_max_contextid) {
        OBJ_RELEASE(pml_comm);
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    mca_pml_ob1_comm_init_size(pml_comm, comm->c_remote_group->grp_proc_count);
    comm->c_pml_comm = pml_comm;

    for( i = 0; i < comm->c_remote_group->grp_proc_count; i++ ) {
        pml_comm->procs[i].ompi_proc = ompi_group_peer_lookup(comm->c_remote_group,i);
        OBJ_RETAIN(pml_comm->procs[i].ompi_proc);
    }
    /* Grab all related messages from the non_existing_communicator pending queue */
    for( item = opal_list_get_first(&mca_pml_ob1.non_existing_communicator_pending);
         item != opal_list_get_end(&mca_pml_ob1.non_existing_communicator_pending);
         item = next_item ) {
        frag = (mca_pml_ob1_recv_frag_t*)item;
        next_item = opal_list_get_next(item);
        hdr = &frag->hdr.hdr_match;

        /* Is this fragment for the current communicator ? */
        if( frag->hdr.hdr_match.hdr_ctx != comm->c_contextid )
            continue;

        /* As we now know we work on a fragment for this communicator
         * we should remove it from the
         * non_existing_communicator_pending list. */
        opal_list_remove_item( &mca_pml_ob1.non_existing_communicator_pending, 
                               item );

      add_fragment_to_unexpected:

        /* We generate the MSG_ARRIVED event as soon as the PML is aware
         * of a matching fragment arrival. Independing if it is received
         * on the correct order or not. This will allow the tools to
         * figure out if the messages are not received in the correct
         * order (if multiple network interfaces).
         */
        PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm,
                               hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);

        /* There is no matching to be done, and no lock to be held on the communicator as
         * we know at this point that the communicator has not yet been returned to the user.
         * The only required protection is around the non_existing_communicator_pending queue.
         * We just have to push the fragment into the unexpected list of the corresponding
         * proc, or into the out-of-order (cant_match) list.
         */
        pml_proc = &(pml_comm->procs[hdr->hdr_src]);

        if( ((uint16_t)hdr->hdr_seq) == ((uint16_t)pml_proc->expected_sequence) ) {
            /* We're now expecting the next sequence number. */
            pml_proc->expected_sequence++;
            opal_list_append( &pml_proc->unexpected_frags, (opal_list_item_t*)frag );
            PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_INSERT_IN_UNEX_Q, comm,
                                   hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
            /* And now the ugly part. As some fragments can be inserted in the cant_match list,
             * every time we succesfully add a fragment in the unexpected list we have to make
             * sure the next one is not in the cant_match. Otherwise, we will endup in a deadlock
             * situation as the cant_match is only checked when a new fragment is received from
             * the network.
             */
           for(frag = (mca_pml_ob1_recv_frag_t *)opal_list_get_first(&pml_proc->frags_cant_match);
               frag != (mca_pml_ob1_recv_frag_t *)opal_list_get_end(&pml_proc->frags_cant_match);
               frag = (mca_pml_ob1_recv_frag_t *)opal_list_get_next(frag)) {
               hdr = &frag->hdr.hdr_match;
               /* If the message has the next expected seq from that proc...  */
               if(hdr->hdr_seq != pml_proc->expected_sequence)
                   continue;

               opal_list_remove_item(&pml_proc->frags_cant_match, (opal_list_item_t*)frag);
               goto add_fragment_to_unexpected;
           }
        } else {
            opal_list_append( &pml_proc->frags_cant_match, (opal_list_item_t*)frag );
        }
    }
    return OMPI_SUCCESS;
}
void mca_pml_csum_recv_frag_callback_match(mca_btl_base_module_t* btl, 
                                          mca_btl_base_tag_t tag,
                                          mca_btl_base_descriptor_t* des,
                                          void* cbdata ) { 
    mca_btl_base_segment_t* segments = des->des_dst;
    mca_pml_csum_match_hdr_t* hdr = (mca_pml_csum_match_hdr_t*)segments->seg_addr.pval;
    ompi_communicator_t *comm_ptr;
    mca_pml_csum_recv_request_t *match = NULL;
    mca_pml_csum_comm_t *comm;
    mca_pml_csum_comm_proc_t *proc;
    mca_pml_csum_recv_frag_t* frag = NULL;
    size_t num_segments = des->des_dst_cnt;
    size_t bytes_received = 0;
    uint16_t csum_received, csum=0;
    uint32_t csum_data;
    
    if( OPAL_UNLIKELY(segments->seg_len < OMPI_PML_CSUM_MATCH_HDR_LEN) ) {
        return;
     }
    csum_hdr_ntoh(((mca_pml_csum_hdr_t*) hdr), MCA_PML_CSUM_HDR_TYPE_MATCH);

    csum_received = hdr->hdr_common.hdr_csum;
    hdr->hdr_common.hdr_csum = 0;
#if OPAL_ENABLE_HETEROGENEOUS_SUPPORT
    hdr->hdr_common.hdr_flags &= ~MCA_PML_CSUM_HDR_FLAGS_NBO;
#endif
    csum = opal_csum16(hdr, OMPI_PML_CSUM_MATCH_HDR_LEN);
    hdr->hdr_common.hdr_csum = csum_received;
    
    OPAL_OUTPUT_VERBOSE((5, mca_pml_base_output,
                         "%s:%s:%d common_hdr: %02x:%02x:%04x   match_hdr:  %04x:%04x:%08x:%08x:%08x",
                         ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__,
                         hdr->hdr_common.hdr_type, hdr->hdr_common.hdr_flags, hdr->hdr_common.hdr_csum,
                         hdr->hdr_ctx, hdr->hdr_seq, hdr->hdr_src, hdr->hdr_tag, hdr->hdr_csum));
    
    if (csum_received != csum) {
        opal_output(0, "%s:%s:%d: Invalid \'match header\' - received csum:0x%04x  != computed csum:0x%04x\n",
                    ORTE_NAME_PRINT(ORTE_PROC_MY_NAME), __FILE__, __LINE__, csum_received, csum);
        orte_notifier.log(ORTE_NOTIFIER_CRIT, 1,
                          "Checksum header violation: job %s file %s line %d",
                          (NULL == orte_job_ident) ? "UNKNOWN" : orte_job_ident,
                          __FILE__, __LINE__);
        dump_csum_error_data(segments, 1);
        orte_errmgr.abort(-1,NULL);
    }
    
    /* communicator pointer */
    comm_ptr = ompi_comm_lookup(hdr->hdr_ctx);
    if(OPAL_UNLIKELY(NULL == comm_ptr)) {
        /* This is a special case. A message for a not yet existing
         * communicator can happens. Instead of doing a matching we
         * will temporarily add it the a pending queue in the PML.
         * Later on, when the communicator is completely instantiated,
         * this pending queue will be searched and all matching fragments
         * moved to the right communicator.
         */
        append_frag_to_list( &mca_pml_csum.non_existing_communicator_pending,
                             btl, hdr, segments, num_segments, frag );
        return;
    }
    comm = (mca_pml_csum_comm_t *)comm_ptr->c_pml_comm;
    
    /* source sequence number */
    proc = &comm->procs[hdr->hdr_src];
 
    /* We generate the MSG_ARRIVED event as soon as the PML is aware
     * of a matching fragment arrival. Independing if it is received
     * on the correct order or not. This will allow the tools to
     * figure out if the messages are not received in the correct
     * order (if multiple network interfaces).
     */
    PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm_ptr,
                           hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
 
    /* get next expected message sequence number - if threaded
     * run, lock to make sure that if another thread is processing
     * a frag from the same message a match is made only once.
     * Also, this prevents other posted receives (for a pair of
     * end points) from being processed, and potentially "loosing"
     * the fragment.
     */
    OPAL_THREAD_LOCK(&comm->matching_lock);
    
     /* get sequence number of next message that can be processed */
    if(OPAL_UNLIKELY((((uint16_t) hdr->hdr_seq) != ((uint16_t) proc->expected_sequence)) ||
                     (opal_list_get_size(&proc->frags_cant_match) > 0 ))) {
        goto slow_path;
    }
    
    /* This is the sequence number we were expecting, so we can try
     * matching it to already posted receives.
     */
    
    /* We're now expecting the next sequence number. */
    proc->expected_sequence++;

    /* We generate the SEARCH_POSTED_QUEUE only when the message is
     * received in the correct sequence. Otherwise, we delay the event
     * generation until we reach the correct sequence number.
     */
    PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_BEGIN, comm_ptr,
                            hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
    
    match = match_one(btl, hdr, segments, num_segments, comm_ptr, proc, frag);
    
    /* The match is over. We generate the SEARCH_POSTED_Q_END here,
     * before going into the mca_pml_csum_check_cantmatch_for_match so
     * we can make a difference for the searching time for all
     * messages.
     */
    PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_END, comm_ptr,
                           hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
    
    /* release matching lock before processing fragment */
    OPAL_THREAD_UNLOCK(&comm->matching_lock);
    
    if(OPAL_LIKELY(match)) {
        bytes_received = segments->seg_len - OMPI_PML_CSUM_MATCH_HDR_LEN;
        match->req_recv.req_bytes_packed = bytes_received;
        
        MCA_PML_CSUM_RECV_REQUEST_MATCHED(match, hdr);
        if(bytes_received > 0) { 
            struct iovec iov[2];
            uint32_t iov_count = 1;
            
            /*
             *  Make user buffer accessable(defined) before unpacking.
             */
            MEMCHECKER(
                       memchecker_call(&opal_memchecker_base_mem_defined,
                                       match->req_recv.req_base.req_addr,
                                       match->req_recv.req_base.req_count,
                                       match->req_recv.req_base.req_datatype);
                       );
            
            iov[0].iov_len = bytes_received;
            iov[0].iov_base = (IOVBASE_TYPE*)((unsigned char*)segments->seg_addr.pval +
                                              OMPI_PML_CSUM_MATCH_HDR_LEN);
            while (iov_count < num_segments) {
                bytes_received += segments[iov_count].seg_len;
                iov[iov_count].iov_len = segments[iov_count].seg_len;
                iov[iov_count].iov_base = (IOVBASE_TYPE*)((unsigned char*)segments[iov_count].seg_addr.pval);
                iov_count++;
            }
            opal_convertor_unpack( &match->req_recv.req_base.req_convertor,
                                   iov,
                                   &iov_count,
                                   &bytes_received );
            match->req_bytes_received = bytes_received;
            /*
             *  Unpacking finished, make the user buffer unaccessable again.
             */
            MEMCHECKER(
                       memchecker_call(&opal_memchecker_base_mem_noaccess,
                                       match->req_recv.req_base.req_addr,
                                       match->req_recv.req_base.req_count,
                                       match->req_recv.req_base.req_datatype);
                       );
        }
void mca_pml_ob1_recv_frag_callback_match(mca_btl_base_module_t* btl, 
                                          mca_btl_base_tag_t tag,
                                          mca_btl_base_descriptor_t* des,
                                          void* cbdata ) { 
    mca_btl_base_segment_t* segments = des->des_dst;
    mca_pml_ob1_match_hdr_t* hdr = (mca_pml_ob1_match_hdr_t*)segments->seg_addr.pval;
    ompi_communicator_t *comm_ptr;
    mca_pml_ob1_recv_request_t *match = NULL;
    mca_pml_ob1_comm_t *comm;
    mca_pml_ob1_comm_proc_t *proc;
    mca_pml_ob1_recv_frag_t* frag = NULL;
    size_t num_segments = des->des_dst_cnt;
    size_t bytes_received = 0;
    
    if( OPAL_UNLIKELY(segments->seg_len < OMPI_PML_OB1_MATCH_HDR_LEN) ) {
        return;
    }
    ob1_hdr_ntoh(((mca_pml_ob1_hdr_t*) hdr), MCA_PML_OB1_HDR_TYPE_MATCH);
    
    /* communicator pointer */
    comm_ptr = ompi_comm_lookup(hdr->hdr_ctx);
    if(OPAL_UNLIKELY(NULL == comm_ptr)) {
        /* This is a special case. A message for a not yet existing
         * communicator can happens. Instead of doing a matching we
         * will temporarily add it the a pending queue in the PML.
         * Later on, when the communicator is completely instantiated,
         * this pending queue will be searched and all matching fragments
         * moved to the right communicator.
         */
        append_frag_to_list( &mca_pml_ob1.non_existing_communicator_pending,
                             btl, hdr, segments, num_segments, frag );
        return;
    }
    comm = (mca_pml_ob1_comm_t *)comm_ptr->c_pml_comm;
    
    /* source sequence number */
    proc = &comm->procs[hdr->hdr_src];
 
    /* We generate the MSG_ARRIVED event as soon as the PML is aware
     * of a matching fragment arrival. Independing if it is received
     * on the correct order or not. This will allow the tools to
     * figure out if the messages are not received in the correct
     * order (if multiple network interfaces).
     */
    PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_MSG_ARRIVED, comm_ptr,
                           hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
 
    /* get next expected message sequence number - if threaded
     * run, lock to make sure that if another thread is processing
     * a frag from the same message a match is made only once.
     * Also, this prevents other posted receives (for a pair of
     * end points) from being processed, and potentially "loosing"
     * the fragment.
     */
    OPAL_THREAD_LOCK(&comm->matching_lock);
    
     /* get sequence number of next message that can be processed */
    if(OPAL_UNLIKELY((((uint16_t) hdr->hdr_seq) != ((uint16_t) proc->expected_sequence)) ||
                     (opal_list_get_size(&proc->frags_cant_match) > 0 ))) {
        goto slow_path;
    }
    
    /* This is the sequence number we were expecting, so we can try
     * matching it to already posted receives.
     */
    
    /* We're now expecting the next sequence number. */
    proc->expected_sequence++;

    /* We generate the SEARCH_POSTED_QUEUE only when the message is
     * received in the correct sequence. Otherwise, we delay the event
     * generation until we reach the correct sequence number.
     */
    PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_BEGIN, comm_ptr,
                            hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
    
    match = match_one(btl, hdr, segments, num_segments, comm_ptr, proc, frag);
    
    /* The match is over. We generate the SEARCH_POSTED_Q_END here,
     * before going into the mca_pml_ob1_check_cantmatch_for_match so
     * we can make a difference for the searching time for all
     * messages.
     */
    PERUSE_TRACE_MSG_EVENT(PERUSE_COMM_SEARCH_POSTED_Q_END, comm_ptr,
                           hdr->hdr_src, hdr->hdr_tag, PERUSE_RECV);
    
    /* release matching lock before processing fragment */
    OPAL_THREAD_UNLOCK(&comm->matching_lock);
    
    if(OPAL_LIKELY(match)) {
        bytes_received = segments->seg_len - OMPI_PML_OB1_MATCH_HDR_LEN;
        match->req_recv.req_bytes_packed = bytes_received;
        
        MCA_PML_OB1_RECV_REQUEST_MATCHED(match, hdr);
        if(match->req_bytes_delivered > 0) { 
            struct iovec iov[2];
            uint32_t iov_count = 1;
            
            /*
             *  Make user buffer accessable(defined) before unpacking.
             */
            MEMCHECKER(
                       memchecker_call(&opal_memchecker_base_mem_defined,
                                       match->req_recv.req_base.req_addr,
                                       match->req_recv.req_base.req_count,
                                       match->req_recv.req_base.req_datatype);
                       );
            
            iov[0].iov_len = bytes_received;
            iov[0].iov_base = (IOVBASE_TYPE*)((unsigned char*)segments->seg_addr.pval +
                                              OMPI_PML_OB1_MATCH_HDR_LEN);
            while (iov_count < num_segments) {
                bytes_received += segments[iov_count].seg_len;
                iov[iov_count].iov_len = segments[iov_count].seg_len;
                iov[iov_count].iov_base = (IOVBASE_TYPE*)((unsigned char*)segments[iov_count].seg_addr.pval);
                iov_count++;
            }
            ompi_convertor_unpack( &match->req_recv.req_base.req_convertor,
                                   iov,
                                   &iov_count,
                                   &bytes_received );
            match->req_bytes_received = bytes_received;
            /*
             *  Unpacking finished, make the user buffer unaccessable again.
             */
            MEMCHECKER(
                       memchecker_call(&opal_memchecker_base_mem_noaccess,
                                       match->req_recv.req_base.req_addr,
                                       match->req_recv.req_base.req_count,
                                       match->req_recv.req_base.req_datatype);
                       );
        }