Beispiel #1
0
static void rcd_allgather_process_data(scon_collectives_tracker_t *coll,
                                       uint32_t distance)
{
    /* Communication step:
     At every step i, rank r:
     - exchanges message containing all data collected so far with rank peer = (r ^ 2^i).
     */
    uint32_t log2nprocs = (uint32_t) log2(coll->sig->nprocs);
    scon_proc_t peer;
    int rc, peer_index;
    while (distance < log2nprocs) {
        scon_output_verbose(2,  scon_collectives_base_framework.framework_output,
                            "%s rcd: allgather process distance =%d",
                            SCON_PRINT_PROC(SCON_PROC_MY_NAME),
                            distance);

        /* first send my current contents */
        peer_index = coll->my_rank ^ (1 << distance);
        assert (peer_index < (int)coll->sig->nprocs);
        peer = coll->sig->procs[peer_index];

        rcd_allgather_send_dist(coll, &peer, distance);

        /* check whether data for next distance is available */
        if (NULL == coll->buffers || NULL == coll->buffers[distance]) {
            break;
        }

        scon_output_verbose(2,  scon_collectives_base_framework.framework_output,
                            "%s rcd: allgather DATA found for distance =%d",
                            SCON_PRINT_PROC(SCON_PROC_MY_NAME),
                            distance);
        if (SCON_SUCCESS != (rc = scon_bfrop.copy_payload(&coll->bucket, coll->buffers[distance]))) {
            SCON_ERROR_LOG(rc);
            rcd_finalize_coll(coll, rc);
            return;
        }
        coll->nreported += 1 << distance;
        scon_collectives_base_mark_distance_recv(coll, distance);
        SCON_RELEASE(coll->buffers[distance]);
        coll->buffers[distance] = NULL;
        ++distance;
    }

    scon_output_verbose(2, scon_collectives_base_framework.framework_output,
                        "%s collectives rcd allgather reported %lu process from %lu",
                         SCON_PRINT_PROC(SCON_PROC_MY_NAME), (unsigned long)coll->nreported,
                        (unsigned long)coll->sig->nprocs);

    /* if we are done, then complete things */
    if (coll->nreported == coll->sig->nprocs) {
        rcd_finalize_coll(coll, SCON_SUCCESS);
    }
}
Beispiel #2
0
static void rcd_allgather_recv_dist(int status, orte_process_name_t* sender,
                                    opal_buffer_t* buffer, orte_rml_tag_t tag,
                                    void* cbdata)
{
    int32_t cnt, num_remote;
    int rc;
    orte_grpcomm_signature_t *sig;
    orte_grpcomm_coll_t *coll;
    orte_vpid_t distance, new_distance;

    OPAL_OUTPUT_VERBOSE((5, orte_grpcomm_base_framework.framework_output,
                             "%s grpcomm:coll:recdub received data",
                             ORTE_NAME_PRINT(ORTE_PROC_MY_NAME)));

    /* unpack the signature */
    cnt = 1;
    if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &sig, &cnt, ORTE_SIGNATURE))) {
        ORTE_ERROR_LOG(rc);
        return;
    }

    /* check for the tracker and create it if not found */
    if (NULL == (coll = orte_grpcomm_base_get_tracker(sig, true))) {
        ORTE_ERROR_LOG(ORTE_ERR_NOT_FOUND);
        OBJ_RELEASE(sig);
        return;
    }

    /* unpack the distance */
    distance = 1;
    if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &distance, &cnt, OPAL_INT32))) {
        OBJ_RELEASE(sig);
        ORTE_ERROR_LOG(rc);
        rcd_finalize_coll(coll, rc);
        return;
    }

    /* unpack number of reported */
    num_remote = 0;
    if (OPAL_SUCCESS != (rc = opal_dss.unpack(buffer, &num_remote, &cnt, OPAL_INT32))) {
        OBJ_RELEASE(sig);
        ORTE_ERROR_LOG(rc);
        rcd_finalize_coll(coll, rc);
        return;
    }
    coll->nreported += num_remote;

    /* capture any provided content */
    if (OPAL_SUCCESS != (rc = opal_dss.copy_payload(&coll->bucket, buffer))) {
        OBJ_RELEASE(sig);
        ORTE_ERROR_LOG(rc);
        rcd_finalize_coll(coll, rc);
        return;
    }

    //update distance and send
    new_distance = distance <<= 1;
    if (new_distance < coll->ndmns) {
        rcd_allgather_send_dist(coll, new_distance);
    } else {
        rcd_finalize_coll(coll, ORTE_SUCCESS);
    }

    OBJ_RELEASE(sig);

    return;
}
Beispiel #3
0
static void rcd_allgather_recv_dist(int status,
                                       scon_handle_t handle,
                                       scon_proc_t* sender,
                                       scon_buffer_t* buffer,
                                       scon_msg_tag_t tag,
                                       void* cbdata)
{
    int32_t cnt;
    int rc;
    scon_collectives_signature_t *sig;
    scon_collectives_tracker_t *coll;
    uint32_t distance;
    scon_output_verbose(2,  scon_collectives_base_framework.framework_output,
                        "%s rcd: allgather receiving from %s",
                        SCON_PRINT_PROC(SCON_PROC_MY_NAME),
                        SCON_PRINT_PROC(sender));

    /* unpack the signature */
    cnt = 1;
    if (SCON_SUCCESS != (rc = scon_bfrop.unpack(buffer, &sig, &cnt, SCON_COLLECTIVES_SIGNATURE))) {
        SCON_ERROR_LOG(rc);
        return;
    }

    /* check for the tracker and create it if not found */
    if (NULL == (coll = scon_collectives_base_get_tracker (sig, true))) {
        SCON_ERROR_LOG(SCON_ERR_NOT_FOUND);
        SCON_RELEASE(sig);
        return;
    }
    /* unpack the distance */
    distance = -1;
    cnt = 1;
    if (SCON_SUCCESS != (rc = scon_bfrop.unpack(buffer, &distance, &cnt, SCON_INT32))) {
        SCON_RELEASE(sig);
        SCON_ERROR_LOG(rc);
        rcd_finalize_coll(coll, rc);
        return;
    }
    assert(0 == scon_collectives_base_check_distance_recv(coll, distance));
    /* Check whether we can process next distance */
    if (coll->nreported && (!distance || scon_collectives_base_check_distance_recv(coll, distance - 1))) {
        scon_output_verbose(2,  scon_collectives_base_framework.framework_output,
                             "%s grpcomm:coll:rcd data from %d distance received, "
                             "Process the next distance.",
                             SCON_PRINT_PROC(SCON_PROC_MY_NAME), distance);
        /* capture any provided content */
        if (SCON_SUCCESS != (rc = scon_bfrop.copy_payload(&coll->bucket, buffer))) {
            SCON_RELEASE(sig);
            SCON_ERROR_LOG(rc);
            rcd_finalize_coll(coll, rc);
            return;
        }
        coll->nreported += (1 << distance);
        scon_collectives_base_mark_distance_recv(coll, distance);
        rcd_allgather_process_data(coll, distance + 1);
    } else {
        scon_output_verbose(2,  scon_collectives_base_framework.framework_output,
                             "%s grpcomm:coll:rcd data from %d distance received, "
                             "still waiting for data.",
                             SCON_PRINT_PROC(SCON_PROC_MY_NAME), distance);
        if (NULL == coll->buffers) {
            if (NULL == (coll->buffers = (scon_buffer_t **) calloc ((uint32_t) log2 (coll->sig->nprocs) , sizeof(scon_buffer_t *)))) {
                rc = SCON_ERR_OUT_OF_RESOURCE;
                SCON_RELEASE(sig);
                SCON_ERROR_LOG(rc);
                rcd_finalize_coll(coll, rc);
                return;
            }
        }
        //if (NULL == (coll->buffers[distance] = SCON_NEW(scon_buffer_t))) {
        if (NULL == (coll->buffers[distance] = (scon_buffer_t*) malloc (sizeof(scon_buffer_t)))) {
            rc = SCON_ERR_OUT_OF_RESOURCE;
            SCON_RELEASE(sig);
            SCON_ERROR_LOG(rc);
            rcd_finalize_coll(coll, rc);
            return;
        }
        if (SCON_SUCCESS != (rc = scon_bfrop.copy_payload(coll->buffers[distance], buffer))) {
            SCON_RELEASE(sig);
            SCON_ERROR_LOG(rc);
            rcd_finalize_coll(coll, rc);
            return;
        }
    }
    SCON_RELEASE(sig);
}