Ejemplo n.º 1
0
int	sendBytesByTCP(int *blockSocket, char *from, int length,
		struct sockaddr *sn)
{
	int	bytesWritten;

	while (1)	/*	Continue until not interrupted.		*/
	{
		bytesWritten = isend(*blockSocket, from, length, 0);
		if (bytesWritten < 0)
		{
			switch (errno)
			{
			case EINTR:	/*	Interrupted; retry.	*/
				continue;

			case EPIPE:	/*	Lost connection.	*/
			case EBADF:
			case ETIMEDOUT:
			case ECONNRESET:
				closesocket(*blockSocket);
				*blockSocket = -1;
				bytesWritten = 0;
			}

			putSysErrmsg("TCP BSO write() error on socket", NULL);
		}

		return bytesWritten;
	}
}
/*
 * do zero byte IRECV / ISEND: upper half sends to lower half (i.e. do
 * a ping, not a ping pong)
 */
int ompi_init_do_preconnect(void)
{
    int comm_size = ompi_comm_size(MPI_COMM_WORLD);
    int my_rank =  ompi_comm_rank(MPI_COMM_WORLD);
    int i, j, ret;
    struct ompi_request_t **requests;

    requests = (ompi_request_t**)malloc(comm_size * sizeof(struct ompi_request_t *));
    if (NULL == requests) {
        return OMPI_ERR_OUT_OF_RESOURCE;
    }

    for (i = j = 0; i < comm_size; ++i) {
        if (i == my_rank) {
            continue;
        } else if (my_rank < i) {
            ret = MCA_PML_CALL(isend(MPI_BOTTOM, 0, MPI_BYTE,
                                     i, 1,
                                     MCA_PML_BASE_SEND_STANDARD,
                                     MPI_COMM_WORLD,
                                     &requests[j++]));
        } else {
            ret = MCA_PML_CALL(irecv(MPI_BOTTOM,0, MPI_BYTE, i,
                                     1, MPI_COMM_WORLD,
                                     &requests[j++]));
        }
        if (OMPI_SUCCESS != ret) {
            return ret;
        }
    }
    ret = ompi_request_wait_all(j, requests, MPI_STATUSES_IGNORE);
    free(requests);

    return ret;
}
static int ompi_coll_tuned_barrier_intra_basic_linear(struct ompi_communicator_t *comm,
                                                      mca_coll_base_module_t *module)
{
    int i, err, rank, size;

    rank = ompi_comm_rank(comm);
    size = ompi_comm_size(comm);

    /* All non-root send & receive zero-length message. */
    if (rank > 0) {
        err = MCA_PML_CALL(send (NULL, 0, MPI_BYTE, 0, 
                                 MCA_COLL_BASE_TAG_BARRIER,
                                 MCA_PML_BASE_SEND_STANDARD, comm));
        if (MPI_SUCCESS != err) {
            return err;
        }

        err = MCA_PML_CALL(recv (NULL, 0, MPI_BYTE, 0, 
                                 MCA_COLL_BASE_TAG_BARRIER,
                                 comm, MPI_STATUS_IGNORE));
        if (MPI_SUCCESS != err) {
            return err;
        }
    }

    /* The root collects and broadcasts the messages. */

    else {
        ompi_request_t** requests;

        requests = (ompi_request_t**)malloc( size * sizeof(ompi_request_t*) );
        for (i = 1; i < size; ++i) {
            err = MCA_PML_CALL(irecv(NULL, 0, MPI_BYTE, MPI_ANY_SOURCE,
                                     MCA_COLL_BASE_TAG_BARRIER, comm, 
                                     &(requests[i])));
            if (MPI_SUCCESS != err) {
                return err;
            }
        }
        ompi_request_wait_all( size-1, requests+1, MPI_STATUSES_IGNORE );

        for (i = 1; i < size; ++i) {
            err = MCA_PML_CALL(isend(NULL, 0, MPI_BYTE, i,
                                     MCA_COLL_BASE_TAG_BARRIER, 
                                     MCA_PML_BASE_SEND_STANDARD, comm,
                                     &(requests[i])));
            if (MPI_SUCCESS != err) {
                return err;
            }
        }
        ompi_request_wait_all( size-1, requests+1, MPI_STATUSES_IGNORE );
        free( requests );
    }

    /* All done */

    return MPI_SUCCESS;

}
Ejemplo n.º 4
0
static int
mca_coll_basic_neighbor_allgather_dist_graph(const void *sbuf, int scount,
                                             struct ompi_datatype_t *sdtype, void *rbuf,
                                             int rcount, struct ompi_datatype_t *rdtype,
                                             struct ompi_communicator_t *comm,
                                             mca_coll_base_module_t *module)
{
    const mca_topo_base_comm_dist_graph_2_2_0_t *dist_graph = comm->c_topo->mtc.dist_graph;
    const int *inedges, *outedges;
    int indegree, outdegree;
    ompi_request_t **reqs, **preqs;
    ptrdiff_t lb, extent;
    int rc = MPI_SUCCESS, neighbor;

    indegree = dist_graph->indegree;
    outdegree = dist_graph->outdegree;
    if( 0 == (indegree + outdegree) ) return OMPI_SUCCESS;

    inedges = dist_graph->in;
    outedges = dist_graph->out;

    ompi_datatype_get_extent(rdtype, &lb, &extent);
    reqs = preqs = coll_base_comm_get_reqs( module->base_data, indegree + outdegree);
    if( NULL == reqs ) { return OMPI_ERR_OUT_OF_RESOURCE; }

    for (neighbor = 0; neighbor < indegree ; ++neighbor) {
        rc = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, inedges[neighbor],
                                MCA_COLL_BASE_TAG_ALLGATHER,
                                comm, preqs++));
        if (OMPI_SUCCESS != rc) break;
        rbuf = (char *) rbuf + extent * rcount;
    }

    if (OMPI_SUCCESS != rc) {
        ompi_coll_base_free_reqs(reqs, neighbor + 1);
        return rc;
    }

    for (neighbor = 0 ; neighbor < outdegree ; ++neighbor) {
        /* remove cast from const when the pml layer is updated to take
         * a const for the send buffer. */
        rc = MCA_PML_CALL(isend((void *) sbuf, scount, sdtype, outedges[neighbor],
                                MCA_COLL_BASE_TAG_ALLGATHER,
                                MCA_PML_BASE_SEND_STANDARD,
                                comm, preqs++));
        if (OMPI_SUCCESS != rc) break;
    }

    if (OMPI_SUCCESS != rc) {
        ompi_coll_base_free_reqs(reqs, indegree + neighbor + 1);
        return rc;
    }

    rc = ompi_request_wait_all (indegree + outdegree, reqs, MPI_STATUSES_IGNORE);
    if (OMPI_SUCCESS != rc) {
        ompi_coll_base_free_reqs(reqs, indegree + outdegree);
    }
    return rc;
}
static int
mca_coll_basic_neighbor_alltoallv_graph(const void *sbuf, const int scounts[], const int sdisps[],
                                        struct ompi_datatype_t *sdtype, void *rbuf, const int rcounts[],
                                        const int rdisps[], struct ompi_datatype_t *rdtype,
                                        struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
{
    mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t *) module;
    const mca_topo_base_comm_graph_2_2_0_t *graph = comm->c_topo->mtc.graph;
    int rc = MPI_SUCCESS, neighbor, degree;
    const int rank = ompi_comm_rank (comm);
    ptrdiff_t lb, rdextent, sdextent;
    ompi_request_t **reqs;
    const int *edges;

    mca_topo_base_graph_neighbors_count (comm, rank, &degree);

    /* ensure we have enough storage for requests */
    rc = mca_coll_basic_check_for_requests (basic_module, 2 * degree);
    if (OMPI_SUCCESS != rc) {
        return rc;
    }

    edges = graph->edges;
    if (rank > 0) {
        edges += graph->index[rank - 1];
    }

    ompi_datatype_get_extent(rdtype, &lb, &rdextent);
    ompi_datatype_get_extent(sdtype, &lb, &sdextent);

    /* post all receives first */
    for (neighbor = 0, reqs = basic_module->mccb_reqs ; neighbor < degree ; ++neighbor) {
        rc = MCA_PML_CALL(irecv((char *) rbuf + rdisps[neighbor] * rdextent, rcounts[neighbor], rdtype,
                                edges[neighbor], MCA_COLL_BASE_TAG_ALLTOALL, comm, reqs++));
        if (OMPI_SUCCESS != rc) break;
    }

    if (OMPI_SUCCESS != rc) {
        /* should probably try to clean up here */
        return rc;
    }

    for (neighbor = 0 ; neighbor < degree ; ++neighbor) {
        /* remove cast from const when the pml layer is updated to take a const for the send buffer */
        rc = MCA_PML_CALL(isend((char *) sbuf + sdisps[neighbor] * sdextent, scounts[neighbor], sdtype,
                                edges[neighbor], MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD,
                                comm, reqs++));
        if (OMPI_SUCCESS != rc) break;
    }

    if (OMPI_SUCCESS != rc) {
        /* should probably try to clean up here */
        return rc;
    }

    return ompi_request_wait_all (degree * 2, basic_module->mccb_reqs, MPI_STATUSES_IGNORE);
}
Ejemplo n.º 6
0
static int
mca_coll_basic_neighbor_alltoallv_graph(const void *sbuf, const int scounts[], const int sdisps[],
                                        struct ompi_datatype_t *sdtype, void *rbuf, const int rcounts[],
                                        const int rdisps[], struct ompi_datatype_t *rdtype,
                                        struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
{
    const mca_topo_base_comm_graph_2_2_0_t *graph = comm->c_topo->mtc.graph;
    int rc = MPI_SUCCESS, neighbor, degree;
    const int rank = ompi_comm_rank (comm);
    ptrdiff_t lb, rdextent, sdextent;
    ompi_request_t **reqs, **preqs;
    const int *edges;

    mca_topo_base_graph_neighbors_count (comm, rank, &degree);
    if( 0 == degree ) return OMPI_SUCCESS;

    edges = graph->edges;
    if (rank > 0) {
        edges += graph->index[rank - 1];
    }

    ompi_datatype_get_extent(rdtype, &lb, &rdextent);
    ompi_datatype_get_extent(sdtype, &lb, &sdextent);
    reqs = preqs = ompi_coll_base_comm_get_reqs( module->base_data, 2 * degree );
    if( NULL == reqs ) { return OMPI_ERR_OUT_OF_RESOURCE; }

    /* post all receives first */
    for (neighbor = 0; neighbor < degree ; ++neighbor) {
        rc = MCA_PML_CALL(irecv((char *) rbuf + rdisps[neighbor] * rdextent, rcounts[neighbor], rdtype,
                                edges[neighbor], MCA_COLL_BASE_TAG_ALLTOALL, comm, preqs++));
        if (OMPI_SUCCESS != rc) break;
    }

    if (OMPI_SUCCESS != rc) {
        ompi_coll_base_free_reqs( reqs, neighbor + 1);
        return rc;
    }

    for (neighbor = 0 ; neighbor < degree ; ++neighbor) {
        /* remove cast from const when the pml layer is updated to take a const for the send buffer */
        rc = MCA_PML_CALL(isend((char *) sbuf + sdisps[neighbor] * sdextent, scounts[neighbor], sdtype,
                                edges[neighbor], MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD,
                                comm, preqs++));
        if (OMPI_SUCCESS != rc) break;
    }

    if (OMPI_SUCCESS != rc) {
        ompi_coll_base_free_reqs( reqs, degree + neighbor + 1);
        return rc;
    }

    rc = ompi_request_wait_all (degree * 2, reqs, MPI_STATUSES_IGNORE);
    if (OMPI_SUCCESS != rc) {
        ompi_coll_base_free_reqs( reqs, degree * 2);
    }
    return rc;
}
Ejemplo n.º 7
0
/*
 *	scatterv_inter
 *
 *	Function:	- scatterv operation
 *	Accepts:	- same arguments as MPI_Scatterv()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_basic_scatterv_inter(const void *sbuf, const int *scounts,
                              const int *disps, struct ompi_datatype_t *sdtype,
                              void *rbuf, int rcount,
                              struct ompi_datatype_t *rdtype, int root,
                              struct ompi_communicator_t *comm,
                              mca_coll_base_module_t *module)
{
    int i, size, err;
    char *ptmp;
    ptrdiff_t lb, extent;
    ompi_request_t **reqs;

    /* Initialize */
    size = ompi_comm_remote_size(comm);

    /* If not root, receive data.  Note that we will only get here if
     * rcount > 0 or rank == root. */

    if (MPI_PROC_NULL == root) {
        /* do nothing */
        err = OMPI_SUCCESS;
    } else if (MPI_ROOT != root) {
        /* If not root, receive data. */
        err = MCA_PML_CALL(recv(rbuf, rcount, rdtype,
                                root, MCA_COLL_BASE_TAG_SCATTERV,
                                comm, MPI_STATUS_IGNORE));
    } else {
        /* I am the root, loop sending data. */
        err = ompi_datatype_get_extent(sdtype, &lb, &extent);
        if (OMPI_SUCCESS != err) {
            return OMPI_ERROR;
        }

        reqs = coll_base_comm_get_reqs(module->base_data, size);
        for (i = 0; i < size; ++i) {
            ptmp = ((char *) sbuf) + (extent * disps[i]);
            err = MCA_PML_CALL(isend(ptmp, scounts[i], sdtype, i,
                                     MCA_COLL_BASE_TAG_SCATTERV,
                                     MCA_PML_BASE_SEND_STANDARD, comm,
                                     &(reqs[i])));
            if (OMPI_SUCCESS != err) {
                ompi_coll_base_free_reqs(reqs, i);
                return err;
            }
        }

        err = ompi_request_wait_all(size, reqs, MPI_STATUSES_IGNORE);
        if (OMPI_SUCCESS != err) {
            ompi_coll_base_free_reqs(reqs, size);
        }
    }

    /* All done */
    return err;
}
Ejemplo n.º 8
0
ompi_dpm_base_disconnect_obj *ompi_dpm_base_disconnect_init ( ompi_communicator_t *comm)
{
    ompi_dpm_base_disconnect_obj *obj=NULL;
    int ret;
    int i;

    obj = (ompi_dpm_base_disconnect_obj *) calloc(1,sizeof(ompi_dpm_base_disconnect_obj));
    if ( NULL == obj ) {
        printf("Could not allocate disconnect object\n");
        return NULL;
    }

    if ( OMPI_COMM_IS_INTER(comm) ) {
        obj->size = ompi_comm_remote_size (comm);
    } else {
        obj->size = ompi_comm_size (comm);
    }

    obj->comm = comm;
    obj->reqs = (ompi_request_t **) malloc(2*obj->size*sizeof(ompi_request_t *));
    if ( NULL == obj->reqs ) {
        printf("Could not allocate request array for disconnect object\n");
        free (obj);
        return NULL;
    }

    /* initiate all isend_irecvs. We use a dummy buffer stored on
       the object, since we are sending zero size messages anyway. */
    for ( i=0; i < obj->size; i++ ) {
        ret = MCA_PML_CALL(irecv (&(obj->buf), 0, MPI_INT, i,
                     OMPI_COMM_BARRIER_TAG, comm,
                     &(obj->reqs[2*i])));

        if ( OMPI_SUCCESS != ret ) {
            printf("dpm_base_disconnect_init: error %d in irecv to process %d\n", ret, i);
            free (obj->reqs);
            free (obj);
            return NULL;
        }
        ret = MCA_PML_CALL(isend (&(obj->buf), 0, MPI_INT, i,
                     OMPI_COMM_BARRIER_TAG,
                     MCA_PML_BASE_SEND_SYNCHRONOUS,
                     comm, &(obj->reqs[2*i+1])));

        if ( OMPI_SUCCESS != ret ) {
            printf("dpm_base_disconnect_init: error %d in isend to process %d\n", ret, i);
            free (obj->reqs);
            free (obj);
            return NULL;
        }
    }

    /* return handle */
    return obj;
}
int
ompi_init_preconnect_mpi(void)
{
    int comm_size = ompi_comm_size(MPI_COMM_WORLD);
    int comm_rank =  ompi_comm_rank(MPI_COMM_WORLD);
    int param, next, prev, i, ret = OMPI_SUCCESS;
    struct ompi_request_t * requests[2];
    char inbuf[1], outbuf[1];
    const bool *value;

    param = mca_base_var_find("ompi", "mpi", NULL, "preconnect_mpi");
    if (0 > param) return OMPI_SUCCESS;
    ret = mca_base_var_get_value(param, &value, NULL, NULL);
    if (OMPI_SUCCESS != ret || 0 == value[0]) {
        return OMPI_SUCCESS;
    }

    inbuf[0] = outbuf[0] = '\0';

    /* Each iteration, every process sends to its neighbor i hops to
       the right and receives from its neighbor i hops to the left.
       Because send_complete is used, there will only ever be one
       outstanding send and one outstanding receive in the network at
       a time for any given process.  This limits any "flooding"
       effect that can occur with other connection algorithms.  While
       the flooding algorithms may be a more efficient use of
       resources, they can overwhelm the out-of-band connection system
       used to wire up some networks, leading to poor performance and
       hangs. */
    for (i = 1 ; i <= comm_size / 2 ; ++i) {
        next = (comm_rank + i) % comm_size;
        prev = (comm_rank - i + comm_size) % comm_size;

        ret = MCA_PML_CALL(isend(outbuf, 1, MPI_CHAR,
                                 next, 1,
                                 MCA_PML_BASE_SEND_COMPLETE,
                                 MPI_COMM_WORLD, 
                                 &requests[1]));
        if (OMPI_SUCCESS != ret) return ret;

        ret = MCA_PML_CALL(irecv(inbuf, 1, MPI_CHAR,
                                 prev, 1,
                                 MPI_COMM_WORLD, 
                                 &requests[0]));
        if(OMPI_SUCCESS != ret) return ret;

        ret = ompi_request_wait_all(2, requests, MPI_STATUSES_IGNORE);
        if (OMPI_SUCCESS != ret) return ret;
    }

    return ret;
}
static int
mca_coll_basic_neighbor_allgather_graph(const void *sbuf, int scount,
                                        struct ompi_datatype_t *sdtype, void *rbuf,
                                        int rcount, struct ompi_datatype_t *rdtype,
                                        struct ompi_communicator_t *comm,
                                        mca_coll_base_module_t *module)
{
    const mca_topo_base_comm_graph_2_2_0_t *graph = comm->c_topo->mtc.graph;
    const int rank = ompi_comm_rank (comm);
    const int *edges;
    int degree;
    ompi_request_t **reqs, **preqs;
    ptrdiff_t lb, extent;
    int rc = MPI_SUCCESS, neighbor;

    mca_topo_base_graph_neighbors_count (comm, rank, &degree);

    edges = graph->edges;
    if (rank > 0) {
        edges += graph->index[rank - 1];
    }

    ompi_datatype_get_extent(rdtype, &lb, &extent);
    reqs = preqs = coll_base_comm_get_reqs( module->base_data, 2 * degree);
    if( NULL == reqs ) { return OMPI_ERR_OUT_OF_RESOURCE; }

    for (neighbor = 0; neighbor < degree ; ++neighbor) {
        rc = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, edges[neighbor], MCA_COLL_BASE_TAG_ALLGATHER,
                                comm, preqs++));
        if (OMPI_SUCCESS != rc) break;
        rbuf = (char *) rbuf + extent * rcount;

        /* remove cast from const when the pml layer is updated to take
         * a const for the send buffer. */
        rc = MCA_PML_CALL(isend((void *) sbuf, scount, sdtype, edges[neighbor],
                                MCA_COLL_BASE_TAG_ALLGATHER, MCA_PML_BASE_SEND_STANDARD,
                                comm, preqs++));
        if (OMPI_SUCCESS != rc) break;
    }

    if (OMPI_SUCCESS != rc) {
        ompi_coll_base_free_reqs( reqs, (2 * neighbor + 1));
        return rc;
    }

    rc = ompi_request_wait_all (degree * 2, reqs, MPI_STATUSES_IGNORE);
    if (OMPI_SUCCESS != rc) {
        ompi_coll_base_free_reqs( reqs, degree * 2);
    }
    return rc;
}
Ejemplo n.º 11
0
static int
mca_coll_basic_neighbor_alltoallv_dist_graph(const void *sbuf, const int scounts[], const int sdisps[],
                                             struct ompi_datatype_t *sdtype, void *rbuf, const int rcounts[],
                                             const int rdisps[], struct ompi_datatype_t *rdtype,
                                             struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
{
    mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t *) module;
    const mca_topo_base_comm_dist_graph_2_1_0_t *dist_graph = comm->c_topo->mtc.dist_graph;
    ptrdiff_t lb, rdextent, sdextent;
    int rc = MPI_SUCCESS, neighbor;
    const int *inedges, *outedges;
    int indegree, outdegree;
    ompi_request_t **reqs;

    indegree = dist_graph->indegree;
    outdegree = dist_graph->outdegree;

    inedges = dist_graph->in;
    outedges = dist_graph->out;

    ompi_datatype_get_extent(rdtype, &lb, &rdextent);
    ompi_datatype_get_extent(sdtype, &lb, &sdextent);

    /* post all receives first */
    for (neighbor = 0, reqs = basic_module->mccb_reqs ; neighbor < indegree ; ++neighbor) {
        rc = MCA_PML_CALL(irecv((char *) rbuf + rdisps[neighbor] * rdextent, rcounts[neighbor], rdtype,
                                inedges[neighbor], MCA_COLL_BASE_TAG_ALLTOALL, comm, reqs++));
        if (OMPI_SUCCESS != rc) break;
    }

    if (OMPI_SUCCESS != rc) {
        /* should probably try to clean up here */
        return rc;
    }

    for (neighbor = 0 ; neighbor < outdegree ; ++neighbor) {
        /* remove cast from const when the pml layer is updated to take a const for the send buffer */
        rc = MCA_PML_CALL(isend((char *) sbuf + sdisps[neighbor] * sdextent, scounts[neighbor], sdtype,
                                outedges[neighbor], MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD,
                                comm, reqs++));
        if (OMPI_SUCCESS != rc) break;
    }

    if (OMPI_SUCCESS != rc) {
        /* should probably try to clean up here */
        return rc;
    }

    return ompi_request_wait_all (indegree + outdegree, basic_module->mccb_reqs, MPI_STATUSES_IGNORE);
}
Ejemplo n.º 12
0
/*
 *	scatter_inter
 *
 *	Function:	- scatter operation
 *	Accepts:	- same arguments as MPI_Scatter()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_basic_scatter_inter(void *sbuf, int scount,
                             struct ompi_datatype_t *sdtype,
                             void *rbuf, int rcount,
                             struct ompi_datatype_t *rdtype,
                             int root, struct ompi_communicator_t *comm,
                             mca_coll_base_module_t *module)
{
    int i, size, err;
    char *ptmp;
    ptrdiff_t lb, incr;
    mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module;
    ompi_request_t **reqs = basic_module->mccb_reqs;

    /* Initialize */
    size = ompi_comm_remote_size(comm);

    if (MPI_PROC_NULL == root) {
        /* do nothing */
        err = OMPI_SUCCESS;
    } else if (MPI_ROOT != root) {
        /* If not root, receive data. */
        err = MCA_PML_CALL(recv(rbuf, rcount, rdtype, root,
                                MCA_COLL_BASE_TAG_SCATTER,
                                comm, MPI_STATUS_IGNORE));
    } else {
        /* I am the root, loop sending data. */
        err = ompi_datatype_get_extent(sdtype, &lb, &incr);
        if (OMPI_SUCCESS != err) {
            return OMPI_ERROR;
        }

        incr *= scount;
        for (i = 0, ptmp = (char *) sbuf; i < size; ++i, ptmp += incr) {
            err = MCA_PML_CALL(isend(ptmp, scount, sdtype, i,
                                     MCA_COLL_BASE_TAG_SCATTER,
                                     MCA_PML_BASE_SEND_STANDARD, comm,
                                     reqs++));
            if (OMPI_SUCCESS != err) {
                return err;
            }
        }

        err =
            ompi_request_wait_all(size, basic_module->mccb_reqs,
                                  MPI_STATUSES_IGNORE);
    }

    return err;
}
Ejemplo n.º 13
0
/**
 * A quick version of the MPI_Sendreceive implemented for the barrier.
 * No actual data is moved across the wire, we use 0-byte messages to
 * signal a two peer synchronization.
 */
static inline int
ompi_coll_base_sendrecv_zero(int dest, int stag,
                              int source, int rtag,
                              MPI_Comm comm)

{
    int err, line = 0;
    ompi_request_t* reqs[2];
    ompi_status_public_t statuses[2];

    /* post new irecv */
    err = MCA_PML_CALL(irecv( NULL, 0, MPI_BYTE, source, rtag,
                              comm, &reqs[0]));
    if (err != MPI_SUCCESS) { line = __LINE__; goto error_handler; }

    /* send data to children */
    err = MCA_PML_CALL(isend( NULL, 0, MPI_BYTE, dest, stag,
                              MCA_PML_BASE_SEND_STANDARD, comm, &reqs[1]));
    if (err != MPI_SUCCESS) { line = __LINE__; goto error_handler; }

    err = ompi_request_wait_all( 2, reqs, statuses );
    if( MPI_ERR_IN_STATUS == err ) { line = __LINE__;
        /* As we use wait_all we will get MPI_ERR_IN_STATUS which is not an error
         * code that we can propagate up the stack. Instead, look for the real
         * error code from the MPI_ERROR in the status.
         */
        int err_index = 0;
        if( MPI_SUCCESS == statuses[0].MPI_ERROR
         || MPI_ERR_PENDING == statuses[0].MPI_ERROR ) {
            err_index = 1;
        }
        err = statuses[err_index].MPI_ERROR;
        OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred in the %s"
                                              " stage of ompi_coll_base_sendrecv_zero\n",
                      __FILE__, line, err, (0 == err_index ? "receive" : "send")));
        return err;
    }
    if (err != MPI_SUCCESS) { line = __LINE__; goto error_handler; }

    return (MPI_SUCCESS);

 error_handler:
    /* Error discovered during the posting of the irecv or isend,
     * and no status is available.
     */
    OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred\n",
                  __FILE__, line, err));
    (void)line;  // silence compiler warning
    return err;
}
Ejemplo n.º 14
0
int	sendSegmentByAOS(int linkSocket, char *from, int length)
{
	int	bytesWritten;

	while (1)	/*	Continue until not interrupted.		*/
	{
		bytesWritten = isend(linkSocket, from, length, 0);
		if (bytesWritten < 0)
		{
			if (errno == EINTR)	/*	Interrupted.	*/
			{
				continue;	/*	Retry.		*/
			}

			putSysErrmsg("LSO send() error on socket", NULL);
		}

		return bytesWritten;
	}
}
Ejemplo n.º 15
0
static int ompi_comm_allreduce_group_broadcast (ompi_comm_request_t *request)
{
    ompi_comm_allreduce_context_t *context = (ompi_comm_allreduce_context_t *) request->context;
    ompi_comm_cid_context_t *cid_context = context->cid_context;
    ompi_request_t *subreq[2];
    int subreq_count = 0;
    int rc;

    for (int i = 0 ; i < 2 ; ++i) {
        if (MPI_PROC_NULL != context->peers_comm[i + 1]) {
            rc = MCA_PML_CALL(isend(context->outbuf, context->count, MPI_INT, context->peers_comm[i+1],
                                    cid_context->pml_tag, MCA_PML_BASE_SEND_STANDARD,
                                    cid_context->comm, subreq + subreq_count++));
            if (OMPI_SUCCESS != rc) {
                return rc;
            }
        }
    }

    return ompi_comm_request_schedule_append (request, NULL, subreq, subreq_count);
}
Ejemplo n.º 16
0
		/**
		 * The object is serialized using the messageCallack.
		 * @note This function may return immediately.
		 */
		 virtual BareMessage * isend(const void * object, const void * user_ptr = nullptr ){
			uint64_t msg_size = headerSize() + messageCallback->serializeMessageLen(object);
			char * payload = (char*) malloc(msg_size);

			assert(payload);

			uint64_t pos = 0;

			// memset(payload, 255, msg_size);

			serializeHeader(payload, pos, msg_size);
			assert(headerSize() == pos);

			messageCallback->serializeMessage(object, payload, pos);
			assert(pos == msg_size);

			BareMessage * msg = new BareMessage(payload, msg_size, user_ptr);

			isend(msg);
			return msg;
		}		
Ejemplo n.º 17
0
/*
 *	bcast_lin_inter
 *
 *	Function:	- broadcast using O(N) algorithm
 *	Accepts:	- same arguments as MPI_Bcast()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_basic_bcast_lin_inter(void *buff, int count,
                               struct ompi_datatype_t *datatype, int root,
                               struct ompi_communicator_t *comm,
                               mca_coll_base_module_t *module)
{
    int i;
    int rsize;
    int err;
    mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t*) module;
    ompi_request_t **reqs = basic_module->mccb_reqs;

    rsize = ompi_comm_remote_size(comm);

    if (MPI_PROC_NULL == root) {
        /* do nothing */
        err = OMPI_SUCCESS;
    } else if (MPI_ROOT != root) {
        /* Non-root receive the data. */
        err = MCA_PML_CALL(recv(buff, count, datatype, root,
                                MCA_COLL_BASE_TAG_BCAST, comm,
                                MPI_STATUS_IGNORE));
    } else {
        /* root section */
        for (i = 0; i < rsize; i++) {
            err = MCA_PML_CALL(isend(buff, count, datatype, i,
                                     MCA_COLL_BASE_TAG_BCAST,
                                     MCA_PML_BASE_SEND_STANDARD,
                                     comm, &(reqs[i])));
            if (OMPI_SUCCESS != err) {
                return err;
            }
        }
        err = ompi_request_wait_all(rsize, reqs, MPI_STATUSES_IGNORE);
    }


    /* All done */
    return err;
}
int
ompi_osc_pt2pt_component_isend(void *buf,
                               size_t count,
                               struct ompi_datatype_t *datatype,
                               int dest,
                               int tag,
                               struct ompi_communicator_t *comm,
                               ompi_request_t **request,
                               ompi_request_complete_fn_t callback,
                               void *cbdata)
{
    int ret;
    bool missed_callback;
    ompi_request_complete_fn_t tmp;

    ret = MCA_PML_CALL(isend(buf, count, datatype,
                             dest, tag, MCA_PML_BASE_SEND_STANDARD, comm, request));
    if (OMPI_SUCCESS != ret) return ret;

    /* lock the giant request mutex to update the callback data so
       that the PML can't mark the request as complete while we're
       updating the callback data, which means we can
       deterministically ensure the callback is only fired once and
       that we didn't miss it.  */
    OPAL_THREAD_LOCK(&ompi_request_lock);
    (*request)->req_complete_cb = callback;
    (*request)->req_complete_cb_data = cbdata;
    missed_callback = (*request)->req_complete;
    OPAL_THREAD_UNLOCK(&ompi_request_lock);

    if (missed_callback) {
        tmp = (*request)->req_complete_cb;
        (*request)->req_complete_cb = NULL;
        tmp(*request);
    }

    return OMPI_SUCCESS;
}
Ejemplo n.º 19
0
static int ompi_comm_allreduce_inter_leader_exchange (ompi_comm_request_t *request)
{
    ompi_comm_allreduce_context_t *context = (ompi_comm_allreduce_context_t *) request->context;
    ompi_communicator_t *intercomm = context->cid_context->comm;
    ompi_request_t *subreqs[2];
    int rc;

    /* local leader exchange their data and determine the overall result
       for both groups */
    rc = MCA_PML_CALL(irecv (context->outbuf, context->count, MPI_INT, 0, OMPI_COMM_ALLREDUCE_TAG,
                             intercomm, subreqs));
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        return rc;
    }

    rc = MCA_PML_CALL(isend (context->tmpbuf, context->count, MPI_INT, 0, OMPI_COMM_ALLREDUCE_TAG,
                             MCA_PML_BASE_SEND_STANDARD, intercomm, subreqs + 1));
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        return rc;
    }

    return ompi_comm_request_schedule_append (request, ompi_comm_allreduce_inter_leader_reduce, subreqs, 2);
}
Ejemplo n.º 20
0
static int ompi_comm_allreduce_bridged_reduce_complete (ompi_comm_request_t *request)
{
    ompi_comm_allreduce_context_t *context = (ompi_comm_allreduce_context_t *) request->context;
    ompi_communicator_t *bridgecomm = context->cid_context->bridgecomm;
    ompi_request_t *subreq[2];
    int rc;

    /* step 2: leader exchange */
    rc = MCA_PML_CALL(irecv (context->outbuf, context->count, MPI_INT, context->cid_context->remote_leader,
                             OMPI_COMM_ALLREDUCE_TAG, bridgecomm, subreq + 1));
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        return rc;
    }

    rc = MCA_PML_CALL(isend (context->tmpbuf, context->count, MPI_INT, context->cid_context->remote_leader,
                             OMPI_COMM_ALLREDUCE_TAG, MCA_PML_BASE_SEND_STANDARD, bridgecomm,
                             subreq));
    if (OPAL_UNLIKELY(OMPI_SUCCESS != rc)) {
        return rc;
    }

    return ompi_comm_request_schedule_append (request, ompi_comm_allreduce_bridged_xchng_complete, subreq, 2);
}
Ejemplo n.º 21
0
static int ompi_comm_allreduce_group_recv_complete (ompi_comm_request_t *request)
{
    ompi_comm_allreduce_context_t *context = (ompi_comm_allreduce_context_t *) request->context;
    ompi_comm_cid_context_t *cid_context = context->cid_context;
    int *tmp = context->tmpbuf;
    ompi_request_t *subreq[2];
    int rc;

    for (int i = 0 ; i < 2 ; ++i) {
        if (MPI_PROC_NULL != context->peers_comm[i + 1]) {
            ompi_op_reduce (context->op, tmp, context->outbuf, context->count, MPI_INT);
            tmp += context->count;
        }
    }

    if (MPI_PROC_NULL != context->peers_comm[0]) {
        /* interior node */
        rc = MCA_PML_CALL(isend(context->outbuf, context->count, MPI_INT, context->peers_comm[0],
                                cid_context->pml_tag, MCA_PML_BASE_SEND_STANDARD,
                                cid_context->comm, subreq));
        if (OMPI_SUCCESS != rc) {
            return rc;
        }

        rc = MCA_PML_CALL(irecv(context->outbuf, context->count, MPI_INT, context->peers_comm[0],
                                cid_context->pml_tag, cid_context->comm, subreq + 1));
        if (OMPI_SUCCESS != rc) {
            return rc;
        }

        return ompi_comm_request_schedule_append (request, ompi_comm_allreduce_group_broadcast, subreq, 2);
    }

    /* root */
    return ompi_comm_allreduce_group_broadcast (request);
}
Ejemplo n.º 22
0
static int
mca_coll_basic_neighbor_allgather_cart(const void *sbuf, int scount,
                                       struct ompi_datatype_t *sdtype, void *rbuf,
                                       int rcount, struct ompi_datatype_t *rdtype,
                                       struct ompi_communicator_t *comm,
                                       mca_coll_base_module_t *module)
{
    const mca_topo_base_comm_cart_2_2_0_t *cart = comm->c_topo->mtc.cart;
    const int rank = ompi_comm_rank (comm);
    ompi_request_t **reqs, **preqs;
    ptrdiff_t lb, extent;
    int rc = MPI_SUCCESS, dim, nreqs;

    if( 0 == cart->ndims ) return OMPI_SUCCESS;

    ompi_datatype_get_extent(rdtype, &lb, &extent);

    reqs = preqs = coll_base_comm_get_reqs( module->base_data, 4 * cart->ndims );
    if( NULL == reqs ) { return OMPI_ERR_OUT_OF_RESOURCE; }

    /* The ordering is defined as -1 then +1 in each dimension in
     * order of dimension. */
    for (dim = 0, nreqs = 0 ; dim < cart->ndims ; ++dim) {
        int srank = MPI_PROC_NULL, drank = MPI_PROC_NULL;

        if (cart->dims[dim] > 1) {
            mca_topo_base_cart_shift (comm, dim, 1, &srank, &drank);
        } else if (1 == cart->dims[dim] && cart->periods[dim]) {
            srank = drank = rank;
        }

        if (MPI_PROC_NULL != srank) {
            nreqs++;
            rc = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, srank,
                                    MCA_COLL_BASE_TAG_ALLGATHER,
                                    comm, preqs++));
            if (OMPI_SUCCESS != rc) break;

            nreqs++;
            /* remove cast from const when the pml layer is updated to take
             * a const for the send buffer. */
            rc = MCA_PML_CALL(isend((void *) sbuf, scount, sdtype, srank,
                                    MCA_COLL_BASE_TAG_ALLGATHER,
                                    MCA_PML_BASE_SEND_STANDARD,
                                    comm, preqs++));
            if (OMPI_SUCCESS != rc) break;
        }

        rbuf = (char *) rbuf + extent * rcount;

        if (MPI_PROC_NULL != drank) {
            nreqs++;
            rc = MCA_PML_CALL(irecv(rbuf, rcount, rdtype, drank,
                                    MCA_COLL_BASE_TAG_ALLGATHER,
                                    comm, preqs++));
            if (OMPI_SUCCESS != rc) break;

            nreqs++;
            rc = MCA_PML_CALL(isend((void *) sbuf, scount, sdtype, drank,
                                    MCA_COLL_BASE_TAG_ALLGATHER,
                                    MCA_PML_BASE_SEND_STANDARD,
                                    comm, preqs++));
            if (OMPI_SUCCESS != rc) break;
        }

        rbuf = (char *) rbuf + extent * rcount;
    }

    if (OMPI_SUCCESS != rc) {
        ompi_coll_base_free_reqs(reqs, nreqs);
        return rc;
    }

    rc = ompi_request_wait_all (nreqs, reqs, MPI_STATUSES_IGNORE);
    if (OMPI_SUCCESS != rc) {
        ompi_coll_base_free_reqs(reqs, nreqs);
    }
    return rc;
}
Ejemplo n.º 23
0
static inline int NBC_Start_round(NBC_Handle *handle) {
  int num; /* number of operations */
  int res;
  char* ptr;
  MPI_Request *tmp;
  NBC_Fn_type type;
  NBC_Args_send     sendargs;
  NBC_Args_recv     recvargs;
  NBC_Args_op         opargs;
  NBC_Args_copy     copyargs;
  NBC_Args_unpack unpackargs;
  void *buf1,  *buf2;

  /* get round-schedule address */
  ptr = handle->schedule->data + handle->row_offset;

  NBC_GET_BYTES(ptr,num);
  NBC_DEBUG(10, "start_round round at offset %d : posting %i operations\n", handle->row_offset, num);

  for (int i = 0 ; i < num ; ++i) {
    int offset = (intptr_t)(ptr - handle->schedule->data);

    memcpy (&type, ptr, sizeof (type));
    switch(type) {
      case SEND:
        NBC_DEBUG(5,"  SEND (offset %li) ", offset);
        NBC_GET_BYTES(ptr,sendargs);
        NBC_DEBUG(5,"*buf: %p, count: %i, type: %p, dest: %i, tag: %i)\n", sendargs.buf,
                  sendargs.count, sendargs.datatype, sendargs.dest, handle->tag);
        /* get an additional request */
        handle->req_count++;
        /* get buffer */
        if(sendargs.tmpbuf) {
          buf1=(char*)handle->tmpbuf+(long)sendargs.buf;
        } else {
          buf1=(void *)sendargs.buf;
        }
#ifdef NBC_TIMING
        Isend_time -= MPI_Wtime();
#endif
        tmp = (MPI_Request *) realloc ((void *) handle->req_array, handle->req_count * sizeof (MPI_Request));
        if (NULL == tmp) {
          return OMPI_ERR_OUT_OF_RESOURCE;
        }

        handle->req_array = tmp;

        res = MCA_PML_CALL(isend(buf1, sendargs.count, sendargs.datatype, sendargs.dest, handle->tag,
                                 MCA_PML_BASE_SEND_STANDARD, sendargs.local?handle->comm->c_local_comm:handle->comm,
                                 handle->req_array+handle->req_count - 1));
        if (OMPI_SUCCESS != res) {
          NBC_Error ("Error in MPI_Isend(%lu, %i, %p, %i, %i, %lu) (%i)", (unsigned long)buf1, sendargs.count,
                     sendargs.datatype, sendargs.dest, handle->tag, (unsigned long)handle->comm, res);
          return res;
        }
#ifdef NBC_TIMING
        Isend_time += MPI_Wtime();
#endif
        break;
      case RECV:
        NBC_DEBUG(5, "  RECV (offset %li) ", offset);
        NBC_GET_BYTES(ptr,recvargs);
        NBC_DEBUG(5, "*buf: %p, count: %i, type: %p, source: %i, tag: %i)\n", recvargs.buf, recvargs.count,
                  recvargs.datatype, recvargs.source, handle->tag);
        /* get an additional request - TODO: req_count NOT thread safe */
        handle->req_count++;
        /* get buffer */
        if(recvargs.tmpbuf) {
          buf1=(char*)handle->tmpbuf+(long)recvargs.buf;
        } else {
          buf1=recvargs.buf;
        }
#ifdef NBC_TIMING
        Irecv_time -= MPI_Wtime();
#endif
        tmp = (MPI_Request *) realloc ((void *) handle->req_array, handle->req_count * sizeof (MPI_Request));
        if (NULL == tmp) {
          return OMPI_ERR_OUT_OF_RESOURCE;
        }

        handle->req_array = tmp;

        res = MCA_PML_CALL(irecv(buf1, recvargs.count, recvargs.datatype, recvargs.source, handle->tag, recvargs.local?handle->comm->c_local_comm:handle->comm,
                                 handle->req_array+handle->req_count-1));
        if (OMPI_SUCCESS != res) {
          NBC_Error("Error in MPI_Irecv(%lu, %i, %p, %i, %i, %lu) (%i)", (unsigned long)buf1, recvargs.count,
                    recvargs.datatype, recvargs.source, handle->tag, (unsigned long)handle->comm, res);
          return res;
        }
#ifdef NBC_TIMING
        Irecv_time += MPI_Wtime();
#endif
        break;
      case OP:
        NBC_DEBUG(5, "  OP2  (offset %li) ", offset);
        NBC_GET_BYTES(ptr,opargs);
        NBC_DEBUG(5, "*buf1: %p, buf2: %p, count: %i, type: %p)\n", opargs.buf1, opargs.buf2,
                  opargs.count, opargs.datatype);
        /* get buffers */
        if(opargs.tmpbuf1) {
          buf1=(char*)handle->tmpbuf+(long)opargs.buf1;
        } else {
          buf1=(void *)opargs.buf1;
        }
        if(opargs.tmpbuf2) {
          buf2=(char*)handle->tmpbuf+(long)opargs.buf2;
        } else {
          buf2=opargs.buf2;
        }
        ompi_op_reduce(opargs.op, buf1, buf2, opargs.count, opargs.datatype);
        break;
      case COPY:
        NBC_DEBUG(5, "  COPY   (offset %li) ", offset);
        NBC_GET_BYTES(ptr,copyargs);
        NBC_DEBUG(5, "*src: %lu, srccount: %i, srctype: %p, *tgt: %lu, tgtcount: %i, tgttype: %p)\n",
                  (unsigned long) copyargs.src, copyargs.srccount, copyargs.srctype,
                  (unsigned long) copyargs.tgt, copyargs.tgtcount, copyargs.tgttype);
        /* get buffers */
        if(copyargs.tmpsrc) {
          buf1=(char*)handle->tmpbuf+(long)copyargs.src;
        } else {
          buf1=copyargs.src;
        }
        if(copyargs.tmptgt) {
          buf2=(char*)handle->tmpbuf+(long)copyargs.tgt;
        } else {
          buf2=copyargs.tgt;
        }
        res = NBC_Copy (buf1, copyargs.srccount, copyargs.srctype, buf2, copyargs.tgtcount, copyargs.tgttype,
                        handle->comm);
        if (OPAL_UNLIKELY(OMPI_SUCCESS != res)) {
          return res;
        }
        break;
      case UNPACK:
        NBC_DEBUG(5, "  UNPACK   (offset %li) ", offset);
        NBC_GET_BYTES(ptr,unpackargs);
        NBC_DEBUG(5, "*src: %lu, srccount: %i, srctype: %p, *tgt: %lu\n", (unsigned long) unpackargs.inbuf,
                  unpackargs.count, unpackargs.datatype, (unsigned long) unpackargs.outbuf);
        /* get buffers */
        if(unpackargs.tmpinbuf) {
          buf1=(char*)handle->tmpbuf+(long)unpackargs.inbuf;
        } else {
          buf1=unpackargs.inbuf;
        }
        if(unpackargs.tmpoutbuf) {
          buf2=(char*)handle->tmpbuf+(long)unpackargs.outbuf;
        } else {
          buf2=unpackargs.outbuf;
        }
        res = NBC_Unpack (buf1, unpackargs.count, unpackargs.datatype, buf2, handle->comm);
        if (OMPI_SUCCESS != res) {
          NBC_Error ("NBC_Unpack() failed (code: %i)", res);
          return res;
        }

        break;
      default:
        NBC_Error ("NBC_Start_round: bad type %li at offset %li", (long)type, offset);
        return OMPI_ERROR;
    }
  }

  /* check if we can make progress - not in the first round, this allows us to leave the
   * initialization faster and to reach more overlap
   *
   * threaded case: calling progress in the first round can lead to a
   * deadlock if NBC_Free is called in this round :-( */
  if (handle->row_offset) {
    res = NBC_Progress(handle);
    if ((NBC_OK != res) && (NBC_CONTINUE != res)) {
      return OMPI_ERROR;
    }
  }

  return OMPI_SUCCESS;
}
Ejemplo n.º 24
0
static int send_nb( dte_data_representation_t data,
                    uint32_t count,
                    void *buffer,
                    rte_ec_handle_t ec_h,
                    rte_grp_handle_t grp_h,
                    uint32_t tag,
                    rte_request_handle_t *req)
{
    ompi_communicator_t *comm = (ompi_communicator_t *)grp_h;

    if (! ec_h.handle) {
        fprintf(stderr,"***Error in hcolrte_rml_send_nb: wrong null argument: "
                "ec_h.handle = %p, ec_h.rank = %d\n",ec_h.handle,ec_h.rank);
        return 1;
    }
    if (HCOL_DTE_IS_INLINE(data)) {
        /*do inline nb recv*/
        size_t size;
        ompi_request_t *ompi_req;
        if (!buffer && !HCOL_DTE_IS_ZERO(data)) {
            fprintf(stderr, "***Error in hcolrte_rml_send_nb: buffer pointer is NULL"
                    " for non DTE_ZERO INLINE data representation\n");
            return 1;
        }
        size = (size_t)data.rep.in_line_rep.data_handle.in_line.packed_size*count/8;
        HCOL_VERBOSE(30,"PML_ISEND: dest = %d: buf = %p: size = %u: comm = %p",
                     ec_h.rank, buffer, (unsigned int)size, (void *)comm);
        if (MCA_PML_CALL(isend(buffer,size,&(ompi_mpi_unsigned_char.dt),ec_h.rank,
                               tag,MCA_PML_BASE_SEND_STANDARD,comm,&ompi_req)))
        {
            return 1;
        }
        req->data = (void *)ompi_req;
        req->status = HCOLRTE_REQUEST_ACTIVE;
    } else {
        int total_entries_number;
        int i;
        unsigned int j;
        void *buf;
        uint64_t len;
        int repeat_count;
        struct dte_struct_t * repeat;
        if (NULL != buffer) {
            /* We have a full data description & buffer pointer simultaneously.
               It is ambiguous. Throw a warning since the user might have made a
               mistake with data reps*/
            fprintf(stderr,"Warning: buffer_pointer != NULL for NON-inline data representation: buffer_pointer is ignored.\n");
        }
        total_entries_number = count_total_dte_repeat_entries(&data);
        repeat = data.rep.general_rep->data_representation.data->repeat;
        repeat_count = data.rep.general_rep->data_representation.data->repeat_count;
        for (i=0; i< repeat_count; i++) {
            for (j=0; j<repeat[i].n_elements; j++) {
                char *repeat_unit = (char *)&repeat[i];
                buf = (void *)(repeat_unit+repeat[i].elements[j].base_offset);
                len = repeat[i].elements[j].packed_size;
                send_nb(DTE_BYTE,len,buf,ec_h,grp_h,tag,req);
            }
        }
    }
    return HCOLL_SUCCESS;
}
Ejemplo n.º 25
0
int ompi_coll_base_sendrecv_nonzero_actual( void* sendbuf, size_t scount,
                                             ompi_datatype_t* sdatatype,
                                             int dest, int stag,
                                             void* recvbuf, size_t rcount,
                                             ompi_datatype_t* rdatatype,
                                             int source, int rtag,
                                             struct ompi_communicator_t* comm,
                                             ompi_status_public_t* status )

{ /* post receive first, then send, then waitall... should be fast (I hope) */
    int err, line = 0, nreqs = 0;
    size_t typesize;
    ompi_request_t* reqs[2], **req = reqs;
    ompi_status_public_t statuses[2];

    /* post new irecv */
    ompi_datatype_type_size(rdatatype, &typesize);
    if (0 != rcount && 0 != typesize) {
        err = MCA_PML_CALL(irecv( recvbuf, rcount, rdatatype, source, rtag,
                                  comm, req++));
        ++nreqs;
        if (err != MPI_SUCCESS) { line = __LINE__; goto error_handler; }
    }

    /* send data to children */
    ompi_datatype_type_size(sdatatype, &typesize);
    if (0 != scount && 0 != typesize) {
        err = MCA_PML_CALL(isend( sendbuf, scount, sdatatype, dest, stag,
                                  MCA_PML_BASE_SEND_STANDARD, comm, req++));
        ++nreqs;
        if (err != MPI_SUCCESS) { line = __LINE__; goto error_handler; }
    }

    if (0 != nreqs) {
        err = ompi_request_wait_all( nreqs, reqs, statuses );
        if( MPI_ERR_IN_STATUS == err ) {
            /* As we use wait_all we will get MPI_ERR_IN_STATUS which is not an error
             * code that we can propagate up the stack. Instead, look for the real
             * error code from the MPI_ERROR in the status.
             */
            int err_index = 0;
            if( MPI_SUCCESS == statuses[0].MPI_ERROR ) {
                err_index = 1;
            }
            if (MPI_STATUS_IGNORE != status) {
                *status = statuses[err_index];
            }
            err = statuses[err_index].MPI_ERROR;
            OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred in the %s"
                                                  " stage of ompi_coll_base_sendrecv_zero\n",
                          __FILE__, line, err, (0 == err_index ? "receive" : "send")));
            return err;
        }
        if (err != MPI_SUCCESS) { line = __LINE__; goto error_handler; }

        if (MPI_STATUS_IGNORE != status) {
            *status = statuses[0];
        }
    } else {
        if( MPI_STATUS_IGNORE != status )
            *status = ompi_status_empty;
    }

    return (MPI_SUCCESS);

 error_handler:
    /* Error discovered during the posting of the irecv or isend,
     * and no status is available.
     */
    OPAL_OUTPUT ((ompi_coll_base_framework.framework_output, "%s:%d: Error %d occurred\n",
                  __FILE__, line, err));
    if (MPI_STATUS_IGNORE != status) {
        status->MPI_ERROR = err;
    }
    return (err);
}
Ejemplo n.º 26
0
 int
 mca_fcoll_dynamic_file_read_all (mca_io_ompio_file_t *fh,
				  void *buf,
				  int count,
				  struct ompi_datatype_t *datatype,
				  ompi_status_public_t *status)
 {
     MPI_Aint position = 0;
     MPI_Aint total_bytes = 0;          /* total bytes to be read */
     MPI_Aint bytes_to_read_in_cycle = 0; /* left to be read in a cycle*/
     MPI_Aint bytes_per_cycle = 0;      /* total read in each cycle by each process*/
     int index = 0, ret=OMPI_SUCCESS;
     int cycles = 0;
     int i=0, j=0, l=0;
     int n=0; /* current position in total_bytes_per_process array */
     MPI_Aint bytes_remaining = 0; /* how many bytes have been read from the current
					 value from total_bytes_per_process */
     int *sorted_file_offsets=NULL, entries_per_aggregator=0;
     int bytes_received = 0;
     int blocks = 0;
     /* iovec structure and count of the buffer passed in */
     uint32_t iov_count = 0;
     struct iovec *decoded_iov = NULL;
     int iov_index = 0;
     size_t current_position = 0;
     struct iovec *local_iov_array=NULL, *global_iov_array=NULL;
     char *receive_buf = NULL;
     MPI_Aint *memory_displacements=NULL;
     /* global iovec at the readers that contain the iovecs created from
	file_set_view */
     uint32_t total_fview_count = 0;
     int local_count = 0;
     int *fview_count = NULL, *disp_index=NULL, *temp_disp_index=NULL;
     int current_index=0, temp_index=0;
     int **blocklen_per_process=NULL;
     MPI_Aint **displs_per_process=NULL;
     char *global_buf = NULL;
     MPI_Aint global_count = 0;
     local_io_array *file_offsets_for_agg=NULL;

     /* array that contains the sorted indices of the global_iov */
     int *sorted = NULL;
     int *displs = NULL;
     int dynamic_num_io_procs;
     size_t max_data = 0;
     int *bytes_per_process = NULL;
     MPI_Aint *total_bytes_per_process = NULL;
     ompi_datatype_t **sendtype = NULL;
     MPI_Request *send_req=NULL, *recv_req=NULL;


 #if TIME_BREAKDOWN
     double read_time = 0.0, start_read_time = 0.0, end_read_time = 0.0;
     double rcomm_time = 0.0, start_rcomm_time = 0.0, end_rcomm_time = 0.0;
     double read_exch = 0.0, start_rexch = 0.0, end_rexch = 0.0;
     print_entry nentry;
 #endif


//     if (opal_datatype_is_contiguous_memory_layout(&datatype->super,1)) {
//	 fh->f_flags |= OMPIO_CONTIGUOUS_MEMORY;
//     }
     /**************************************************************************
      ** In case the data is not contigous in memory, decode it into an iovec **
      **************************************************************************/
     if (! (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) {
	 ret = fh->f_decode_datatype ((struct mca_io_ompio_file_t *)fh,
				    datatype,
				    count,
				    buf,
				    &max_data,
				    &decoded_iov,
				    &iov_count);
       if (OMPI_SUCCESS != ret){
	 goto exit;
       }
     }
     else {
	 max_data = count * datatype->super.size;
     }

     if ( MPI_STATUS_IGNORE != status ) {
	 status->_ucount = max_data;
     }

     fh->f_get_num_aggregators ( &dynamic_num_io_procs);
     ret = fh->f_set_aggregator_props ((struct mca_io_ompio_file_t *) fh,
				       dynamic_num_io_procs,
				       max_data);
     if (OMPI_SUCCESS != ret){
	 goto exit;
     }

     total_bytes_per_process = (MPI_Aint*)malloc
	 (fh->f_procs_per_group*sizeof(MPI_Aint));
     if (NULL == total_bytes_per_process) {
	 opal_output (1, "OUT OF MEMORY\n");
	 ret = OMPI_ERR_OUT_OF_RESOURCE;
	 goto exit;
     }

     ret = fh->f_allgather_array (&max_data,
				  1,
				  MPI_LONG,
				  total_bytes_per_process,
				  1,
				  MPI_LONG,
				  fh->f_aggregator_index,
				  fh->f_procs_in_group,
				  fh->f_procs_per_group,
				  fh->f_comm);
     if (OMPI_SUCCESS != ret){
       goto exit;
     }

     for (i=0 ; i<fh->f_procs_per_group ; i++) {
	 total_bytes += total_bytes_per_process[i];
     }

     if (NULL != total_bytes_per_process) {
	 free (total_bytes_per_process);
	 total_bytes_per_process = NULL;
     }

     /*********************************************************************
      *** Generate the File offsets/lengths corresponding to this write ***
      ********************************************************************/
     ret = fh->f_generate_current_file_view ((struct mca_io_ompio_file_t *) fh,
					     max_data,
					     &local_iov_array,
					     &local_count);

     if (ret != OMPI_SUCCESS){
	 goto exit;
     }



     /* #########################################################*/

     /*************************************************************
      *** ALLGather the File View information at all processes ***
      *************************************************************/

     fview_count = (int *) malloc (fh->f_procs_per_group * sizeof (int));
     if (NULL == fview_count) {
	 opal_output (1, "OUT OF MEMORY\n");
	 ret = OMPI_ERR_OUT_OF_RESOURCE;
	 goto exit;
     }

     ret = fh->f_allgather_array (&local_count,
				  1,
				  MPI_INT,
				  fview_count,
				  1,
				  MPI_INT,
				  fh->f_aggregator_index,
				  fh->f_procs_in_group,
				  fh->f_procs_per_group,
				  fh->f_comm);

     if (OMPI_SUCCESS != ret){
	 goto exit;
     }

     displs = (int*)malloc (fh->f_procs_per_group*sizeof(int));
     if (NULL == displs) {
	 opal_output (1, "OUT OF MEMORY\n");
	 ret = OMPI_ERR_OUT_OF_RESOURCE;
	 goto exit;
     }

     displs[0] = 0;
     total_fview_count = fview_count[0];
     for (i=1 ; i<fh->f_procs_per_group ; i++) {
	 total_fview_count += fview_count[i];
	 displs[i] = displs[i-1] + fview_count[i-1];
     }

 #if DEBUG_ON
     if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	 for (i=0 ; i<fh->f_procs_per_group ; i++) {
	     printf ("%d: PROCESS: %d  ELEMENTS: %d  DISPLS: %d\n",
		     fh->f_rank,
		     i,
		     fview_count[i],
		     displs[i]);
	 }
     }
 #endif

     /* allocate the global iovec  */
     if (0 != total_fview_count) {
       global_iov_array = (struct iovec*)malloc (total_fview_count *
						 sizeof(struct iovec));
       if (NULL == global_iov_array) {
	 opal_output (1, "OUT OF MEMORY\n");
	 ret = OMPI_ERR_OUT_OF_RESOURCE;
	 goto exit;
       }
     }

     ret =  fh->f_allgatherv_array (local_iov_array,
				    local_count,
				    fh->f_iov_type,
				    global_iov_array,
				    fview_count,
				    displs,
				    fh->f_iov_type,
				    fh->f_aggregator_index,
				    fh->f_procs_in_group,
				    fh->f_procs_per_group,
				    fh->f_comm);

     if (OMPI_SUCCESS != ret){
       goto exit;
     }

     /* sort it */
     if (0 != total_fview_count) {
	 sorted = (int *)malloc (total_fview_count * sizeof(int));
	 if (NULL == sorted) {
	     opal_output (1, "OUT OF MEMORY\n");
	     ret = OMPI_ERR_OUT_OF_RESOURCE;
	     goto exit;
	 }
	 fh->f_sort_iovec (global_iov_array, total_fview_count, sorted);
     }

     if (NULL != local_iov_array) {
	 free (local_iov_array);
	 local_iov_array = NULL;
     }

 #if DEBUG_ON
     if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	 for (i=0 ; i<total_fview_count ; i++) {
	     printf("%d: OFFSET: %p   LENGTH: %d\n",
		    fh->f_rank,
		    global_iov_array[sorted[i]].iov_base,
		    global_iov_array[sorted[i]].iov_len);
	 }
     }
 #endif

     if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {

       disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int));
       if (NULL == disp_index) {
	 opal_output (1, "OUT OF MEMORY\n");
	 ret = OMPI_ERR_OUT_OF_RESOURCE;
	 goto exit;
       }

       blocklen_per_process = (int **)malloc (fh->f_procs_per_group * sizeof (int*));
       if (NULL == blocklen_per_process) {
	 opal_output (1, "OUT OF MEMORY\n");
	 ret = OMPI_ERR_OUT_OF_RESOURCE;
	 goto exit;
       }

       displs_per_process = (MPI_Aint **)malloc (fh->f_procs_per_group * sizeof (MPI_Aint*));
       if (NULL == displs_per_process){
	 opal_output (1, "OUT OF MEMORY\n");
	 ret = OMPI_ERR_OUT_OF_RESOURCE;
	 goto exit;
       }

       for (i=0;i<fh->f_procs_per_group;i++){
	 blocklen_per_process[i] = NULL;
	 displs_per_process[i] = NULL;
       }
     }


     /*
      * Calculate how many bytes are read in each cycle
      */
     fh->f_get_bytes_per_agg ( (int *) &bytes_per_cycle);
     cycles = ceil((double)total_bytes/bytes_per_cycle);

     n = 0;
     bytes_remaining = 0;
     current_index = 0;


 #if TIME_BREAKDOWN
     start_rexch = MPI_Wtime();
 #endif
     for (index = 0; index < cycles; index++) {
       /* Getting ready for next cycle
	  Initializing and freeing buffers */
       if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	 if (NULL == sendtype){
	   sendtype = (ompi_datatype_t **)
	     malloc (fh->f_procs_per_group  * sizeof(ompi_datatype_t *));
	   if (NULL == sendtype) {
	     opal_output (1, "OUT OF MEMORY\n");
	     ret = OMPI_ERR_OUT_OF_RESOURCE;
	     goto exit;
	   }
	 }

	 for(l=0;l<fh->f_procs_per_group;l++){

	   disp_index[l] =  1;

	   if (NULL != blocklen_per_process[l]){
	     free(blocklen_per_process[l]);
	     blocklen_per_process[l] = NULL;
	   }
	   if (NULL != displs_per_process[l]){
	     free(displs_per_process[l]);
	     displs_per_process[l] = NULL;
	   }
	   blocklen_per_process[l] = (int *) calloc (1, sizeof(int));
	   if (NULL == blocklen_per_process[l]) {
	     opal_output (1, "OUT OF MEMORY for blocklen\n");
	     ret = OMPI_ERR_OUT_OF_RESOURCE;
	     goto exit;
	   }
	   displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint));
	   if (NULL == displs_per_process[l]){
	     opal_output (1, "OUT OF MEMORY for displs\n");
	     ret = OMPI_ERR_OUT_OF_RESOURCE;
	     goto exit;
	   }
	 }

	 if (NULL != sorted_file_offsets){
	   free(sorted_file_offsets);
	   sorted_file_offsets = NULL;
	 }

	 if(NULL != file_offsets_for_agg){
	   free(file_offsets_for_agg);
	   file_offsets_for_agg = NULL;
	 }
	 if (NULL != memory_displacements){
	   free(memory_displacements);
	   memory_displacements = NULL;
	 }
       }


       if (cycles-1 == index) {
	 bytes_to_read_in_cycle = total_bytes - bytes_per_cycle*index;
       }
       else {
	 bytes_to_read_in_cycle = bytes_per_cycle;
       }

 #if DEBUG_ON
       if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	 printf ("****%d: CYCLE %d   Bytes %d**********\n",
		 fh->f_rank,
		 index,
		 bytes_to_write_in_cycle);
       }
 #endif

       /* Calculate how much data will be contributed in this cycle
	    by each process*/
       bytes_received = 0;

       while (bytes_to_read_in_cycle) {
	 blocks = fview_count[0];
	 for (j=0 ; j<fh->f_procs_per_group ; j++) {
	   if (sorted[current_index] < blocks) {
	     n = j;
	     break;
	   }
	   else {
	     blocks += fview_count[j+1];
	   }
	 }
	 if (bytes_remaining) {
	   if (bytes_remaining <= bytes_to_read_in_cycle) {

	     if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	       blocklen_per_process[n][disp_index[n] - 1] = bytes_remaining;
	       displs_per_process[n][disp_index[n] - 1] =
		 (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base +
		 (global_iov_array[sorted[current_index]].iov_len - bytes_remaining);
	     }
	     if (fh->f_procs_in_group[n] == fh->f_rank) {
	       bytes_received += bytes_remaining;
	     }
	     current_index ++;
	     bytes_to_read_in_cycle -= bytes_remaining;
	     bytes_remaining = 0;
	     if (fh->f_procs_in_group[fh->f_aggregator_index] ==
		 fh->f_rank) {
	       blocklen_per_process[n] = (int *) realloc
		 ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int));
	       displs_per_process[n] = (MPI_Aint *) realloc
		 ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint));
	       blocklen_per_process[n][disp_index[n]] = 0;
	       displs_per_process[n][disp_index[n]] = 0;
	       disp_index[n] += 1;
	     }
	     continue;
	   }
	   else {
	     if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	       blocklen_per_process[n][disp_index[n] - 1] = bytes_to_read_in_cycle;
	       displs_per_process[n][disp_index[n] - 1] =
		 (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base +
		 (global_iov_array[sorted[current_index]].iov_len
		  - bytes_remaining);
	     }
	     if (fh->f_procs_in_group[n] == fh->f_rank) {
	       bytes_received += bytes_to_read_in_cycle;
	     }
	     bytes_remaining -= bytes_to_read_in_cycle;
	     bytes_to_read_in_cycle = 0;
	     break;
	   }
	 }
	 else {
	   if (bytes_to_read_in_cycle <
		   (MPI_Aint) global_iov_array[sorted[current_index]].iov_len) {
	     if (fh->f_procs_in_group[fh->f_aggregator_index] ==
		 fh->f_rank) {

	       blocklen_per_process[n][disp_index[n] - 1] = bytes_to_read_in_cycle;
	       displs_per_process[n][disp_index[n] - 1] =
		 (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base ;
	     }

	     if (fh->f_procs_in_group[n] == fh->f_rank) {
	       bytes_received += bytes_to_read_in_cycle;
	     }
	     bytes_remaining = global_iov_array[sorted[current_index]].iov_len -
	       bytes_to_read_in_cycle;
	     bytes_to_read_in_cycle = 0;
	     break;
	   }
	   else {
	     if (fh->f_procs_in_group[fh->f_aggregator_index] ==
		 fh->f_rank) {
	       blocklen_per_process[n][disp_index[n] - 1] =
		 global_iov_array[sorted[current_index]].iov_len;
	       displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)
		 global_iov_array[sorted[current_index]].iov_base;
	       blocklen_per_process[n] =
		 (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int));
	       displs_per_process[n] = (MPI_Aint *)realloc
		 ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint));
	       blocklen_per_process[n][disp_index[n]] = 0;
	       displs_per_process[n][disp_index[n]] = 0;
	       disp_index[n] += 1;
	     }
	     if (fh->f_procs_in_group[n] == fh->f_rank) {
	       bytes_received +=
		 global_iov_array[sorted[current_index]].iov_len;
	     }
	     bytes_to_read_in_cycle -=
	       global_iov_array[sorted[current_index]].iov_len;
	     current_index ++;
	     continue;
	   }
	 }
       }
       /* Calculate the displacement on where to put the data and allocate
	  the recieve buffer (global_buf) */
       if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {
	 entries_per_aggregator=0;
	 for (i=0;i<fh->f_procs_per_group; i++){
	   for (j=0;j<disp_index[i];j++){
	     if (blocklen_per_process[i][j] > 0)
	       entries_per_aggregator++ ;
	   }
	 }
	 if (entries_per_aggregator > 0){
	   file_offsets_for_agg = (local_io_array *)
	     malloc(entries_per_aggregator*sizeof(local_io_array));
	   if (NULL == file_offsets_for_agg) {
	     opal_output (1, "OUT OF MEMORY\n");
	     ret = OMPI_ERR_OUT_OF_RESOURCE;
	     goto exit;
	   }
	   sorted_file_offsets = (int *)
	     malloc (entries_per_aggregator*sizeof(int));
	   if (NULL == sorted_file_offsets){
	     opal_output (1, "OUT OF MEMORY\n");
	     ret =  OMPI_ERR_OUT_OF_RESOURCE;
	     goto exit;
	   }
	   /*Moving file offsets to an IO array!*/
	   temp_index = 0;
	   global_count = 0;
	   for (i=0;i<fh->f_procs_per_group; i++){
	     for(j=0;j<disp_index[i];j++){
	       if (blocklen_per_process[i][j] > 0){
		   file_offsets_for_agg[temp_index].length =
		     blocklen_per_process[i][j];
		   global_count += blocklen_per_process[i][j];
		   file_offsets_for_agg[temp_index].process_id = i;
		   file_offsets_for_agg[temp_index].offset =
		     displs_per_process[i][j];
		   temp_index++;
	       }
	     }
	   }
	 }
	 else{
	   continue;
	 }

	 read_heap_sort (file_offsets_for_agg,
			 entries_per_aggregator,
			 sorted_file_offsets);

	 memory_displacements = (MPI_Aint *) malloc
	   (entries_per_aggregator * sizeof(MPI_Aint));
	 memory_displacements[sorted_file_offsets[0]] = 0;
	 for (i=1; i<entries_per_aggregator; i++){
	   memory_displacements[sorted_file_offsets[i]] =
	     memory_displacements[sorted_file_offsets[i-1]] +
	     file_offsets_for_agg[sorted_file_offsets[i-1]].length;
	 }

	 global_buf = (char *) malloc (global_count * sizeof(char));
	 if (NULL == global_buf){
	   opal_output(1, "OUT OF MEMORY\n");
	   ret = OMPI_ERR_OUT_OF_RESOURCE;
	   goto exit;
	 }

	  fh->f_io_array = (mca_io_ompio_io_array_t *) malloc
	    (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t));
	  if (NULL == fh->f_io_array) {
	    opal_output(1, "OUT OF MEMORY\n");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	  }

	 fh->f_num_of_io_entries = 0;
	 fh->f_io_array[fh->f_num_of_io_entries].offset =
	     (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset;
	 fh->f_io_array[fh->f_num_of_io_entries].length =
	   file_offsets_for_agg[sorted_file_offsets[0]].length;
	 fh->f_io_array[fh->f_num_of_io_entries].memory_address =
	   global_buf+memory_displacements[sorted_file_offsets[0]];
	 fh->f_num_of_io_entries++;
	 for (i=1;i<entries_per_aggregator;i++){
	   if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset +
	       file_offsets_for_agg[sorted_file_offsets[i-1]].length ==
	       file_offsets_for_agg[sorted_file_offsets[i]].offset){
	     fh->f_io_array[fh->f_num_of_io_entries - 1].length +=
	       file_offsets_for_agg[sorted_file_offsets[i]].length;
	   }
	   else{
	     fh->f_io_array[fh->f_num_of_io_entries].offset =
		 (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset;
	     fh->f_io_array[fh->f_num_of_io_entries].length =
	       file_offsets_for_agg[sorted_file_offsets[i]].length;
	     fh->f_io_array[fh->f_num_of_io_entries].memory_address =
	       global_buf+memory_displacements[sorted_file_offsets[i]];
	     fh->f_num_of_io_entries++;
	   }
	 }


 #if TIME_BREAKDOWN
	 start_read_time = MPI_Wtime();
 #endif

	 if (fh->f_num_of_io_entries) {
	   if ( 0 >  fh->f_fbtl->fbtl_preadv (fh)) {
	     opal_output (1, "READ FAILED\n");
	     ret = OMPI_ERROR;
	     goto exit;
	   }
	 }

 #if TIME_BREAKDOWN
	 end_read_time = MPI_Wtime();
	 read_time += end_read_time - start_read_time;
 #endif
	 /**********************************************************
	  ******************** DONE READING ************************
	  *********************************************************/

	 temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int));
	 if (NULL == temp_disp_index) {
	   opal_output (1, "OUT OF MEMORY\n");
	   ret = OMPI_ERR_OUT_OF_RESOURCE;
	   goto exit;
	 }
	 for (i=0; i<entries_per_aggregator; i++){
	   temp_index =
	     file_offsets_for_agg[sorted_file_offsets[i]].process_id;
	   displs_per_process[temp_index][temp_disp_index[temp_index]] =
	     memory_displacements[sorted_file_offsets[i]];
	   if (temp_disp_index[temp_index] < disp_index[temp_index]){
	     temp_disp_index[temp_index] += 1;
	   }
	   else{
	     printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n",
		    temp_index, temp_disp_index[temp_index],
		    temp_index, disp_index[temp_index]);
	   }
	 }
	 if (NULL != temp_disp_index){
	   free(temp_disp_index);
	   temp_disp_index = NULL;
	 }

	 send_req = (MPI_Request *)
	   malloc (fh->f_procs_per_group * sizeof(MPI_Request));
	 if (NULL == send_req){
	   opal_output ( 1, "OUT OF MEMORY\n");
	   ret = OMPI_ERR_OUT_OF_RESOURCE;
	   goto exit;
	 }
 #if TIME_BREAKDOWN
	 start_rcomm_time = MPI_Wtime();
 #endif
	 for (i=0;i<fh->f_procs_per_group;i++){
	   ompi_datatype_create_hindexed(disp_index[i],
					 blocklen_per_process[i],
					 displs_per_process[i],
					 MPI_BYTE,
					 &sendtype[i]);
	   ompi_datatype_commit(&sendtype[i]);
	   ret = MCA_PML_CALL (isend(global_buf,
				     1,
				     sendtype[i],
				     fh->f_procs_in_group[i],
				     123,
				     MCA_PML_BASE_SEND_STANDARD,
				     fh->f_comm,
				     &send_req[i]));
	   if(OMPI_SUCCESS != ret){
	       goto exit;
	   }
	 }
 #if TIME_BREAKDOWN
	 end_rcomm_time = MPI_Wtime();
	 rcomm_time += end_rcomm_time - start_rcomm_time;
 #endif
       }

       /**********************************************************
	********* Scatter the Data from the readers **************
	*********************************************************/
       if (fh->f_flags & OMPIO_CONTIGUOUS_MEMORY) {
	 receive_buf = &((char*)buf)[position];
       }
       else if (bytes_received) {
	 /* allocate a receive buffer and copy the data that needs
	    to be received into it in case the data is non-contigous
	    in memory */
	 receive_buf = malloc (bytes_received);
	 if (NULL == receive_buf) {
	   opal_output (1, "OUT OF MEMORY\n");
	   ret = OMPI_ERR_OUT_OF_RESOURCE;
	   goto exit;
	 }
       }

 #if TIME_BREAKDOWN
       start_rcomm_time = MPI_Wtime();
 #endif
       recv_req = (MPI_Request *) malloc (sizeof (MPI_Request));
       if (NULL == recv_req){
	 opal_output (1, "OUT OF MEMORY\n");
	 ret = OMPI_ERR_OUT_OF_RESOURCE;
	 goto exit;
       }

       ret = MCA_PML_CALL(irecv(receive_buf,
				bytes_received,
				MPI_BYTE,
				fh->f_procs_in_group[fh->f_aggregator_index],
				123,
				fh->f_comm,
				recv_req));
       if (OMPI_SUCCESS != ret){
	 goto exit;
       }


       if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank){
	 ret = ompi_request_wait_all (fh->f_procs_per_group,
				    send_req,
				      MPI_STATUS_IGNORE);
	 if (OMPI_SUCCESS != ret){
	   goto exit;
	 }
       }

       ret = ompi_request_wait (recv_req, MPI_STATUS_IGNORE);
       if (OMPI_SUCCESS != ret){
	 goto exit;
       }
       position += bytes_received;

       /* If data is not contigous in memory, copy the data from the
	  receive buffer into the buffer passed in */
       if (!(fh->f_flags & OMPIO_CONTIGUOUS_MEMORY)) {
	 OPAL_PTRDIFF_TYPE mem_address;
	 size_t remaining = 0;
	 size_t temp_position = 0;

	 remaining = bytes_received;

	 while (remaining) {
	   mem_address = (OPAL_PTRDIFF_TYPE)
	     (decoded_iov[iov_index].iov_base) + current_position;

	   if (remaining >=
	       (decoded_iov[iov_index].iov_len - current_position)) {
	     memcpy ((IOVBASE_TYPE *) mem_address,
		     receive_buf+temp_position,
		     decoded_iov[iov_index].iov_len - current_position);
	     remaining = remaining -
	       (decoded_iov[iov_index].iov_len - current_position);
	     temp_position = temp_position +
	       (decoded_iov[iov_index].iov_len - current_position);
	     iov_index = iov_index + 1;
	     current_position = 0;
	   }
	   else {
	     memcpy ((IOVBASE_TYPE *) mem_address,
		     receive_buf+temp_position,
		     remaining);
	     current_position = current_position + remaining;
	     remaining = 0;
	   }
	 }

	 if (NULL != receive_buf) {
	   free (receive_buf);
	 receive_buf = NULL;
	 }
       }
#if TIME_BREAKDOWN
       end_rcomm_time = MPI_Wtime();
       rcomm_time += end_rcomm_time - start_rcomm_time;
#endif

       if (NULL != recv_req){
	 free(recv_req);
	 recv_req = NULL;
       }
       if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank){
	 fh->f_num_of_io_entries = 0;
	 if (NULL != fh->f_io_array) {
	   free (fh->f_io_array);
	   fh->f_io_array = NULL;
	 }
	 if (NULL != global_buf) {
	   free (global_buf);
	   global_buf = NULL;
	 }
	 for (i = 0; i < fh->f_procs_per_group; i++)
	   ompi_datatype_destroy(sendtype+i);
	 if (NULL != sendtype){
	   free(sendtype);
	   sendtype=NULL;
	 }
	 if (NULL != send_req){
	   free(send_req);
	   send_req = NULL;
	 }
	 if (NULL != sorted_file_offsets){
	   free(sorted_file_offsets);
	   sorted_file_offsets = NULL;
	 }
	 if (NULL != file_offsets_for_agg){
	   free(file_offsets_for_agg);
	   file_offsets_for_agg = NULL;
	 }
	 if (NULL != bytes_per_process){
	   free(bytes_per_process);
	   bytes_per_process =NULL;
	 }
	 if (NULL != memory_displacements){
	   free(memory_displacements);
	   memory_displacements= NULL;
	 }
       }
     }

 #if TIME_BREAKDOWN
     end_rexch = MPI_Wtime();
     read_exch += end_rexch - start_rexch;
     nentry.time[0] = read_time;
     nentry.time[1] = rcomm_time;
     nentry.time[2] = read_exch;
     if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank)
       nentry.aggregator = 1;
     else
       nentry.aggregator = 0;
     nentry.nprocs_for_coll = dynamic_num_io_procs;
     if (!fh->f_full_print_queue(READ_PRINT_QUEUE)){
       fh->f_register_print_entry(READ_PRINT_QUEUE,
				  nentry);
     }
 #endif

 exit:
     if (NULL != sorted) {
       free (sorted);
       sorted = NULL;
     }
     if (NULL != global_iov_array) {
       free (global_iov_array);
       global_iov_array = NULL;
     }
     if (NULL != fview_count) {
       free (fview_count);
       fview_count = NULL;
     }
     if (NULL != decoded_iov) {
       free (decoded_iov);
       decoded_iov = NULL;
     }
     if (NULL != local_iov_array){
       free(local_iov_array);
       local_iov_array=NULL;
     }

     if (NULL != displs) {
       free (displs);
       displs = NULL;
     }
     if (fh->f_procs_in_group[fh->f_aggregator_index] == fh->f_rank) {

       if (NULL != disp_index){
	 free(disp_index);
	 disp_index = NULL;
       }

       if ( NULL != blocklen_per_process){
	 for(l=0;l<fh->f_procs_per_group;l++){
	   if (NULL != blocklen_per_process[l]){
	     free(blocklen_per_process[l]);
	     blocklen_per_process[l] = NULL;
	   }
	 }

	 free(blocklen_per_process);
	 blocklen_per_process = NULL;
       }

       if (NULL != displs_per_process){
	 for (l=0; i<fh->f_procs_per_group; l++){
	   if (NULL != displs_per_process[l]){
	     free(displs_per_process[l]);
	     displs_per_process[l] = NULL;
	   }
	 }
	 free(displs_per_process);
	 displs_per_process = NULL;
       }

     }
     return ret;
 }
static int
mca_coll_basic_neighbor_alltoallv_cart(const void *sbuf, const int scounts[], const int sdisps[],
                                       struct ompi_datatype_t *sdtype, void *rbuf, const int rcounts[],
                                       const int rdisps[], struct ompi_datatype_t *rdtype,
                                       struct ompi_communicator_t *comm, mca_coll_base_module_t *module)
{
    mca_coll_basic_module_t *basic_module = (mca_coll_basic_module_t *) module;
    const mca_topo_base_comm_cart_2_2_0_t *cart = comm->c_topo->mtc.cart;
    const int rank = ompi_comm_rank (comm);
    int rc = MPI_SUCCESS, dim, i, nreqs;
    ptrdiff_t lb, rdextent, sdextent;
    ompi_request_t **reqs;

    /* ensure we have enough storage for requests */
    rc = mca_coll_basic_check_for_requests (basic_module, cart->ndims * 4);
    if (OMPI_SUCCESS != rc) {
        return rc;
    }

    ompi_datatype_get_extent(rdtype, &lb, &rdextent);
    ompi_datatype_get_extent(sdtype, &lb, &sdextent);

    /* post receives first */
    for (dim = 0, nreqs = 0, i = 0, reqs = basic_module->mccb_reqs ; dim < cart->ndims ; ++dim, i += 2) {
        int srank = MPI_PROC_NULL, drank = MPI_PROC_NULL;

        if (cart->dims[dim] > 1) {
            mca_topo_base_cart_shift (comm, dim, 1, &srank, &drank);
        } else if (1 == cart->dims[dim] && cart->periods[dim]) {
            srank = drank = rank;
        }

        if (MPI_PROC_NULL != srank) {
            rc = MCA_PML_CALL(irecv((char *) rbuf + rdisps[i] * rdextent, rcounts[i], rdtype, srank,
                                    MCA_COLL_BASE_TAG_ALLTOALL, comm, reqs++));
            if (OMPI_SUCCESS != rc) break;
            nreqs++;
        }

        if (MPI_PROC_NULL != drank) {
            rc = MCA_PML_CALL(irecv((char *) rbuf + rdisps[i+1] * rdextent, rcounts[i+1], rdtype, drank,
                                    MCA_COLL_BASE_TAG_ALLTOALL, comm, reqs++));
            if (OMPI_SUCCESS != rc) break;
            nreqs++;
        }
    }

    if (OMPI_SUCCESS != rc) {
        /* should probably try to clean up here */
        return rc;
    }

    for (dim = 0, i = 0 ; dim < cart->ndims ; ++dim, i += 2) {
        int srank = MPI_PROC_NULL, drank = MPI_PROC_NULL;

        if (cart->dims[dim] > 1) {
            mca_topo_base_cart_shift (comm, dim, 1, &srank, &drank);
        } else if (1 == cart->dims[dim] && cart->periods[dim]) {
            srank = drank = rank;
        }

        if (MPI_PROC_NULL != srank) {
            /* remove cast from const when the pml layer is updated to take a const for the send buffer */
            rc = MCA_PML_CALL(isend((char *) sbuf + sdisps[i] * sdextent, scounts[i], sdtype, srank,
                                    MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, reqs++));
            if (OMPI_SUCCESS != rc) break;
            nreqs++;
        }

        if (MPI_PROC_NULL != drank) {
            rc = MCA_PML_CALL(isend((char *) sbuf + sdisps[i+1] * sdextent, scounts[i+1], sdtype, drank,
                                    MCA_COLL_BASE_TAG_ALLTOALL, MCA_PML_BASE_SEND_STANDARD, comm, reqs++));
            if (OMPI_SUCCESS != rc) break;
            nreqs++;
        }
    }

    if (OMPI_SUCCESS != rc) {
        /* should probably try to clean up here */
        return rc;
    }

    return ompi_request_wait_all (nreqs, basic_module->mccb_reqs, MPI_STATUSES_IGNORE);
}
Ejemplo n.º 28
0
static int parse_file(const char *section, char *key, const char *buf,int *sec_s,int *sec_e,
					  int *key_s,int *key_e, int *value_s, int *value_e)
{
	const char *p = buf;
	int i=0;

	assert(buf!=NULL);
	assert(section != NULL && strlen(section));
	assert(key != NULL && strlen(key));

	*sec_e = *sec_s = *key_e = *key_s = *value_s = *value_e = -1;

	while( !isend(p[i]) ) {
		//find the section
		if( ( 0==i ||  isnewline(p[i-1]) ) && isleftbarce(p[i]) )
		{
			int section_start=i+1;

			//find the ']'
			do {
				i++;
			} while( !isrightbrace(p[i]) && !isend(p[i]));

			if( 0 == strncmp(p+section_start,section, i-section_start)) {
				int newline_start=0;

				i++;

				//Skip over space char after ']'
				while(isspace(p[i])) {
					i++;
				}

				//find the section
				*sec_s = section_start;
				*sec_e = i;

				while( ! (isnewline(p[i-1]) && isleftbarce(p[i])) 
				&& !isend(p[i]) ) {
					int j=0;
					//get a new line
					newline_start = i;

					while( !isnewline(p[i]) &&  !isend(p[i]) ) {
						i++;
					}
					
					//now i  is equal to end of the line
					j = newline_start;

					if(';' != p[j]) //skip over comment
					{
						while(j < i && p[j]!='=') {
							j++;
							if('=' == p[j]) {
								if(strncmp(key,p+newline_start,j-newline_start)==0)
								{
									//find the key ok
									*key_s = newline_start;
									*key_e = j-1;

									*value_s = j+1;
									*value_e = i;

									return 1;
								}
							}
						}
					}

					i++;
				}
			}
		}
		else
		{
			i++;
		}
	}
	return 0;
}
int
mca_fcoll_dynamic_gen2_file_read_all (mca_io_ompio_file_t *fh,
                                 void *buf,
                                 int count,
                                 struct ompi_datatype_t *datatype,
                                 ompi_status_public_t *status)
{
    MPI_Aint position = 0;
    MPI_Aint total_bytes = 0;          /* total bytes to be read */
    MPI_Aint bytes_to_read_in_cycle = 0; /* left to be read in a cycle*/
    MPI_Aint bytes_per_cycle = 0;      /* total read in each cycle by each process*/
    int index = 0, ret=OMPI_SUCCESS;
    int cycles = 0;
    int i=0, j=0, l=0;
    int n=0; /* current position in total_bytes_per_process array */
    MPI_Aint bytes_remaining = 0; /* how many bytes have been read from the current
                                     value from total_bytes_per_process */
    int *sorted_file_offsets=NULL, entries_per_aggregator=0;
    int bytes_received = 0;
    int blocks = 0;
    /* iovec structure and count of the buffer passed in */
    uint32_t iov_count = 0;
    struct iovec *decoded_iov = NULL;
    int iov_index = 0;
    size_t current_position = 0;
    struct iovec *local_iov_array=NULL, *global_iov_array=NULL;
    char *receive_buf = NULL;
    MPI_Aint *memory_displacements=NULL;
    /* global iovec at the readers that contain the iovecs created from
       file_set_view */
    uint32_t total_fview_count = 0;
    int local_count = 0;
    int *fview_count = NULL, *disp_index=NULL, *temp_disp_index=NULL;
    int current_index=0, temp_index=0;
    int **blocklen_per_process=NULL;
    MPI_Aint **displs_per_process=NULL;
    char *global_buf = NULL;
    MPI_Aint global_count = 0;
    mca_io_ompio_local_io_array *file_offsets_for_agg=NULL;

    /* array that contains the sorted indices of the global_iov */
    int *sorted = NULL;
    int *displs = NULL;
    int dynamic_gen2_num_io_procs;
    size_t max_data = 0;
    MPI_Aint *total_bytes_per_process = NULL;
    ompi_datatype_t **sendtype = NULL;
    MPI_Request *send_req=NULL, recv_req=NULL;
    int my_aggregator =-1;
    bool recvbuf_is_contiguous=false;
    size_t ftype_size;
    OPAL_PTRDIFF_TYPE ftype_extent, lb;


#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    double read_time = 0.0, start_read_time = 0.0, end_read_time = 0.0;
    double rcomm_time = 0.0, start_rcomm_time = 0.0, end_rcomm_time = 0.0;
    double read_exch = 0.0, start_rexch = 0.0, end_rexch = 0.0;
    mca_io_ompio_print_entry nentry;
#endif

    /**************************************************************************
     ** 1. In case the data is not contigous in memory, decode it into an iovec
     **************************************************************************/

    opal_datatype_type_size ( &datatype->super, &ftype_size );
    opal_datatype_get_extent ( &datatype->super, &lb, &ftype_extent );

    if ( (ftype_extent == (OPAL_PTRDIFF_TYPE) ftype_size)             &&
        opal_datatype_is_contiguous_memory_layout(&datatype->super,1) &&
        0 == lb ) {
        recvbuf_is_contiguous = true;
    }


    if (! recvbuf_is_contiguous ) {
        ret = fh->f_decode_datatype ((struct mca_io_ompio_file_t *)fh,
                                     datatype,
                                     count,
                                     buf,
                                     &max_data,
                                     &decoded_iov,
                                     &iov_count);
        if (OMPI_SUCCESS != ret){
            goto exit;
        }
    }
    else {
        max_data = count * datatype->super.size;
    }

    if ( MPI_STATUS_IGNORE != status ) {
        status->_ucount = max_data;
    }

    fh->f_get_num_aggregators ( &dynamic_gen2_num_io_procs);
    ret = fh->f_set_aggregator_props ((struct mca_io_ompio_file_t *) fh,
                                      dynamic_gen2_num_io_procs,
                                      max_data);
    if (OMPI_SUCCESS != ret){
        goto exit;
    }
    my_aggregator = fh->f_procs_in_group[fh->f_aggregator_index];

    /**************************************************************************
     ** 2. Determine the total amount of data to be written
     **************************************************************************/
    total_bytes_per_process = (MPI_Aint*)malloc(fh->f_procs_per_group*sizeof(MPI_Aint));
    if (NULL == total_bytes_per_process) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rcomm_time = MPI_Wtime();
#endif
    ret = fcoll_base_coll_allgather_array (&max_data,
                                           1,
                                           MPI_LONG,
                                           total_bytes_per_process,
                                           1,
                                           MPI_LONG,
                                           fh->f_aggregator_index,
                                           fh->f_procs_in_group,
                                           fh->f_procs_per_group,
                                           fh->f_comm);
    if (OMPI_SUCCESS != ret){
        goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_rcomm_time = MPI_Wtime();
    rcomm_time += end_rcomm_time - start_rcomm_time;
#endif

    for (i=0 ; i<fh->f_procs_per_group ; i++) {
        total_bytes += total_bytes_per_process[i];
    }

    if (NULL != total_bytes_per_process) {
        free (total_bytes_per_process);
        total_bytes_per_process = NULL;
    }

    /*********************************************************************
     *** 3. Generate the File offsets/lengths corresponding to this write
     ********************************************************************/
    ret = fh->f_generate_current_file_view ((struct mca_io_ompio_file_t *) fh,
                                            max_data,
                                            &local_iov_array,
                                            &local_count);

    if (ret != OMPI_SUCCESS){
        goto exit;
    }

    /*************************************************************
     *** 4. Allgather the File View information at all processes
     *************************************************************/

    fview_count = (int *) malloc (fh->f_procs_per_group * sizeof (int));
    if (NULL == fview_count) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rcomm_time = MPI_Wtime();
#endif
    ret = fcoll_base_coll_allgather_array (&local_count,
                                           1,
                                           MPI_INT,
                                           fview_count,
                                           1,
                                           MPI_INT,
                                           fh->f_aggregator_index,
                                           fh->f_procs_in_group,
                                           fh->f_procs_per_group,
                                           fh->f_comm);
    
    if (OMPI_SUCCESS != ret){
        goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_rcomm_time = MPI_Wtime();
    rcomm_time += end_rcomm_time - start_rcomm_time;
#endif

    displs = (int*)malloc (fh->f_procs_per_group*sizeof(int));
    if (NULL == displs) {
        opal_output (1, "OUT OF MEMORY\n");
        ret = OMPI_ERR_OUT_OF_RESOURCE;
        goto exit;
    }

    displs[0] = 0;
    total_fview_count = fview_count[0];
    for (i=1 ; i<fh->f_procs_per_group ; i++) {
        total_fview_count += fview_count[i];
        displs[i] = displs[i-1] + fview_count[i-1];
    }

#if DEBUG_ON
    if (my_aggregator == fh->f_rank) {
    for (i=0 ; i<fh->f_procs_per_group ; i++) {
    printf ("%d: PROCESS: %d  ELEMENTS: %d  DISPLS: %d\n",
        fh->f_rank,
        i,
        fview_count[i],
        displs[i]);
}
}
#endif

    /* allocate the global iovec  */
    if (0 != total_fview_count) {
        global_iov_array = (struct iovec*)malloc (total_fview_count *
                                                  sizeof(struct iovec));
        if (NULL == global_iov_array) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rcomm_time = MPI_Wtime();
#endif
    ret =  fcoll_base_coll_allgatherv_array (local_iov_array,
                                             local_count,
                                             fh->f_iov_type,
                                             global_iov_array,
                                             fview_count,
                                             displs,
                                             fh->f_iov_type,
                                             fh->f_aggregator_index,
                                             fh->f_procs_in_group,
                                             fh->f_procs_per_group,
                                             fh->f_comm);

    if (OMPI_SUCCESS != ret){
        goto exit;
    }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_rcomm_time = MPI_Wtime();
    rcomm_time += end_rcomm_time - start_rcomm_time;
#endif

    /****************************************************************************************
     *** 5. Sort the global offset/lengths list based on the offsets.
     *** The result of the sort operation is the 'sorted', an integer array,
     *** which contains the indexes of the global_iov_array based on the offset.
     *** For example, if global_iov_array[x].offset is followed by global_iov_array[y].offset
     *** in the file, and that one is followed by global_iov_array[z].offset, than
     *** sorted[0] = x, sorted[1]=y and sorted[2]=z;
     ******************************************************************************************/
    if (0 != total_fview_count) {
       sorted = (int *)malloc (total_fview_count * sizeof(int));
      if (NULL == sorted) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }
        fh->f_sort_iovec (global_iov_array, total_fview_count, sorted);
    }

    if (NULL != local_iov_array) {
        free (local_iov_array);
        local_iov_array = NULL;
    }

#if DEBUG_ON
    if (my_aggregator == fh->f_rank) {
        for (i=0 ; i<total_fview_count ; i++) {
            printf("%d: OFFSET: %p   LENGTH: %d\n",
                   fh->f_rank,
                   global_iov_array[sorted[i]].iov_base,
                   global_iov_array[sorted[i]].iov_len);
        }
    }
#endif

    /*************************************************************
     *** 6. Determine the number of cycles required to execute this
     ***    operation
     *************************************************************/
    fh->f_get_bytes_per_agg ( (int *) &bytes_per_cycle);
    cycles = ceil((double)total_bytes/bytes_per_cycle);

    if ( my_aggregator == fh->f_rank) {
      disp_index = (int *)malloc (fh->f_procs_per_group * sizeof (int));
      if (NULL == disp_index) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        blocklen_per_process = (int **)malloc (fh->f_procs_per_group * sizeof (int*));
        if (NULL == blocklen_per_process) {
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        displs_per_process = (MPI_Aint **)malloc (fh->f_procs_per_group * sizeof (MPI_Aint*));
        if (NULL == displs_per_process){
            opal_output (1, "OUT OF MEMORY\n");
            ret = OMPI_ERR_OUT_OF_RESOURCE;
            goto exit;
        }

        for (i=0;i<fh->f_procs_per_group;i++){
            blocklen_per_process[i] = NULL;
            displs_per_process[i] = NULL;
        }

	send_req = (MPI_Request *) malloc (fh->f_procs_per_group * sizeof(MPI_Request));
	if (NULL == send_req){
	    opal_output ( 1, "OUT OF MEMORY\n");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	}

	global_buf = (char *) malloc (bytes_per_cycle);
	if (NULL == global_buf){
	    opal_output(1, "OUT OF MEMORY\n");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	}

	sendtype = (ompi_datatype_t **) malloc (fh->f_procs_per_group * sizeof(ompi_datatype_t *));
	if (NULL == sendtype) {
            opal_output (1, "OUT OF MEMORY\n");
	    ret = OMPI_ERR_OUT_OF_RESOURCE;
	    goto exit;
	}

	for(l=0;l<fh->f_procs_per_group;l++){
            sendtype[l] = MPI_DATATYPE_NULL;
	}
    }




#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    start_rexch = MPI_Wtime();
#endif
    n = 0;
    bytes_remaining = 0;
    current_index = 0;

    for (index = 0; index < cycles; index++) {
        /**********************************************************************
         ***  7a. Getting ready for next cycle: initializing and freeing buffers
	 **********************************************************************/
        if (my_aggregator == fh->f_rank) {
             if (NULL != fh->f_io_array) {
                free (fh->f_io_array);
                fh->f_io_array = NULL;
            }
            fh->f_num_of_io_entries = 0;

            if (NULL != sendtype){
                for (i =0; i< fh->f_procs_per_group; i++) {
		    if ( MPI_DATATYPE_NULL != sendtype[i] ) {
                        ompi_datatype_destroy(&sendtype[i]);
                        sendtype[i] = MPI_DATATYPE_NULL;
                    }
		}
            }

            for(l=0;l<fh->f_procs_per_group;l++){
                disp_index[l] =  1;

                if (NULL != blocklen_per_process[l]){
                    free(blocklen_per_process[l]);
                    blocklen_per_process[l] = NULL;
                }
                if (NULL != displs_per_process[l]){
                    free(displs_per_process[l]);
                    displs_per_process[l] = NULL;
                }
                blocklen_per_process[l] = (int *) calloc (1, sizeof(int));
                if (NULL == blocklen_per_process[l]) {
                    opal_output (1, "OUT OF MEMORY for blocklen\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                displs_per_process[l] = (MPI_Aint *) calloc (1, sizeof(MPI_Aint));
                if (NULL == displs_per_process[l]){
                    opal_output (1, "OUT OF MEMORY for displs\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
            }

            if (NULL != sorted_file_offsets){
                free(sorted_file_offsets);
                sorted_file_offsets = NULL;
            }

            if(NULL != file_offsets_for_agg){
                free(file_offsets_for_agg);
                file_offsets_for_agg = NULL;
            }
            if (NULL != memory_displacements){
                free(memory_displacements);
                memory_displacements = NULL;
            }
        }  /* (my_aggregator == fh->f_rank */

        /**************************************************************************
         ***  7b. Determine the number of bytes to be actually read in this cycle
	 **************************************************************************/
        if (cycles-1 == index) {
            bytes_to_read_in_cycle = total_bytes - bytes_per_cycle*index;
        }
        else {
            bytes_to_read_in_cycle = bytes_per_cycle;
        }

#if DEBUG_ON
        if (my_aggregator == fh->f_rank) {
            printf ("****%d: CYCLE %d   Bytes %d**********\n",
                    fh->f_rank,
                    index,
                    bytes_to_write_in_cycle);
        }
#endif

        /*****************************************************************
         *** 7c. Calculate how much data will be contributed in this cycle
	 ***     by each process
         *****************************************************************/
        bytes_received = 0;

        while (bytes_to_read_in_cycle) {
            /* This next block identifies which process is the holder
            ** of the sorted[current_index] element;
            */
            blocks = fview_count[0];
            for (j=0 ; j<fh->f_procs_per_group ; j++) {
                if (sorted[current_index] < blocks) {
                    n = j;
                    break;
                }
                else {
                    blocks += fview_count[j+1];
                }
            }

            if (bytes_remaining) {
                /* Finish up a partially used buffer from the previous  cycle */
                if (bytes_remaining <= bytes_to_read_in_cycle) {
                    /* Data fits completely into the block */
                    if (my_aggregator == fh->f_rank) {
                        blocklen_per_process[n][disp_index[n] - 1] = bytes_remaining;
                        displs_per_process[n][disp_index[n] - 1] =
                            (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base +
                            (global_iov_array[sorted[current_index]].iov_len - bytes_remaining);

                        blocklen_per_process[n] = (int *) realloc
                            ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int));
                        displs_per_process[n] = (MPI_Aint *) realloc
                            ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint));
                        blocklen_per_process[n][disp_index[n]] = 0;
                        displs_per_process[n][disp_index[n]] = 0;
                        disp_index[n] += 1;
                    }
                    if (fh->f_procs_in_group[n] == fh->f_rank) {
                        bytes_received += bytes_remaining;
                    }
                    current_index ++;
                    bytes_to_read_in_cycle -= bytes_remaining;
                    bytes_remaining = 0;
                    continue;
                }
                else {
                     /* the remaining data from the previous cycle is larger than the
                        bytes_to_write_in_cycle, so we have to segment again */
                    if (my_aggregator == fh->f_rank) {
                        blocklen_per_process[n][disp_index[n] - 1] = bytes_to_read_in_cycle;
                        displs_per_process[n][disp_index[n] - 1] =
                            (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base +
                            (global_iov_array[sorted[current_index]].iov_len
                             - bytes_remaining);
                    }
                    if (fh->f_procs_in_group[n] == fh->f_rank) {
                        bytes_received += bytes_to_read_in_cycle;
                    }
                    bytes_remaining -= bytes_to_read_in_cycle;
                    bytes_to_read_in_cycle = 0;
                    break;
                }
            }
            else {
                /* No partially used entry available, have to start a new one */
                if (bytes_to_read_in_cycle <
                    (MPI_Aint) global_iov_array[sorted[current_index]].iov_len) {
                    /* This entry has more data than we can sendin one cycle */
                    if (my_aggregator == fh->f_rank) {
                        blocklen_per_process[n][disp_index[n] - 1] = bytes_to_read_in_cycle;
                        displs_per_process[n][disp_index[n] - 1] =
                            (OPAL_PTRDIFF_TYPE)global_iov_array[sorted[current_index]].iov_base ;
                    }

                    if (fh->f_procs_in_group[n] == fh->f_rank) {
                        bytes_received += bytes_to_read_in_cycle;
                    }
                    bytes_remaining = global_iov_array[sorted[current_index]].iov_len -
                        bytes_to_read_in_cycle;
                    bytes_to_read_in_cycle = 0;
                    break;
                }
                else {
                    /* Next data entry is less than bytes_to_write_in_cycle */
                    if (my_aggregator ==  fh->f_rank) {
                        blocklen_per_process[n][disp_index[n] - 1] =
                            global_iov_array[sorted[current_index]].iov_len;
                        displs_per_process[n][disp_index[n] - 1] = (OPAL_PTRDIFF_TYPE)
                            global_iov_array[sorted[current_index]].iov_base;
                        blocklen_per_process[n] =
                            (int *) realloc ((void *)blocklen_per_process[n], (disp_index[n]+1)*sizeof(int));
                        displs_per_process[n] = (MPI_Aint *)realloc
                            ((void *)displs_per_process[n], (disp_index[n]+1)*sizeof(MPI_Aint));
                        blocklen_per_process[n][disp_index[n]] = 0;
                        displs_per_process[n][disp_index[n]] = 0;
                        disp_index[n] += 1;
                    }
                    if (fh->f_procs_in_group[n] == fh->f_rank) {
                        bytes_received +=
                            global_iov_array[sorted[current_index]].iov_len;
                    }
                    bytes_to_read_in_cycle -=
                        global_iov_array[sorted[current_index]].iov_len;
                    current_index ++;
                    continue;
                }
            }
        } /* end while (bytes_to_read_in_cycle) */

        /*************************************************************************
	 *** 7d. Calculate the displacement on where to put the data and allocate
         ***     the recieve buffer (global_buf)
	 *************************************************************************/
        if (my_aggregator == fh->f_rank) {
            entries_per_aggregator=0;
            for (i=0;i<fh->f_procs_per_group; i++){
                for (j=0;j<disp_index[i];j++){
                    if (blocklen_per_process[i][j] > 0)
                        entries_per_aggregator++ ;
                }
            }
            if (entries_per_aggregator > 0){
                file_offsets_for_agg = (mca_io_ompio_local_io_array *)
                    malloc(entries_per_aggregator*sizeof(mca_io_ompio_local_io_array));
                if (NULL == file_offsets_for_agg) {
                    opal_output (1, "OUT OF MEMORY\n");
                    ret = OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                sorted_file_offsets = (int *)
                    malloc (entries_per_aggregator*sizeof(int));
                if (NULL == sorted_file_offsets){
                    opal_output (1, "OUT OF MEMORY\n");
                    ret =  OMPI_ERR_OUT_OF_RESOURCE;
                    goto exit;
                }
                /*Moving file offsets to an IO array!*/
                temp_index = 0;
                global_count = 0;
                for (i=0;i<fh->f_procs_per_group; i++){
                    for(j=0;j<disp_index[i];j++){
                        if (blocklen_per_process[i][j] > 0){
                            file_offsets_for_agg[temp_index].length =
                                blocklen_per_process[i][j];
                            global_count += blocklen_per_process[i][j];
                            file_offsets_for_agg[temp_index].process_id = i;
                            file_offsets_for_agg[temp_index].offset =
                                displs_per_process[i][j];
                            temp_index++;
                        }
                    }
                }
            }
            else{
                continue;
            }

             /* Sort the displacements for each aggregator */
            read_heap_sort (file_offsets_for_agg,
                            entries_per_aggregator,
                            sorted_file_offsets);

            memory_displacements = (MPI_Aint *) malloc
                (entries_per_aggregator * sizeof(MPI_Aint));
            memory_displacements[sorted_file_offsets[0]] = 0;
            for (i=1; i<entries_per_aggregator; i++){
                memory_displacements[sorted_file_offsets[i]] =
                    memory_displacements[sorted_file_offsets[i-1]] +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length;
            }

             /**********************************************************
	     *** 7e. Create the io array, and pass it to fbtl
	     *********************************************************/
            fh->f_io_array = (mca_io_ompio_io_array_t *) malloc
                (entries_per_aggregator * sizeof (mca_io_ompio_io_array_t));
            if (NULL == fh->f_io_array) {
                opal_output(1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }

            fh->f_num_of_io_entries = 0;
            fh->f_io_array[0].offset =
                (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[0]].offset;
            fh->f_io_array[0].length =
                file_offsets_for_agg[sorted_file_offsets[0]].length;
            fh->f_io_array[0].memory_address =
                global_buf+memory_displacements[sorted_file_offsets[0]];
            fh->f_num_of_io_entries++;
            for (i=1;i<entries_per_aggregator;i++){
                if (file_offsets_for_agg[sorted_file_offsets[i-1]].offset +
                    file_offsets_for_agg[sorted_file_offsets[i-1]].length ==
                    file_offsets_for_agg[sorted_file_offsets[i]].offset){
                    fh->f_io_array[fh->f_num_of_io_entries - 1].length +=
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                }
                else{
                    fh->f_io_array[fh->f_num_of_io_entries].offset =
                        (IOVBASE_TYPE *)(intptr_t)file_offsets_for_agg[sorted_file_offsets[i]].offset;
                    fh->f_io_array[fh->f_num_of_io_entries].length =
                        file_offsets_for_agg[sorted_file_offsets[i]].length;
                    fh->f_io_array[fh->f_num_of_io_entries].memory_address =
                        global_buf+memory_displacements[sorted_file_offsets[i]];
                    fh->f_num_of_io_entries++;
                }
            }


#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            start_read_time = MPI_Wtime();
#endif

            if (fh->f_num_of_io_entries) {
                if ( 0 >  fh->f_fbtl->fbtl_preadv (fh)) {
                    opal_output (1, "READ FAILED\n");
                    ret = OMPI_ERROR;
                    goto exit;
                }
            }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            end_read_time = MPI_Wtime();
            read_time += end_read_time - start_read_time;
#endif
            /**********************************************************
             ******************** DONE READING ************************
             *********************************************************/

            temp_disp_index = (int *)calloc (1, fh->f_procs_per_group * sizeof (int));
            if (NULL == temp_disp_index) {
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
            for (i=0; i<entries_per_aggregator; i++){
                temp_index =
                    file_offsets_for_agg[sorted_file_offsets[i]].process_id;
                displs_per_process[temp_index][temp_disp_index[temp_index]] =
                    memory_displacements[sorted_file_offsets[i]];
                if (temp_disp_index[temp_index] < disp_index[temp_index]){
                    temp_disp_index[temp_index] += 1;
                }
                else{
                    printf("temp_disp_index[%d]: %d is greater than disp_index[%d]: %d\n",
                           temp_index, temp_disp_index[temp_index],
                           temp_index, disp_index[temp_index]);
                }
            }
            if (NULL != temp_disp_index){
                free(temp_disp_index);
                temp_disp_index = NULL;
            }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            start_rcomm_time = MPI_Wtime();
#endif
            for (i=0;i<fh->f_procs_per_group;i++){
                send_req[i] = MPI_REQUEST_NULL;
                if ( 0 < disp_index[i] ) {
                    ompi_datatype_create_hindexed(disp_index[i],
                                                  blocklen_per_process[i],
                                                  displs_per_process[i],
                                                  MPI_BYTE,
                                                  &sendtype[i]);
                    ompi_datatype_commit(&sendtype[i]);
                    ret = MCA_PML_CALL (isend(global_buf,
                                              1,
                                              sendtype[i],
                                              fh->f_procs_in_group[i],
                                              123,
                                              MCA_PML_BASE_SEND_STANDARD,
                                              fh->f_comm,
                                              &send_req[i]));
                    if(OMPI_SUCCESS != ret){
                        goto exit;
                    }
                }
            }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
            end_rcomm_time = MPI_Wtime();
            rcomm_time += end_rcomm_time - start_rcomm_time;
#endif
        }

        /**********************************************************
         *** 7f.  Scatter the Data from the readers
         *********************************************************/
        if ( recvbuf_is_contiguous ) {
            receive_buf = &((char*)buf)[position];
        }
        else if (bytes_received) {
            /* allocate a receive buffer and copy the data that needs
               to be received into it in case the data is non-contigous
               in memory */
            receive_buf = malloc (bytes_received);
            if (NULL == receive_buf) {
                opal_output (1, "OUT OF MEMORY\n");
                ret = OMPI_ERR_OUT_OF_RESOURCE;
                goto exit;
            }
        }

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        start_rcomm_time = MPI_Wtime();
#endif
        ret = MCA_PML_CALL(irecv(receive_buf,
                                 bytes_received,
                                 MPI_BYTE,
                                 my_aggregator,
                                 123,
                                 fh->f_comm,
                                 &recv_req));
        if (OMPI_SUCCESS != ret){
            goto exit;
        }


        if (my_aggregator == fh->f_rank){
            ret = ompi_request_wait_all (fh->f_procs_per_group,
                                         send_req,
                                         MPI_STATUS_IGNORE);
            if (OMPI_SUCCESS != ret){
                goto exit;
            }
        }

        ret = ompi_request_wait (&recv_req, MPI_STATUS_IGNORE);
        if (OMPI_SUCCESS != ret){
            goto exit;
        }
        position += bytes_received;

        /* If data is not contigous in memory, copy the data from the
           receive buffer into the buffer passed in */
        if (!recvbuf_is_contiguous ) {
            OPAL_PTRDIFF_TYPE mem_address;
            size_t remaining = 0;
            size_t temp_position = 0;

            remaining = bytes_received;

            while (remaining) {
                mem_address = (OPAL_PTRDIFF_TYPE)
                    (decoded_iov[iov_index].iov_base) + current_position;

                if (remaining >=
                    (decoded_iov[iov_index].iov_len - current_position)) {
                    memcpy ((IOVBASE_TYPE *) mem_address,
                            receive_buf+temp_position,
                            decoded_iov[iov_index].iov_len - current_position);
                    remaining = remaining -
                        (decoded_iov[iov_index].iov_len - current_position);
                    temp_position = temp_position +
                        (decoded_iov[iov_index].iov_len - current_position);
                    iov_index = iov_index + 1;
                    current_position = 0;
                }
                else {
                    memcpy ((IOVBASE_TYPE *) mem_address,
                            receive_buf+temp_position,
                            remaining);
                    current_position = current_position + remaining;
                    remaining = 0;
                }
            }

            if (NULL != receive_buf) {
                free (receive_buf);
                receive_buf = NULL;
            }
        }
#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
        end_rcomm_time = MPI_Wtime();
        rcomm_time += end_rcomm_time - start_rcomm_time;
#endif
    } /* end for (index=0; index < cycles; index ++) */

#if OMPIO_FCOLL_WANT_TIME_BREAKDOWN
    end_rexch = MPI_Wtime();
    read_exch += end_rexch - start_rexch;
    nentry.time[0] = read_time;
    nentry.time[1] = rcomm_time;
    nentry.time[2] = read_exch;
    if (my_aggregator == fh->f_rank)
        nentry.aggregator = 1;
    else
        nentry.aggregator = 0;
    nentry.nprocs_for_coll = dynamic_gen2_num_io_procs;
    if (!fh->f_full_print_queue(READ_PRINT_QUEUE)){
        fh->f_register_print_entry(READ_PRINT_QUEUE,
                                   nentry);
    }
#endif

exit:
    if (!recvbuf_is_contiguous) {
        if (NULL != receive_buf) {
            free (receive_buf);
            receive_buf = NULL;
        }
    }
    if (NULL != global_buf) {
        free (global_buf);
        global_buf = NULL;
    }
    if (NULL != sorted) {
        free (sorted);
        sorted = NULL;
    }
    if (NULL != global_iov_array) {
        free (global_iov_array);
        global_iov_array = NULL;
    }
    if (NULL != fview_count) {
        free (fview_count);
        fview_count = NULL;
    }
    if (NULL != decoded_iov) {
        free (decoded_iov);
        decoded_iov = NULL;
    }
    if (NULL != local_iov_array){
        free(local_iov_array);
        local_iov_array=NULL;
    }

    if (NULL != displs) {
        free (displs);
        displs = NULL;
    }
    if (my_aggregator == fh->f_rank) {

        if (NULL != sorted_file_offsets){
            free(sorted_file_offsets);
            sorted_file_offsets = NULL;
        }
        if (NULL != file_offsets_for_agg){
            free(file_offsets_for_agg);
            file_offsets_for_agg = NULL;
        }
        if (NULL != memory_displacements){
            free(memory_displacements);
            memory_displacements= NULL;
        }
        if (NULL != sendtype){
            for (i = 0; i < fh->f_procs_per_group; i++) {
                if ( MPI_DATATYPE_NULL != sendtype[i] ) {
                    ompi_datatype_destroy(&sendtype[i]);
                }
            }
            free(sendtype);
            sendtype=NULL;
        }

        if (NULL != disp_index){
            free(disp_index);
            disp_index = NULL;
        }

        if ( NULL != blocklen_per_process){
            for(l=0;l<fh->f_procs_per_group;l++){
                if (NULL != blocklen_per_process[l]){
                    free(blocklen_per_process[l]);
                    blocklen_per_process[l] = NULL;
                }
            }

            free(blocklen_per_process);
            blocklen_per_process = NULL;
        }

        if (NULL != displs_per_process){
            for (l=0; i<fh->f_procs_per_group; l++){
                if (NULL != displs_per_process[l]){
                    free(displs_per_process[l]);
                    displs_per_process[l] = NULL;
                }
            }
            free(displs_per_process);
            displs_per_process = NULL;
        }
        if ( NULL != send_req ) {
            free ( send_req );
            send_req = NULL;
        }
    }
    return ret;
}
Ejemplo n.º 30
0
/*
 *	allreduce_inter
 *
 *	Function:	- allreduce using other MPI collectives
 *	Accepts:	- same as MPI_Allreduce()
 *	Returns:	- MPI_SUCCESS or error code
 */
int
mca_coll_inter_allreduce_inter(const void *sbuf, void *rbuf, int count,
                               struct ompi_datatype_t *dtype,
                               struct ompi_op_t *op,
                               struct ompi_communicator_t *comm,
                               mca_coll_base_module_t *module)
{
    int err, rank, root = 0;
    char *tmpbuf = NULL, *pml_buffer = NULL;
    ompi_request_t *req[2];
    ptrdiff_t gap, span;

    rank = ompi_comm_rank(comm);

    /* Perform the reduction locally */
    span = opal_datatype_span(&dtype->super, count, &gap);

    tmpbuf = (char *) malloc(span);
    if (NULL == tmpbuf) {
	return OMPI_ERR_OUT_OF_RESOURCE;
    }
    pml_buffer = tmpbuf - gap;

    err = comm->c_local_comm->c_coll->coll_reduce(sbuf, pml_buffer, count,
						 dtype, op, root,
						 comm->c_local_comm,
                                                 comm->c_local_comm->c_coll->coll_reduce_module);
    if (OMPI_SUCCESS != err) {
	goto exit;
    }

    if (rank == root) {
	/* Do a send-recv between the two root procs. to avoid deadlock */
        err = MCA_PML_CALL(irecv(rbuf, count, dtype, 0,
                                 MCA_COLL_BASE_TAG_ALLREDUCE, comm,
                                 &(req[0])));
        if (OMPI_SUCCESS != err) {
            goto exit;
        }

        err = MCA_PML_CALL(isend(pml_buffer, count, dtype, 0,
                                 MCA_COLL_BASE_TAG_ALLREDUCE,
                                 MCA_PML_BASE_SEND_STANDARD,
                                 comm, &(req[1])));
        if (OMPI_SUCCESS != err) {
            goto exit;
        }

        err = ompi_request_wait_all(2, req, MPI_STATUSES_IGNORE);
        if (OMPI_SUCCESS != err) {
            goto exit;
        }
    }

    /* bcast the message to all the local processes */
    err = comm->c_local_comm->c_coll->coll_bcast(rbuf, count, dtype,
						root, comm->c_local_comm,
                                                comm->c_local_comm->c_coll->coll_bcast_module);
    if (OMPI_SUCCESS != err) {
            goto exit;
    }

  exit:
    if (NULL != tmpbuf) {
        free(tmpbuf);
    }

    return err;
}