Пример #1
0
FORT_DLL_SPEC void FORT_CALL mpi_iscatterv_ ( void*v1, MPI_Fint v2[], MPI_Fint v3[], MPI_Fint *v4, void*v5, MPI_Fint *v6, MPI_Fint *v7, MPI_Fint *v8, MPI_Fint *v9, MPI_Fint *v10, MPI_Fint *ierr ){

#ifndef HAVE_MPI_F_INIT_WORKS_WITH_C
    if (MPIR_F_NeedInit){ mpirinitf_(); MPIR_F_NeedInit = 0; }
#endif
    if (v5 == MPIR_F_MPI_IN_PLACE) v5 = MPI_IN_PLACE;
    *ierr = MPI_Iscatterv( v1, v2, v3, (MPI_Datatype)(*v4), v5, (int)*v6, (MPI_Datatype)(*v7), (int)*v8, (MPI_Comm)(*v9), (MPI_Request *)(v10) );
}
Пример #2
0
int main(int argc, char **argv)
{
    int errs = 0;
    int i;
    int rank, size;
    int *sbuf = NULL;
    int *rbuf = NULL;
    int *scounts = NULL;
    int *rcounts = NULL;
    int *sdispls = NULL;
    int *rdispls = NULL;
    int *types = NULL;
    MPI_Comm comm;
    MPI_Request req;

    /* intentionally not using MTest_Init/MTest_Finalize in order to make it
     * easy to take this test and use it as an NBC sanity test outside of the
     * MPICH test suite */
    MPI_Init(&argc, &argv);

    comm = MPI_COMM_WORLD;

    MPI_Comm_size(comm, &size);
    MPI_Comm_rank(comm, &rank);

    /* enough space for every process to contribute at least NUM_INTS ints to any
     * collective operation */
    sbuf = malloc(NUM_INTS * size * sizeof(int));
    my_assert(sbuf);
    rbuf = malloc(NUM_INTS * size * sizeof(int));
    my_assert(rbuf);
    scounts = malloc(size * sizeof(int));
    my_assert(scounts);
    rcounts = malloc(size * sizeof(int));
    my_assert(rcounts);
    sdispls = malloc(size * sizeof(int));
    my_assert(sdispls);
    rdispls = malloc(size * sizeof(int));
    my_assert(rdispls);
    types = malloc(size * sizeof(int));
    my_assert(types);

    for (i = 0; i < size; ++i) {
        sbuf[2 * i] = i;
        sbuf[2 * i + 1] = i;
        rbuf[2 * i] = i;
        rbuf[2 * i + 1] = i;
        scounts[i] = NUM_INTS;
        rcounts[i] = NUM_INTS;
        sdispls[i] = i * NUM_INTS;
        rdispls[i] = i * NUM_INTS;
        types[i] = MPI_INT;
    }

    MPI_Ibarrier(comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    MPI_Ibcast(sbuf, NUM_INTS, MPI_INT, 0, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    MPI_Igather(sbuf, NUM_INTS, MPI_INT, rbuf, NUM_INTS, MPI_INT, 0, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    if (0 == rank)
        MPI_Igather(MPI_IN_PLACE, -1, MPI_DATATYPE_NULL, rbuf, NUM_INTS, MPI_INT, 0, comm, &req);
    else
        MPI_Igather(sbuf, NUM_INTS, MPI_INT, rbuf, NUM_INTS, MPI_INT, 0, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    MPI_Igatherv(sbuf, NUM_INTS, MPI_INT, rbuf, rcounts, rdispls, MPI_INT, 0, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    if (0 == rank)
        MPI_Igatherv(MPI_IN_PLACE, -1, MPI_DATATYPE_NULL, rbuf, rcounts, rdispls, MPI_INT, 0, comm,
                     &req);
    else
        MPI_Igatherv(sbuf, NUM_INTS, MPI_INT, rbuf, rcounts, rdispls, MPI_INT, 0, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    MPI_Iscatter(sbuf, NUM_INTS, MPI_INT, rbuf, NUM_INTS, MPI_INT, 0, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    if (0 == rank)
        MPI_Iscatter(sbuf, NUM_INTS, MPI_INT, MPI_IN_PLACE, -1, MPI_DATATYPE_NULL, 0, comm, &req);
    else
        MPI_Iscatter(sbuf, NUM_INTS, MPI_INT, rbuf, NUM_INTS, MPI_INT, 0, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    MPI_Iscatterv(sbuf, scounts, sdispls, MPI_INT, rbuf, NUM_INTS, MPI_INT, 0, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    if (0 == rank)
        MPI_Iscatterv(sbuf, scounts, sdispls, MPI_INT, MPI_IN_PLACE, -1, MPI_DATATYPE_NULL, 0, comm,
                      &req);
    else
        MPI_Iscatterv(sbuf, scounts, sdispls, MPI_INT, rbuf, NUM_INTS, MPI_INT, 0, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    MPI_Iallgather(sbuf, NUM_INTS, MPI_INT, rbuf, NUM_INTS, MPI_INT, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    MPI_Iallgather(MPI_IN_PLACE, -1, MPI_DATATYPE_NULL, rbuf, NUM_INTS, MPI_INT, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    MPI_Iallgatherv(sbuf, NUM_INTS, MPI_INT, rbuf, rcounts, rdispls, MPI_INT, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    MPI_Iallgatherv(MPI_IN_PLACE, -1, MPI_DATATYPE_NULL, rbuf, rcounts, rdispls, MPI_INT, comm,
                    &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    MPI_Ialltoall(sbuf, NUM_INTS, MPI_INT, rbuf, NUM_INTS, MPI_INT, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    MPI_Ialltoall(MPI_IN_PLACE, -1, MPI_DATATYPE_NULL, rbuf, NUM_INTS, MPI_INT, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    MPI_Ialltoallv(sbuf, scounts, sdispls, MPI_INT, rbuf, rcounts, rdispls, MPI_INT, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    MPI_Ialltoallv(MPI_IN_PLACE, NULL, NULL, MPI_DATATYPE_NULL, rbuf, rcounts, rdispls, MPI_INT,
                   comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    MPI_Ialltoallw(sbuf, scounts, sdispls, types, rbuf, rcounts, rdispls, types, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    MPI_Ialltoallw(MPI_IN_PLACE, NULL, NULL, NULL, rbuf, rcounts, rdispls, types, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    MPI_Ireduce(sbuf, rbuf, NUM_INTS, MPI_INT, MPI_SUM, 0, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    if (0 == rank)
        MPI_Ireduce(MPI_IN_PLACE, rbuf, NUM_INTS, MPI_INT, MPI_SUM, 0, comm, &req);
    else
        MPI_Ireduce(sbuf, rbuf, NUM_INTS, MPI_INT, MPI_SUM, 0, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    MPI_Iallreduce(sbuf, rbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    MPI_Iallreduce(MPI_IN_PLACE, rbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    MPI_Ireduce_scatter(sbuf, rbuf, rcounts, MPI_INT, MPI_SUM, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    MPI_Ireduce_scatter(MPI_IN_PLACE, rbuf, rcounts, MPI_INT, MPI_SUM, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    MPI_Ireduce_scatter_block(sbuf, rbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    MPI_Ireduce_scatter_block(MPI_IN_PLACE, rbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    MPI_Iscan(sbuf, rbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    MPI_Iscan(MPI_IN_PLACE, rbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    MPI_Iexscan(sbuf, rbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    MPI_Iexscan(MPI_IN_PLACE, rbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req);
    MPI_Wait(&req, MPI_STATUS_IGNORE);

    if (sbuf)
        free(sbuf);
    if (rbuf)
        free(rbuf);
    if (scounts)
        free(scounts);
    if (rcounts)
        free(rcounts);
    if (sdispls)
        free(sdispls);
    if (rdispls)
        free(rdispls);

    if (rank == 0) {
        if (errs)
            fprintf(stderr, "Found %d errors\n", errs);
        else
            printf(" No errors\n");
    }
    MPI_Finalize();
    return 0;
}
Пример #3
0
/* Starts a "random" operation on "comm" corresponding to "rndnum" and returns
 * in (*req) a request handle corresonding to that operation.  This call should
 * be considered collective over comm (with a consistent value for "rndnum"),
 * even though the operation may only be a point-to-point request. */
static void start_random_nonblocking(MPI_Comm comm, unsigned int rndnum, MPI_Request *req, struct laundry *l)
{
    int i, j;
    int rank, size;
    int *buf = NULL;
    int *recvbuf = NULL;
    int *sendcounts = NULL;
    int *recvcounts = NULL;
    int *sdispls = NULL;
    int *rdispls = NULL;
    int *sendtypes = NULL;
    int *recvtypes = NULL;
    signed char *buf_alias = NULL;

    MPI_Comm_rank(comm, &rank);
    MPI_Comm_size(comm, &size);

    *req = MPI_REQUEST_NULL;

    l->case_num = -1;
    l->comm = comm;

    l->buf        = buf        = malloc(COUNT*size*sizeof(int));
    l->recvbuf    = recvbuf    = malloc(COUNT*size*sizeof(int));
    l->sendcounts = sendcounts = malloc(size*sizeof(int));
    l->recvcounts = recvcounts = malloc(size*sizeof(int));
    l->sdispls    = sdispls    = malloc(size*sizeof(int));
    l->rdispls    = rdispls    = malloc(size*sizeof(int));
    l->sendtypes  = sendtypes  = malloc(size*sizeof(MPI_Datatype));
    l->recvtypes  = recvtypes  = malloc(size*sizeof(MPI_Datatype));

#define NUM_CASES (21)
    l->case_num = rand_range(rndnum, 0, NUM_CASES);
    switch (l->case_num) {
        case 0: /* MPI_Ibcast */
            for (i = 0; i < COUNT; ++i) {
                if (rank == 0) {
                    buf[i] = i;
                }
                else {
                    buf[i] = 0xdeadbeef;
                }
            }
            MPI_Ibcast(buf, COUNT, MPI_INT, 0, comm, req);
            break;

        case 1: /* MPI_Ibcast (again, but designed to stress scatter/allgather impls) */
            /* FIXME fiddle with PRIME and buffer allocation s.t. PRIME is much larger (1021?) */
            buf_alias = (signed char *)buf;
            my_assert(COUNT*size*sizeof(int) > PRIME); /* sanity */
            for (i = 0; i < PRIME; ++i) {
                if (rank == 0)
                    buf_alias[i] = i;
                else
                    buf_alias[i] = 0xdb;
            }
            for (i = PRIME; i < COUNT * size * sizeof(int); ++i) {
                buf_alias[i] = 0xbf;
            }
            MPI_Ibcast(buf_alias, PRIME, MPI_SIGNED_CHAR, 0, comm, req);
            break;

        case 2: /* MPI_Ibarrier */
            MPI_Ibarrier(comm, req);
            break;

        case 3: /* MPI_Ireduce */
            for (i = 0; i < COUNT; ++i) {
                buf[i] = rank + i;
                recvbuf[i] = 0xdeadbeef;
            }
            MPI_Ireduce(buf, recvbuf, COUNT, MPI_INT, MPI_SUM, 0, comm, req);
            break;

        case 4: /* same again, use a user op and free it before the wait */
            {
                MPI_Op op = MPI_OP_NULL;
                MPI_Op_create(sum_fn, /*commute=*/1, &op);
                for (i = 0; i < COUNT; ++i) {
                    buf[i] = rank + i;
                    recvbuf[i] = 0xdeadbeef;
                }
                MPI_Ireduce(buf, recvbuf, COUNT, MPI_INT, op, 0, comm, req);
                MPI_Op_free(&op);
            }
            break;

        case 5: /* MPI_Iallreduce */
            for (i = 0; i < COUNT; ++i) {
                buf[i] = rank + i;
                recvbuf[i] = 0xdeadbeef;
            }
            MPI_Iallreduce(buf, recvbuf, COUNT, MPI_INT, MPI_SUM, comm, req);
            break;

        case 6: /* MPI_Ialltoallv (a weak test, neither irregular nor sparse) */
            for (i = 0; i < size; ++i) {
                sendcounts[i] = COUNT;
                recvcounts[i] = COUNT;
                sdispls[i] = COUNT * i;
                rdispls[i] = COUNT * i;
                for (j = 0; j < COUNT; ++j) {
                    buf[i*COUNT+j] = rank + (i * j);
                    recvbuf[i*COUNT+j] = 0xdeadbeef;
                }
            }
            MPI_Ialltoallv(buf, sendcounts, sdispls, MPI_INT, recvbuf, recvcounts, rdispls, MPI_INT, comm, req);
            break;

        case 7: /* MPI_Igather */
            for (i = 0; i < size*COUNT; ++i) {
                buf[i] = rank + i;
                recvbuf[i] = 0xdeadbeef;
            }
            MPI_Igather(buf, COUNT, MPI_INT, recvbuf, COUNT, MPI_INT, 0, comm, req);
            break;

        case 8: /* same test again, just use a dup'ed datatype and free it before the wait */
            {
                MPI_Datatype type = MPI_DATATYPE_NULL;
                MPI_Type_dup(MPI_INT, &type);
                for (i = 0; i < size*COUNT; ++i) {
                    buf[i] = rank + i;
                    recvbuf[i] = 0xdeadbeef;
                }
                MPI_Igather(buf, COUNT, MPI_INT, recvbuf, COUNT, type, 0, comm, req);
                MPI_Type_free(&type); /* should cause implementations that don't refcount
                                         correctly to blow up or hang in the wait */
            }
            break;

        case 9: /* MPI_Iscatter */
            for (i = 0; i < size; ++i) {
                for (j = 0; j < COUNT; ++j) {
                    if (rank == 0)
                        buf[i*COUNT+j] = i + j;
                    else
                        buf[i*COUNT+j] = 0xdeadbeef;
                    recvbuf[i*COUNT+j] = 0xdeadbeef;
                }
            }
            MPI_Iscatter(buf, COUNT, MPI_INT, recvbuf, COUNT, MPI_INT, 0, comm, req);
            break;

        case 10: /* MPI_Iscatterv */
            for (i = 0; i < size; ++i) {
                /* weak test, just test the regular case where all counts are equal */
                sendcounts[i] = COUNT;
                sdispls[i] = i * COUNT;
                for (j = 0; j < COUNT; ++j) {
                    if (rank == 0)
                        buf[i*COUNT+j] = i + j;
                    else
                        buf[i*COUNT+j] = 0xdeadbeef;
                    recvbuf[i*COUNT+j] = 0xdeadbeef;
                }
            }
            MPI_Iscatterv(buf, sendcounts, sdispls, MPI_INT, recvbuf, COUNT, MPI_INT, 0, comm, req);
            break;

        case 11: /* MPI_Ireduce_scatter */
            for (i = 0; i < size; ++i) {
                recvcounts[i] = COUNT;
                for (j = 0; j < COUNT; ++j) {
                    buf[i*COUNT+j] = rank + i;
                    recvbuf[i*COUNT+j] = 0xdeadbeef;
                }
            }
            MPI_Ireduce_scatter(buf, recvbuf, recvcounts, MPI_INT, MPI_SUM, comm, req);
            break;

        case 12: /* MPI_Ireduce_scatter_block */
            for (i = 0; i < size; ++i) {
                for (j = 0; j < COUNT; ++j) {
                    buf[i*COUNT+j] = rank + i;
                    recvbuf[i*COUNT+j] = 0xdeadbeef;
                }
            }
            MPI_Ireduce_scatter_block(buf, recvbuf, COUNT, MPI_INT, MPI_SUM, comm, req);
            break;

        case 13: /* MPI_Igatherv */
            for (i = 0; i < size*COUNT; ++i) {
                buf[i] = 0xdeadbeef;
                recvbuf[i] = 0xdeadbeef;
            }
            for (i = 0; i < COUNT; ++i) {
                buf[i] = rank + i;
            }
            for (i = 0; i < size; ++i) {
                recvcounts[i] = COUNT;
                rdispls[i] = i * COUNT;
            }
            MPI_Igatherv(buf, COUNT, MPI_INT, recvbuf, recvcounts, rdispls, MPI_INT, 0, comm, req);
            break;

        case 14: /* MPI_Ialltoall */
            for (i = 0; i < size; ++i) {
                for (j = 0; j < COUNT; ++j) {
                    buf[i*COUNT+j] = rank + (i * j);
                    recvbuf[i*COUNT+j] = 0xdeadbeef;
                }
            }
            MPI_Ialltoall(buf, COUNT, MPI_INT, recvbuf, COUNT, MPI_INT, comm, req);
            break;

        case 15: /* MPI_Iallgather */
            for (i = 0; i < size*COUNT; ++i) {
                buf[i] = rank + i;
                recvbuf[i] = 0xdeadbeef;
            }
            MPI_Iallgather(buf, COUNT, MPI_INT, recvbuf, COUNT, MPI_INT, comm, req);
            break;

        case 16: /* MPI_Iallgatherv */
            for (i = 0; i < size; ++i) {
                for (j = 0; j < COUNT; ++j) {
                    recvbuf[i*COUNT+j] = 0xdeadbeef;
                }
                recvcounts[i] = COUNT;
                rdispls[i] = i * COUNT;
            }
            for (i = 0; i < COUNT; ++i)
                buf[i] = rank + i;
            MPI_Iallgatherv(buf, COUNT, MPI_INT, recvbuf, recvcounts, rdispls, MPI_INT, comm, req);
            break;

        case 17: /* MPI_Iscan */
            for (i = 0; i < COUNT; ++i) {
                buf[i] = rank + i;
                recvbuf[i] = 0xdeadbeef;
            }
            MPI_Iscan(buf, recvbuf, COUNT, MPI_INT, MPI_SUM, comm, req);
            break;

        case 18: /* MPI_Iexscan */
            for (i = 0; i < COUNT; ++i) {
                buf[i] = rank + i;
                recvbuf[i] = 0xdeadbeef;
            }
            MPI_Iexscan(buf, recvbuf, COUNT, MPI_INT, MPI_SUM, comm, req);
            break;

        case 19: /* MPI_Ialltoallw (a weak test, neither irregular nor sparse) */
            for (i = 0; i < size; ++i) {
                sendcounts[i] = COUNT;
                recvcounts[i] = COUNT;
                sdispls[i] = COUNT * i * sizeof(int);
                rdispls[i] = COUNT * i * sizeof(int);
                sendtypes[i] = MPI_INT;
                recvtypes[i] = MPI_INT;
                for (j = 0; j < COUNT; ++j) {
                    buf[i*COUNT+j] = rank + (i * j);
                    recvbuf[i*COUNT+j] = 0xdeadbeef;
                }
            }
            MPI_Ialltoallw(buf, sendcounts, sdispls, sendtypes, recvbuf, recvcounts, rdispls, recvtypes, comm, req);
            break;

        case 20: /* basic pt2pt MPI_Isend/MPI_Irecv pairing */
            /* even ranks send to odd ranks, but only if we have a full pair */
            if ((rank % 2 != 0) || (rank != size-1)) {
                for (j = 0; j < COUNT; ++j) {
                    buf[j] = j;
                    recvbuf[j] = 0xdeadbeef;
                }
                if (rank % 2 == 0)
                    MPI_Isend(buf, COUNT, MPI_INT, rank+1, 5, comm, req);
                else
                    MPI_Irecv(recvbuf, COUNT, MPI_INT, rank-1, 5, comm, req);
            }
            break;

        default:
            fprintf(stderr, "unexpected value for l->case_num=%d)\n", (l->case_num));
            MPI_Abort(comm, 1);
            exit(1);
            break;
    }
}
Пример #4
0
int main(int argc, char *argv[])
{
    setbuf(stdout, NULL);
    int i = 0, rank, size, disp;
    int numprocs;
    double latency = 0.0, t_start = 0.0, t_stop = 0.0;
    double tcomp = 0.0, tcomp_total=0.0, latency_in_secs=0.0;
    double test_time = 0.0, test_total = 0.0;
    double timer=0.0;
    double wait_time = 0.0, init_time = 0.0;
    double init_total = 0.0, wait_total = 0.0;
    char *sendbuf=NULL;
    char *recvbuf=NULL;
    int *sdispls=NULL, *sendcounts=NULL;
    int po_ret;
    size_t bufsize;

    set_header(HEADER);
    set_benchmark_name("osu_iscatterv");
    enable_accel_support();
    po_ret = process_options(argc, argv);

    if (po_okay == po_ret && none != options.accel) {
        if (init_accel()) {
            fprintf(stderr, "Error initializing device\n");
            exit(EXIT_FAILURE);
        }
    }

    MPI_Init(&argc, &argv);
    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
    MPI_Comm_size(MPI_COMM_WORLD, &numprocs);
    MPI_Request request;
    MPI_Status status;

    switch (po_ret) {
        case po_bad_usage:
            print_bad_usage_message(rank);
            MPI_Finalize();
            exit(EXIT_FAILURE);
        case po_help_message:
            print_help_message(rank);
            MPI_Finalize();
            exit(EXIT_SUCCESS);
        case po_version_message:
            print_version_message(rank);
            MPI_Finalize();
            exit(EXIT_SUCCESS);
        case po_okay:
            break;
    }

    if(numprocs < 2) {
        if (rank == 0) {
            fprintf(stderr, "This test requires at least two processes\n");
        }

        MPI_Finalize();
        exit(EXIT_FAILURE);
    }

    if ((options.max_message_size * numprocs) > options.max_mem_limit) {
        options.max_message_size = options.max_mem_limit / numprocs;
    }
    
    if (0 == rank) {
        if (allocate_buffer((void**)&sendcounts, numprocs*sizeof(int), none)) {
            fprintf(stderr, "Could Not Allocate Memory [rank %d]\n", rank);
            MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
        }
        if (allocate_buffer((void**)&sdispls, numprocs*sizeof(int), none)) {
            fprintf(stderr, "Could Not Allocate Memory [rank %d]\n", rank);
            MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
        }

        bufsize = options.max_message_size * numprocs;
        if (allocate_buffer((void**)&sendbuf, bufsize, options.accel)) {
            fprintf(stderr, "Could Not Allocate Memory [rank %d]\n", rank);
            MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
        }
        set_buffer(sendbuf, options.accel, 1, bufsize);
    }

    if (allocate_buffer((void**)&recvbuf, options.max_message_size,
                options.accel)) {
        fprintf(stderr, "Could Not Allocate Memory [rank %d]\n", rank);
        MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE);
    }
    set_buffer(recvbuf, options.accel, 0, options.max_message_size);

    print_preamble_nbc(rank);

    for(size=options.min_message_size; size <=options.max_message_size; size *= 2) {
        if(size > LARGE_MESSAGE_SIZE) {
            options.skip = SKIP_LARGE;
            options.iterations = options.iterations_large;
        }
        else {
            options.skip = SKIP;
        }
        if (0 == rank) {
            disp =0;
            for ( i = 0; i < numprocs; i++) {
                sendcounts[i] = size;
                sdispls[i] = disp;
                disp += size;
            }
        }
	
        MPI_Barrier(MPI_COMM_WORLD);
        
        timer = 0.0;     
        
        for(i=0; i < options.iterations + options.skip ; i++) {
            t_start = MPI_Wtime();
            
            MPI_Iscatterv(sendbuf, sendcounts, sdispls, MPI_CHAR, recvbuf,
                      size, MPI_CHAR, 0, MPI_COMM_WORLD, &request);
            MPI_Wait(&request,&status);

            t_stop = MPI_Wtime();
            
            if(i>=options.skip){
                timer += t_stop-t_start;
            } 
            MPI_Barrier(MPI_COMM_WORLD);
        }  
        
        MPI_Barrier(MPI_COMM_WORLD);

        latency = (timer * 1e6) / options.iterations;
        
        latency_in_secs = timer/options.iterations;

        init_arrays(latency_in_secs);
        
        if (0 == rank) {
            disp =0;
            for ( i = 0; i < numprocs; i++) {
                sendcounts[i] = size;
                sdispls[i] = disp;
                disp += size;
            }
        }

        MPI_Barrier(MPI_COMM_WORLD);

        timer = 0.0; tcomp_total = 0; tcomp = 0;        
        init_total = 0.0; wait_total = 0.0;
        test_time = 0.0, test_total = 0.0;

        for(i=0; i < options.iterations + options.skip ; i++) {
            t_start = MPI_Wtime();

            init_time = MPI_Wtime();
            MPI_Iscatterv(sendbuf, sendcounts, sdispls, MPI_CHAR, recvbuf,
                      size, MPI_CHAR, 0, MPI_COMM_WORLD, &request);
            init_time = MPI_Wtime() - init_time;

            tcomp = MPI_Wtime();             
            test_time = dummy_compute(latency_in_secs, &request); 
            tcomp = MPI_Wtime() - tcomp;

            wait_time = MPI_Wtime();
            MPI_Wait(&request,&status);
            wait_time = MPI_Wtime() - wait_time;

            t_stop = MPI_Wtime();
            
            if(i>=options.skip){
                timer += t_stop-t_start;
                tcomp_total += tcomp;
                test_total += test_time;
                init_total += init_time;
                wait_total += wait_time;
            }
            MPI_Barrier(MPI_COMM_WORLD);
        }  
       
        MPI_Barrier (MPI_COMM_WORLD);
        
        calculate_and_print_stats(rank, size, numprocs,
                                  timer, latency,
                                  test_total, tcomp_total,
                                  wait_total, init_total);
    }  
    
    if (0 == rank) {
        free_buffer(sendcounts, none);
        free_buffer(sdispls, none);
        free_buffer(sendbuf, options.accel);
    }
    free_buffer(recvbuf, options.accel);

    MPI_Finalize();

    if (none != options.accel) {
        if (cleanup_accel()) {
            fprintf(stderr, "Error cleaning up device\n");
            exit(EXIT_FAILURE);
        }
    }
   
    return EXIT_SUCCESS;
}
Пример #5
0
int main (int argc, char **argv) {
	FILE *fp;
	double **A = NULL, **B = NULL, **C = NULL, *A_array = NULL, *B_array = NULL, *C_array = NULL;
	double *A_local_block = NULL, *B_local_block = NULL, *C_local_block = NULL;
	int A_rows, A_columns, A_local_block_rows, A_local_block_columns, A_local_block_size;
	int B_rows, B_columns, B_local_block_rows, B_local_block_columns, B_local_block_size;
	int rank, size, sqrt_size, matrices_a_b_dimensions[4];
	MPI_Comm cartesian_grid_communicator, row_communicator, column_communicator;
	MPI_Status status;

	// used to manage the cartesian grid
	int dimensions[2], periods[2], coordinates[2], remain_dims[2];

	MPI_Init(&argc, &argv);
	MPI_Comm_size(MPI_COMM_WORLD, &size);
	MPI_Comm_rank(MPI_COMM_WORLD, &rank);

	/* For square mesh */
	sqrt_size = (int)sqrt((double) size);
	if(sqrt_size * sqrt_size != size){
		if( rank == 0 ) perror("need to run mpiexec with a perfect square number of processes\n");
		MPI_Abort(MPI_COMM_WORLD, -1);
	}

	// create a 2D cartesian grid
	dimensions[0] = dimensions[1] = sqrt_size;
	periods[0] = periods[1] = 1;
	MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions, periods, 1, &cartesian_grid_communicator);
	MPI_Cart_coords(cartesian_grid_communicator, rank, 2, coordinates); //v COORDINATES imas shranjene koordinate procesa RANK

	// create a row communicator
	remain_dims[0] = 0;
	remain_dims[1] = 1;
	MPI_Cart_sub(cartesian_grid_communicator, remain_dims, &row_communicator);

	// create a column communicator
	remain_dims[0] = 1;
	remain_dims[1] = 0;
	MPI_Cart_sub(cartesian_grid_communicator, remain_dims, &column_communicator);


	// set time variables for different MPI parts
	double read_time, send_dim_time, send_blocks_time, gather_time, write_time, dod_cajt;


	read_time = MPI_Wtime();
	// getting matrices from files at rank 0 only
	// example: mpiexec -n 64 ./cannon matrix1 matrix2 [test]
	if (rank == 0){
		int row, column;
		if ((fp = fopen (argv[1], "r")) != NULL){
			fscanf(fp, "%d %d\n", &matrices_a_b_dimensions[0], &matrices_a_b_dimensions[1]);
			A = (double **) malloc (matrices_a_b_dimensions[0] * sizeof(double *));
			for (row = 0; row < matrices_a_b_dimensions[0]; row++){
				A[row] = (double *) malloc(matrices_a_b_dimensions[1] * sizeof(double));
				for (column = 0; column < matrices_a_b_dimensions[1]; column++)
					fscanf(fp, "%lf", &A[row][column]);
			}
			fclose(fp);
		} else {
			if(rank == 0) fprintf(stderr, "error opening file for matrix A (%s)\n", argv[1]);
			MPI_Abort(MPI_COMM_WORLD, -1);
		}
		if((fp = fopen (argv[2], "r")) != NULL){
			fscanf(fp, "%d %d\n", &matrices_a_b_dimensions[2], &matrices_a_b_dimensions[3]);
			B = (double **) malloc (matrices_a_b_dimensions[2] * sizeof(double *));
			for(row = 0; row < matrices_a_b_dimensions[2]; row++){
				B[row] = (double *) malloc(matrices_a_b_dimensions[3] * sizeof(double *));
				for(column = 0; column < matrices_a_b_dimensions[3]; column++)
					fscanf(fp, "%lf", &B[row][column]);
			}
			fclose(fp);
		} else {
			if(rank == 0) fprintf(stderr, "error opening file for matrix B (%s)\n", argv[2]);
			MPI_Abort(MPI_COMM_WORLD, -1);
		}

		// need to check that the multiplication is possible given dimensions
		// matrices_a_b_dimensions[0] = row size of A
		// matrices_a_b_dimensions[1] = column size of A
		// matrices_a_b_dimensions[2] = row size of B
		// matrices_a_b_dimensions[3] = column size of B
		if(matrices_a_b_dimensions[1] != matrices_a_b_dimensions[2]){
			if(rank == 0) fprintf(stderr, "A's column size (%d) must match B's row size (%d)\n",
					matrices_a_b_dimensions[1], matrices_a_b_dimensions[2]);
			MPI_Abort(MPI_COMM_WORLD, -1);
		}

		// this implementation is limited to cases where thematrices can be partitioned perfectly
		if( matrices_a_b_dimensions[0] % sqrt_size != 0
				|| matrices_a_b_dimensions[1] % sqrt_size != 0
				|| matrices_a_b_dimensions[2] % sqrt_size != 0
				|| matrices_a_b_dimensions[3] % sqrt_size != 0 ){
			if(rank == 0) fprintf(stderr, "cannot distribute work evenly among processes\n"
					"all dimensions (A: r:%d c:%d; B: r:%d c:%d) need to be divisible by %d\n",
					matrices_a_b_dimensions[0],matrices_a_b_dimensions[1],
					matrices_a_b_dimensions[2],matrices_a_b_dimensions[3], sqrt_size );
			MPI_Abort(MPI_COMM_WORLD, -1);
		}
	}

	read_time -= MPI_Wtime();

	send_dim_time = MPI_Wtime();
	// send dimensions to all peers
	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	//has to be blocking, bcs data is used right afterwards...
	MPI_Bcast(matrices_a_b_dimensions, 4, MPI_INT, 0, cartesian_grid_communicator);
	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	send_dim_time -= MPI_Wtime();


	A_rows = matrices_a_b_dimensions[0];
	A_columns = matrices_a_b_dimensions[1];
	B_rows = matrices_a_b_dimensions[2];
	B_columns = matrices_a_b_dimensions[3];

	// local metadata for A
	A_local_block_rows = A_rows / sqrt_size;
	A_local_block_columns = A_columns / sqrt_size;
	A_local_block_size = A_local_block_rows * A_local_block_columns;
	A_local_block = (double *) malloc (A_local_block_size * sizeof(double));

	// local metadata for B
	B_local_block_rows = B_rows / sqrt_size;
	B_local_block_columns = B_columns / sqrt_size;
	B_local_block_size = B_local_block_rows * B_local_block_columns;
	B_local_block = (double *) malloc (B_local_block_size * sizeof(double));

	// local metadata for C
	C_local_block = (double *) malloc (A_local_block_rows * B_local_block_columns * sizeof(double));
	// C needs to be initialized at 0 (accumulates partial dot-products)
	int i,j;
	for(i=0; i < A_local_block_rows * B_local_block_columns; i++){
		C_local_block[i] = 0;
	}

	dod_cajt = MPI_Wtime();
	// full arrays only needed at root
	if(rank == 0){
		A_array = (double *) malloc(sizeof(double) * A_rows * A_columns);
		B_array = (double *) malloc(sizeof(double) * B_rows * B_columns);
		C_array = (double *) malloc(sizeof(double) * A_rows * B_columns);
		// generate the 1D arrays of the matrices at root
		int row, column, i, j;
		for (i = 0; i < sqrt_size; i++){
			for (j = 0; j < sqrt_size; j++){
				for (row = 0; row < A_local_block_rows; row++){
					for (column = 0; column < A_local_block_columns; column++){
						A_array[((i * sqrt_size + j) * A_local_block_size) + (row * A_local_block_columns) + column] = 
A[i * A_local_block_rows + row][j * A_local_block_columns + column];
					}
				}
				for (row = 0; row < B_local_block_rows; row++){
					for (column = 0; column < B_local_block_columns; column++){
						B_array[((i * sqrt_size + j) * B_local_block_size) + (row * B_local_block_columns) + column] = 
B[i * B_local_block_rows + row][j * B_local_block_columns + column];
					}
				}
			}
		}
		// allocate output matrix C
		C = (double **) malloc(A_rows * sizeof(double *));
		for(i=0; i<A_rows ;i++){
			C[i] = (double *) malloc(B_columns * sizeof(double));
		}
	}

	
	dod_cajt -= MPI_Wtime(); //a bi mogla dat to v send blocks time?

	// send a block to each process
	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	/*MPI_Scatter(A_array, A_local_block_size, MPI_DOUBLE, A_local_block, A_local_block_size, MPI_DOUBLE, 0, cartesian_grid_communicator);
	MPI_Scatter(B_array, B_local_block_size, MPI_DOUBLE, B_local_block, B_local_block_size, MPI_DOUBLE, 0, cartesian_grid_communicator);
	*/

	send_blocks_time = MPI_Wtime();

	int displsA[size];
	int displsB[size];
	int localblsizA[size];
	int localblsizB[size];
	MPI_Request requests[2];
	MPI_Status statuses[2];
	for (i=0; i<sqrt_size; i++){
		for (j=0; j<sqrt_size; j++){
			displsA[i*sqrt_size + j] = (i*sqrt_size + j)*A_local_block_size; //(i*sqrt_size + (j+i)%sqrt_size)*A_local_block_size;
			displsB[i*sqrt_size + j] = (i*sqrt_size + j)*B_local_block_size; //(j + ((j+i)%size)*sqrt_size)*B_local_block_size;
			localblsizA[i*sqrt_size+j] = A_local_block_size;
			localblsizB[i*sqrt_size+j] = B_local_block_size;
		}
	}
	MPI_Iscatterv(A_array, localblsizA, displsA, MPI_DOUBLE, A_local_block, A_local_block_size,
        		MPI_DOUBLE, 0, cartesian_grid_communicator, &requests[0]);
	MPI_Iscatterv(B_array, localblsizB, displsB, MPI_DOUBLE, B_local_block, B_local_block_size,
        		MPI_DOUBLE, 0, cartesian_grid_communicator, &requests[1]);

	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


	// fix initial arrangements before the core algorithm starts - fora je, da se preden se prvic zacne computational part of algo, moras ze bloke zamenjat...
	/*if(coordinates[0] != 0){
		MPI_Sendrecv_replace(A_local_block, A_local_block_size, MPI_DOUBLE,
				(coordinates[1] + sqrt_size - coordinates[0]) % sqrt_size, 0,
				(coordinates[1] + coordinates[0]) % sqrt_size, 0, row_communicator, &status);
	}
	if(coordinates[1] != 0){
		MPI_Sendrecv_replace(B_local_block, B_local_block_size, MPI_DOUBLE,
				(coordinates[0] + sqrt_size - coordinates[1]) % sqrt_size, 0,
				(coordinates[0] + coordinates[1]) % sqrt_size, 0, column_communicator, &status);
	}*/

	// cannon's algorithm
	int cannon_block_cycle;
	double compute_time = 0, mpi_time = 0, start;
	int C_index, A_row, A_column, B_column;

	MPI_Waitall(2, requests, statuses);
	send_blocks_time -= MPI_Wtime();


	for(cannon_block_cycle = 0; cannon_block_cycle < sqrt_size; cannon_block_cycle++){
		// compute partial result for this block cycle
		start = MPI_Wtime();
		for(C_index = 0, A_row = 0; A_row < A_local_block_rows; A_row++){
			for(B_column = 0; B_column < B_local_block_columns; B_column++, C_index++){
				for(A_column = 0; A_column < A_local_block_columns; A_column++){
					C_local_block[C_index] += A_local_block[A_row * A_local_block_columns + A_column] * 
B_local_block[A_column * B_local_block_columns + B_column];
				}
			}
		}
		compute_time += MPI_Wtime() - start;
		start = MPI_Wtime();
		// rotate blocks horizontally
		MPI_Sendrecv_replace(A_local_block, A_local_block_size, MPI_DOUBLE, //to bi slo z MPI_alltoallv, in tisto variablo za replacing. ampak bi blo inefficient - glej komentarje!
				(coordinates[1] + sqrt_size - 1) % sqrt_size, 0,
				(coordinates[1] + 1) % sqrt_size, 0, row_communicator, &status);
		// rotate blocks vertically
		MPI_Sendrecv_replace(B_local_block, B_local_block_size, MPI_DOUBLE,
				(coordinates[0] + sqrt_size - 1) % sqrt_size, 0,
				(coordinates[0] + 1) % sqrt_size, 0, column_communicator, &status);
		mpi_time += MPI_Wtime() - start;
	}


	// get C parts from other processes at rank 0
	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
	gather_time =  MPI_Wtime();
	MPI_Gather(C_local_block, A_local_block_rows * B_local_block_columns, MPI_DOUBLE,
               		 C_array, A_local_block_rows * B_local_block_columns, MPI_DOUBLE,
                   0, cartesian_grid_communicator);  //blocking, ker gres takoj nekaj delat s tem pol... right?
	gather_time -= MPI_Wtime();
	//%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

	// generating output at rank 0
	if (rank == 0) {
		write_time = MPI_Wtime();
		// convert the ID array into the actual C matrix
		int i, j, k, row, column;
		for (i = 0; i < sqrt_size; i++){  // block row index
			for (j = 0; j < sqrt_size; j++){ // block column index
				for (row = 0; row < A_local_block_rows; row++){
					for (column = 0; column < B_local_block_columns; column++){
						C[i * A_local_block_rows + row] [j * B_local_block_columns + column] =
							C_array[((i * sqrt_size + j) * A_local_block_rows * B_local_block_columns)
							+ (row * B_local_block_columns) + column];
					}
				}
			}
		}
		write_time -= MPI_Wtime();

		printf("(%d,%d)x(%d,%d)=(%d,%d)\n", A_rows, A_columns, B_rows, B_columns, A_rows, B_columns);
		printf("Computation time: %lf\n", compute_time);
		printf("MPI time:         %lf\n", mpi_time);
		printf("Read time:        %lf\n", -read_time);
		printf("Send dims time:   %lf\n", -send_dim_time);
		printf("Send blocks time: %lf\n", -send_blocks_time);
		printf("Gather time:      %lf\n", -gather_time);
		printf("Addit. time:      %lf\n", -dod_cajt);
		printf("Write time:       %lf\n", -write_time);

		if (argc == 4){
			// present results on the screen
			printf("\nA( %d x %d ):\n", A_rows, A_columns);
			for(row = 0; row < A_rows; row++) {
				for(column = 0; column < A_columns; column++)
					printf ("%7.3f ", A[row][column]);
				printf ("\n");
			}
			printf("\nB( %d x %d ):\n", B_rows, B_columns);
			for(row = 0; row < B_rows; row++){
				for(column = 0; column < B_columns; column++)
					printf("%7.3f ", B[row][column]);
				printf("\n");
			}
			printf("\nC( %d x %d ) = AxB:\n", A_rows, B_columns);
			for(row = 0; row < A_rows; row++){
				for(column = 0; column < B_columns; column++)
					printf("%7.3f ",C[row][column]);
				printf("\n");
			}


			printf("\nPerforming serial consistency check. Be patient...\n");
			fflush(stdout);
			int pass = 1;
			double temp;
			for(i=0; i<A_rows; i++){
				for(j=0; j<B_columns; j++){
					temp = 0;
					for(k=0; k<B_rows; k++){
						temp += A[i][k] * B[k][j];
					}
					//printf("%7.3f ", temp);
					printf("%7.3f ", temp-C[i][j]);
					if(temp != C[i][j]){
						pass = 0;
					}
				}
				printf("\n");
			}
			if (pass) printf("Consistency check: PASS\n");
			else printf("Consistency check: FAIL\n");
		}
	}

	// free all memory
	if(rank == 0){
		int i;
		for(i = 0; i < A_rows; i++){
			free(A[i]);
		}
		for(i = 0; i < B_rows; i++){
			free(B[i]);
		}
		for(i = 0; i < A_rows; i++){
			free(C[i]);
		}
		free(A);
		free(B);
		free(C);
		free(A_array);
		free(B_array);
		free(C_array);
	}
	free(A_local_block);
	free(B_local_block);
	free(C_local_block);

	// finalize MPI
	MPI_Finalize();
}
Пример #6
0
int main(int argc, char **argv)
{
    int errs = 0;
    int i;
    int rank, size;
    int *sbuf = NULL;
    int *rbuf = NULL;
    int *scounts = NULL;
    int *rcounts = NULL;
    int *sdispls = NULL;
    int *rdispls = NULL;
    MPI_Datatype *types = NULL;
    MPI_Comm comm;
    MPI_Request req;

    /* intentionally not using MTest_Init/MTest_Finalize in order to make it
     * easy to take this test and use it as an NBC sanity test outside of the
     * MPICH test suite */
    MPI_Init(&argc, &argv);

    comm = MPI_COMM_WORLD;

    MPI_Comm_size(comm, &size);
    MPI_Comm_rank(comm, &rank);

    MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN);

    /* enough space for every process to contribute at least NUM_INTS ints to any
     * collective operation */
    sbuf = malloc(NUM_INTS * size * sizeof(int));
    my_assert(sbuf);
    rbuf = malloc(NUM_INTS * size * sizeof(int));
    my_assert(rbuf);
    scounts = malloc(size * sizeof(int));
    my_assert(scounts);
    rcounts = malloc(size * sizeof(int));
    my_assert(rcounts);
    sdispls = malloc(size * sizeof(int));
    my_assert(sdispls);
    rdispls = malloc(size * sizeof(int));
    my_assert(rdispls);
    types = malloc(size * sizeof(MPI_Datatype));
    my_assert(types);

    for (i = 0; i < size; ++i) {
        sbuf[2 * i] = i;
        sbuf[2 * i + 1] = i;
        rbuf[2 * i] = i;
        rbuf[2 * i + 1] = i;
        scounts[i] = NUM_INTS;
        rcounts[i] = NUM_INTS;
        sdispls[i] = i * NUM_INTS;
        rdispls[i] = i * NUM_INTS;
        types[i] = MPI_INT;
    }

    if (rank == 0 && MPI_SUCCESS ==
        MPI_Igather(sbuf, NUM_INTS, MPI_INT, sbuf, NUM_INTS, MPI_INT, 0, comm, &req))
        errs++;

    if (rank == 0 && MPI_SUCCESS ==
        MPI_Igatherv(sbuf, NUM_INTS, MPI_INT, sbuf, rcounts, rdispls, MPI_INT, 0, comm, &req))
        errs++;

    if (rank == 0 && MPI_SUCCESS ==
        MPI_Iscatter(sbuf, NUM_INTS, MPI_INT, sbuf, NUM_INTS, MPI_INT, 0, comm, &req))
        errs++;

    if (rank == 0 && MPI_SUCCESS ==
        MPI_Iscatterv(sbuf, scounts, sdispls, MPI_INT, sbuf, NUM_INTS, MPI_INT, 0, comm, &req))
        errs++;

    if (MPI_SUCCESS == MPI_Iallgather(&sbuf[rank], 1, MPI_INT, sbuf, 1, MPI_INT, comm, &req))
        errs++;

    if (MPI_SUCCESS ==
        MPI_Iallgatherv(&sbuf[rank * rcounts[rank]], rcounts[rank], MPI_INT, sbuf, rcounts, rdispls,
                        MPI_INT, comm, &req))
        errs++;

    if (MPI_SUCCESS == MPI_Ialltoall(sbuf, NUM_INTS, MPI_INT, sbuf, NUM_INTS, MPI_INT, comm, &req))
        errs++;

    if (MPI_SUCCESS ==
        MPI_Ialltoallv(sbuf, scounts, sdispls, MPI_INT, sbuf, scounts, sdispls, MPI_INT, comm,
                       &req))
        errs++;

    if (MPI_SUCCESS ==
        MPI_Ialltoallw(sbuf, scounts, sdispls, types, sbuf, scounts, sdispls, types, comm, &req))
        errs++;

    if (rank == 0 && MPI_SUCCESS ==
        MPI_Ireduce(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, 0, comm, &req))
        errs++;

    if (MPI_SUCCESS == MPI_Iallreduce(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req))
        errs++;

    if (MPI_SUCCESS == MPI_Ireduce_scatter(sbuf, sbuf, rcounts, MPI_INT, MPI_SUM, comm, &req))
        errs++;

    if (MPI_SUCCESS ==
        MPI_Ireduce_scatter_block(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req))
        errs++;

    if (MPI_SUCCESS == MPI_Iscan(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req))
        errs++;

    if (MPI_SUCCESS == MPI_Iexscan(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req))
        errs++;

    if (sbuf)
        free(sbuf);
    if (rbuf)
        free(rbuf);
    if (scounts)
        free(scounts);
    if (rcounts)
        free(rcounts);
    if (sdispls)
        free(sdispls);
    if (rdispls)
        free(rdispls);
    if (types)
        free(types);

    if (rank == 0) {
        if (errs)
            fprintf(stderr, "Found %d errors\n", errs);
        else
            printf(" No errors\n");
    }
    MPI_Finalize();
    return 0;
}