FORT_DLL_SPEC void FORT_CALL mpi_iscatterv_ ( void*v1, MPI_Fint v2[], MPI_Fint v3[], MPI_Fint *v4, void*v5, MPI_Fint *v6, MPI_Fint *v7, MPI_Fint *v8, MPI_Fint *v9, MPI_Fint *v10, MPI_Fint *ierr ){ #ifndef HAVE_MPI_F_INIT_WORKS_WITH_C if (MPIR_F_NeedInit){ mpirinitf_(); MPIR_F_NeedInit = 0; } #endif if (v5 == MPIR_F_MPI_IN_PLACE) v5 = MPI_IN_PLACE; *ierr = MPI_Iscatterv( v1, v2, v3, (MPI_Datatype)(*v4), v5, (int)*v6, (MPI_Datatype)(*v7), (int)*v8, (MPI_Comm)(*v9), (MPI_Request *)(v10) ); }
int main(int argc, char **argv) { int errs = 0; int i; int rank, size; int *sbuf = NULL; int *rbuf = NULL; int *scounts = NULL; int *rcounts = NULL; int *sdispls = NULL; int *rdispls = NULL; int *types = NULL; MPI_Comm comm; MPI_Request req; /* intentionally not using MTest_Init/MTest_Finalize in order to make it * easy to take this test and use it as an NBC sanity test outside of the * MPICH test suite */ MPI_Init(&argc, &argv); comm = MPI_COMM_WORLD; MPI_Comm_size(comm, &size); MPI_Comm_rank(comm, &rank); /* enough space for every process to contribute at least NUM_INTS ints to any * collective operation */ sbuf = malloc(NUM_INTS * size * sizeof(int)); my_assert(sbuf); rbuf = malloc(NUM_INTS * size * sizeof(int)); my_assert(rbuf); scounts = malloc(size * sizeof(int)); my_assert(scounts); rcounts = malloc(size * sizeof(int)); my_assert(rcounts); sdispls = malloc(size * sizeof(int)); my_assert(sdispls); rdispls = malloc(size * sizeof(int)); my_assert(rdispls); types = malloc(size * sizeof(int)); my_assert(types); for (i = 0; i < size; ++i) { sbuf[2 * i] = i; sbuf[2 * i + 1] = i; rbuf[2 * i] = i; rbuf[2 * i + 1] = i; scounts[i] = NUM_INTS; rcounts[i] = NUM_INTS; sdispls[i] = i * NUM_INTS; rdispls[i] = i * NUM_INTS; types[i] = MPI_INT; } MPI_Ibarrier(comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Ibcast(sbuf, NUM_INTS, MPI_INT, 0, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Igather(sbuf, NUM_INTS, MPI_INT, rbuf, NUM_INTS, MPI_INT, 0, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); if (0 == rank) MPI_Igather(MPI_IN_PLACE, -1, MPI_DATATYPE_NULL, rbuf, NUM_INTS, MPI_INT, 0, comm, &req); else MPI_Igather(sbuf, NUM_INTS, MPI_INT, rbuf, NUM_INTS, MPI_INT, 0, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Igatherv(sbuf, NUM_INTS, MPI_INT, rbuf, rcounts, rdispls, MPI_INT, 0, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); if (0 == rank) MPI_Igatherv(MPI_IN_PLACE, -1, MPI_DATATYPE_NULL, rbuf, rcounts, rdispls, MPI_INT, 0, comm, &req); else MPI_Igatherv(sbuf, NUM_INTS, MPI_INT, rbuf, rcounts, rdispls, MPI_INT, 0, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Iscatter(sbuf, NUM_INTS, MPI_INT, rbuf, NUM_INTS, MPI_INT, 0, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); if (0 == rank) MPI_Iscatter(sbuf, NUM_INTS, MPI_INT, MPI_IN_PLACE, -1, MPI_DATATYPE_NULL, 0, comm, &req); else MPI_Iscatter(sbuf, NUM_INTS, MPI_INT, rbuf, NUM_INTS, MPI_INT, 0, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Iscatterv(sbuf, scounts, sdispls, MPI_INT, rbuf, NUM_INTS, MPI_INT, 0, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); if (0 == rank) MPI_Iscatterv(sbuf, scounts, sdispls, MPI_INT, MPI_IN_PLACE, -1, MPI_DATATYPE_NULL, 0, comm, &req); else MPI_Iscatterv(sbuf, scounts, sdispls, MPI_INT, rbuf, NUM_INTS, MPI_INT, 0, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Iallgather(sbuf, NUM_INTS, MPI_INT, rbuf, NUM_INTS, MPI_INT, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Iallgather(MPI_IN_PLACE, -1, MPI_DATATYPE_NULL, rbuf, NUM_INTS, MPI_INT, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Iallgatherv(sbuf, NUM_INTS, MPI_INT, rbuf, rcounts, rdispls, MPI_INT, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Iallgatherv(MPI_IN_PLACE, -1, MPI_DATATYPE_NULL, rbuf, rcounts, rdispls, MPI_INT, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Ialltoall(sbuf, NUM_INTS, MPI_INT, rbuf, NUM_INTS, MPI_INT, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Ialltoall(MPI_IN_PLACE, -1, MPI_DATATYPE_NULL, rbuf, NUM_INTS, MPI_INT, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Ialltoallv(sbuf, scounts, sdispls, MPI_INT, rbuf, rcounts, rdispls, MPI_INT, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Ialltoallv(MPI_IN_PLACE, NULL, NULL, MPI_DATATYPE_NULL, rbuf, rcounts, rdispls, MPI_INT, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Ialltoallw(sbuf, scounts, sdispls, types, rbuf, rcounts, rdispls, types, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Ialltoallw(MPI_IN_PLACE, NULL, NULL, NULL, rbuf, rcounts, rdispls, types, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Ireduce(sbuf, rbuf, NUM_INTS, MPI_INT, MPI_SUM, 0, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); if (0 == rank) MPI_Ireduce(MPI_IN_PLACE, rbuf, NUM_INTS, MPI_INT, MPI_SUM, 0, comm, &req); else MPI_Ireduce(sbuf, rbuf, NUM_INTS, MPI_INT, MPI_SUM, 0, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Iallreduce(sbuf, rbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Iallreduce(MPI_IN_PLACE, rbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Ireduce_scatter(sbuf, rbuf, rcounts, MPI_INT, MPI_SUM, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Ireduce_scatter(MPI_IN_PLACE, rbuf, rcounts, MPI_INT, MPI_SUM, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Ireduce_scatter_block(sbuf, rbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Ireduce_scatter_block(MPI_IN_PLACE, rbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Iscan(sbuf, rbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Iscan(MPI_IN_PLACE, rbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Iexscan(sbuf, rbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Iexscan(MPI_IN_PLACE, rbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); if (sbuf) free(sbuf); if (rbuf) free(rbuf); if (scounts) free(scounts); if (rcounts) free(rcounts); if (sdispls) free(sdispls); if (rdispls) free(rdispls); if (rank == 0) { if (errs) fprintf(stderr, "Found %d errors\n", errs); else printf(" No errors\n"); } MPI_Finalize(); return 0; }
/* Starts a "random" operation on "comm" corresponding to "rndnum" and returns * in (*req) a request handle corresonding to that operation. This call should * be considered collective over comm (with a consistent value for "rndnum"), * even though the operation may only be a point-to-point request. */ static void start_random_nonblocking(MPI_Comm comm, unsigned int rndnum, MPI_Request *req, struct laundry *l) { int i, j; int rank, size; int *buf = NULL; int *recvbuf = NULL; int *sendcounts = NULL; int *recvcounts = NULL; int *sdispls = NULL; int *rdispls = NULL; int *sendtypes = NULL; int *recvtypes = NULL; signed char *buf_alias = NULL; MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &size); *req = MPI_REQUEST_NULL; l->case_num = -1; l->comm = comm; l->buf = buf = malloc(COUNT*size*sizeof(int)); l->recvbuf = recvbuf = malloc(COUNT*size*sizeof(int)); l->sendcounts = sendcounts = malloc(size*sizeof(int)); l->recvcounts = recvcounts = malloc(size*sizeof(int)); l->sdispls = sdispls = malloc(size*sizeof(int)); l->rdispls = rdispls = malloc(size*sizeof(int)); l->sendtypes = sendtypes = malloc(size*sizeof(MPI_Datatype)); l->recvtypes = recvtypes = malloc(size*sizeof(MPI_Datatype)); #define NUM_CASES (21) l->case_num = rand_range(rndnum, 0, NUM_CASES); switch (l->case_num) { case 0: /* MPI_Ibcast */ for (i = 0; i < COUNT; ++i) { if (rank == 0) { buf[i] = i; } else { buf[i] = 0xdeadbeef; } } MPI_Ibcast(buf, COUNT, MPI_INT, 0, comm, req); break; case 1: /* MPI_Ibcast (again, but designed to stress scatter/allgather impls) */ /* FIXME fiddle with PRIME and buffer allocation s.t. PRIME is much larger (1021?) */ buf_alias = (signed char *)buf; my_assert(COUNT*size*sizeof(int) > PRIME); /* sanity */ for (i = 0; i < PRIME; ++i) { if (rank == 0) buf_alias[i] = i; else buf_alias[i] = 0xdb; } for (i = PRIME; i < COUNT * size * sizeof(int); ++i) { buf_alias[i] = 0xbf; } MPI_Ibcast(buf_alias, PRIME, MPI_SIGNED_CHAR, 0, comm, req); break; case 2: /* MPI_Ibarrier */ MPI_Ibarrier(comm, req); break; case 3: /* MPI_Ireduce */ for (i = 0; i < COUNT; ++i) { buf[i] = rank + i; recvbuf[i] = 0xdeadbeef; } MPI_Ireduce(buf, recvbuf, COUNT, MPI_INT, MPI_SUM, 0, comm, req); break; case 4: /* same again, use a user op and free it before the wait */ { MPI_Op op = MPI_OP_NULL; MPI_Op_create(sum_fn, /*commute=*/1, &op); for (i = 0; i < COUNT; ++i) { buf[i] = rank + i; recvbuf[i] = 0xdeadbeef; } MPI_Ireduce(buf, recvbuf, COUNT, MPI_INT, op, 0, comm, req); MPI_Op_free(&op); } break; case 5: /* MPI_Iallreduce */ for (i = 0; i < COUNT; ++i) { buf[i] = rank + i; recvbuf[i] = 0xdeadbeef; } MPI_Iallreduce(buf, recvbuf, COUNT, MPI_INT, MPI_SUM, comm, req); break; case 6: /* MPI_Ialltoallv (a weak test, neither irregular nor sparse) */ for (i = 0; i < size; ++i) { sendcounts[i] = COUNT; recvcounts[i] = COUNT; sdispls[i] = COUNT * i; rdispls[i] = COUNT * i; for (j = 0; j < COUNT; ++j) { buf[i*COUNT+j] = rank + (i * j); recvbuf[i*COUNT+j] = 0xdeadbeef; } } MPI_Ialltoallv(buf, sendcounts, sdispls, MPI_INT, recvbuf, recvcounts, rdispls, MPI_INT, comm, req); break; case 7: /* MPI_Igather */ for (i = 0; i < size*COUNT; ++i) { buf[i] = rank + i; recvbuf[i] = 0xdeadbeef; } MPI_Igather(buf, COUNT, MPI_INT, recvbuf, COUNT, MPI_INT, 0, comm, req); break; case 8: /* same test again, just use a dup'ed datatype and free it before the wait */ { MPI_Datatype type = MPI_DATATYPE_NULL; MPI_Type_dup(MPI_INT, &type); for (i = 0; i < size*COUNT; ++i) { buf[i] = rank + i; recvbuf[i] = 0xdeadbeef; } MPI_Igather(buf, COUNT, MPI_INT, recvbuf, COUNT, type, 0, comm, req); MPI_Type_free(&type); /* should cause implementations that don't refcount correctly to blow up or hang in the wait */ } break; case 9: /* MPI_Iscatter */ for (i = 0; i < size; ++i) { for (j = 0; j < COUNT; ++j) { if (rank == 0) buf[i*COUNT+j] = i + j; else buf[i*COUNT+j] = 0xdeadbeef; recvbuf[i*COUNT+j] = 0xdeadbeef; } } MPI_Iscatter(buf, COUNT, MPI_INT, recvbuf, COUNT, MPI_INT, 0, comm, req); break; case 10: /* MPI_Iscatterv */ for (i = 0; i < size; ++i) { /* weak test, just test the regular case where all counts are equal */ sendcounts[i] = COUNT; sdispls[i] = i * COUNT; for (j = 0; j < COUNT; ++j) { if (rank == 0) buf[i*COUNT+j] = i + j; else buf[i*COUNT+j] = 0xdeadbeef; recvbuf[i*COUNT+j] = 0xdeadbeef; } } MPI_Iscatterv(buf, sendcounts, sdispls, MPI_INT, recvbuf, COUNT, MPI_INT, 0, comm, req); break; case 11: /* MPI_Ireduce_scatter */ for (i = 0; i < size; ++i) { recvcounts[i] = COUNT; for (j = 0; j < COUNT; ++j) { buf[i*COUNT+j] = rank + i; recvbuf[i*COUNT+j] = 0xdeadbeef; } } MPI_Ireduce_scatter(buf, recvbuf, recvcounts, MPI_INT, MPI_SUM, comm, req); break; case 12: /* MPI_Ireduce_scatter_block */ for (i = 0; i < size; ++i) { for (j = 0; j < COUNT; ++j) { buf[i*COUNT+j] = rank + i; recvbuf[i*COUNT+j] = 0xdeadbeef; } } MPI_Ireduce_scatter_block(buf, recvbuf, COUNT, MPI_INT, MPI_SUM, comm, req); break; case 13: /* MPI_Igatherv */ for (i = 0; i < size*COUNT; ++i) { buf[i] = 0xdeadbeef; recvbuf[i] = 0xdeadbeef; } for (i = 0; i < COUNT; ++i) { buf[i] = rank + i; } for (i = 0; i < size; ++i) { recvcounts[i] = COUNT; rdispls[i] = i * COUNT; } MPI_Igatherv(buf, COUNT, MPI_INT, recvbuf, recvcounts, rdispls, MPI_INT, 0, comm, req); break; case 14: /* MPI_Ialltoall */ for (i = 0; i < size; ++i) { for (j = 0; j < COUNT; ++j) { buf[i*COUNT+j] = rank + (i * j); recvbuf[i*COUNT+j] = 0xdeadbeef; } } MPI_Ialltoall(buf, COUNT, MPI_INT, recvbuf, COUNT, MPI_INT, comm, req); break; case 15: /* MPI_Iallgather */ for (i = 0; i < size*COUNT; ++i) { buf[i] = rank + i; recvbuf[i] = 0xdeadbeef; } MPI_Iallgather(buf, COUNT, MPI_INT, recvbuf, COUNT, MPI_INT, comm, req); break; case 16: /* MPI_Iallgatherv */ for (i = 0; i < size; ++i) { for (j = 0; j < COUNT; ++j) { recvbuf[i*COUNT+j] = 0xdeadbeef; } recvcounts[i] = COUNT; rdispls[i] = i * COUNT; } for (i = 0; i < COUNT; ++i) buf[i] = rank + i; MPI_Iallgatherv(buf, COUNT, MPI_INT, recvbuf, recvcounts, rdispls, MPI_INT, comm, req); break; case 17: /* MPI_Iscan */ for (i = 0; i < COUNT; ++i) { buf[i] = rank + i; recvbuf[i] = 0xdeadbeef; } MPI_Iscan(buf, recvbuf, COUNT, MPI_INT, MPI_SUM, comm, req); break; case 18: /* MPI_Iexscan */ for (i = 0; i < COUNT; ++i) { buf[i] = rank + i; recvbuf[i] = 0xdeadbeef; } MPI_Iexscan(buf, recvbuf, COUNT, MPI_INT, MPI_SUM, comm, req); break; case 19: /* MPI_Ialltoallw (a weak test, neither irregular nor sparse) */ for (i = 0; i < size; ++i) { sendcounts[i] = COUNT; recvcounts[i] = COUNT; sdispls[i] = COUNT * i * sizeof(int); rdispls[i] = COUNT * i * sizeof(int); sendtypes[i] = MPI_INT; recvtypes[i] = MPI_INT; for (j = 0; j < COUNT; ++j) { buf[i*COUNT+j] = rank + (i * j); recvbuf[i*COUNT+j] = 0xdeadbeef; } } MPI_Ialltoallw(buf, sendcounts, sdispls, sendtypes, recvbuf, recvcounts, rdispls, recvtypes, comm, req); break; case 20: /* basic pt2pt MPI_Isend/MPI_Irecv pairing */ /* even ranks send to odd ranks, but only if we have a full pair */ if ((rank % 2 != 0) || (rank != size-1)) { for (j = 0; j < COUNT; ++j) { buf[j] = j; recvbuf[j] = 0xdeadbeef; } if (rank % 2 == 0) MPI_Isend(buf, COUNT, MPI_INT, rank+1, 5, comm, req); else MPI_Irecv(recvbuf, COUNT, MPI_INT, rank-1, 5, comm, req); } break; default: fprintf(stderr, "unexpected value for l->case_num=%d)\n", (l->case_num)); MPI_Abort(comm, 1); exit(1); break; } }
int main(int argc, char *argv[]) { setbuf(stdout, NULL); int i = 0, rank, size, disp; int numprocs; double latency = 0.0, t_start = 0.0, t_stop = 0.0; double tcomp = 0.0, tcomp_total=0.0, latency_in_secs=0.0; double test_time = 0.0, test_total = 0.0; double timer=0.0; double wait_time = 0.0, init_time = 0.0; double init_total = 0.0, wait_total = 0.0; char *sendbuf=NULL; char *recvbuf=NULL; int *sdispls=NULL, *sendcounts=NULL; int po_ret; size_t bufsize; set_header(HEADER); set_benchmark_name("osu_iscatterv"); enable_accel_support(); po_ret = process_options(argc, argv); if (po_okay == po_ret && none != options.accel) { if (init_accel()) { fprintf(stderr, "Error initializing device\n"); exit(EXIT_FAILURE); } } MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &numprocs); MPI_Request request; MPI_Status status; switch (po_ret) { case po_bad_usage: print_bad_usage_message(rank); MPI_Finalize(); exit(EXIT_FAILURE); case po_help_message: print_help_message(rank); MPI_Finalize(); exit(EXIT_SUCCESS); case po_version_message: print_version_message(rank); MPI_Finalize(); exit(EXIT_SUCCESS); case po_okay: break; } if(numprocs < 2) { if (rank == 0) { fprintf(stderr, "This test requires at least two processes\n"); } MPI_Finalize(); exit(EXIT_FAILURE); } if ((options.max_message_size * numprocs) > options.max_mem_limit) { options.max_message_size = options.max_mem_limit / numprocs; } if (0 == rank) { if (allocate_buffer((void**)&sendcounts, numprocs*sizeof(int), none)) { fprintf(stderr, "Could Not Allocate Memory [rank %d]\n", rank); MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); } if (allocate_buffer((void**)&sdispls, numprocs*sizeof(int), none)) { fprintf(stderr, "Could Not Allocate Memory [rank %d]\n", rank); MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); } bufsize = options.max_message_size * numprocs; if (allocate_buffer((void**)&sendbuf, bufsize, options.accel)) { fprintf(stderr, "Could Not Allocate Memory [rank %d]\n", rank); MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); } set_buffer(sendbuf, options.accel, 1, bufsize); } if (allocate_buffer((void**)&recvbuf, options.max_message_size, options.accel)) { fprintf(stderr, "Could Not Allocate Memory [rank %d]\n", rank); MPI_Abort(MPI_COMM_WORLD, EXIT_FAILURE); } set_buffer(recvbuf, options.accel, 0, options.max_message_size); print_preamble_nbc(rank); for(size=options.min_message_size; size <=options.max_message_size; size *= 2) { if(size > LARGE_MESSAGE_SIZE) { options.skip = SKIP_LARGE; options.iterations = options.iterations_large; } else { options.skip = SKIP; } if (0 == rank) { disp =0; for ( i = 0; i < numprocs; i++) { sendcounts[i] = size; sdispls[i] = disp; disp += size; } } MPI_Barrier(MPI_COMM_WORLD); timer = 0.0; for(i=0; i < options.iterations + options.skip ; i++) { t_start = MPI_Wtime(); MPI_Iscatterv(sendbuf, sendcounts, sdispls, MPI_CHAR, recvbuf, size, MPI_CHAR, 0, MPI_COMM_WORLD, &request); MPI_Wait(&request,&status); t_stop = MPI_Wtime(); if(i>=options.skip){ timer += t_stop-t_start; } MPI_Barrier(MPI_COMM_WORLD); } MPI_Barrier(MPI_COMM_WORLD); latency = (timer * 1e6) / options.iterations; latency_in_secs = timer/options.iterations; init_arrays(latency_in_secs); if (0 == rank) { disp =0; for ( i = 0; i < numprocs; i++) { sendcounts[i] = size; sdispls[i] = disp; disp += size; } } MPI_Barrier(MPI_COMM_WORLD); timer = 0.0; tcomp_total = 0; tcomp = 0; init_total = 0.0; wait_total = 0.0; test_time = 0.0, test_total = 0.0; for(i=0; i < options.iterations + options.skip ; i++) { t_start = MPI_Wtime(); init_time = MPI_Wtime(); MPI_Iscatterv(sendbuf, sendcounts, sdispls, MPI_CHAR, recvbuf, size, MPI_CHAR, 0, MPI_COMM_WORLD, &request); init_time = MPI_Wtime() - init_time; tcomp = MPI_Wtime(); test_time = dummy_compute(latency_in_secs, &request); tcomp = MPI_Wtime() - tcomp; wait_time = MPI_Wtime(); MPI_Wait(&request,&status); wait_time = MPI_Wtime() - wait_time; t_stop = MPI_Wtime(); if(i>=options.skip){ timer += t_stop-t_start; tcomp_total += tcomp; test_total += test_time; init_total += init_time; wait_total += wait_time; } MPI_Barrier(MPI_COMM_WORLD); } MPI_Barrier (MPI_COMM_WORLD); calculate_and_print_stats(rank, size, numprocs, timer, latency, test_total, tcomp_total, wait_total, init_total); } if (0 == rank) { free_buffer(sendcounts, none); free_buffer(sdispls, none); free_buffer(sendbuf, options.accel); } free_buffer(recvbuf, options.accel); MPI_Finalize(); if (none != options.accel) { if (cleanup_accel()) { fprintf(stderr, "Error cleaning up device\n"); exit(EXIT_FAILURE); } } return EXIT_SUCCESS; }
int main (int argc, char **argv) { FILE *fp; double **A = NULL, **B = NULL, **C = NULL, *A_array = NULL, *B_array = NULL, *C_array = NULL; double *A_local_block = NULL, *B_local_block = NULL, *C_local_block = NULL; int A_rows, A_columns, A_local_block_rows, A_local_block_columns, A_local_block_size; int B_rows, B_columns, B_local_block_rows, B_local_block_columns, B_local_block_size; int rank, size, sqrt_size, matrices_a_b_dimensions[4]; MPI_Comm cartesian_grid_communicator, row_communicator, column_communicator; MPI_Status status; // used to manage the cartesian grid int dimensions[2], periods[2], coordinates[2], remain_dims[2]; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &size); MPI_Comm_rank(MPI_COMM_WORLD, &rank); /* For square mesh */ sqrt_size = (int)sqrt((double) size); if(sqrt_size * sqrt_size != size){ if( rank == 0 ) perror("need to run mpiexec with a perfect square number of processes\n"); MPI_Abort(MPI_COMM_WORLD, -1); } // create a 2D cartesian grid dimensions[0] = dimensions[1] = sqrt_size; periods[0] = periods[1] = 1; MPI_Cart_create(MPI_COMM_WORLD, 2, dimensions, periods, 1, &cartesian_grid_communicator); MPI_Cart_coords(cartesian_grid_communicator, rank, 2, coordinates); //v COORDINATES imas shranjene koordinate procesa RANK // create a row communicator remain_dims[0] = 0; remain_dims[1] = 1; MPI_Cart_sub(cartesian_grid_communicator, remain_dims, &row_communicator); // create a column communicator remain_dims[0] = 1; remain_dims[1] = 0; MPI_Cart_sub(cartesian_grid_communicator, remain_dims, &column_communicator); // set time variables for different MPI parts double read_time, send_dim_time, send_blocks_time, gather_time, write_time, dod_cajt; read_time = MPI_Wtime(); // getting matrices from files at rank 0 only // example: mpiexec -n 64 ./cannon matrix1 matrix2 [test] if (rank == 0){ int row, column; if ((fp = fopen (argv[1], "r")) != NULL){ fscanf(fp, "%d %d\n", &matrices_a_b_dimensions[0], &matrices_a_b_dimensions[1]); A = (double **) malloc (matrices_a_b_dimensions[0] * sizeof(double *)); for (row = 0; row < matrices_a_b_dimensions[0]; row++){ A[row] = (double *) malloc(matrices_a_b_dimensions[1] * sizeof(double)); for (column = 0; column < matrices_a_b_dimensions[1]; column++) fscanf(fp, "%lf", &A[row][column]); } fclose(fp); } else { if(rank == 0) fprintf(stderr, "error opening file for matrix A (%s)\n", argv[1]); MPI_Abort(MPI_COMM_WORLD, -1); } if((fp = fopen (argv[2], "r")) != NULL){ fscanf(fp, "%d %d\n", &matrices_a_b_dimensions[2], &matrices_a_b_dimensions[3]); B = (double **) malloc (matrices_a_b_dimensions[2] * sizeof(double *)); for(row = 0; row < matrices_a_b_dimensions[2]; row++){ B[row] = (double *) malloc(matrices_a_b_dimensions[3] * sizeof(double *)); for(column = 0; column < matrices_a_b_dimensions[3]; column++) fscanf(fp, "%lf", &B[row][column]); } fclose(fp); } else { if(rank == 0) fprintf(stderr, "error opening file for matrix B (%s)\n", argv[2]); MPI_Abort(MPI_COMM_WORLD, -1); } // need to check that the multiplication is possible given dimensions // matrices_a_b_dimensions[0] = row size of A // matrices_a_b_dimensions[1] = column size of A // matrices_a_b_dimensions[2] = row size of B // matrices_a_b_dimensions[3] = column size of B if(matrices_a_b_dimensions[1] != matrices_a_b_dimensions[2]){ if(rank == 0) fprintf(stderr, "A's column size (%d) must match B's row size (%d)\n", matrices_a_b_dimensions[1], matrices_a_b_dimensions[2]); MPI_Abort(MPI_COMM_WORLD, -1); } // this implementation is limited to cases where thematrices can be partitioned perfectly if( matrices_a_b_dimensions[0] % sqrt_size != 0 || matrices_a_b_dimensions[1] % sqrt_size != 0 || matrices_a_b_dimensions[2] % sqrt_size != 0 || matrices_a_b_dimensions[3] % sqrt_size != 0 ){ if(rank == 0) fprintf(stderr, "cannot distribute work evenly among processes\n" "all dimensions (A: r:%d c:%d; B: r:%d c:%d) need to be divisible by %d\n", matrices_a_b_dimensions[0],matrices_a_b_dimensions[1], matrices_a_b_dimensions[2],matrices_a_b_dimensions[3], sqrt_size ); MPI_Abort(MPI_COMM_WORLD, -1); } } read_time -= MPI_Wtime(); send_dim_time = MPI_Wtime(); // send dimensions to all peers //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% //has to be blocking, bcs data is used right afterwards... MPI_Bcast(matrices_a_b_dimensions, 4, MPI_INT, 0, cartesian_grid_communicator); //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% send_dim_time -= MPI_Wtime(); A_rows = matrices_a_b_dimensions[0]; A_columns = matrices_a_b_dimensions[1]; B_rows = matrices_a_b_dimensions[2]; B_columns = matrices_a_b_dimensions[3]; // local metadata for A A_local_block_rows = A_rows / sqrt_size; A_local_block_columns = A_columns / sqrt_size; A_local_block_size = A_local_block_rows * A_local_block_columns; A_local_block = (double *) malloc (A_local_block_size * sizeof(double)); // local metadata for B B_local_block_rows = B_rows / sqrt_size; B_local_block_columns = B_columns / sqrt_size; B_local_block_size = B_local_block_rows * B_local_block_columns; B_local_block = (double *) malloc (B_local_block_size * sizeof(double)); // local metadata for C C_local_block = (double *) malloc (A_local_block_rows * B_local_block_columns * sizeof(double)); // C needs to be initialized at 0 (accumulates partial dot-products) int i,j; for(i=0; i < A_local_block_rows * B_local_block_columns; i++){ C_local_block[i] = 0; } dod_cajt = MPI_Wtime(); // full arrays only needed at root if(rank == 0){ A_array = (double *) malloc(sizeof(double) * A_rows * A_columns); B_array = (double *) malloc(sizeof(double) * B_rows * B_columns); C_array = (double *) malloc(sizeof(double) * A_rows * B_columns); // generate the 1D arrays of the matrices at root int row, column, i, j; for (i = 0; i < sqrt_size; i++){ for (j = 0; j < sqrt_size; j++){ for (row = 0; row < A_local_block_rows; row++){ for (column = 0; column < A_local_block_columns; column++){ A_array[((i * sqrt_size + j) * A_local_block_size) + (row * A_local_block_columns) + column] = A[i * A_local_block_rows + row][j * A_local_block_columns + column]; } } for (row = 0; row < B_local_block_rows; row++){ for (column = 0; column < B_local_block_columns; column++){ B_array[((i * sqrt_size + j) * B_local_block_size) + (row * B_local_block_columns) + column] = B[i * B_local_block_rows + row][j * B_local_block_columns + column]; } } } } // allocate output matrix C C = (double **) malloc(A_rows * sizeof(double *)); for(i=0; i<A_rows ;i++){ C[i] = (double *) malloc(B_columns * sizeof(double)); } } dod_cajt -= MPI_Wtime(); //a bi mogla dat to v send blocks time? // send a block to each process //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% /*MPI_Scatter(A_array, A_local_block_size, MPI_DOUBLE, A_local_block, A_local_block_size, MPI_DOUBLE, 0, cartesian_grid_communicator); MPI_Scatter(B_array, B_local_block_size, MPI_DOUBLE, B_local_block, B_local_block_size, MPI_DOUBLE, 0, cartesian_grid_communicator); */ send_blocks_time = MPI_Wtime(); int displsA[size]; int displsB[size]; int localblsizA[size]; int localblsizB[size]; MPI_Request requests[2]; MPI_Status statuses[2]; for (i=0; i<sqrt_size; i++){ for (j=0; j<sqrt_size; j++){ displsA[i*sqrt_size + j] = (i*sqrt_size + j)*A_local_block_size; //(i*sqrt_size + (j+i)%sqrt_size)*A_local_block_size; displsB[i*sqrt_size + j] = (i*sqrt_size + j)*B_local_block_size; //(j + ((j+i)%size)*sqrt_size)*B_local_block_size; localblsizA[i*sqrt_size+j] = A_local_block_size; localblsizB[i*sqrt_size+j] = B_local_block_size; } } MPI_Iscatterv(A_array, localblsizA, displsA, MPI_DOUBLE, A_local_block, A_local_block_size, MPI_DOUBLE, 0, cartesian_grid_communicator, &requests[0]); MPI_Iscatterv(B_array, localblsizB, displsB, MPI_DOUBLE, B_local_block, B_local_block_size, MPI_DOUBLE, 0, cartesian_grid_communicator, &requests[1]); //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // fix initial arrangements before the core algorithm starts - fora je, da se preden se prvic zacne computational part of algo, moras ze bloke zamenjat... /*if(coordinates[0] != 0){ MPI_Sendrecv_replace(A_local_block, A_local_block_size, MPI_DOUBLE, (coordinates[1] + sqrt_size - coordinates[0]) % sqrt_size, 0, (coordinates[1] + coordinates[0]) % sqrt_size, 0, row_communicator, &status); } if(coordinates[1] != 0){ MPI_Sendrecv_replace(B_local_block, B_local_block_size, MPI_DOUBLE, (coordinates[0] + sqrt_size - coordinates[1]) % sqrt_size, 0, (coordinates[0] + coordinates[1]) % sqrt_size, 0, column_communicator, &status); }*/ // cannon's algorithm int cannon_block_cycle; double compute_time = 0, mpi_time = 0, start; int C_index, A_row, A_column, B_column; MPI_Waitall(2, requests, statuses); send_blocks_time -= MPI_Wtime(); for(cannon_block_cycle = 0; cannon_block_cycle < sqrt_size; cannon_block_cycle++){ // compute partial result for this block cycle start = MPI_Wtime(); for(C_index = 0, A_row = 0; A_row < A_local_block_rows; A_row++){ for(B_column = 0; B_column < B_local_block_columns; B_column++, C_index++){ for(A_column = 0; A_column < A_local_block_columns; A_column++){ C_local_block[C_index] += A_local_block[A_row * A_local_block_columns + A_column] * B_local_block[A_column * B_local_block_columns + B_column]; } } } compute_time += MPI_Wtime() - start; start = MPI_Wtime(); // rotate blocks horizontally MPI_Sendrecv_replace(A_local_block, A_local_block_size, MPI_DOUBLE, //to bi slo z MPI_alltoallv, in tisto variablo za replacing. ampak bi blo inefficient - glej komentarje! (coordinates[1] + sqrt_size - 1) % sqrt_size, 0, (coordinates[1] + 1) % sqrt_size, 0, row_communicator, &status); // rotate blocks vertically MPI_Sendrecv_replace(B_local_block, B_local_block_size, MPI_DOUBLE, (coordinates[0] + sqrt_size - 1) % sqrt_size, 0, (coordinates[0] + 1) % sqrt_size, 0, column_communicator, &status); mpi_time += MPI_Wtime() - start; } // get C parts from other processes at rank 0 //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% gather_time = MPI_Wtime(); MPI_Gather(C_local_block, A_local_block_rows * B_local_block_columns, MPI_DOUBLE, C_array, A_local_block_rows * B_local_block_columns, MPI_DOUBLE, 0, cartesian_grid_communicator); //blocking, ker gres takoj nekaj delat s tem pol... right? gather_time -= MPI_Wtime(); //%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% // generating output at rank 0 if (rank == 0) { write_time = MPI_Wtime(); // convert the ID array into the actual C matrix int i, j, k, row, column; for (i = 0; i < sqrt_size; i++){ // block row index for (j = 0; j < sqrt_size; j++){ // block column index for (row = 0; row < A_local_block_rows; row++){ for (column = 0; column < B_local_block_columns; column++){ C[i * A_local_block_rows + row] [j * B_local_block_columns + column] = C_array[((i * sqrt_size + j) * A_local_block_rows * B_local_block_columns) + (row * B_local_block_columns) + column]; } } } } write_time -= MPI_Wtime(); printf("(%d,%d)x(%d,%d)=(%d,%d)\n", A_rows, A_columns, B_rows, B_columns, A_rows, B_columns); printf("Computation time: %lf\n", compute_time); printf("MPI time: %lf\n", mpi_time); printf("Read time: %lf\n", -read_time); printf("Send dims time: %lf\n", -send_dim_time); printf("Send blocks time: %lf\n", -send_blocks_time); printf("Gather time: %lf\n", -gather_time); printf("Addit. time: %lf\n", -dod_cajt); printf("Write time: %lf\n", -write_time); if (argc == 4){ // present results on the screen printf("\nA( %d x %d ):\n", A_rows, A_columns); for(row = 0; row < A_rows; row++) { for(column = 0; column < A_columns; column++) printf ("%7.3f ", A[row][column]); printf ("\n"); } printf("\nB( %d x %d ):\n", B_rows, B_columns); for(row = 0; row < B_rows; row++){ for(column = 0; column < B_columns; column++) printf("%7.3f ", B[row][column]); printf("\n"); } printf("\nC( %d x %d ) = AxB:\n", A_rows, B_columns); for(row = 0; row < A_rows; row++){ for(column = 0; column < B_columns; column++) printf("%7.3f ",C[row][column]); printf("\n"); } printf("\nPerforming serial consistency check. Be patient...\n"); fflush(stdout); int pass = 1; double temp; for(i=0; i<A_rows; i++){ for(j=0; j<B_columns; j++){ temp = 0; for(k=0; k<B_rows; k++){ temp += A[i][k] * B[k][j]; } //printf("%7.3f ", temp); printf("%7.3f ", temp-C[i][j]); if(temp != C[i][j]){ pass = 0; } } printf("\n"); } if (pass) printf("Consistency check: PASS\n"); else printf("Consistency check: FAIL\n"); } } // free all memory if(rank == 0){ int i; for(i = 0; i < A_rows; i++){ free(A[i]); } for(i = 0; i < B_rows; i++){ free(B[i]); } for(i = 0; i < A_rows; i++){ free(C[i]); } free(A); free(B); free(C); free(A_array); free(B_array); free(C_array); } free(A_local_block); free(B_local_block); free(C_local_block); // finalize MPI MPI_Finalize(); }
int main(int argc, char **argv) { int errs = 0; int i; int rank, size; int *sbuf = NULL; int *rbuf = NULL; int *scounts = NULL; int *rcounts = NULL; int *sdispls = NULL; int *rdispls = NULL; MPI_Datatype *types = NULL; MPI_Comm comm; MPI_Request req; /* intentionally not using MTest_Init/MTest_Finalize in order to make it * easy to take this test and use it as an NBC sanity test outside of the * MPICH test suite */ MPI_Init(&argc, &argv); comm = MPI_COMM_WORLD; MPI_Comm_size(comm, &size); MPI_Comm_rank(comm, &rank); MPI_Comm_set_errhandler(MPI_COMM_WORLD, MPI_ERRORS_RETURN); /* enough space for every process to contribute at least NUM_INTS ints to any * collective operation */ sbuf = malloc(NUM_INTS * size * sizeof(int)); my_assert(sbuf); rbuf = malloc(NUM_INTS * size * sizeof(int)); my_assert(rbuf); scounts = malloc(size * sizeof(int)); my_assert(scounts); rcounts = malloc(size * sizeof(int)); my_assert(rcounts); sdispls = malloc(size * sizeof(int)); my_assert(sdispls); rdispls = malloc(size * sizeof(int)); my_assert(rdispls); types = malloc(size * sizeof(MPI_Datatype)); my_assert(types); for (i = 0; i < size; ++i) { sbuf[2 * i] = i; sbuf[2 * i + 1] = i; rbuf[2 * i] = i; rbuf[2 * i + 1] = i; scounts[i] = NUM_INTS; rcounts[i] = NUM_INTS; sdispls[i] = i * NUM_INTS; rdispls[i] = i * NUM_INTS; types[i] = MPI_INT; } if (rank == 0 && MPI_SUCCESS == MPI_Igather(sbuf, NUM_INTS, MPI_INT, sbuf, NUM_INTS, MPI_INT, 0, comm, &req)) errs++; if (rank == 0 && MPI_SUCCESS == MPI_Igatherv(sbuf, NUM_INTS, MPI_INT, sbuf, rcounts, rdispls, MPI_INT, 0, comm, &req)) errs++; if (rank == 0 && MPI_SUCCESS == MPI_Iscatter(sbuf, NUM_INTS, MPI_INT, sbuf, NUM_INTS, MPI_INT, 0, comm, &req)) errs++; if (rank == 0 && MPI_SUCCESS == MPI_Iscatterv(sbuf, scounts, sdispls, MPI_INT, sbuf, NUM_INTS, MPI_INT, 0, comm, &req)) errs++; if (MPI_SUCCESS == MPI_Iallgather(&sbuf[rank], 1, MPI_INT, sbuf, 1, MPI_INT, comm, &req)) errs++; if (MPI_SUCCESS == MPI_Iallgatherv(&sbuf[rank * rcounts[rank]], rcounts[rank], MPI_INT, sbuf, rcounts, rdispls, MPI_INT, comm, &req)) errs++; if (MPI_SUCCESS == MPI_Ialltoall(sbuf, NUM_INTS, MPI_INT, sbuf, NUM_INTS, MPI_INT, comm, &req)) errs++; if (MPI_SUCCESS == MPI_Ialltoallv(sbuf, scounts, sdispls, MPI_INT, sbuf, scounts, sdispls, MPI_INT, comm, &req)) errs++; if (MPI_SUCCESS == MPI_Ialltoallw(sbuf, scounts, sdispls, types, sbuf, scounts, sdispls, types, comm, &req)) errs++; if (rank == 0 && MPI_SUCCESS == MPI_Ireduce(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, 0, comm, &req)) errs++; if (MPI_SUCCESS == MPI_Iallreduce(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req)) errs++; if (MPI_SUCCESS == MPI_Ireduce_scatter(sbuf, sbuf, rcounts, MPI_INT, MPI_SUM, comm, &req)) errs++; if (MPI_SUCCESS == MPI_Ireduce_scatter_block(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req)) errs++; if (MPI_SUCCESS == MPI_Iscan(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req)) errs++; if (MPI_SUCCESS == MPI_Iexscan(sbuf, sbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req)) errs++; if (sbuf) free(sbuf); if (rbuf) free(rbuf); if (scounts) free(scounts); if (rcounts) free(rcounts); if (sdispls) free(sdispls); if (rdispls) free(rdispls); if (types) free(types); if (rank == 0) { if (errs) fprintf(stderr, "Found %d errors\n", errs); else printf(" No errors\n"); } MPI_Finalize(); return 0; }