int main(int argc, char **argv) { int rank, nproc; int errs = 0; int array[1024]; int val = 0; int target_rank; MPI_Aint bases[2]; MPI_Aint disp, offset; MPI_Win win; MTest_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nproc); if (rank == 0 && nproc != 2) { MTestError("Must run with 2 ranks\n"); } /* Get the base address in the middle of the array */ if (rank == 0) { target_rank = 1; array[0] = 1234; MPI_Get_address(&array[512], &bases[0]); } else if (rank == 1) { target_rank = 0; array[1023] = 1234; MPI_Get_address(&array[512], &bases[1]); } /* Exchange bases */ MPI_Allgather(MPI_IN_PLACE, 0, MPI_DATATYPE_NULL, bases, 1, MPI_AINT, MPI_COMM_WORLD); MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &win); MPI_Win_attach(win, array, sizeof(int)*1024); /* Do MPI_Aint addressing arithmetic */ if (rank == 0) { disp = sizeof(int)*511; offset = MPIX_Aint_add(bases[1], disp); /* offset points to array[1023]*/ } else if (rank == 1) { disp = sizeof(int)*512; offset = MPIX_Aint_diff(bases[0], disp); /* offset points to array[0] */ } /* Get val and verify it */ MPI_Win_fence(MPI_MODE_NOPRECEDE, win); MPI_Get(&val, 1, MPI_INT, target_rank, offset, 1, MPI_INT, win); MPI_Win_fence(MPI_MODE_NOSUCCEED, win); if (val != 1234) { errs++; printf("%d -- Got %d, expected 1234\n", rank, val); } MPI_Win_detach(win, array); MPI_Win_free(&win); MTest_Finalize(errs); MPI_Finalize(); return 0; }
void run_rma_test(int nprocs_per_node) { int myrank, nprocs; int mem_rank; MPI_Win win; int *baseptr; MPI_Aint local_size; MPI_Comm_rank(MPI_COMM_WORLD, &myrank); MPI_Comm_size(MPI_COMM_WORLD, &nprocs); if (nprocs < nprocs_per_node * 2) { if (!myrank) printf("should start program with at least %d processes\n", nprocs_per_node * 2); MPI_Finalize(); exit(EXIT_FAILURE); } mem_rank = nprocs_per_node + nprocs_per_node / 2; local_size = (myrank == mem_rank) ? COUNT : 0; MPI_Win_create_dynamic(MPI_INFO_NULL, MPI_COMM_WORLD, &win); MPI_Win_lock_all(0, win); int type_size; MPI_Type_size(MPI_INT, &type_size); size_t nbytes = COUNT * type_size; assert(MPI_Alloc_mem(nbytes, MPI_INFO_NULL, &baseptr) == MPI_SUCCESS); assert(MPI_Win_attach(win, baseptr, nbytes) == MPI_SUCCESS); MPI_Aint ldisp; MPI_Aint *disps = malloc(nprocs * sizeof(MPI_Aint)); assert(MPI_Get_address(baseptr, &ldisp) == MPI_SUCCESS); assert(MPI_Allgather(&ldisp, 1, MPI_AINT, disps, nprocs, MPI_AINT, MPI_COMM_WORLD) == MPI_SUCCESS); if (myrank == 0) { for (size_t idx = 0; idx < COUNT; ++idx) { baseptr[idx] = idx * COUNT + 1; } } MPI_Barrier(MPI_COMM_WORLD); if (myrank == mem_rank) { assert(MPI_Get(baseptr, 10, MPI_INT, 0, disps[0], 10, MPI_INT, win) == MPI_SUCCESS); assert(MPI_Win_flush(0, win) == MPI_SUCCESS); for (size_t idx = 0; idx < COUNT; ++idx) { assert(baseptr[idx] == idx * 10 + 1); } } MPI_Barrier(MPI_COMM_WORLD); MPI_Win_unlock_all(win); MPI_Barrier(MPI_COMM_WORLD); MPI_Win_free(&win); MPI_Free_mem(baseptr); printf("Test finished\n"); }
int main(int argc, char **argv) { int i, j, rank, nranks, peer, bufsize, errors; double *win_buf, *src_buf, *dst_buf; MPI_Win buf_win; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); bufsize = XDIM * YDIM * sizeof(double); MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &win_buf); MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &src_buf); MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &dst_buf); if (rank == 0) if (verbose) printf("MPI RMA Strided Put Test:\n"); for (i = 0; i < XDIM*YDIM; i++) { *(win_buf + i) = 1.0 + rank; *(src_buf + i) = 1.0 + rank; } MPI_Win_create(win_buf, bufsize, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &buf_win); peer = (rank+1) % nranks; /* Perform ITERATIONS strided put operations */ for (i = 0; i < ITERATIONS; i++) { MPI_Aint idx_loc[SUB_YDIM]; int idx_rem[SUB_YDIM]; int blk_len[SUB_YDIM]; MPI_Datatype src_type, dst_type; if (rank == 0) if (verbose) printf(" + iteration %d\n", i); for (j = 0; j < SUB_YDIM; j++) { MPI_Get_address(&src_buf[j*XDIM], &idx_loc[j]); idx_rem[j] = j*XDIM*sizeof(double); blk_len[j] = SUB_XDIM*sizeof(double); } MPI_Type_create_hindexed(SUB_YDIM, blk_len, idx_loc, MPI_BYTE, &src_type); MPI_Type_create_indexed_block(SUB_YDIM, SUB_XDIM*sizeof(double), idx_rem, MPI_BYTE, &dst_type); MPI_Type_commit(&src_type); MPI_Type_commit(&dst_type); MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win); MPI_Put(MPI_BOTTOM, 1, src_type, peer, 0, 1, dst_type, buf_win); MPI_Win_unlock(peer, buf_win); MPI_Type_free(&src_type); MPI_Type_free(&dst_type); } MPI_Barrier(MPI_COMM_WORLD); /* Verify that the results are correct */ MPI_Win_lock(MPI_LOCK_EXCLUSIVE, rank, 0, buf_win); errors = 0; for (i = 0; i < SUB_XDIM; i++) { for (j = 0; j < SUB_YDIM; j++) { const double actual = *(win_buf + i + j*XDIM); const double expected = (1.0 + ((rank+nranks-1)%nranks)); if (actual - expected > 1e-10) { printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n", rank, j, i, expected, actual); errors++; fflush(stdout); } } } for (i = SUB_XDIM; i < XDIM; i++) { for (j = 0; j < SUB_YDIM; j++) { const double actual = *(win_buf + i + j*XDIM); const double expected = 1.0 + rank; if (actual - expected > 1e-10) { printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n", rank, j, i, expected, actual); errors++; fflush(stdout); } } } for (i = 0; i < XDIM; i++) { for (j = SUB_YDIM; j < YDIM; j++) { const double actual = *(win_buf + i + j*XDIM); const double expected = 1.0 + rank; if (actual - expected > 1e-10) { printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n", rank, j, i, expected, actual); errors++; fflush(stdout); } } } MPI_Win_unlock(rank, buf_win); MPI_Win_free(&buf_win); MPI_Free_mem(win_buf); MPI_Free_mem(src_buf); MPI_Free_mem(dst_buf); MPI_Finalize(); if (errors == 0) { if (rank == 0) printf(" No Errors\n"); return 0; } else { printf("%d: Fail\n", rank); return 1; } }
/** Optimized implementation of the ARMCI IOV operation that uses an MPI * datatype to achieve a one-sided gather/scatter. Does not use MPI_BOTTOM. */ int ARMCII_Iov_op_datatype_no_bottom(enum ARMCII_Op_e op, void **src, void **dst, int count, int elem_count, MPI_Datatype type, int proc) { gmr_t *mreg; MPI_Datatype type_loc, type_rem; MPI_Aint disp_loc[count]; int disp_rem[count]; int block_len[count]; void *dst_win_base; int dst_win_size, i, type_size; void **buf_rem, **buf_loc; MPI_Aint base_rem; MPI_Aint base_loc; void *base_loc_ptr; switch(op) { case ARMCII_OP_ACC: case ARMCII_OP_PUT: buf_rem = dst; buf_loc = src; break; case ARMCII_OP_GET: buf_rem = src; buf_loc = dst; break; default: ARMCII_Error("unknown operation (%d)", op); return 1; } MPI_Type_size(type, &type_size); mreg = gmr_lookup(buf_rem[0], proc); ARMCII_Assert_msg(mreg != NULL, "Invalid remote pointer"); dst_win_base = mreg->slices[proc].base; dst_win_size = mreg->slices[proc].size; MPI_Get_address(dst_win_base, &base_rem); /* Pick a base address for the start of the origin's datatype */ base_loc_ptr = buf_loc[0]; MPI_Get_address(base_loc_ptr, &base_loc); for (i = 0; i < count; i++) { MPI_Aint target_rem, target_loc; MPI_Get_address(buf_loc[i], &target_loc); MPI_Get_address(buf_rem[i], &target_rem); disp_loc[i] = target_loc - base_loc; disp_rem[i] = (target_rem - base_rem)/type_size; block_len[i] = elem_count; ARMCII_Assert_msg((target_rem - base_rem) % type_size == 0, "Transfer size is not a multiple of type size"); ARMCII_Assert_msg(disp_rem[i] >= 0 && disp_rem[i] < dst_win_size, "Invalid remote pointer"); ARMCII_Assert_msg(((uint8_t*)buf_rem[i]) + block_len[i] <= ((uint8_t*)dst_win_base) + dst_win_size, "Transfer exceeds buffer length"); } MPI_Type_create_hindexed(count, block_len, disp_loc, type, &type_loc); MPI_Type_create_indexed_block(count, elem_count, disp_rem, type, &type_rem); //MPI_Type_indexed(count, block_len, disp_rem, type, &type_rem); MPI_Type_commit(&type_loc); MPI_Type_commit(&type_rem); gmr_lock(mreg, proc); switch(op) { case ARMCII_OP_ACC: gmr_accumulate_typed(mreg, base_loc_ptr, 1, type_loc, MPI_BOTTOM, 1, type_rem, proc); break; case ARMCII_OP_PUT: gmr_put_typed(mreg, base_loc_ptr, 1, type_loc, MPI_BOTTOM, 1, type_rem, proc); break; case ARMCII_OP_GET: gmr_get_typed(mreg, MPI_BOTTOM, 1, type_rem, base_loc_ptr, 1, type_loc, proc); break; default: ARMCII_Error("unknown operation (%d)", op); return 1; } gmr_unlock(mreg, proc); MPI_Type_free(&type_loc); MPI_Type_free(&type_rem); return 0; }
int main(int argc, char* argv[]) { int* bodies_off; int* n_bodies_split; int n_local_bodies; const MPI_Comm comm = MPI_COMM_WORLD; FILE *inputf; FILE *outputf; double clockStart, clockEnd; int rc, n_proc, rank; rc = MPI_Init(&argc, &argv); if (rc != MPI_SUCCESS) { puts("MPI_Init failed"); exit(-1); } MPI_Comm_size(comm, &n_proc); MPI_Comm_rank(comm, &rank); //creazione datatype per mpi! MPI_Datatype bodytype; MPI_Datatype type[6] = { MPI_LB, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_DOUBLE, MPI_UB }; int block_len[6] = {1, 1, 3, 3, 3, 1}; MPI_Aint disp[6]; leaf_t example[2]; MPI_Get_address(&example[0], &disp[0]); MPI_Get_address(&(example[0].mass), &disp[1]); MPI_Get_address(&(example[0].pos), &disp[2]); MPI_Get_address(&(example[0].vel), &disp[3]); MPI_Get_address(&(example[0].acc), &disp[4]); MPI_Get_address(&(example[1].acc), &disp[5]); // int i; // for(i = 6; i >= 0; --i) // disp[i] -= disp[0]; disp[1] = disp[1] - disp[0]; disp[2] = disp[2] - disp[0]; disp[3] = disp[3] - disp[0]; disp[4] = disp[4] - disp[0]; disp[5] = disp[5] - disp[0]; MPI_Type_create_struct(6, block_len, disp, type, &bodytype); MPI_Type_commit(&bodytype); bodies_off = malloc((n_proc + 1) * sizeof(int)); n_bodies_split = malloc((n_proc) * sizeof(int)); bodies = malloc(nbodies * sizeof(node_t*)); leafs = malloc(nbodies * sizeof(leaf_t)); char* inputfile = argv[1]; inputf = fopen(inputfile, "r"); if (inputf == NULL) { printf("impossibile leggere da file"); exit(1); } fscanf(inputf, "%d", &nbodies); fscanf(inputf, "%d", &steps); fscanf(inputf, "%lf", &dt); fscanf(inputf, "%lf", &eps); fscanf(inputf, "%lf", &tol); fclose(inputf); if (rank == 0) { int i; create_bodies(); quicksort(0, nbodies - 1); // bublesort(); // int i = 0; // for (i = 0; i < nbodies; i++) { // printf("%lf, %lf, %lf \n", bodies[i]->pos[0], bodies[i]->pos[1], // bodies[i]->pos[2]); // } n_local_bodies = nbodies / n_proc; //split delle particelle secondo shark & fish // split_bodies(n_proc, bodies_off, n_bodies_split); // n_local_bodies = n_bodies_split[rank]; // // MPI_Bcast(n_bodies_split, n_proc, MPI_INT, 0, comm); MPI_Bcast(leafs, nbodies, bodytype, 0, comm); dthf = 0.5 * dt; epssq = eps * eps; itolsq = 1.0 / (tol * tol); clockStart = MPI_Wtime(); int step = 0; root = NULL; for (step = 0; step < steps; step++) { compute_center_and_diameter(); root = malloc(sizeof(struct node_t)); // "new" is like "malloc" double mass_root = 0.0; root->type = 1; root->mass = &mass_root; root->pos = center; root->cell.childs[0] = NULL; root->cell.childs[1] = NULL; root->cell.childs[2] = NULL; root->cell.childs[3] = NULL; root->cell.childs[4] = NULL; root->cell.childs[5] = NULL; root->cell.childs[6] = NULL; root->cell.childs[7] = NULL; double radius = diameter * 0.5; int i = 0; for (i = 0; i < nbodies; i++) { insert(root, bodies[i], radius); // questo è il modo per passare i dati per riferimento... cioè mandare l'indirizzo della struttura puntata dal puntatore } curr = 0; compute_center_of_mass(&(*root)); for (i = 0; i < n_local_bodies; i++) { compute_force(&(*root), &(*bodies[i]), diameter, step); } // for (i = 0; i < nbodies; i++) { // } deallocate_tree(root); //inserire all gather MPI_Allgather(leafs, n_local_bodies, bodytype, leafs, n_local_bodies, bodytype, comm); for (i = 0; i < nbodies; i++) { advance(&(*bodies[i])); } // int p = 0; // for (p = 0; p < nbodies; p++) // printf("%lf, %lf, %lf \n", bodies[p]->pos[0], bodies[p]->pos[1], // bodies[p]->pos[2]); // printf("*************************************** \n"); } // int i = 0; // dopo l'esecuzione!! // int proc_rec = 1; // while (proc_rec < n_proc) { // MPI_Status status; // int proc_rank; // int cap = nbodies / n_proc; // node_t temp[cap]; // MPI_Recv(temp, cap, bodytype, MPI_ANY_SOURCE, MPI_ANY_TAG, comm, // &status); // proc_rank = status.MPI_SOURCE; // // int idx = 0; // for (idx = proc_rec * (cap); idx < cap; idx++) // *bodies[idx] = temp[idx]; // proc_rec++; // } clockEnd = MPI_Wtime(); if (nbodies == 16384) { system("echo 'Host:' `hostname` >> output16384 "); outputf = fopen("output16384", "a"); fprintf(outputf, "Tempo di esecuzione: %lf \n", clockEnd - clockStart); for (i = 0; i < nbodies; i++) { fprintf(outputf, "%lf, %lf, %lf \n", bodies[i]->pos[0], bodies[i]->pos[1], bodies[i]->pos[2]); } } else if (nbodies == 32768) { system("echo 'Host:' `hostname` >> output32768 "); outputf = fopen("output32768", "a"); fprintf(outputf, "Tempo di esecuzione: %lf \n", clockEnd - clockStart); for (i = 0; i < nbodies; i++) { fprintf(outputf, "%lf, %lf, %lf \n", bodies[i]->pos[0], bodies[i]->pos[1], bodies[i]->pos[2]); } } else if (nbodies == 65536) { system("echo 'Host:' `hostname` >> output65536 "); outputf = fopen("output65536", "a"); fprintf(outputf, "Tempo di esecuzione: %lf \n", clockEnd - clockStart); for (i = 0; i < nbodies; i++) { fprintf(outputf, "%lf, %lf, %lf \n", bodies[i]->pos[0], bodies[i]->pos[1], bodies[i]->pos[2]); } } else { system("echo 'Host:' `hostname` >> output "); outputf = fopen("output", "a"); fprintf(outputf, "Tempo di esecuzione: %lf \n", clockEnd - clockStart); for (i = 0; i < nbodies; i++) { fprintf(outputf, "%lf, %lf, %lf \n", bodies[i]->pos[0], bodies[i]->pos[1], bodies[i]->pos[2]); } } fflush(outputf); fclose(outputf); printf("Esecuzione completata\n"); } else { int low = 1, up = 0; int i; dthf = 0.5 * dt; epssq = eps * eps; itolsq = 1.0 / (tol * tol); // if (PAPI_library_init(PAPI_VER_CURRENT) != PAPI_VER_CURRENT) { // printf("Inizializzazione della libreria di papi fallita \n"); // exit(1); // } // // if (PAPI_create_eventset(&event_set) != PAPI_OK) { // printf("E' andata a male la creazione dell'eventSet \n"); // exit(1); // } // // if (PAPI_add_events(event_set, events, 2) != PAPI_OK) { // printf("E' andata a male l'aggiunta degli eventi\n"); // exit(1); // } n_local_bodies = nbodies / n_proc; MPI_Bcast(leafs, nbodies, bodytype, 0, comm); int step = 0; root = NULL; low += (rank * n_local_bodies); up = low + n_local_bodies; // PAPI_start(event_set); // clockStart = PAPI_get_real_usec(); for (step = 0; step < steps; step++) { compute_center_and_diameter(); root = malloc(sizeof(struct node_t)); // "new" is like "malloc" root->type = 1; *(root->mass) = 0.0; root->pos = center; root->cell.childs[0] = NULL; root->cell.childs[1] = NULL; root->cell.childs[2] = NULL; root->cell.childs[3] = NULL; root->cell.childs[4] = NULL; root->cell.childs[5] = NULL; root->cell.childs[6] = NULL; root->cell.childs[7] = NULL; double radius = diameter * 0.5; for (i = 0; i < nbodies; i++) { bodies[i] = malloc(sizeof(node_t)); bodies[i]->cell.leaf = &leafs[i]; bodies[i]->mass = &leafs[i].mass; bodies[i]->pos = leafs[i].pos; insert(&(*root), &(*bodies[i]), radius); // questo è il modo per passare i dati per riferimento... cioè mandare l'indirizzo della struttura puntata dal puntatore } curr = 0; compute_center_of_mass(&(*root)); for (i = low; i < up; i++) { compute_force(&(*root), &(*bodies[i]), diameter, step); } // for (i = 0; i < nbodies; i++) { // } deallocate_tree(root); local_leafs = &leafs[low]; //inserire all_gather MPI_Allgather(local_leafs, up - low, bodytype, leafs, up - low, bodytype, comm); for (i = 0; i < nbodies; i++) { advance(&(*bodies[i])); } // int p = 0; // for (p = 0; p < nbodies; p++) // printf("%lf, %lf, %lf \n", bodies[p]->pos[0], bodies[p]->pos[1], // bodies[p]->pos[2]); // printf("*************************************** \n"); } // clockEnd = PAPI_get_real_usec(); // PAPI_stop(event_set, values); // int i = 0; // MPI_Send(bodies[low], up - low + 1, bodytype, 0, MPI_ANY_TAG, comm); } MPI_Finalize(); return 0; }
int main(int argc, char **argv) { int i, j, rank, nranks, peer, bufsize, errors; double *win_buf, *loc_buf; MPI_Win buf_win; MPI_Aint idx_loc[SUB_YDIM]; int idx_rem[SUB_YDIM]; int blk_len[SUB_YDIM]; MPI_Datatype loc_type, rem_type; MPI_Init(&argc, &argv); MPI_Comm_rank(MPI_COMM_WORLD, &rank); MPI_Comm_size(MPI_COMM_WORLD, &nranks); bufsize = XDIM * YDIM * sizeof(double); MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &win_buf); MPI_Alloc_mem(bufsize, MPI_INFO_NULL, &loc_buf); if (rank == 0) if (verbose) printf("MPI RMA Strided Get Test:\n"); for (i = 0; i < XDIM*YDIM; i++) *(win_buf + i) = 1.0 + rank; MPI_Win_create(win_buf, bufsize, 1, MPI_INFO_NULL, MPI_COMM_WORLD, &buf_win); peer = (rank+1) % nranks; /* Build the datatype */ for (i = 0; i < SUB_YDIM; i++) { MPI_Get_address(&loc_buf[i*XDIM], &idx_loc[i]); idx_rem[i] = i*XDIM; blk_len[i] = SUB_XDIM; } MPI_Type_indexed(SUB_YDIM, blk_len, idx_rem, MPI_DOUBLE, &loc_type); MPI_Type_indexed(SUB_YDIM, blk_len, idx_rem, MPI_DOUBLE, &rem_type); MPI_Type_commit(&loc_type); MPI_Type_commit(&rem_type); /* Perform get operation */ MPI_Win_lock(MPI_LOCK_EXCLUSIVE, peer, 0, buf_win); MPI_Get(loc_buf, 1, loc_type, peer, 0, 1, rem_type, buf_win); /* Use the datatype only on the remote side (must have SUB_XDIM == XDIM) */ /* MPI_Get(loc_buf, SUB_XDIM*SUB_YDIM, MPI_DOUBLE, peer, 0, 1, rem_type, buf_win); */ MPI_Win_unlock(peer, buf_win); MPI_Type_free(&loc_type); MPI_Type_free(&rem_type); MPI_Barrier(MPI_COMM_WORLD); /* Verify that the results are correct */ errors = 0; for (i = 0; i < SUB_XDIM; i++) { for (j = 0; j < SUB_YDIM; j++) { const double actual = *(loc_buf + i + j*XDIM); const double expected = (1.0 + peer); if (actual - expected > 1e-10) { printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n", rank, j, i, expected, actual); errors++; fflush(stdout); } } } for (i = SUB_XDIM; i < XDIM; i++) { for (j = 0; j < SUB_YDIM; j++) { const double actual = *(loc_buf + i + j*XDIM); const double expected = 1.0 + rank; if (actual - expected > 1e-10) { printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n", rank, j, i, expected, actual); errors++; fflush(stdout); } } } for (i = 0; i < XDIM; i++) { for (j = SUB_YDIM; j < YDIM; j++) { const double actual = *(loc_buf + i + j*XDIM); const double expected = 1.0 + rank; if (actual - expected > 1e-10) { printf("%d: Data validation failed at [%d, %d] expected=%f actual=%f\n", rank, j, i, expected, actual); errors++; fflush(stdout); } } } MPI_Win_free(&buf_win); MPI_Free_mem(win_buf); MPI_Free_mem(loc_buf); MPI_Finalize(); if (errors == 0) { if (rank == 0) printf(" No Errors\n"); return 0; } else { printf("%d: Fail\n", rank); return 1; } }
int main(int argc, char **argv) { int vcount, vstride; int32_t counts[2]; int packsize, i, position, errs = 0; double *outbuf, *outbuf2; double *vsource; MPI_Datatype vtype, stype; double t0, t1; double tspack, tvpack, tmanual; int ntry; int blocklengths[2]; MPI_Aint displacements[2]; MPI_Datatype typesArray[2]; MPI_Init(&argc, &argv); /* Create a struct consisting of a two 32-bit ints, followed by a * vector of stride 3 but count 128k (less than a few MB of data area) */ vcount = 128000; vstride = 3; MPI_Type_vector(vcount, 1, vstride, MPI_DOUBLE, &vtype); vsource = (double *) malloc((vcount + 1) * (vstride + 1) * sizeof(double)); if (!vsource) { fprintf(stderr, "Unable to allocate vsource\n"); MPI_Abort(MPI_COMM_WORLD, 1); } for (i = 0; i < vcount * vstride; i++) { vsource[i] = i; } blocklengths[0] = 2; MPI_Get_address(&counts[0], &displacements[0]); blocklengths[1] = 1; MPI_Get_address(vsource, &displacements[1]); if (verbose) { printf("%p = %p?\n", vsource, (void *) displacements[1]); } typesArray[0] = MPI_INT32_T; typesArray[1] = vtype; MPI_Type_create_struct(2, blocklengths, displacements, typesArray, &stype); MPI_Type_commit(&stype); MPI_Type_commit(&vtype); #if defined(MPICH) && defined(PRINT_DATATYPE_INTERNALS) /* To use MPIDU_Datatype_debug to print the datatype internals, * you must configure MPICH with --enable-g=log */ if (verbose) { printf("Original struct datatype:\n"); MPIDU_Datatype_debug(stype, 10); } #endif MPI_Pack_size(1, stype, MPI_COMM_WORLD, &packsize); outbuf = (double *) malloc(packsize); outbuf2 = (double *) malloc(packsize); if (!outbuf) { fprintf(stderr, "Unable to allocate %ld for outbuf\n", (long) packsize); MPI_Abort(MPI_COMM_WORLD, 1); } if (!outbuf2) { fprintf(stderr, "Unable to allocate %ld for outbuf2\n", (long) packsize); MPI_Abort(MPI_COMM_WORLD, 1); } position = 0; /* Warm up the code and data */ MPI_Pack(MPI_BOTTOM, 1, stype, outbuf, packsize, &position, MPI_COMM_WORLD); tspack = 1e12; for (ntry = 0; ntry < 5; ntry++) { position = 0; t0 = MPI_Wtime(); MPI_Pack(MPI_BOTTOM, 1, stype, outbuf, packsize, &position, MPI_COMM_WORLD); t1 = MPI_Wtime() - t0; if (t1 < tspack) tspack = t1; } MPI_Type_free(&stype); /* An equivalent packing, using the 2 ints and the vector separately */ tvpack = 1e12; for (ntry = 0; ntry < 5; ntry++) { position = 0; t0 = MPI_Wtime(); MPI_Pack(counts, 2, MPI_INT32_T, outbuf, packsize, &position, MPI_COMM_WORLD); MPI_Pack(vsource, 1, vtype, outbuf, packsize, &position, MPI_COMM_WORLD); t1 = MPI_Wtime() - t0; if (t1 < tvpack) tvpack = t1; } MPI_Type_free(&vtype); /* Note that we exploit the fact that the vector type contains vblock * instances of a contiguous type of size 24, or a single block of 24*vblock * bytes. */ tmanual = 1e12; for (ntry = 0; ntry < 5; ntry++) { const double *restrict ppe = (const double *) vsource; double *restrict ppo = outbuf2; int j; t0 = MPI_Wtime(); position = 0; *(int32_t *) ppo = counts[0]; *(((int32_t *) ppo) + 1) = counts[1]; ppo++; /* Some hand optimization because this file is not normally * compiled with optimization by the test suite */ j = vcount; while (j) { *ppo++ = *ppe; ppe += vstride; *ppo++ = *ppe; ppe += vstride; *ppo++ = *ppe; ppe += vstride; *ppo++ = *ppe; ppe += vstride; j -= 4; } position += (1 + vcount); position *= sizeof(double); t1 = MPI_Wtime() - t0; if (t1 < tmanual) tmanual = t1; /* Check on correctness */ #ifdef PACK_IS_NATIVE if (memcmp(outbuf, outbuf2, position) != 0) { printf("Panic(manual) - pack buffers differ\n"); for (j = 0; j < 8; j++) { printf("%d: %llx\t%llx\n", j, (long long unsigned) outbuf[j], (long long unsigned) outbuf2[j]); } } #endif } if (verbose) { printf("Bytes packed = %d\n", position); printf("MPI_Pack time = %e (struct), = %e (vector), manual pack time = %e\n", tspack, tvpack, tmanual); } if (4 * tmanual < tspack) { errs++; printf("MPI_Pack time using struct with vector = %e, manual pack time = %e\n", tspack, tmanual); printf("MPI_Pack time should be less than 4 times the manual time\n"); printf("For most informative results, be sure to compile this test with optimization\n"); } if (4 * tmanual < tvpack) { errs++; printf("MPI_Pack using vector = %e, manual pack time = %e\n", tvpack, tmanual); printf("MPI_Pack time should be less than 4 times the manual time\n"); printf("For most informative results, be sure to compile this test with optimization\n"); } if (4 * tvpack < tspack) { errs++; printf("MPI_Pack using a vector = %e, using a struct with vector = %e\n", tvpack, tspack); printf ("MPI_Pack time using vector should be about the same as the struct containing the vector\n"); printf("For most informative results, be sure to compile this test with optimization\n"); } if (errs) { printf(" Found %d errors\n", errs); } else { printf(" No Errors\n"); } free(vsource); free(outbuf); free(outbuf2); MPI_Finalize(); return 0; }