double runBicomTest(int procs, int bufsize, int iters, int rank, int wsize, int procsPerNode, char allocPattern, int useBarrier, int useNearestRank, MPI_Comm *activeComm) { int i, currtarg; double start, diff, max; double totalops; MPI_Status stat; char *comBuf; char *comBuf2; uintptr_t sbuf, rbuf , sbuf2, rbuf2 ; currtarg = getTargetRank(rank, wsize, procsPerNode, useNearestRank); diff = 0; // printf("START BI rank [ %d ] buffsize %d \n", rank, bufsize); if ( isActiveProc(rank, wsize, procsPerNode, procs, allocPattern, useNearestRank) ) { if(0){ }else{ // printf("START BI COMM INIT rank [ %d ] buffsize %d \n", rank, bufsize); comBuf = (char*)memalign( LINESIZE, bufsize); comBuf2 = (char*)memalign( LINESIZE, bufsize); //memset(comBuf, 0, bufsize); //allocate memory for sync flags both this rank and the target //we will keep senders sync flag --> to indicate done copying //we will keep recievers sync flag --> to indicate completion of transfer to start again req_array[currtarg].sync = malloc(sizeof(int)); req_array[currtarg].sync[0] = 0; //we need offset for request going for target for synergistic transfer //Senders offset doesn't matter req_array[currtarg].offset = malloc(sizeof(int)); req_array[currtarg].offset[0] = 0; //initialize //req_array[rank].offset[0] = 0; if ( rank < currtarg ){ //setup sender req_array[currtarg].buffer = comBuf; req_array[currtarg].buffer2 = comBuf2; }else{ //setup reciever req_array[currtarg].buffer = comBuf; req_array[currtarg].buffer2 = comBuf2; } } MPI_Barrier(*activeComm); MPI_Barrier(*activeComm); // printf("START BI COMM Active Comm rank [ %d ] buffsize %d \n", rank, bufsize); if ( rank < currtarg ) { // Sender - Time operation loop start = MPI_Wtime(); //wait untill receiver completes if(0){ }else{ volatile int* offsetptr = req_array[currtarg].offset; int offset = 0; sbuf = (uintptr_t) comBuf ; sbuf2 = (uintptr_t) req_array[rank].buffer2 ; rbuf = (uintptr_t) req_array[rank].buffer ; rbuf2 = (uintptr_t) comBuf2 ; for ( i = 0; i < iters; i++ ) { // int in = 0 ; // if(offsetptr[i] != 0) // printf("[WARN] SENDER CHECK OFFSET NOT ZERO !! rank [ %d ] iteration [ %d ] in [ %d ] buffsize [ %d ] offset_ptr [ %d ]\n",rank, i ,in, bufsize , offsetptr[i]); //length to copy is min of len - offset and BLOCK_SIZE while(1) { offset = __sync_fetch_and_add((volatile int *) offsetptr, BLOCKSIZE); // printf("SENDER CHECK LOOP rank [ %d ] iteration [ %d ] in [ %d ] buffsize [ %d ] offset [ %d ]\n",rank, i ,in, bufsize , offset); if ((int)offset >= bufsize) break ; // printf("SENDER IN LOOP rank [ %d ] iteration [ %d ] in [ %d ] buffsize [ %d ] offset [ %d ]\n",rank, i ,in, bufsize , offset); int left = bufsize - (int) offset; memcpy((void*)(rbuf + (int)offset), (void*)(sbuf + (int)offset), (left < BLOCKSIZE ? left : BLOCKSIZE)); memcpy((void*)(rbuf2 + (int)offset), (void*)(sbuf2 + (int)offset), (left < BLOCKSIZE ? left : BLOCKSIZE)); // __sync_fetch_and_add(&count, 1); // in = 1 ; } // printf("Sender rank [ %d ] iteration [ %d ] in [ %d ] buffsize [ %d ] offset [ %d ]\n",rank, i ,in, bufsize , offset); req_array[currtarg].sync[0] = 2 ; // printf("SYNC Sender rank [ %d ] iteration [ %d ] S1 [ %d ] S2 [ %d ] offset [ %d ]\n",rank, i ,req_array[currtarg].sync[0], req_array[rank].sync[0] , offset); while(!__sync_bool_compare_and_swap(req_array[rank].sync, 1 ,0)); // printf("SYNC-EXIT Sender rank [ %d ] iteration [ %d ] S1 [ %d ] S2 [ %d ] offset [ %d ]\n",rank, i ,req_array[currtarg].sync[0], req_array[rank].sync[0] , offset); } } } else { //Reciever code start = MPI_Wtime(); if(0){ }else{ volatile int* offsetptr = req_array[rank].offset; int offset = 0; //reciever now ready to setup the transfer for synergistic protocol rbuf = (uintptr_t) comBuf ; rbuf2 = (uintptr_t) req_array[rank].buffer2 ; sbuf = (uintptr_t) req_array[rank].buffer ; sbuf2 = (uintptr_t) comBuf2 ; for ( i = 0; i < iters; i++ ) { // int in = 0 ; // if(offsetptr[i] != 0) // printf("[WARN] RECIEVER CHECK OFFSET NOT ZERO !! rank [ %d ] iteration [ %d ] in [ %d ] buffsize [ %d ] offset_ptr [ %d ]\n",rank, i ,in, bufsize , offsetptr[i]); while(1) { offset = __sync_fetch_and_add((volatile int *) offsetptr, BLOCKSIZE); // printf("RECIEVER CHECK LOOP rank [ %d ] iteration [ %d ] in [ %d ] buffsize [ %d ] offset [ %d ]\n",rank, i ,in, bufsize , offset); if ((int)offset >= bufsize) break ; // printf("RECIEVER IN LOOP rank [ %d ] iteration [ %d ] in [ %d ] buffsize [ %d ] offset [ %d ]\n",rank, i ,in, bufsize , offset); int left = bufsize - (int)offset; // printf("RECIEVER IN LOOP 2 rank [ %d ] iteration [ %d ] left [ %d ] disp [ %ld ] offset [ %d ]\n",rank, i ,left, disp, offset); memcpy((void*)(rbuf + (int)offset), (void*)(sbuf + (int)offset), (left < BLOCKSIZE ? left : BLOCKSIZE)); memcpy((void*)(rbuf2 + (int)offset), (void*)(sbuf2 + (int)offset), (left < BLOCKSIZE ? left : BLOCKSIZE)); // __sync_fetch_and_add(&count, 1); // in = 1 ; } // printf("SYNC Reciever rank [ %d ] iteration [ %d ] S1 [ %d ] S2 [ %d ] offset [ %d ]\n",rank, i , req_array[rank].sync[0],req_array[currtarg].sync[0] , offset); while(!__sync_bool_compare_and_swap(req_array[rank].sync, 2 ,0)); *offsetptr = 0 ; req_array[currtarg].sync[0] = 1 ; // printf("SYNC-EXIT Reciever rank [ %d ] iteration [ %d ] S1 [ %d ] S2 [ %d ] offset [ %d ]\n",rank, i , req_array[rank].sync[0], req_array[currtarg].sync[0], offset); } } } //if ( useBarrier ) //MPI_Barrier(*activeComm); //generic_barrier(*activeComm); diff = MPI_Wtime() - start; MPI_Barrier(*activeComm); MPI_Barrier(*activeComm); // printf("START FREE 1 rank [%d] bufsize [%d] usebarrier [%d] \n", rank, bufsize, useBarrier); free(req_array[currtarg].sync); free(req_array[currtarg].offset); free(comBuf); // printf("FINISHED FREE rank [%d] bufsize [%d]\n", rank, bufsize); } MPI_Barrier(MPI_COMM_WORLD); /* Get maximum sample length */ MPI_Reduce(&diff, &max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if ( rank == 0 ) { totalops = (double)(iters * procs); /* Bandwidth is calculated as : (# of processes * operations per time sample * message size) ------------------------------------------------------------ maximum sample length of all processes */ if ( max > 0 ) { printf(outputFormat, "Bidirectional", procs, bufsize, ((double)totalops*(double)bufsize/max)/1000000, max/iters*1000000); } else { printf("Invalid measurement. Increase number of operation per measurement.\n"); } } return max; }
main(int argc, char** argv) { int rank, wsize, iters, i, procs, currtarg, dummy; double diff = 0.0; double start, max, mintime = 9999; MPI_Status stat; char comBuf; MPI_Comm activeComm; char* procFile = NULL; int* procList = NULL; int procListSize; int messStart, messStop, messFactor; int procsPerNode, procIdx, useBarrier, printPairs, useNearestRank; char allocPattern; command = argv[0]; MPI_Init(&argc, &argv); MPI_Comm_size(MPI_COMM_WORLD, &wsize); MPI_Comm_rank(MPI_COMM_WORLD, &rank); if ( !processArgs(argc, argv, rank, wsize, &iters, &dummy, &messStart, &messStop, &messFactor, &procFile, &procsPerNode, &allocPattern, &printPairs, &useBarrier, &useNearestRank) ) { if ( rank == 0 ) printUse(); MPI_Finalize(); exit(-1); } if ( ! getProcList(procFile, wsize, &procList, &procListSize, procsPerNode, allocPattern) ) { if ( procFile ) printf("Failed to get process list from file %s.\n", procFile); else printf("Failed to allocate process list.\n"); exit(-1); } if ( rank == 0 ) printReportHeader(); currtarg = getTargetRank(rank, wsize, procsPerNode, useNearestRank); for ( procIdx = 0; procIdx < procListSize; procIdx++ ) { procs = procList[procIdx]; if ( printPairs ) { printActivePairs(procs, rank, wsize, procsPerNode, allocPattern, useNearestRank); } /* Create Communicator of all active processes */ createActiveComm(procs, rank, wsize, procsPerNode, allocPattern, printPairs, useNearestRank, &activeComm); if ( isActiveProc(rank, wsize, procsPerNode, procs, allocPattern, useNearestRank) ) { if ( rank < currtarg ) { /* Ensure pair communication has been initialized */ MPI_Recv(&comBuf, 0, MPI_INT, currtarg, 0, MPI_COMM_WORLD, &stat); MPI_Send(&comBuf, 0, MPI_INT, currtarg, 0, MPI_COMM_WORLD); } else { /* Ensure pair communication has been initialized */ MPI_Send(&comBuf, 0, MPI_INT, currtarg, 0, MPI_COMM_WORLD); MPI_Recv(&comBuf, 0, MPI_INT, currtarg, 0, MPI_COMM_WORLD, &stat); } //generic_barrier(activeComm); MPI_Barrier(activeComm); //generic_barrier(activeComm); MPI_Barrier(activeComm); if ( rank < currtarg ) { /* Time operation loop */ start = MPI_Wtime(); for ( i = 0; i < iters; i++ ) { MPI_Send(&comBuf, 0, MPI_INT, currtarg, 0, MPI_COMM_WORLD); MPI_Recv(&comBuf, 0, MPI_INT, currtarg, 0, MPI_COMM_WORLD, &stat); } } else { /* Time operation loop */ start = MPI_Wtime(); for ( i = 0; i < iters; i++ ) { MPI_Recv(&comBuf, 0, MPI_INT, currtarg, 0, MPI_COMM_WORLD, &stat); MPI_Send(&comBuf, 0, MPI_INT, currtarg, 0, MPI_COMM_WORLD); } } if ( useBarrier ) MPI_Barrier(activeComm); //generic_barrier(activeComm); diff = MPI_Wtime() - start; } if ( activeComm != MPI_COMM_NULL ) MPI_Comm_free(&activeComm); /* Get maximum sample length */ MPI_Reduce(&diff, &max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if ( rank == 0 ) { if ( max < mintime ) mintime = max; printf(outputFormat, procs, max/iters/2*1000000); } } if ( rank == 0 ) { printParameters(iters, procFile, procsPerNode, allocPattern, useBarrier); } printReportFooter(mintime, rank, wsize, procsPerNode, useNearestRank); MPI_Finalize(); exit(0); }
double runUnicomTest(int procs, int bufsize, int iters, int rank, int wsize, int procsPerNode, char allocPattern, int useBarrier, int useNearestRank, MPI_Comm *activeComm) { int i, currtarg; double diff; double start, maxtime; MPI_Status stat; char *comBuf; double totalops; uintptr_t sbuf, rbuf ; currtarg = getTargetRank(rank, wsize, procsPerNode, useNearestRank); diff = 0; maxtime = 0; // printf("START rank [ %d ]\n" , rank); int count = 0 ; // if(bufsize != 262144) return 1.0 ; if ( isActiveProc(rank, wsize, procsPerNode, procs, allocPattern, useNearestRank) ) { if(0){ comBuf = memalign( LINESIZE, bufsize + sizeof(int)); //printf("cpoint1 rank: %d wsize : %d \n", rank, wsize); // memset(comBuf, 0, bufsize); req_array[currtarg].sync = comBuf + bufsize ; //printf("cpoint2 rank: %d sync addr : %p \n", rank, req_array[currtarg].sync); req_array[currtarg].sync[0] = 0; //printf("cpoint3 rank: %d \n", rank); }else{ comBuf = (char*)memalign( LINESIZE, bufsize); //memset(comBuf, 0, bufsize); //allocate memory for sync flags both this rank and the target //we will keep senders sync flag --> to indicate done copying //we will keep recievers sync flag --> to indicate completion of transfer to start again req_array[currtarg].sync = malloc(sizeof(int)); req_array[currtarg].sync[0] = 0; //we need offset for request going for target for synergistic transfer //Senders offset doesn't matter req_array[currtarg].offset = malloc(sizeof(int)); req_array[currtarg].offset[0] = 0; //initialize //req_array[rank].offset[0] = 0; if ( rank < currtarg ){ //setup sender req_array[currtarg].buffer = comBuf; }else{ //setup reciever req_array[currtarg].buffer = comBuf; } } MPI_Barrier(*activeComm); MPI_Barrier(*activeComm); #pragma omp parallel { #pragma omp master { int k = omp_get_num_threads(); printf ("Number of Threads requested = %i\n",k); } } // printf("START Comm rank [ %d ] buffsize %d \n", rank, bufsize); if ( rank < currtarg ) { // Sender - Time operation loop start = MPI_Wtime(); //wait untill receiver completes if(0){ }else{ volatile int* offsetptr = req_array[currtarg].offset; int offset = 0; sbuf = (uintptr_t) comBuf ; rbuf = (uintptr_t) req_array[rank].buffer ; pthrequest* req = malloc(sizeof(pthrequest)); req->sbuf = sbuf ; req->rbuf = rbuf ; req->offsetptr = offsetptr ; req->bufsize = bufsize ; req->s_sync = req_array[rank].sync ; req->r_sync = req_array[currtarg].sync ; for ( i = 0; i < iters; i++ ) { //do_strasnfer(req); //sleep(10); } } } else { //Reciever code start = MPI_Wtime(); if(0){ }else{ volatile int* offsetptr = req_array[rank].offset; int offset = 0; //reciever now ready to setup the transfer for synergistic protocol rbuf = (uintptr_t) comBuf ; sbuf = (uintptr_t) req_array[rank].buffer ; pthrequest* req = malloc(sizeof(pthrequest)); req->sbuf = sbuf ; req->rbuf = rbuf ; req->offsetptr = offsetptr ; req->bufsize = bufsize ; req->s_sync = req_array[currtarg].sync ; req->r_sync = req_array[rank].sync ; for ( i = 0; i < iters; i++ ) { //do_rtrasnfer(req); //sleep(10); } } } // if ( useBarrier ) // MPI_Barrier(*activeComm); //generic_barrier(*activeComm); diff = MPI_Wtime() - start; MPI_Barrier(*activeComm); MPI_Barrier(*activeComm); // printf("START FREE 1 rank [%d] bufsize [%d] usebarrier [%d] \n", rank, bufsize, useBarrier); free(req_array[currtarg].sync); free(req_array[currtarg].offset); free(comBuf); // printf("FINISHED FREE rank [%d] bufsize [%d]\n", rank, bufsize); } //printf("cpoint12 rank: %d \n", rank); MPI_Barrier(MPI_COMM_WORLD); //return 1.0; // printf("After Barrier \n"); /* Get maximum sample length */ MPI_Reduce(&diff, &maxtime, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); int count_sum = 0 ; MPI_Reduce(&count, &count_sum, 1, MPI_INT, MPI_SUM, 0, MPI_COMM_WORLD); //printf("cpoint13 rank: %d \n", rank); if ( rank == 0 ) { totalops = iters * (procs/2); /* Bandwidth is calculated as : ((# of processes/2) * operations per time sample * message size) ------------------------------------------------------------ maximum sample length of all processes */ if ( maxtime > 0 ) { printf(outputFormat, "Unidirectional", procs, count_sum, bufsize, ((double)totalops*(double)bufsize/maxtime)/1000000, maxtime/iters/2*1000000); } else { printf("Invalid measurement. Increase number of operation per measurement.\n"); } } return maxtime; }
double runLatencyTest ( int bufsize, int iters, MPI_Comm * activeComm ) { int i, currtarg; double start, diff; MPI_Status stat; char *comBuf = NULL; if ( bufsize > 0 ) { comBuf = ( char * ) malloc ( bufsize ); if ( comBuf == NULL ) prestaAbort ( "Failed to allocate latency buffer.\n" ); } currtarg = getTargetRank ( rank, argStruct.procsPerNode, argStruct.useNearestRank ); diff = 0.0; if ( isActiveProc ( activeComm ) ) { for ( i = 0; i < 1000; i++ ) { if ( rank < currtarg ) { /* Ensure pair communication has been initialized */ MPI_Send ( comBuf, bufsize, MPI_BYTE, currtarg, 0, MPI_COMM_WORLD ); MPI_Recv ( comBuf, bufsize, MPI_BYTE, currtarg, 0, MPI_COMM_WORLD, &stat ); } else { /* Ensure pair communication has been initialized */ MPI_Recv ( comBuf, bufsize, MPI_BYTE, currtarg, 0, MPI_COMM_WORLD, &stat ); MPI_Send ( comBuf, bufsize, MPI_BYTE, currtarg, 0, MPI_COMM_WORLD ); } } generic_barrier ( *activeComm ); generic_barrier ( *activeComm ); if ( rank < currtarg ) { /* Time operation loop */ start = MPI_Wtime ( ); for ( i = 0; i < iters; i++ ) { MPI_Send ( comBuf, bufsize, MPI_BYTE, currtarg, 0, MPI_COMM_WORLD ); MPI_Recv ( comBuf, bufsize, MPI_BYTE, currtarg, 0, MPI_COMM_WORLD, &stat ); } } else { /* Time operation loop */ start = MPI_Wtime ( ); for ( i = 0; i < iters; i++ ) { MPI_Recv ( comBuf, bufsize, MPI_BYTE, currtarg, 0, MPI_COMM_WORLD, &stat ); MPI_Send ( comBuf, bufsize, MPI_BYTE, currtarg, 0, MPI_COMM_WORLD ); } } if ( argStruct.useBarrier ) generic_barrier ( *activeComm ); diff = MPI_Wtime ( ) - start; } MPI_Barrier ( MPI_COMM_WORLD ); if ( comBuf != NULL ) free ( comBuf ); return diff; }
double runNonblockBicomTest ( int bufsize, int iters, MPI_Comm * activeComm ) { int i, currtarg; double diff = 0.0; double start; MPI_Status stat; char *sendBuf, *recvBuf; MPI_Request *sendRequests, *recvRequests; MPI_Status *sendStatuses, *recvStatuses; currtarg = getTargetRank ( rank, argStruct.procsPerNode, argStruct.useNearestRank ); sendBuf = ( char * ) malloc ( bufsize ); recvBuf = ( char * ) malloc ( bufsize ); sendRequests = malloc ( sizeof ( MPI_Request ) * argStruct.iters ); recvRequests = malloc ( sizeof ( MPI_Request ) * argStruct.iters ); sendStatuses = malloc ( sizeof ( MPI_Status ) * argStruct.iters ); recvStatuses = malloc ( sizeof ( MPI_Status ) * argStruct.iters ); if ( sendBuf == NULL || recvBuf == NULL || sendRequests == NULL || recvRequests == NULL || sendStatuses == NULL || recvStatuses == NULL ) return 0; memset ( sendBuf, 0, bufsize ); memset ( recvBuf, 0, bufsize ); if ( isActiveProc ( activeComm ) ) { /* Ensure communication paths have been initialized */ MPI_Irecv ( recvBuf, bufsize, MPI_BYTE, currtarg, 0, MPI_COMM_WORLD, recvRequests ); MPI_Isend ( sendBuf, bufsize, MPI_BYTE, currtarg, 0, MPI_COMM_WORLD, sendRequests ); MPI_Wait ( recvRequests, recvStatuses ); MPI_Wait ( sendRequests, sendStatuses ); generic_barrier ( *activeComm ); generic_barrier ( *activeComm ); /* Time operation loop */ start = MPI_Wtime ( ); #ifdef FINAL_WAITALL for ( i = 0; i < iters; i++ ) { MPI_Irecv ( recvBuf, bufsize, MPI_BYTE, currtarg, MPI_ANY_TAG, MPI_COMM_WORLD, &recvRequests[i] ); } for ( i = 0; i < iters; i++ ) { MPI_Isend ( sendBuf, bufsize, MPI_BYTE, currtarg, i, MPI_COMM_WORLD, &sendRequests[i] ); } MPI_Waitall ( argStruct.iters, sendRequests, sendStatuses ); MPI_Waitall ( argStruct.iters, recvRequests, recvStatuses ); #else for ( i = 0; i < iters; i++ ) { MPI_Isend ( sendBuf, bufsize, MPI_BYTE, currtarg, i, MPI_COMM_WORLD, &sendRequests[0] ); MPI_Recv ( recvBuf, bufsize, MPI_BYTE, currtarg, MPI_ANY_TAG, MPI_COMM_WORLD, &stat ); MPI_Wait ( sendRequests, sendStatuses ); } #endif if ( argStruct.useBarrier ) generic_barrier ( *activeComm ); diff = MPI_Wtime ( ) - start; } free ( sendBuf ); free ( recvBuf ); free ( sendRequests ); free ( recvRequests ); free ( sendStatuses ); free ( recvStatuses ); MPI_Barrier ( MPI_COMM_WORLD ); if ( diff > 0 && argStruct.sumLocalBW == 1 ) return ( ( double ) bufsize * ( double ) iters ) / diff; else return diff; }
double runBicomTest ( int bufsize, int iters, MPI_Comm * activeComm ) { int i, currtarg; double start, diff; char *sendbuf, *recvbuf, *validate_buf; MPI_Status stat; long long err_count = 0; currtarg = getTargetRank ( rank, argStruct.procsPerNode, argStruct.useNearestRank ); diff = 0.0; if ( currtarg != -1 && isActiveProc ( activeComm ) ) { sendbuf = ( char * ) malloc ( bufsize ); recvbuf = ( char * ) malloc ( bufsize ); memset ( sendbuf, 0, bufsize ); memset ( recvbuf, 0, bufsize ); /* Ensure communication paths have been initialized */ MPI_Sendrecv ( sendbuf, bufsize, MPI_BYTE, currtarg, 0, recvbuf, bufsize, MPI_BYTE, currtarg, 0, MPI_COMM_WORLD, &stat ); generic_barrier ( *activeComm ); generic_barrier ( *activeComm ); /* Time operation loop */ start = MPI_Wtime ( ); if ( presta_check_data == 1 ) validate_buf = malloc ( bufsize ); for ( i = 0; i < iters; i++ ) { if ( presta_check_data == 1 ) { set_data_values ( bufsize, sendbuf ); memcpy ( validate_buf, sendbuf, bufsize ); } MPI_Sendrecv ( sendbuf, bufsize, MPI_BYTE, currtarg, 0, recvbuf, bufsize, MPI_BYTE, currtarg, 0, MPI_COMM_WORLD, &stat ); if ( presta_check_data == 1 ) { err_count = check_data_values ( bufsize, recvbuf, validate_buf, MPI_BYTE, PRESTA_OP_P2P ); if ( err_count > 0 ) { prestaWarn ( "Bidirectional receive data check failed with %d errors\n", err_count ); presta_data_err_total += err_count; } } } if ( presta_check_data == 1 ) free ( validate_buf ); if ( argStruct.useBarrier ) generic_barrier ( *activeComm ); diff = MPI_Wtime ( ) - start; free ( sendbuf ); free ( recvbuf ); } MPI_Barrier ( MPI_COMM_WORLD ); if ( diff > 0 && argStruct.sumLocalBW == 1 ) return ( ( double ) bufsize * ( double ) iters ) / diff; else return diff; }
double runUnicomTest ( int bufsize, int iters, MPI_Comm * activeComm ) { int i, currtarg; double diff = 0.0; double start; MPI_Status stat; char *comBuf; currtarg = getTargetRank ( rank, argStruct.procsPerNode, argStruct.useNearestRank ); diff = 0; if ( isActiveProc ( activeComm ) ) { comBuf = ( char * ) malloc ( bufsize ); memset ( comBuf, 0, bufsize ); /* Ensure communication paths have been initialized */ if ( rank < currtarg ) MPI_Send ( comBuf, bufsize, MPI_BYTE, currtarg, 0, MPI_COMM_WORLD ); else { MPI_Recv ( comBuf, bufsize, MPI_BYTE, currtarg, 0, MPI_COMM_WORLD, &stat ); } generic_barrier ( *activeComm ); generic_barrier ( *activeComm ); if ( rank < currtarg ) { /* Time operation loop */ start = MPI_Wtime ( ); for ( i = 0; i < iters; i++ ) { if ( presta_check_data == 1 ) set_data_values ( bufsize, comBuf ); MPI_Send ( comBuf, bufsize, MPI_BYTE, currtarg, 0, MPI_COMM_WORLD ); } } else { void *validate_buf = NULL; int err_count = 0; if ( presta_check_data == 1 ) { validate_buf = malloc ( bufsize ); } start = MPI_Wtime ( ); for ( i = 0; i < iters; i++ ) { if ( presta_check_data == 1 ) set_data_values ( bufsize, validate_buf ); MPI_Recv ( comBuf, bufsize, MPI_BYTE, currtarg, 0, MPI_COMM_WORLD, &stat ); if ( presta_check_data == 1 ) { err_count = check_data_values ( bufsize, comBuf, validate_buf, MPI_BYTE, PRESTA_OP_P2P ); if ( err_count > 0 ) { prestaWarn ( "Unidirectional receive data check failed with %d errors\n", err_count ); presta_data_err_total += err_count; } } } if ( presta_check_data == 1 ) { free ( validate_buf ); } } if ( argStruct.useBarrier ) generic_barrier ( *activeComm ); diff = MPI_Wtime ( ) - start; free ( comBuf ); } MPI_Barrier ( MPI_COMM_WORLD ); if ( diff > 0 && argStruct.sumLocalBW == 1 ) return ( ( double ) bufsize * ( double ) iters ) / ( diff * 2 ); else return diff; }
double runBicomTest(int procs, int bufsize, int iters, int rank, int wsize, int procsPerNode, char allocPattern, int useBarrier, int useNearestRank, MPI_Comm *activeComm) { int i, currtarg; double start, diff, max; char *sendbuf, *recvbuf; double totalops; MPI_Status stat; currtarg = getTargetRank(rank, wsize, procsPerNode, useNearestRank); diff = 0; if ( isActiveProc(rank, wsize, procsPerNode, procs, allocPattern, useNearestRank) ) { sendbuf = (char*)malloc(bufsize); recvbuf = (char*)malloc(bufsize); memset(sendbuf, 0, bufsize); memset(recvbuf, 0, bufsize); /* Ensure communication paths have been initialized */ MPI_Sendrecv(sendbuf, bufsize, MPI_CHAR, currtarg, 0, recvbuf, bufsize, MPI_CHAR, currtarg, 0, MPI_COMM_WORLD, &stat); //generic_barrier(*activeComm); MPI_Barrier(*activeComm); //generic_barrier(*activeComm); MPI_Barrier(*activeComm); /* Time operation loop */ start = MPI_Wtime(); for ( i = 0; i < iters; i++ ) { MPI_Sendrecv(sendbuf, bufsize, MPI_CHAR, currtarg, 0, recvbuf, bufsize, MPI_CHAR, currtarg, 0, MPI_COMM_WORLD, &stat); } if ( useBarrier ) MPI_Barrier(*activeComm); //generic_barrier(*activeComm); diff = MPI_Wtime() - start; free(sendbuf); free(recvbuf); } MPI_Barrier(MPI_COMM_WORLD); /* Get maximum sample length */ MPI_Reduce(&diff, &max, 1, MPI_DOUBLE, MPI_MAX, 0, MPI_COMM_WORLD); if ( rank == 0 ) { totalops = (double)(iters * procs); /* Bandwidth is calculated as : (# of processes * operations per time sample * message size) ------------------------------------------------------------ maximum sample length of all processes */ if ( max > 0 ) { printf(outputFormat, "Bidirectional", procs, bufsize, ((double)totalops*(double)bufsize/max)/1000000, max/iters*1000000); } else { printf("Invalid measurement. Increase number of operation per measurement.\n"); } } return max; }