static request ibroadcast(const communicator& comm, T& x, int root) { request r; MPI_Ibcast(Datatype::address(x), Datatype::count(x), Datatype::datatype(), root, comm, &r.r); return r; }
int main(int argc, char **argv) { int errs = 0; int i; int rank, size; int *sbuf = NULL; int *rbuf = NULL; int *scounts = NULL; int *rcounts = NULL; int *sdispls = NULL; int *rdispls = NULL; int *types = NULL; MPI_Comm comm; MPI_Request req; /* intentionally not using MTest_Init/MTest_Finalize in order to make it * easy to take this test and use it as an NBC sanity test outside of the * MPICH test suite */ MPI_Init(&argc, &argv); comm = MPI_COMM_WORLD; MPI_Comm_size(comm, &size); MPI_Comm_rank(comm, &rank); /* enough space for every process to contribute at least NUM_INTS ints to any * collective operation */ sbuf = malloc(NUM_INTS * size * sizeof(int)); my_assert(sbuf); rbuf = malloc(NUM_INTS * size * sizeof(int)); my_assert(rbuf); scounts = malloc(size * sizeof(int)); my_assert(scounts); rcounts = malloc(size * sizeof(int)); my_assert(rcounts); sdispls = malloc(size * sizeof(int)); my_assert(sdispls); rdispls = malloc(size * sizeof(int)); my_assert(rdispls); types = malloc(size * sizeof(int)); my_assert(types); for (i = 0; i < size; ++i) { sbuf[2 * i] = i; sbuf[2 * i + 1] = i; rbuf[2 * i] = i; rbuf[2 * i + 1] = i; scounts[i] = NUM_INTS; rcounts[i] = NUM_INTS; sdispls[i] = i * NUM_INTS; rdispls[i] = i * NUM_INTS; types[i] = MPI_INT; } MPI_Ibarrier(comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Ibcast(sbuf, NUM_INTS, MPI_INT, 0, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Igather(sbuf, NUM_INTS, MPI_INT, rbuf, NUM_INTS, MPI_INT, 0, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); if (0 == rank) MPI_Igather(MPI_IN_PLACE, -1, MPI_DATATYPE_NULL, rbuf, NUM_INTS, MPI_INT, 0, comm, &req); else MPI_Igather(sbuf, NUM_INTS, MPI_INT, rbuf, NUM_INTS, MPI_INT, 0, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Igatherv(sbuf, NUM_INTS, MPI_INT, rbuf, rcounts, rdispls, MPI_INT, 0, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); if (0 == rank) MPI_Igatherv(MPI_IN_PLACE, -1, MPI_DATATYPE_NULL, rbuf, rcounts, rdispls, MPI_INT, 0, comm, &req); else MPI_Igatherv(sbuf, NUM_INTS, MPI_INT, rbuf, rcounts, rdispls, MPI_INT, 0, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Iscatter(sbuf, NUM_INTS, MPI_INT, rbuf, NUM_INTS, MPI_INT, 0, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); if (0 == rank) MPI_Iscatter(sbuf, NUM_INTS, MPI_INT, MPI_IN_PLACE, -1, MPI_DATATYPE_NULL, 0, comm, &req); else MPI_Iscatter(sbuf, NUM_INTS, MPI_INT, rbuf, NUM_INTS, MPI_INT, 0, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Iscatterv(sbuf, scounts, sdispls, MPI_INT, rbuf, NUM_INTS, MPI_INT, 0, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); if (0 == rank) MPI_Iscatterv(sbuf, scounts, sdispls, MPI_INT, MPI_IN_PLACE, -1, MPI_DATATYPE_NULL, 0, comm, &req); else MPI_Iscatterv(sbuf, scounts, sdispls, MPI_INT, rbuf, NUM_INTS, MPI_INT, 0, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Iallgather(sbuf, NUM_INTS, MPI_INT, rbuf, NUM_INTS, MPI_INT, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Iallgather(MPI_IN_PLACE, -1, MPI_DATATYPE_NULL, rbuf, NUM_INTS, MPI_INT, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Iallgatherv(sbuf, NUM_INTS, MPI_INT, rbuf, rcounts, rdispls, MPI_INT, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Iallgatherv(MPI_IN_PLACE, -1, MPI_DATATYPE_NULL, rbuf, rcounts, rdispls, MPI_INT, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Ialltoall(sbuf, NUM_INTS, MPI_INT, rbuf, NUM_INTS, MPI_INT, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Ialltoall(MPI_IN_PLACE, -1, MPI_DATATYPE_NULL, rbuf, NUM_INTS, MPI_INT, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Ialltoallv(sbuf, scounts, sdispls, MPI_INT, rbuf, rcounts, rdispls, MPI_INT, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Ialltoallv(MPI_IN_PLACE, NULL, NULL, MPI_DATATYPE_NULL, rbuf, rcounts, rdispls, MPI_INT, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Ialltoallw(sbuf, scounts, sdispls, types, rbuf, rcounts, rdispls, types, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Ialltoallw(MPI_IN_PLACE, NULL, NULL, NULL, rbuf, rcounts, rdispls, types, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Ireduce(sbuf, rbuf, NUM_INTS, MPI_INT, MPI_SUM, 0, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); if (0 == rank) MPI_Ireduce(MPI_IN_PLACE, rbuf, NUM_INTS, MPI_INT, MPI_SUM, 0, comm, &req); else MPI_Ireduce(sbuf, rbuf, NUM_INTS, MPI_INT, MPI_SUM, 0, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Iallreduce(sbuf, rbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Iallreduce(MPI_IN_PLACE, rbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Ireduce_scatter(sbuf, rbuf, rcounts, MPI_INT, MPI_SUM, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Ireduce_scatter(MPI_IN_PLACE, rbuf, rcounts, MPI_INT, MPI_SUM, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Ireduce_scatter_block(sbuf, rbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Ireduce_scatter_block(MPI_IN_PLACE, rbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Iscan(sbuf, rbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Iscan(MPI_IN_PLACE, rbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Iexscan(sbuf, rbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); MPI_Iexscan(MPI_IN_PLACE, rbuf, NUM_INTS, MPI_INT, MPI_SUM, comm, &req); MPI_Wait(&req, MPI_STATUS_IGNORE); if (sbuf) free(sbuf); if (rbuf) free(rbuf); if (scounts) free(scounts); if (rcounts) free(rcounts); if (sdispls) free(sdispls); if (rdispls) free(rdispls); if (rank == 0) { if (errs) fprintf(stderr, "Found %d errors\n", errs); else printf(" No errors\n"); } MPI_Finalize(); return 0; }
int main(int argc, char **argv) { MPI_Comm cartComm; //Cartesian communicator int tid; //Thread id int nthreads; //Number of threads double time_initial; //Start time double time_end; //End time int n; //N is the size of the matrix //Wrap around int wrapAround=1; #if defined(USE_BROADCAST_ASYNC) //If asynchronus broadcast is enabled, keep a request for testing if the data can be safely modified after being sent MPI_Request bcastRequest; #endif //Initialize the MPI environment if(MPI_Init(NULL,NULL)!=MPI_SUCCESS) { //cerr<<"ERROR"<<endl; } //Get number of threads if(MPI_Comm_size(MPI_COMM_WORLD, &nthreads)!=MPI_SUCCESS) { //cerr<<"ERROR"<<endl; } //Create one dimensional cartesian grouping that is NOT ring-shaped (does not wrap around) //??? Make it wrap around and add a note to the message that tells it to stop when it has reached the start? if(MPI_Cart_create(MPI_COMM_WORLD,1,&nthreads,&wrapAround,1,&cartComm)!=MPI_SUCCESS) { //cerr<<"ERROR"<<endl; } //Get number of threads if(MPI_Comm_size(cartComm, &nthreads)!=MPI_SUCCESS) { //cerr<<"ERROR"<<endl; } //Get thread id if(MPI_Comm_rank(cartComm, &tid)!=MPI_SUCCESS) { //cerr<<"ERROR"<<endl; } int destinationN; int destinationP; MPI_Cart_shift(cartComm,0,1,&destinationP,&destinationN); //Set the size of the matrix n=kappa*nthreads; //Initialize rand srand(time(NULL)+tid); //Create the matrix and split it amongs the threads double ** matrPart; //Holds this thread's part of the matrix int partSize; createMatrix(cartComm,tid,nthreads,n,&matrPart,&partSize); #ifdef __DEBUG__MODE_EX1__ if(tid==0) { ////cout<<"------------------------------------------"<<endl; } printMatrix(cartComm,tid,nthreads,n,&matrPart,partSize); if(tid==0) { //cout<<"------------------------------------------"<<endl; } #endif //Create a cache for optimization //This ensures that there is no difference due to how expensive the functions used to find the right collumn or processor are #ifdef __QuestionExtra__ int colnum = n*2; #else int colnum = n+1; #endif int* thrForCol=malloc(sizeof(int)*colnum); //Tells us which thread each collumn belongs to for (int i=0;i<colnum;++i) { thrForCol[i]=threadForCollumn(nthreads,n,i); } bool* colValidForThr=malloc(sizeof(bool)*colnum); //Tells us if the coolumn selected is valid for the current thread for (int i=0;i<colnum;++i) { colValidForThr[i]=(thrForCol[i]==tid); } int* glColToPartCol=malloc(sizeof(int)*colnum);//Holds the part collumn for the global collumn given (-1 if invalid) for (int i=0;i<colnum;++i) { if(colValidForThr[i]) { glColToPartCol[i]=globColToPartCol(tid,nthreads,n,i); } else { glColToPartCol[i]=-1; } } int* ptColToGlobCol=malloc(sizeof(int)*partSize);//Holds the global collumn for the part column given for (int i=0;i<partSize;++i) { ptColToGlobCol[i]=partColToGlobCol(tid,nthreads,n,i); } //If this is computing the inverse matrix #ifdef __QuestionExtra__ bool* inInverseMatrix=malloc(sizeof(bool)*partSize); //True if in the inverse matrix for (int i=0;i<partSize;++i) { inInverseMatrix[i]=(ptColToGlobCol[i]>=n); } #endif //Set the active diagonal to 0 int k=0; int kapOwner; if(tid==0) { //Get the start time time_initial = MPI_Wtime(); } //Start solving while(k<n) { kapOwner=thrForCol[k]; //If this is the owner of kappa if(tid==kapOwner) { //Get the collumn you need int curCol=glColToPartCol[k]; //For row k, divide it so that it becomes 1 and send what you divided it with to the other rows //First send what we need to do to it to the other threads (which is [k,k]) //(Data sent is number to divide with (the other threads should have the correct k and sender)) #ifndef __SingleProc__ #ifdef USE_BROADCAST MPI_Bcast(&(matrPart[curCol][k]),1,MPI_DOUBLE,kapOwner,cartComm); #elif defined(USE_BROADCAST_ASYNC) MPI_Ibcast(&(matrPart[curCol][k]),1,MPI_DOUBLE,kapOwner,cartComm, &bcastRequest); #else //if not defined USE_BROADCAST MPI_Send(&(matrPart[curCol][k]),1,MPI_DOUBLE,destinationN,COL_TAG,cartComm); #endif #endif //Then divide with that number for(int jj=curCol+1;jj<partSize;++jj) { matrPart[jj][k]=matrPart[jj][k]/matrPart[curCol][k]; } #if !defined(__SingleProc__) && defined(USE_BROADCAST_ASYNC) //Wait for the buffer to be read if sending asynchronously, to avoid race conditions MPI_Wait(&bcastRequest, MPI_STATUS_IGNORE); #endif matrPart[curCol][k]=1; //No need to do a real division for the first element //Then for all rows, subtract and send what we are multiplying to subtract to the other threads for(int i=k+1;i<n;++i) { //First send #ifndef __SingleProc__ #ifdef USE_BROADCAST MPI_Bcast(&(matrPart[curCol][i]),1,MPI_DOUBLE,kapOwner,cartComm); #elif defined(USE_BROADCAST_ASYNC) MPI_Ibcast(&(matrPart[curCol][i]),1,MPI_DOUBLE,kapOwner,cartComm, &bcastRequest); #else //if not defined USE_BROADCAST MPI_Send(&(matrPart[curCol][i]),1,MPI_DOUBLE,destinationN,COL_TAG,cartComm); #endif #endif //For all partcollumns, check to see if we can subtract anything //(their global col must be greater than k and current collumn) for(int jj=curCol+1;jj<partSize;++jj) { matrPart[jj][i]=matrPart[jj][i]-matrPart[jj][k]*matrPart[curCol][i]; } #if !defined(__SingleProc__) && defined(USE_BROADCAST_ASYNC) //Wait for the buffer to be read if sending asynchronously, to avoid race conditions MPI_Wait(&bcastRequest, MPI_STATUS_IGNORE); #endif //Then subtract matrPart[curCol][i]=0; //NO need to do real subtraction for the first element } } //Else, if this is not the owner of kappa else { //Used for optimisation bool isValid=false; bool isValidArr[partSize]; for(int j=0;j<partSize;++j) { if(ptColToGlobCol[j]>k) { isValid=true; isValidArr[j]=true; } else { isValidArr[j]=false; } } //First receive the number you need to divide k row with and send it to the next one //(unless next one is sender) double recD; #ifdef USE_BROADCAST MPI_Bcast(&recD,1,MPI_DOUBLE,kapOwner,cartComm); #elif defined(USE_BROADCAST_ASYNC) MPI_Ibcast(&recD,1,MPI_DOUBLE,kapOwner,cartComm, &bcastRequest); MPI_Wait(&bcastRequest, MPI_STATUS_IGNORE); #else //if not defined USE_BROADCAST MPI_Recv(&recD,1,MPI_DOUBLE,destinationP,MPI_ANY_TAG,cartComm,MPI_STATUS_IGNORE); if(destinationN!=kapOwner) { MPI_Send(&recD,1,MPI_DOUBLE,destinationN,COL_TAG,cartComm); } #endif //Then divide k row if necessary if(isValid) { for(int j=0;j<partSize;++j) { if(isValidArr[j]) { matrPart[j][k]=matrPart[j][k]/recD; } } } //Then for all rows below k row, receive what we need to multiply the subtraction with //and do that if necessary for(int i=k+1;i<n;++i) { #ifdef USE_BROADCAST MPI_Bcast(&recD,1,MPI_DOUBLE,kapOwner,cartComm); #elif defined(USE_BROADCAST_ASYNC) MPI_Ibcast(&recD,1,MPI_DOUBLE,kapOwner,cartComm, &bcastRequest); MPI_Wait(&bcastRequest, MPI_STATUS_IGNORE); #else //if not defined USE_BROADCAST MPI_Recv(&recD,1,MPI_DOUBLE,destinationP,MPI_ANY_TAG,cartComm,MPI_STATUS_IGNORE); if(destinationN!=kapOwner) { MPI_Send(&recD,1,MPI_DOUBLE,destinationN,COL_TAG,cartComm); } #endif if(isValid) { for(int j=0;j<partSize;++j) { if(isValidArr[j]) { matrPart[j][i]=matrPart[j][i]-recD*matrPart[j][k]; } } } } } //Finally, increment k ++k; #ifdef __DEBUG__MODE_EX1__ printMatrix(cartComm,tid,nthreads,n,&matrPart,partSize); if(tid==0) { //cout<<"------------------------------------------"<<endl; } #endif } k=n-1; #ifdef __QuestionExtra__ //IF THIS IS COMPUTING THE INVERSE MATRIX while(k>0) { kapOwner=thrForCol[k]; //If this is the owner of kappa if(tid==kapOwner) { //Get the collumn you need int curCol=glColToPartCol[k]; for(int i=k-1;i>=0;--i) { #ifndef __SingleProc__ #ifdef USE_BROADCAST MPI_Bcast(&(matrPart[curCol][i]),1,MPI_DOUBLE,kapOwner,cartComm); #elif defined(USE_BROADCAST_ASYNC) MPI_Ibcast(&(matrPart[curCol][i]),1,MPI_DOUBLE,kapOwner,cartComm, &bcastRequest); #else //if not defined USE_BROADCAST MPI_Send(&(matrPart[curCol][i]),1,MPI_DOUBLE,destinationN,COL_TAG,cartComm); #endif #endif for(int j=curCol+1;j<partSize;++j) { //If this is in the inverse matrix if(inInverseMatrix[j]) { matrPart[j][i]=matrPart[j][i]-matrPart[j][k]*matrPart[curCol][i]; } } #if !defined(__SingleProc__) && defined(USE_BROADCAST_ASYNC) //Wait for the buffer to be read if sending asynchronously, to avoid race conditions MPI_Wait(&bcastRequest, MPI_STATUS_IGNORE); #endif matrPart[curCol][i]=0; //No need to do real subtraction. } } //Else, if this is not the owner of kappa else { //for all rows above k row, receive what we need to multiply the subtraction with //and do that if necessary double recD; for(int i=k-1;i>=0;--i) { #ifdef USE_BROADCAST MPI_Bcast(&recD,1,MPI_DOUBLE,kapOwner,cartComm); #elif defined(USE_BROADCAST_ASYNC) MPI_Ibcast(&recD,1,MPI_DOUBLE,kapOwner,cartComm, &bcastRequest); MPI_Wait(&bcastRequest, MPI_STATUS_IGNORE); #else //if not defined USE_BROADCAST MPI_Recv(&recD,1,MPI_DOUBLE,destinationP,MPI_ANY_TAG,cartComm,MPI_STATUS_IGNORE); if(destinationN!=kapOwner) { //Pass it along to the next thread MPI_Send(&recD,1,MPI_DOUBLE,destinationN,COL_TAG,cartComm); } #endif //For all collumns for(int j=0;j<partSize;++j) { //If this is in the inverse matrix if(inInverseMatrix[j]) { matrPart[j][i]=matrPart[j][i]-recD*matrPart[j][k]; } } } } //Finally, decrement kappa --k; #ifdef __DEBUG__MODE_EX1__ printMatrix(cartComm,tid,nthreads,n,&matrPart,partSize); if(tid==0) { //cout<<"------------------------------------------"<<endl; } #endif } #else //If this is not computing the inverse matrix but doing elimination while(k>0) { //Used for optimisation int endCol; bool isValid=colValidForThr[n]; if(isValid) { endCol=glColToPartCol[n]; } kapOwner=thrForCol[k]; //If this is the owner of kappa if(tid==kapOwner) { //Get the collumn you need int curCol=glColToPartCol[k]; for(int i=k-1;i>=0;--i) { #ifndef __SingleProc__ #ifdef USE_BROADCAST MPI_Bcast(&(matrPart[curCol][i]),1,MPI_DOUBLE,kapOwner,cartComm); #elif defined(USE_BROADCAST_ASYNC) MPI_Ibcast(&(matrPart[curCol][i]),1,MPI_DOUBLE,kapOwner,cartComm, &bcastRequest); #else //if not defined USE_BROADCAST MPI_Send(&(matrPart[curCol][i]),1,MPI_DOUBLE,destinationN,COL_TAG,cartComm); #endif #endif if(isValid) { matrPart[endCol][i]=matrPart[endCol][i]-matrPart[endCol][k]*matrPart[curCol][i]; } #if !defined(__SingleProc__) && defined(USE_BROADCAST_ASYNC) //Wait for the buffer to be read if sending asynchronously, to avoid race conditions MPI_Wait(&bcastRequest, MPI_STATUS_IGNORE); #endif matrPart[curCol][i]=0; //No need to do real subtraction. } } //Else, if this is not the owner of kappa else { //for all rows above k row, receive what we need to multiply the subtraction with //and do that if necessary double recD; for(int i=k-1;i>=0;--i) { #ifdef USE_BROADCAST MPI_Bcast(&recD,1,MPI_DOUBLE,kapOwner,cartComm); #elif defined(USE_BROADCAST_ASYNC) MPI_Ibcast(&recD,1,MPI_DOUBLE,kapOwner,cartComm, &bcastRequest); MPI_Wait(&bcastRequest, MPI_STATUS_IGNORE); #else //if not defined USE_BROADCAST MPI_Recv(&recD,1,MPI_DOUBLE,destinationP,MPI_ANY_TAG,cartComm,MPI_STATUS_IGNORE); if(destinationN!=kapOwner) { MPI_Send(&recD,1,MPI_DOUBLE,destinationN,COL_TAG,cartComm); } #endif if(isValid) { matrPart[endCol][i]=matrPart[endCol][i]-recD*matrPart[endCol][k]; } } } //Finally, decrement kappa --k; #ifdef __DEBUG__MODE_EX1__ printMatrix(cartComm,tid,nthreads,n,&matrPart,partSize); if(tid==0) { //cout<<"------------------------------------------"<<endl; } #endif } #endif if(tid==0) { //Get the end time time_end = MPI_Wtime(); } #ifdef __DEBUG__MODE_EX1__ //Print the solution printMatrix(cartComm,tid,nthreads,n,&matrPart,partSize); #endif if(tid==0) { #ifdef __DEBUG__MODE_EX1__ //Write some info //cout<<"Solved in "<<(time_end-time_initial)<<" seconds in "<<nthreads<<" threads using configuration "; #ifdef __Question1__ //cout<<"1:\"serial\""<<endl; #endif #ifdef __Question2__ //cout<<"2:\"shuffle\""<<endl; #endif #else /*if(isnan(matrPart[0][0])) { //cout<<"INVALID MATRIX: NAN"<<endl; } else {*/ printf("%.20f",(time_end-time_initial)); ////cout<<fixed<<setprecision(20)<<(time_end-time_initial)<<endl; //} #endif } //Delete data for(int j=0;j<partSize;++j) { free(matrPart[j]); } free(matrPart); //Delete cache free(thrForCol); free(colValidForThr); free(glColToPartCol); free(ptColToGlobCol); #ifdef __QuestionExtra__ free(inInverseMatrix); #endif //Finalize the MPI environment if(MPI_Finalize()!=MPI_SUCCESS) { ////cerr<<tid<<" ERROR"<<endl; } //Exit return EXIT_SUCCESS; }
/* Starts a "random" operation on "comm" corresponding to "rndnum" and returns * in (*req) a request handle corresonding to that operation. This call should * be considered collective over comm (with a consistent value for "rndnum"), * even though the operation may only be a point-to-point request. */ static void start_random_nonblocking(MPI_Comm comm, unsigned int rndnum, MPI_Request *req, struct laundry *l) { int i, j; int rank, size; int *buf = NULL; int *recvbuf = NULL; int *sendcounts = NULL; int *recvcounts = NULL; int *sdispls = NULL; int *rdispls = NULL; int *sendtypes = NULL; int *recvtypes = NULL; signed char *buf_alias = NULL; MPI_Comm_rank(comm, &rank); MPI_Comm_size(comm, &size); *req = MPI_REQUEST_NULL; l->case_num = -1; l->comm = comm; l->buf = buf = malloc(COUNT*size*sizeof(int)); l->recvbuf = recvbuf = malloc(COUNT*size*sizeof(int)); l->sendcounts = sendcounts = malloc(size*sizeof(int)); l->recvcounts = recvcounts = malloc(size*sizeof(int)); l->sdispls = sdispls = malloc(size*sizeof(int)); l->rdispls = rdispls = malloc(size*sizeof(int)); l->sendtypes = sendtypes = malloc(size*sizeof(MPI_Datatype)); l->recvtypes = recvtypes = malloc(size*sizeof(MPI_Datatype)); #define NUM_CASES (21) l->case_num = rand_range(rndnum, 0, NUM_CASES); switch (l->case_num) { case 0: /* MPI_Ibcast */ for (i = 0; i < COUNT; ++i) { if (rank == 0) { buf[i] = i; } else { buf[i] = 0xdeadbeef; } } MPI_Ibcast(buf, COUNT, MPI_INT, 0, comm, req); break; case 1: /* MPI_Ibcast (again, but designed to stress scatter/allgather impls) */ /* FIXME fiddle with PRIME and buffer allocation s.t. PRIME is much larger (1021?) */ buf_alias = (signed char *)buf; my_assert(COUNT*size*sizeof(int) > PRIME); /* sanity */ for (i = 0; i < PRIME; ++i) { if (rank == 0) buf_alias[i] = i; else buf_alias[i] = 0xdb; } for (i = PRIME; i < COUNT * size * sizeof(int); ++i) { buf_alias[i] = 0xbf; } MPI_Ibcast(buf_alias, PRIME, MPI_SIGNED_CHAR, 0, comm, req); break; case 2: /* MPI_Ibarrier */ MPI_Ibarrier(comm, req); break; case 3: /* MPI_Ireduce */ for (i = 0; i < COUNT; ++i) { buf[i] = rank + i; recvbuf[i] = 0xdeadbeef; } MPI_Ireduce(buf, recvbuf, COUNT, MPI_INT, MPI_SUM, 0, comm, req); break; case 4: /* same again, use a user op and free it before the wait */ { MPI_Op op = MPI_OP_NULL; MPI_Op_create(sum_fn, /*commute=*/1, &op); for (i = 0; i < COUNT; ++i) { buf[i] = rank + i; recvbuf[i] = 0xdeadbeef; } MPI_Ireduce(buf, recvbuf, COUNT, MPI_INT, op, 0, comm, req); MPI_Op_free(&op); } break; case 5: /* MPI_Iallreduce */ for (i = 0; i < COUNT; ++i) { buf[i] = rank + i; recvbuf[i] = 0xdeadbeef; } MPI_Iallreduce(buf, recvbuf, COUNT, MPI_INT, MPI_SUM, comm, req); break; case 6: /* MPI_Ialltoallv (a weak test, neither irregular nor sparse) */ for (i = 0; i < size; ++i) { sendcounts[i] = COUNT; recvcounts[i] = COUNT; sdispls[i] = COUNT * i; rdispls[i] = COUNT * i; for (j = 0; j < COUNT; ++j) { buf[i*COUNT+j] = rank + (i * j); recvbuf[i*COUNT+j] = 0xdeadbeef; } } MPI_Ialltoallv(buf, sendcounts, sdispls, MPI_INT, recvbuf, recvcounts, rdispls, MPI_INT, comm, req); break; case 7: /* MPI_Igather */ for (i = 0; i < size*COUNT; ++i) { buf[i] = rank + i; recvbuf[i] = 0xdeadbeef; } MPI_Igather(buf, COUNT, MPI_INT, recvbuf, COUNT, MPI_INT, 0, comm, req); break; case 8: /* same test again, just use a dup'ed datatype and free it before the wait */ { MPI_Datatype type = MPI_DATATYPE_NULL; MPI_Type_dup(MPI_INT, &type); for (i = 0; i < size*COUNT; ++i) { buf[i] = rank + i; recvbuf[i] = 0xdeadbeef; } MPI_Igather(buf, COUNT, MPI_INT, recvbuf, COUNT, type, 0, comm, req); MPI_Type_free(&type); /* should cause implementations that don't refcount correctly to blow up or hang in the wait */ } break; case 9: /* MPI_Iscatter */ for (i = 0; i < size; ++i) { for (j = 0; j < COUNT; ++j) { if (rank == 0) buf[i*COUNT+j] = i + j; else buf[i*COUNT+j] = 0xdeadbeef; recvbuf[i*COUNT+j] = 0xdeadbeef; } } MPI_Iscatter(buf, COUNT, MPI_INT, recvbuf, COUNT, MPI_INT, 0, comm, req); break; case 10: /* MPI_Iscatterv */ for (i = 0; i < size; ++i) { /* weak test, just test the regular case where all counts are equal */ sendcounts[i] = COUNT; sdispls[i] = i * COUNT; for (j = 0; j < COUNT; ++j) { if (rank == 0) buf[i*COUNT+j] = i + j; else buf[i*COUNT+j] = 0xdeadbeef; recvbuf[i*COUNT+j] = 0xdeadbeef; } } MPI_Iscatterv(buf, sendcounts, sdispls, MPI_INT, recvbuf, COUNT, MPI_INT, 0, comm, req); break; case 11: /* MPI_Ireduce_scatter */ for (i = 0; i < size; ++i) { recvcounts[i] = COUNT; for (j = 0; j < COUNT; ++j) { buf[i*COUNT+j] = rank + i; recvbuf[i*COUNT+j] = 0xdeadbeef; } } MPI_Ireduce_scatter(buf, recvbuf, recvcounts, MPI_INT, MPI_SUM, comm, req); break; case 12: /* MPI_Ireduce_scatter_block */ for (i = 0; i < size; ++i) { for (j = 0; j < COUNT; ++j) { buf[i*COUNT+j] = rank + i; recvbuf[i*COUNT+j] = 0xdeadbeef; } } MPI_Ireduce_scatter_block(buf, recvbuf, COUNT, MPI_INT, MPI_SUM, comm, req); break; case 13: /* MPI_Igatherv */ for (i = 0; i < size*COUNT; ++i) { buf[i] = 0xdeadbeef; recvbuf[i] = 0xdeadbeef; } for (i = 0; i < COUNT; ++i) { buf[i] = rank + i; } for (i = 0; i < size; ++i) { recvcounts[i] = COUNT; rdispls[i] = i * COUNT; } MPI_Igatherv(buf, COUNT, MPI_INT, recvbuf, recvcounts, rdispls, MPI_INT, 0, comm, req); break; case 14: /* MPI_Ialltoall */ for (i = 0; i < size; ++i) { for (j = 0; j < COUNT; ++j) { buf[i*COUNT+j] = rank + (i * j); recvbuf[i*COUNT+j] = 0xdeadbeef; } } MPI_Ialltoall(buf, COUNT, MPI_INT, recvbuf, COUNT, MPI_INT, comm, req); break; case 15: /* MPI_Iallgather */ for (i = 0; i < size*COUNT; ++i) { buf[i] = rank + i; recvbuf[i] = 0xdeadbeef; } MPI_Iallgather(buf, COUNT, MPI_INT, recvbuf, COUNT, MPI_INT, comm, req); break; case 16: /* MPI_Iallgatherv */ for (i = 0; i < size; ++i) { for (j = 0; j < COUNT; ++j) { recvbuf[i*COUNT+j] = 0xdeadbeef; } recvcounts[i] = COUNT; rdispls[i] = i * COUNT; } for (i = 0; i < COUNT; ++i) buf[i] = rank + i; MPI_Iallgatherv(buf, COUNT, MPI_INT, recvbuf, recvcounts, rdispls, MPI_INT, comm, req); break; case 17: /* MPI_Iscan */ for (i = 0; i < COUNT; ++i) { buf[i] = rank + i; recvbuf[i] = 0xdeadbeef; } MPI_Iscan(buf, recvbuf, COUNT, MPI_INT, MPI_SUM, comm, req); break; case 18: /* MPI_Iexscan */ for (i = 0; i < COUNT; ++i) { buf[i] = rank + i; recvbuf[i] = 0xdeadbeef; } MPI_Iexscan(buf, recvbuf, COUNT, MPI_INT, MPI_SUM, comm, req); break; case 19: /* MPI_Ialltoallw (a weak test, neither irregular nor sparse) */ for (i = 0; i < size; ++i) { sendcounts[i] = COUNT; recvcounts[i] = COUNT; sdispls[i] = COUNT * i * sizeof(int); rdispls[i] = COUNT * i * sizeof(int); sendtypes[i] = MPI_INT; recvtypes[i] = MPI_INT; for (j = 0; j < COUNT; ++j) { buf[i*COUNT+j] = rank + (i * j); recvbuf[i*COUNT+j] = 0xdeadbeef; } } MPI_Ialltoallw(buf, sendcounts, sdispls, sendtypes, recvbuf, recvcounts, rdispls, recvtypes, comm, req); break; case 20: /* basic pt2pt MPI_Isend/MPI_Irecv pairing */ /* even ranks send to odd ranks, but only if we have a full pair */ if ((rank % 2 != 0) || (rank != size-1)) { for (j = 0; j < COUNT; ++j) { buf[j] = j; recvbuf[j] = 0xdeadbeef; } if (rank % 2 == 0) MPI_Isend(buf, COUNT, MPI_INT, rank+1, 5, comm, req); else MPI_Irecv(recvbuf, COUNT, MPI_INT, rank-1, 5, comm, req); } break; default: fprintf(stderr, "unexpected value for l->case_num=%d)\n", (l->case_num)); MPI_Abort(comm, 1); exit(1); break; } }
void IMB_ibcast_pure(struct comm_info* c_info, int size, struct iter_schedule* ITERATIONS, MODES RUN_MODE, double* time) /* MPI-NBC benchmark kernel Benchmarks MPI_Ibcast Input variables: -c_info (type struct comm_info*) Collection of all base data for MPI; see [1] for more information -size (type int) Basic message size in bytes -ITERATIONS (type struct iter_schedule *) Repetition scheduling -RUN_MODE (type MODES) (only MPI-2 case: see [1]) Output variables: -time (type double*) Timing result per sample */ { int i = 0, root = 0; Type_Size s_size; int s_num = 0; void* bc_buf = NULL; MPI_Request request; MPI_Status status; double t_pure = 0.; #ifdef CHECK defect = 0.; #endif ierr = 0; /* GET SIZE OF DATA TYPE */ MPI_Type_size(c_info->s_data_type, &s_size); if (s_size != 0) { s_num = size / s_size; } if(c_info->rank != -1) { root = 0; for (i = 0; i < N_BARR; i++) { MPI_Barrier(c_info->communicator); } t_pure = MPI_Wtime(); for(i = 0; i < ITERATIONS->n_sample; i++) { bc_buf = (root == c_info->rank) ? c_info->s_buffer : c_info->r_buffer; ierr = MPI_Ibcast((char*)bc_buf + i % ITERATIONS->s_cache_iter * ITERATIONS->s_offs, s_num, c_info->s_data_type, root, c_info->communicator, &request); MPI_ERRHAND(ierr); MPI_Wait(&request, &status); CHK_DIFF("Ibcast_pure", c_info, (char*)bc_buf + i % ITERATIONS->s_cache_iter * ITERATIONS->s_offs, 0, size, size, 1, put, 0, ITERATIONS->n_sample, i, root, &defect); root = (++root) % c_info->num_procs; } t_pure = (MPI_Wtime() - t_pure) / ITERATIONS->n_sample; } time[0] = t_pure; }