int main(int argc, char* argv[]) { // DQ: Modified code to add const. // Use of "const" makes the type a SgModifierType (so for now let's keep it simple). // const Point zero = getZeros(); Point zero = getZeros(); Point lo=zero; // Point hi=getOnes()*(DOMAINSIZE-1); Point hi=getOnes()*(6); Box bxdest(lo,hi); //box low and high corners for destination // This will grow the box by one ghost // along each face and become the box for // the source box. Box bxsrc = bxdest.grow(1); // source and destination data containers RectMDArray<double> Asrc(bxsrc); RectMDArray<double> Adest(bxdest); // all the coefficients I need for this operation const double ident = 1.0; const double C0 = -4.0; // An expression to recognize: // pair<Shift,double>(zero,C0); // This is a simpler interface to interpret (suggested by Anshu). // Stencil<double> laplace(pair<Shift,double>(zero,C0)); Stencil<double> laplace; // laplace = laplace + (pair<Shift,double>(zero,C0)); // laplace = laplace + (pair<Shift,double>(zero,C0)); Point xdir = getUnitv(0); xdir *= -1; laplace=laplace+(pair<Shift,double>(xdir,ident)); }
void setStencil(Stencil<double>& a_laplace, const double & a_dx) { double C0=-2.0*DIM; Point zero=getZeros(); Point ones=getOnes(); Point negones=ones*(-1); double ident=1.0; array<Shift,DIM> S=getShiftVec(); a_laplace = C0*(S^zero); for (int dir=0;dir<DIM;dir++) { Point thishft=getUnitv(dir); a_laplace = a_laplace + ident*(S^thishft); a_laplace = a_laplace + ident*(S^(thishft*(-1))); } //cout << "stencil unscaled by dx = " << endl; //a_laplace.stencilDump(); a_laplace *= (1.0/a_dx/a_dx); }
void testlap(LevelData<double >& a_phi, double a_dx,char* a_str) { double coef = 1./(a_dx*a_dx); Stencil<double> Laplacian(make_pair(getZeros(),-DIM*2*coef)); BoxLayout bl = a_phi.getBoxLayout(); for (int dir = 0; dir < DIM ; dir++) { Point edir = getUnitv(dir); Stencil<double> plus(make_pair(Shift(edir),coef)); Stencil<double> minus(make_pair(Shift(edir*(-1)),coef)); Laplacian = Laplacian + minus + plus; } a_phi.exchange(); RectMDArray<double> LPhi00(bl.getDomain()); for (BLIterator blit(bl); blit != blit.end(); ++blit) { LPhi00 |= Laplacian(a_phi[*blit],bl[*blit]); } MDWrite(a_str,LPhi00); };
int main(int argc,char *argv[]) { // Indicate SPMD code segments, without changing scopes of variables #pragma omp target device(mpi) begin const class Point zero = getZeros(); const class Point ones = getOnes(); const class Point negones = ones * -1; const class Point lo(zero); // DQ (2/7/2015): Fixup for error yet to be fixed in ROSE (or fixed on alternative branch not yet merged). // Point hi = getOnes()*(BLOCKSIZE-1); const int adjustedBlockSize = 511; const class Point hi = getOnes() * adjustedBlockSize; //box low and high corners for destination const class Box bxdest(lo,hi); // This will grow the box by one ghost // along each face and become the box for // the source box. const class Box bxsrc = bxdest . grow (1); #pragma omp target device(mpi) end // source and destination data containers #pragma omp target data map(to:Asrc) map(from:Adest) class RectMDArray< double , 1 , 1 , 1 > Asrc(bxsrc); class RectMDArray< double , 1 , 1 , 1 > Adest(bxdest); // all the coefficients I need for this operation const double ident = 1.0; // DQ (2/18/2015): I need the simpler version because the current constant folding does not operate on floating point values. // const double C0 = -2.0 * DIM; const double C0 = -6.00000; initialize(Asrc); initialize(Adest); // cout <<" The source Box" << endl; // Asrc.print(); // cout << endl; // build the stencil, and the stencil operator // Stencil<double> laplace(wt,shft); const std::array< Shift , 3 > S = getShiftVec(); // This calls: template <class T> Stencil<T> operator*(T a_coef, Shift a_shift); class Stencil< double > laplace = C0*((S)^(zero)); for (int dir = 0; dir < 3; dir++) { const class Point thishft = getUnitv(dir); // DQ (2/15/2015): Added operator+=() to support clearer updates of an existing object for compile-time analysis. // laplace = laplace + ident*(S^thishft); // laplace = laplace + ident*(S^(thishft*(-1))); laplace += ident*((S)^(thishft)); laplace += ident*((S)^thishft * -1); } // laplace.stencilDump(); // StencilOperator<double,double, double> op; int lb2 = bxdest . getLowCorner ()[2]; int k = 0; int ub2 = bxdest . getHighCorner ()[2]; int arraySize_X = bxdest . size (0); int lb1 = bxdest . getLowCorner ()[1]; int j = 0; int ub1 = bxdest . getHighCorner ()[1]; int arraySize_Y = bxdest . size (1); int lb0 = bxdest . getLowCorner ()[0]; int i = 0; int ub0 = bxdest . getHighCorner ()[0]; int arraySize_Z = bxdest . size (2); double *sourceDataPointer = Asrc . getPointer(); double *destinationDataPointer = Adest . getPointer(); int arraySize_X_src = bxsrc . size (0); int arraySize_Y_src = bxsrc . size (1); int arraySize_Z_src = bxsrc . size (2); int lb2src = bxsrc . getLowCorner ()[2]; int lb1src = bxsrc . getLowCorner ()[1]; int lb0src = bxsrc . getLowCorner ()[0]; #pragma omp target device(mpi) map(to:sourceDataPointer[lb0src:arraySize_X_src][lb1src:arraySize_Y_src][lb2src:arraySize_Z_src] dist_data(DUPLICATE, DUPLICATE, BLOCK)) map(from:destinationDataPointer[lb0:arraySize_X][lb1:arraySize_Y][lb2:arraySize_Z]) #pragma omp target parallel for for (k = lb2; k <= ub2; ++k) { // loop to be distributed must match the dimension being distributed (3rd dimension). for (j = lb1; j <= ub1; ++j) { for (i = lb0; i <= ub0; ++i) { destinationDataPointer[arraySize_X * (arraySize_Y * k + j) + i] = sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] * -6.00000; // std::cout<< arraySize_X_src << " " << arraySize_Y_src << " " << i << ":" << j << ":" << k << " " << destinationDataPointer[arraySize_X * (arraySize_Y * k + j) + i] << "=" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] << "* -6.00000" << std::endl; } } } // cout <<" The destination Box" << endl; // Adest.print(); }
int main(int argc,char *argv[]) { const class Point zero = getZeros(); const class Point ones = getOnes(); const class Point negones = ones * -1; const class Point lo(zero); // DQ (2/7/2015): Fixup for error yet to be fixed in ROSE (or fixed on alternative branch not yet merged). // Point hi = getOnes()*(BLOCKSIZE-1); const int adjustedBlockSize = 31; const class Point hi = getOnes() * adjustedBlockSize; //box low and high corners for destination const class Box bxdest(lo,hi); // This will grow the box by one ghost // along each face and become the box for // the source box. const class Box bxsrc = bxdest . grow (1); // source and destination data containers class RectMDArray< double , 1 , 1 , 1 > Asrc(bxsrc); class RectMDArray< double , 1 , 1 , 1 > Adest(bxdest); // all the coefficients I need for this operation const double ident = 1.0; // DQ (2/18/2015): I need the simpler version because the current constant folding does not operate on floating point values. // const double C0 = -2.0 * DIM; const double C0 = -6.00000; initialize(Asrc); initialize(Adest); // cout <<" The source Box" << endl; // Asrc.print(); // cout << endl; // build the stencil, and the stencil operator // Stencil<double> laplace(wt,shft); const std::array< Shift , 3 > S = getShiftVec(); // This calls: template <class T> Stencil<T> operator*(T a_coef, Shift a_shift); class Stencil< double > laplace = C0*((S)^(zero)); for (int dir = 0; dir < 3; dir++) { const class Point thishft = getUnitv(dir); // DQ (2/15/2015): Added operator+=() to support clearer updates of an existing object for compile-time analysis. // laplace = laplace + ident*(S^thishft); // laplace = laplace + ident*(S^(thishft*(-1))); laplace += ident*((S)^(thishft)); laplace += ident*((S)^thishft * -1); } int lb2 = bxdest . getLowCorner ()[2]; int k = 0; int ub2 = bxdest . getHighCorner ()[2]; int arraySize_X = bxdest . size (0); int lb1 = bxdest . getLowCorner ()[1]; int j = 0; int ub1 = bxdest . getHighCorner ()[1]; int arraySize_Y = bxdest . size (1); int lb0 = bxdest . getLowCorner ()[0]; int i = 0; int ub0 = bxdest . getHighCorner ()[0]; int arraySize_Z = bxdest . size (2); #pragma 0 double (*sourceDataPointer)[arraySize_Y][arraySize_X]; sourceDataPointer = (double (*)[arraySize_Y][arraySize_X])malloc(sizeof(double)*arraySize_X*arraySize_Y*arraySize_Z); memcpy(sourceDataPointer,Asrc.getPointer(),sizeof(double)*arraySize_X*arraySize_Y*arraySize_Z); #pragma 0 double (*destinationDataPointer)[arraySize_Y][arraySize_X]; destinationDataPointer = (double (*)[arraySize_Y][arraySize_X])malloc(sizeof(double)*arraySize_X*arraySize_Y*arraySize_Z); memcpy(destinationDataPointer,Adest.getPointer(),sizeof(double)*arraySize_X*arraySize_Y*arraySize_Z); #pragma 32 for (k = lb2; k < ub2; ++k) { for (j = lb1; j < ub1; ++j) { for (i = lb0; i < ub0; ++i) { destinationDataPointer[k][j][i] = sourceDataPointer[k + -1][j][i] + sourceDataPointer[k + 1][j][i] + sourceDataPointer[k][j + -1][i] + sourceDataPointer[k][j + 1][i] + sourceDataPointer[k][j][i + -1] + sourceDataPointer[k][j][i + 1] + sourceDataPointer[k][j][i] * -6.00000; } } } memcpy(Asrc.getPointer(),sourceDataPointer,sizeof(double)*arraySize_X*arraySize_Y*arraySize_Z); memcpy(Adest.getPointer(),destinationDataPointer,sizeof(double)*arraySize_X*arraySize_Y*arraySize_Z); free(sourceDataPointer); free(destinationDataPointer); // cout <<" The destination Box" << endl; // Adest.print(); }
int master(int rank, int nprocs) { const class Point zero = getZeros(); const class Point ones = getOnes(); const class Point negones = ones * -1; const class Point lo(zero); // DQ (2/7/2015): Fixup for error yet to be fixed in ROSE (or fixed on alternative branch not yet merged). // Point hi = getOnes()*(BLOCKSIZE-1); const int adjustedBlockSize = SIZE; const class Point hi = getOnes() * adjustedBlockSize; //box low and high corners for destination const class Box bxdest(lo,hi); // This will grow the box by one ghost // along each face and become the box for // the source box. const class Box bxsrc = bxdest . grow (1); // source and destination data containers class RectMDArray< double , 1 , 1 , 1 > Asrc(bxsrc); class RectMDArray< double , 1 , 1 , 1 > Adest(bxdest); // all the coefficients I need for this operation const double ident = 1.0; // DQ (2/18/2015): I need the simpler version because the current constant folding does not operate on floating point values. // const double C0 = -2.0 * DIM; const double C0 = -6.00000; initialize(Asrc); initialize(Adest); #if debug cout <<" The source Box" << endl; Asrc.print(); cout << endl; #endif // build the stencil, and the stencil operator // Stencil<double> laplace(wt,shft); const std::array< Shift , 3 > S = getShiftVec(); // This calls: template <class T> Stencil<T> operator*(T a_coef, Shift a_shift); class Stencil< double > laplace = C0*((S)^(zero)); for (int dir = 0; dir < 3; dir++) { const class Point thishft = getUnitv(dir); // DQ (2/15/2015): Added operator+=() to support clearer updates of an existing object for compile-time analysis. // laplace = laplace + ident*(S^thishft); // laplace = laplace + ident*(S^(thishft*(-1))); laplace += ident*((S)^(thishft)); laplace += ident*((S)^thishft * -1); } // laplace.stencilDump(); // sequential version ------------------------------ // StencilOperator<double,double, double> op; double begin = MPI_Wtime(); int lb2src = bxsrc . getLowCorner ()[2]; int k = 0; int ub2src = bxsrc . getHighCorner ()[2]; int arraySize_X_src = bxsrc . size (0); int lb1src = bxsrc . getLowCorner ()[1]; int j = 0; int ub1src = bxsrc . getHighCorner ()[1]; int arraySize_Y_src = bxsrc . size (1); int lb0src = bxsrc . getLowCorner ()[0]; int i = 0; int ub0src = bxsrc . getHighCorner ()[0]; int arraySize_Z_src = bxsrc . size (2); int lb2dest = bxdest . getLowCorner ()[2]; int ub2dest = bxdest . getHighCorner ()[2]; int arraySize_X_dest = bxdest . size (0); int lb1dest = bxdest . getLowCorner ()[1]; int ub1dest = bxdest . getHighCorner ()[1]; int arraySize_Y_dest = bxdest . size (1); int lb0dest = bxdest . getLowCorner ()[0]; int ub0dest = bxdest . getHighCorner ()[0]; int arraySize_Z_dest = bxdest . size (2); double *sourceDataPointer = Asrc . getPointer(); double *destinationDataPointer = Adest . getPointer(); for (k = lb2dest; k <= ub2dest; ++k) { for (j = lb1dest; j <= ub1dest; ++j) { for (i = lb0dest; i <= ub0dest; ++i) { destinationDataPointer[arraySize_X_dest * (arraySize_Y_dest * k + j) + i] = sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] * -6.00000; #if debug cout << i << " " << j << " " << k << " " << destinationDataPointer[arraySize_X_dest * (arraySize_Y_dest * k + j) + i] << "= " << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] << "* -6.00000" << endl; #endif } } } double end = MPI_Wtime(); double elapsed_secs = (end - begin); cout << "Exec. time for serial code: " << elapsed_secs << endl; #if debug cout <<" The serial result" << endl; Adest.print(); cout << endl; #endif // real MPI in the following class RectMDArray< double , 1 , 1 , 1 > Adest_new(bxdest); initialize(Adest_new); double *destinationDataPointer_new = Adest_new . getPointer(); begin = MPI_Wtime(); MPI_Bcast( &lb0src, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &lb1src, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &lb2src, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &ub0src, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &ub1src, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &ub2src, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &arraySize_X_src, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &arraySize_Y_src, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &arraySize_Z_src, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &lb0dest, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &lb1dest, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &lb2dest, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &ub0dest, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &ub1dest, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &ub2dest, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &arraySize_X_dest, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &arraySize_Y_dest, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &arraySize_Z_dest, 1, MPI_INT, 0, MPI_COMM_WORLD); double *distsrc; double *distdest; int nghost = 1; int distsrcsize = arraySize_Z_src / nprocs; int offsetsrc = rank * distsrcsize; if(rank < arraySize_Z_src%nprocs) { distsrcsize++; } if(rank >= arraySize_Z_src%nprocs) offsetsrc += arraySize_Z_src%nprocs; else offsetsrc += rank; int distdestsize = arraySize_Z_dest / nprocs; int offsetdest = rank * distdestsize; if(rank < arraySize_Z_dest%nprocs) { distdestsize++; } if(rank >= arraySize_Z_dest%nprocs) offsetdest += arraySize_Z_dest%nprocs; else offsetdest += rank; distsrc = (double*)calloc(1,sizeof(double)*(distdestsize+2*nghost)*arraySize_X_src*arraySize_Y_src); distdest = (double*)calloc(1,sizeof(double)*(distdestsize)*arraySize_X_dest*arraySize_Y_dest); // team leader send data to all members int copyOffset = offsetdest * arraySize_X_src*arraySize_Y_src; int copySize = (distdestsize + 2 * nghost) * arraySize_X_src*arraySize_Y_src; if(nprocs > 1) { int dest, send_tag=1; MPI_Request send_reqs[nprocs-1]; MPI_Status send_status[nprocs-1]; for(dest = 1; dest < nprocs; ++dest) { int sendSize = arraySize_Z_dest / nprocs; int sendOffset = dest * sendSize; if(dest < arraySize_Z_dest%nprocs) { sendSize++; } sendSize = (sendSize+2)*arraySize_X_src*arraySize_Y_src; if(dest >= arraySize_Z_dest%nprocs) sendOffset += arraySize_Z_dest%nprocs; else sendOffset += dest; sendOffset = sendOffset*arraySize_X_src*arraySize_Y_src; #if debug cout << "Master send size " << sendSize<< " from offset " << sendOffset << " " << " to " << dest << endl; #endif MPI_Isend(sourceDataPointer+sendOffset, sendSize,MPI_DOUBLE, dest, send_tag, MPI_COMM_WORLD,&send_reqs[dest-1]); // int idx; // for(idx = 0; idx < sendSize; ++idx) // printf("Source send to dest:%d result %d: %f\n",dest, idx, sourceDataPointer[offsetsrc+sendOffset+idx]); } MPI_Waitall(nprocs-1,send_reqs,send_status); } // local copy (this is optional, but simplier for transformation) memcpy(distsrc,sourceDataPointer+copyOffset,copySize*sizeof(double)); // computation for (k = lb2dest; k < distdestsize; ++k) { for (j = lb1dest; j <= ub1dest; ++j) { for (i = lb0dest; i <= ub0dest; ++i) { distdest[arraySize_X_dest * (arraySize_Y_dest * k + j) + i] = distsrc[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] + \ distsrc[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] + \ distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] + \ distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] + \ distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] + \ distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] + distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] * -6.00000; #if debug cout << "rank0 " << i << " " << j << " " << k << " " << distdest[arraySize_X_dest * (arraySize_Y_dest * k + j) + i] << "= " << \ distsrc[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] << "+" << \ distsrc[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] << "+" << \ distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] << "+" << \ distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] << "+" << \ distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] << "+" << \ distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] << "+" << \ distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] << "* -6.00000" << endl; #endif } } } // team leader receives data to all members int src, recv_tag=1; MPI_Request recv_reqs[nprocs-1]; MPI_Status recv_status[nprocs-1]; for(src = 1; src < nprocs; ++src) { int recvSize = arraySize_Z_dest / nprocs; int recvOffset = src * recvSize; if(src < arraySize_Z_dest%nprocs) { recvSize++; } recvSize *= arraySize_X_dest*arraySize_Y_dest; if(src >= arraySize_Z_dest%nprocs) recvOffset += arraySize_Z_dest%nprocs; else recvOffset += src; recvOffset = recvOffset*arraySize_X_dest*arraySize_Y_dest; MPI_Irecv(destinationDataPointer_new+recvOffset, recvSize, MPI_DOUBLE, src, recv_tag, MPI_COMM_WORLD,&recv_reqs[src-1]); } MPI_Waitall(nprocs-1,recv_reqs,recv_status); // local copy (this could be optional, but simpler for transformation) memcpy(destinationDataPointer_new+offsetdest,distdest,distdestsize*bxdest.size(0)*bxdest.size(1)*sizeof(double)); end = MPI_Wtime(); elapsed_secs = (end - begin); cout << "Exec. time for MPI code: " << elapsed_secs << endl; #if debug cout <<" MPI result " << endl; Adest_new.print(); cout << endl; #endif assert(checksum(Adest, Adest_new)==0); return 0; }