Example #1
0
int main(int argc, char* argv[])
   {
  // DQ: Modified code to add const.

  // Use of "const" makes the type a SgModifierType (so for now let's keep it simple).
  // const Point zero = getZeros();
     Point zero = getZeros();

     Point lo=zero;
  // Point hi=getOnes()*(DOMAINSIZE-1);
     Point hi=getOnes()*(6);

     Box bxdest(lo,hi); //box low and high corners for destination
  
  // This will grow the box by one ghost
  // along each face and become the box for
  // the source box. 
     Box bxsrc = bxdest.grow(1);

  // source and destination data containers
     RectMDArray<double> Asrc(bxsrc);
     RectMDArray<double> Adest(bxdest);

  // all the coefficients I need for this operation
     const double ident =  1.0;
     const double C0    = -4.0;

  // An expression to recognize: 
  // pair<Shift,double>(zero,C0);

  // This is a simpler interface to interpret (suggested by Anshu).
  // Stencil<double> laplace(pair<Shift,double>(zero,C0));

     Stencil<double> laplace;

  // laplace = laplace + (pair<Shift,double>(zero,C0));
  // laplace = laplace + (pair<Shift,double>(zero,C0));

     Point xdir = getUnitv(0);

     xdir *= -1;

     laplace=laplace+(pair<Shift,double>(xdir,ident));

   }
Example #2
0
void setStencil(Stencil<double>& a_laplace,
                const double   & a_dx)
{
  double C0=-2.0*DIM;
  Point zero=getZeros();
  Point ones=getOnes();
  Point negones=ones*(-1);
  double ident=1.0;
  array<Shift,DIM> S=getShiftVec();

  a_laplace = C0*(S^zero);
  for (int dir=0;dir<DIM;dir++)
    {
      Point thishft=getUnitv(dir);
      a_laplace = a_laplace + ident*(S^thishft);
      a_laplace = a_laplace + ident*(S^(thishft*(-1)));
    }
  //cout  << "stencil unscaled by dx = " << endl;
  //a_laplace.stencilDump();
  
  a_laplace *= (1.0/a_dx/a_dx);
}
Example #3
0
File: copyTest.cpp Project: 8l/rose
void testlap(LevelData<double >& a_phi, double a_dx,char* a_str)
{
  double coef = 1./(a_dx*a_dx);
  Stencil<double> Laplacian(make_pair(getZeros(),-DIM*2*coef));
  BoxLayout bl = a_phi.getBoxLayout();
  
  for (int dir = 0; dir < DIM ; dir++)
    {
      Point edir = getUnitv(dir);
      Stencil<double> plus(make_pair(Shift(edir),coef));
      Stencil<double> minus(make_pair(Shift(edir*(-1)),coef));
      Laplacian = Laplacian + minus + plus;
    }
  
  a_phi.exchange();
  RectMDArray<double> LPhi00(bl.getDomain());
  for (BLIterator blit(bl); blit != blit.end(); ++blit)
    {
      LPhi00 |= Laplacian(a_phi[*blit],bl[*blit]);
    }
   MDWrite(a_str,LPhi00);
};
int main(int argc,char *argv[])
{

// Indicate SPMD code segments, without changing scopes of variables
#pragma omp target device(mpi) begin  
  const class Point zero = getZeros();
  const class Point ones = getOnes();
  const class Point negones = ones * -1;
  const class Point lo(zero);
// DQ (2/7/2015): Fixup for error yet to be fixed in ROSE (or fixed on alternative branch not yet merged).
// Point hi = getOnes()*(BLOCKSIZE-1);
  const int adjustedBlockSize = 511;
  const class Point hi = getOnes() * adjustedBlockSize;
//box low and high corners for destination
  const class Box bxdest(lo,hi);
// This will grow the box by one ghost
// along each face and become the box for
// the source box. 
  const class Box bxsrc = bxdest .  grow (1);
#pragma omp target device(mpi) end

// source and destination data containers
#pragma omp target data map(to:Asrc) map(from:Adest)
  class RectMDArray< double  , 1 , 1 , 1 > Asrc(bxsrc);
  class RectMDArray< double  , 1 , 1 , 1 > Adest(bxdest);

// all the coefficients I need for this operation
  const double ident = 1.0;
// DQ (2/18/2015): I need the simpler version because the current constant folding does not operate on floating point values.
// const double C0    = -2.0 * DIM;
  const double C0 = -6.00000;
  initialize(Asrc);
  initialize(Adest);
// cout <<" The source Box" << endl;
// Asrc.print();
// cout << endl;
// build the stencil, and the stencil operator
// Stencil<double> laplace(wt,shft);
  const std::array< Shift  , 3 > S = getShiftVec();
// This calls: template <class T> Stencil<T> operator*(T a_coef, Shift a_shift);
  class Stencil< double  > laplace = C0*((S)^(zero));
  for (int dir = 0; dir < 3; dir++) {
    const class Point thishft = getUnitv(dir);
// DQ (2/15/2015): Added operator+=() to support clearer updates of an existing object for compile-time analysis.
// laplace = laplace + ident*(S^thishft);
// laplace = laplace + ident*(S^(thishft*(-1)));
    laplace += ident*((S)^(thishft));
    laplace += ident*((S)^thishft * -1);
  }
// laplace.stencilDump();
// StencilOperator<double,double, double> op;

  int lb2 = bxdest .  getLowCorner ()[2];
  int k = 0;
  int ub2 = bxdest .  getHighCorner ()[2];
  int arraySize_X = bxdest .  size (0);
  int lb1 = bxdest .  getLowCorner ()[1];
  int j = 0;
  int ub1 = bxdest .  getHighCorner ()[1];
  int arraySize_Y = bxdest .  size (1);
  int lb0 = bxdest .  getLowCorner ()[0];
  int i = 0;
  int ub0 = bxdest .  getHighCorner ()[0];
  int arraySize_Z = bxdest .  size (2);

  double *sourceDataPointer = Asrc . getPointer();
  double *destinationDataPointer = Adest . getPointer();

  int arraySize_X_src = bxsrc .  size (0);
  int arraySize_Y_src = bxsrc .  size (1);
  int arraySize_Z_src = bxsrc .  size (2);
  int lb2src = bxsrc .  getLowCorner ()[2];
  int lb1src = bxsrc .  getLowCorner ()[1];
  int lb0src = bxsrc .  getLowCorner ()[0];

#pragma omp target device(mpi) map(to:sourceDataPointer[lb0src:arraySize_X_src][lb1src:arraySize_Y_src][lb2src:arraySize_Z_src] dist_data(DUPLICATE, DUPLICATE, BLOCK)) map(from:destinationDataPointer[lb0:arraySize_X][lb1:arraySize_Y][lb2:arraySize_Z])
#pragma omp target parallel for 
  for (k = lb2; k <= ub2; ++k) { // loop to be distributed must match the dimension being distributed (3rd dimension).
    for (j = lb1; j <= ub1; ++j) {
      for (i = lb0; i <= ub0; ++i) {
        destinationDataPointer[arraySize_X * (arraySize_Y * k + j) + i] = sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] * -6.00000;
//        std::cout<< arraySize_X_src << " " << arraySize_Y_src << " " << i << ":" << j << ":" << k << " " << destinationDataPointer[arraySize_X * (arraySize_Y * k + j) + i] << "=" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] << "* -6.00000" << std::endl;
      }
    }
  }
// cout <<" The destination Box" << endl;
// Adest.print();
}
Example #5
0
int main(int argc,char *argv[])
{
  const class Point zero = getZeros();
  const class Point ones = getOnes();
  const class Point negones = ones * -1;
  const class Point lo(zero);
// DQ (2/7/2015): Fixup for error yet to be fixed in ROSE (or fixed on alternative branch not yet merged).
// Point hi = getOnes()*(BLOCKSIZE-1);
  const int adjustedBlockSize = 31;
  const class Point hi = getOnes() * adjustedBlockSize;
//box low and high corners for destination
  const class Box bxdest(lo,hi);
// This will grow the box by one ghost
// along each face and become the box for
// the source box. 
  const class Box bxsrc = bxdest .  grow (1);
// source and destination data containers
  class RectMDArray< double  , 1 , 1 , 1 > Asrc(bxsrc);
  class RectMDArray< double  , 1 , 1 , 1 > Adest(bxdest);
// all the coefficients I need for this operation
  const double ident = 1.0;
// DQ (2/18/2015): I need the simpler version because the current constant folding does not operate on floating point values.
// const double C0    = -2.0 * DIM;
  const double C0 = -6.00000;
  initialize(Asrc);
  initialize(Adest);
// cout <<" The source Box" << endl;
// Asrc.print();
// cout << endl;
// build the stencil, and the stencil operator
// Stencil<double> laplace(wt,shft);
  const std::array< Shift  , 3 > S = getShiftVec();
// This calls: template <class T> Stencil<T> operator*(T a_coef, Shift a_shift);
  class Stencil< double  > laplace = C0*((S)^(zero));
  for (int dir = 0; dir < 3; dir++) {
    const class Point thishft = getUnitv(dir);
// DQ (2/15/2015): Added operator+=() to support clearer updates of an existing object for compile-time analysis.
// laplace = laplace + ident*(S^thishft);
// laplace = laplace + ident*(S^(thishft*(-1)));
    laplace += ident*((S)^(thishft));
    laplace += ident*((S)^thishft * -1);
  }
  int lb2 = bxdest .  getLowCorner ()[2];
  int k = 0;
  int ub2 = bxdest .  getHighCorner ()[2];
  int arraySize_X = bxdest .  size (0);
  int lb1 = bxdest .  getLowCorner ()[1];
  int j = 0;
  int ub1 = bxdest .  getHighCorner ()[1];
  int arraySize_Y = bxdest .  size (1);
  int lb0 = bxdest .  getLowCorner ()[0];
  int i = 0;
  int ub0 = bxdest .  getHighCorner ()[0];
  int arraySize_Z = bxdest .  size (2);
#pragma 0
  double (*sourceDataPointer)[arraySize_Y][arraySize_X];
  sourceDataPointer = (double (*)[arraySize_Y][arraySize_X])malloc(sizeof(double)*arraySize_X*arraySize_Y*arraySize_Z);    
  memcpy(sourceDataPointer,Asrc.getPointer(),sizeof(double)*arraySize_X*arraySize_Y*arraySize_Z); 
#pragma 0
  double (*destinationDataPointer)[arraySize_Y][arraySize_X];
  destinationDataPointer = (double (*)[arraySize_Y][arraySize_X])malloc(sizeof(double)*arraySize_X*arraySize_Y*arraySize_Z);    
  memcpy(destinationDataPointer,Adest.getPointer(),sizeof(double)*arraySize_X*arraySize_Y*arraySize_Z); 
#pragma 32
  for (k = lb2; k < ub2; ++k) {
    for (j = lb1; j < ub1; ++j) {
      for (i = lb0; i < ub0; ++i) {
        destinationDataPointer[k][j][i] = sourceDataPointer[k + -1][j][i] + sourceDataPointer[k + 1][j][i] + sourceDataPointer[k][j + -1][i] + sourceDataPointer[k][j + 1][i] + sourceDataPointer[k][j][i + -1] + sourceDataPointer[k][j][i + 1] + sourceDataPointer[k][j][i] * -6.00000;
      }
    }
  }
  memcpy(Asrc.getPointer(),sourceDataPointer,sizeof(double)*arraySize_X*arraySize_Y*arraySize_Z); 
  memcpy(Adest.getPointer(),destinationDataPointer,sizeof(double)*arraySize_X*arraySize_Y*arraySize_Z);
  free(sourceDataPointer); 
  free(destinationDataPointer); 
// cout <<" The destination Box" << endl;
// Adest.print();
}
Example #6
0
int master(int rank, int nprocs)
{
  const class Point zero = getZeros();
  const class Point ones = getOnes();
  const class Point negones = ones * -1;
  const class Point lo(zero);
// DQ (2/7/2015): Fixup for error yet to be fixed in ROSE (or fixed on alternative branch not yet merged).
// Point hi = getOnes()*(BLOCKSIZE-1);
  const int adjustedBlockSize = SIZE;
  const class Point hi = getOnes() * adjustedBlockSize;
//box low and high corners for destination
  const class Box bxdest(lo,hi);
// This will grow the box by one ghost
// along each face and become the box for
// the source box. 
  const class Box bxsrc = bxdest .  grow (1);
// source and destination data containers
  class RectMDArray< double  , 1 , 1 , 1 > Asrc(bxsrc);
  class RectMDArray< double  , 1 , 1 , 1 > Adest(bxdest);
// all the coefficients I need for this operation
  const double ident = 1.0;
// DQ (2/18/2015): I need the simpler version because the current constant folding does not operate on floating point values.
// const double C0    = -2.0 * DIM;
  const double C0 = -6.00000;
  initialize(Asrc);
  initialize(Adest);
#if debug
 cout <<" The source Box" << endl;
 Asrc.print();
 cout << endl;
#endif
// build the stencil, and the stencil operator
// Stencil<double> laplace(wt,shft);
  const std::array< Shift  , 3 > S = getShiftVec();
// This calls: template <class T> Stencil<T> operator*(T a_coef, Shift a_shift);
  class Stencil< double  > laplace = C0*((S)^(zero));
  for (int dir = 0; dir < 3; dir++) {
    const class Point thishft = getUnitv(dir);
// DQ (2/15/2015): Added operator+=() to support clearer updates of an existing object for compile-time analysis.
// laplace = laplace + ident*(S^thishft);
// laplace = laplace + ident*(S^(thishft*(-1)));
    laplace += ident*((S)^(thishft));
    laplace += ident*((S)^thishft * -1);
  }
// laplace.stencilDump();
// sequential version ------------------------------
// StencilOperator<double,double, double> op;
  double begin = MPI_Wtime();
  int lb2src = bxsrc .  getLowCorner ()[2];
  int k = 0;
  int ub2src = bxsrc .  getHighCorner ()[2];
  int arraySize_X_src = bxsrc .  size (0);
  int lb1src = bxsrc .  getLowCorner ()[1];
  int j = 0;
  int ub1src = bxsrc .  getHighCorner ()[1];
  int arraySize_Y_src = bxsrc .  size (1);
  int lb0src = bxsrc .  getLowCorner ()[0];
  int i = 0;
  int ub0src = bxsrc .  getHighCorner ()[0];
  int arraySize_Z_src = bxsrc .  size (2);
  int lb2dest = bxdest .  getLowCorner ()[2];
  int ub2dest = bxdest .  getHighCorner ()[2];
  int arraySize_X_dest = bxdest .  size (0);
  int lb1dest = bxdest .  getLowCorner ()[1];
  int ub1dest = bxdest .  getHighCorner ()[1];
  int arraySize_Y_dest = bxdest .  size (1);
  int lb0dest = bxdest .  getLowCorner ()[0];
  int ub0dest = bxdest .  getHighCorner ()[0];
  int arraySize_Z_dest = bxdest .  size (2);
  double *sourceDataPointer = Asrc . getPointer();
  double *destinationDataPointer = Adest . getPointer();


  for (k = lb2dest; k <= ub2dest; ++k) {
    for (j = lb1dest; j <= ub1dest; ++j) {
      for (i = lb0dest; i <= ub0dest; ++i) {
        destinationDataPointer[arraySize_X_dest * (arraySize_Y_dest * k + j) + i] = sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] * -6.00000;
#if debug
        cout << i << " " << j << " " << k << " " << destinationDataPointer[arraySize_X_dest * (arraySize_Y_dest * k + j) + i] << "= " << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] << "* -6.00000" << endl;
#endif
      }
    }
  }
  double end = MPI_Wtime();
  double elapsed_secs = (end - begin);
  cout << "Exec. time for serial code: " << elapsed_secs << endl;
#if debug
 cout <<" The serial result" << endl;
 Adest.print();
 cout << endl;
#endif

// real MPI in the following
  class RectMDArray< double  , 1 , 1 , 1 > Adest_new(bxdest);
  initialize(Adest_new);
  double *destinationDataPointer_new = Adest_new . getPointer();
  begin = MPI_Wtime();

  MPI_Bcast( &lb0src, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &lb1src, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &lb2src, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &ub0src, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &ub1src, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &ub2src, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &arraySize_X_src, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &arraySize_Y_src, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &arraySize_Z_src, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &lb0dest, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &lb1dest, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &lb2dest, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &ub0dest, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &ub1dest, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &ub2dest, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &arraySize_X_dest, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &arraySize_Y_dest, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &arraySize_Z_dest, 1, MPI_INT, 0, MPI_COMM_WORLD); 

  double *distsrc;
  double *distdest;
  int nghost = 1;
  int distsrcsize = arraySize_Z_src / nprocs;
  int offsetsrc = rank * distsrcsize;
  if(rank < arraySize_Z_src%nprocs)
  {
    distsrcsize++;
  }
  if(rank >= arraySize_Z_src%nprocs)
    offsetsrc += arraySize_Z_src%nprocs;
  else
    offsetsrc += rank;
  int distdestsize = arraySize_Z_dest / nprocs;
  int offsetdest = rank * distdestsize;
  if(rank < arraySize_Z_dest%nprocs)
  {
    distdestsize++;
  }
  if(rank >= arraySize_Z_dest%nprocs)
    offsetdest += arraySize_Z_dest%nprocs;
  else
    offsetdest += rank;

  distsrc = (double*)calloc(1,sizeof(double)*(distdestsize+2*nghost)*arraySize_X_src*arraySize_Y_src);
  distdest = (double*)calloc(1,sizeof(double)*(distdestsize)*arraySize_X_dest*arraySize_Y_dest);

   // team leader send data to all members
   int copyOffset = offsetdest * arraySize_X_src*arraySize_Y_src;
   int copySize = (distdestsize + 2 * nghost) * arraySize_X_src*arraySize_Y_src;
   if(nprocs > 1)    
   { 
     int dest, send_tag=1;
     MPI_Request send_reqs[nprocs-1];
     MPI_Status send_status[nprocs-1];
     for(dest = 1; dest < nprocs; ++dest)
     {
       int sendSize = arraySize_Z_dest / nprocs;
       int sendOffset = dest * sendSize;
       if(dest < arraySize_Z_dest%nprocs)
       {
         sendSize++;
       }
       sendSize = (sendSize+2)*arraySize_X_src*arraySize_Y_src;
       if(dest >= arraySize_Z_dest%nprocs)
         sendOffset += arraySize_Z_dest%nprocs;
       else
         sendOffset += dest;
       sendOffset = sendOffset*arraySize_X_src*arraySize_Y_src;
#if debug
cout << "Master send size " << sendSize<< " from offset " << sendOffset << " "  << " to " << dest << endl;
#endif
        MPI_Isend(sourceDataPointer+sendOffset, sendSize,MPI_DOUBLE, dest, send_tag, MPI_COMM_WORLD,&send_reqs[dest-1]);
//     int idx;
//     for(idx = 0; idx < sendSize; ++idx)
//      printf("Source send to dest:%d result %d: %f\n",dest, idx, sourceDataPointer[offsetsrc+sendOffset+idx]); 
     }  
     MPI_Waitall(nprocs-1,send_reqs,send_status);
   }
   // local copy (this is optional, but simplier for transformation)
   memcpy(distsrc,sourceDataPointer+copyOffset,copySize*sizeof(double));

// computation
  for (k = lb2dest; k < distdestsize; ++k) {
    for (j = lb1dest; j <= ub1dest; ++j) {
      for (i = lb0dest; i <= ub0dest; ++i) {
        distdest[arraySize_X_dest * (arraySize_Y_dest * k + j) + i] = 
distsrc[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] + \
distsrc[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)]  + \
distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] + \
distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] + \
distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] + \
distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] + 
distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] * -6.00000;
#if debug
        cout << "rank0 " <<  i << " " << j << " " << k << " " <<  distdest[arraySize_X_dest * (arraySize_Y_dest * k + j) + i] << "= " << \
distsrc[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] << "+" << \
distsrc[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] << "+" << \
distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] << "+" << \
distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] << "+" << \
distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] << "+" << \
distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] << "+" << \
distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] << "* -6.00000" << endl;
#endif
      }
    }
  }

// team leader receives data to all members
    int src, recv_tag=1;
    MPI_Request recv_reqs[nprocs-1];
    MPI_Status recv_status[nprocs-1];
    for(src = 1; src < nprocs; ++src)
    {
      int recvSize = arraySize_Z_dest / nprocs;
      int recvOffset = src * recvSize;
      if(src < arraySize_Z_dest%nprocs)
      {
        recvSize++;
      }
      recvSize *= arraySize_X_dest*arraySize_Y_dest;
      if(src >= arraySize_Z_dest%nprocs)
        recvOffset += arraySize_Z_dest%nprocs;
      else
        recvOffset += src;
       recvOffset = recvOffset*arraySize_X_dest*arraySize_Y_dest;
       MPI_Irecv(destinationDataPointer_new+recvOffset, recvSize, MPI_DOUBLE, src, recv_tag, MPI_COMM_WORLD,&recv_reqs[src-1]);
    }  
    MPI_Waitall(nprocs-1,recv_reqs,recv_status);
    // local copy (this could be optional, but simpler for transformation)
    memcpy(destinationDataPointer_new+offsetdest,distdest,distdestsize*bxdest.size(0)*bxdest.size(1)*sizeof(double));

  end = MPI_Wtime();
  elapsed_secs = (end - begin);
  cout << "Exec. time for MPI code: " << elapsed_secs << endl;

#if debug
 cout <<" MPI result " << endl;
 Adest_new.print();
 cout << endl;
#endif
  assert(checksum(Adest, Adest_new)==0);
  return 0; 
}