Beispiel #1
0
ComplexArr shiftFreqs(ComplexArr arr, float constant){
	ComplexArr retVal = getZeros(arr.size());
	for (size_t i = 0; i < (arr.size()/2); i++) {
		retVal[i] = arr[size_t(i/constant)];
	}
	return retVal;
}
Beispiel #2
0
int main(int argc, char* argv[])
   {
  // DQ: Modified code to add const.

  // Use of "const" makes the type a SgModifierType (so for now let's keep it simple).
  // const Point zero = getZeros();
     Point zero = getZeros();

     Point lo=zero;
  // Point hi=getOnes()*(DOMAINSIZE-1);
     Point hi=getOnes()*(6);

     Box bxdest(lo,hi); //box low and high corners for destination
  
  // This will grow the box by one ghost
  // along each face and become the box for
  // the source box. 
     Box bxsrc = bxdest.grow(1);

  // source and destination data containers
     RectMDArray<double> Asrc(bxsrc);
     RectMDArray<double> Adest(bxdest);

  // all the coefficients I need for this operation
     const double ident =  1.0;
     const double C0    = -4.0;

  // An expression to recognize: 
  // pair<Shift,double>(zero,C0);

  // This is a simpler interface to interpret (suggested by Anshu).
  // Stencil<double> laplace(pair<Shift,double>(zero,C0));

     Stencil<double> laplace;

  // laplace = laplace + (pair<Shift,double>(zero,C0));
  // laplace = laplace + (pair<Shift,double>(zero,C0));

     Point xdir = getUnitv(0);

     xdir *= -1;

     laplace=laplace+(pair<Shift,double>(xdir,ident));

   }
Beispiel #3
0
void setStencil(Stencil<double>& a_laplace,
                const double   & a_dx)
{
  double C0=-2.0*DIM;
  Point zero=getZeros();
  Point ones=getOnes();
  Point negones=ones*(-1);
  double ident=1.0;
  array<Shift,DIM> S=getShiftVec();

  a_laplace = C0*(S^zero);
  for (int dir=0;dir<DIM;dir++)
    {
      Point thishft=getUnitv(dir);
      a_laplace = a_laplace + ident*(S^thishft);
      a_laplace = a_laplace + ident*(S^(thishft*(-1)));
    }
  //cout  << "stencil unscaled by dx = " << endl;
  //a_laplace.stencilDump();
  
  a_laplace *= (1.0/a_dx/a_dx);
}
Beispiel #4
0
void testlap(LevelData<double >& a_phi, double a_dx,char* a_str)
{
  double coef = 1./(a_dx*a_dx);
  Stencil<double> Laplacian(make_pair(getZeros(),-DIM*2*coef));
  BoxLayout bl = a_phi.getBoxLayout();
  
  for (int dir = 0; dir < DIM ; dir++)
    {
      Point edir = getUnitv(dir);
      Stencil<double> plus(make_pair(Shift(edir),coef));
      Stencil<double> minus(make_pair(Shift(edir*(-1)),coef));
      Laplacian = Laplacian + minus + plus;
    }
  
  a_phi.exchange();
  RectMDArray<double> LPhi00(bl.getDomain());
  for (BLIterator blit(bl); blit != blit.end(); ++blit)
    {
      LPhi00 |= Laplacian(a_phi[*blit],bl[*blit]);
    }
   MDWrite(a_str,LPhi00);
};
int main(int argc,char *argv[])
{

// Indicate SPMD code segments, without changing scopes of variables
#pragma omp target device(mpi) begin  
  const class Point zero = getZeros();
  const class Point ones = getOnes();
  const class Point negones = ones * -1;
  const class Point lo(zero);
// DQ (2/7/2015): Fixup for error yet to be fixed in ROSE (or fixed on alternative branch not yet merged).
// Point hi = getOnes()*(BLOCKSIZE-1);
  const int adjustedBlockSize = 511;
  const class Point hi = getOnes() * adjustedBlockSize;
//box low and high corners for destination
  const class Box bxdest(lo,hi);
// This will grow the box by one ghost
// along each face and become the box for
// the source box. 
  const class Box bxsrc = bxdest .  grow (1);
#pragma omp target device(mpi) end

// source and destination data containers
#pragma omp target data map(to:Asrc) map(from:Adest)
  class RectMDArray< double  , 1 , 1 , 1 > Asrc(bxsrc);
  class RectMDArray< double  , 1 , 1 , 1 > Adest(bxdest);

// all the coefficients I need for this operation
  const double ident = 1.0;
// DQ (2/18/2015): I need the simpler version because the current constant folding does not operate on floating point values.
// const double C0    = -2.0 * DIM;
  const double C0 = -6.00000;
  initialize(Asrc);
  initialize(Adest);
// cout <<" The source Box" << endl;
// Asrc.print();
// cout << endl;
// build the stencil, and the stencil operator
// Stencil<double> laplace(wt,shft);
  const std::array< Shift  , 3 > S = getShiftVec();
// This calls: template <class T> Stencil<T> operator*(T a_coef, Shift a_shift);
  class Stencil< double  > laplace = C0*((S)^(zero));
  for (int dir = 0; dir < 3; dir++) {
    const class Point thishft = getUnitv(dir);
// DQ (2/15/2015): Added operator+=() to support clearer updates of an existing object for compile-time analysis.
// laplace = laplace + ident*(S^thishft);
// laplace = laplace + ident*(S^(thishft*(-1)));
    laplace += ident*((S)^(thishft));
    laplace += ident*((S)^thishft * -1);
  }
// laplace.stencilDump();
// StencilOperator<double,double, double> op;

  int lb2 = bxdest .  getLowCorner ()[2];
  int k = 0;
  int ub2 = bxdest .  getHighCorner ()[2];
  int arraySize_X = bxdest .  size (0);
  int lb1 = bxdest .  getLowCorner ()[1];
  int j = 0;
  int ub1 = bxdest .  getHighCorner ()[1];
  int arraySize_Y = bxdest .  size (1);
  int lb0 = bxdest .  getLowCorner ()[0];
  int i = 0;
  int ub0 = bxdest .  getHighCorner ()[0];
  int arraySize_Z = bxdest .  size (2);

  double *sourceDataPointer = Asrc . getPointer();
  double *destinationDataPointer = Adest . getPointer();

  int arraySize_X_src = bxsrc .  size (0);
  int arraySize_Y_src = bxsrc .  size (1);
  int arraySize_Z_src = bxsrc .  size (2);
  int lb2src = bxsrc .  getLowCorner ()[2];
  int lb1src = bxsrc .  getLowCorner ()[1];
  int lb0src = bxsrc .  getLowCorner ()[0];

#pragma omp target device(mpi) map(to:sourceDataPointer[lb0src:arraySize_X_src][lb1src:arraySize_Y_src][lb2src:arraySize_Z_src] dist_data(DUPLICATE, DUPLICATE, BLOCK)) map(from:destinationDataPointer[lb0:arraySize_X][lb1:arraySize_Y][lb2:arraySize_Z])
#pragma omp target parallel for 
  for (k = lb2; k <= ub2; ++k) { // loop to be distributed must match the dimension being distributed (3rd dimension).
    for (j = lb1; j <= ub1; ++j) {
      for (i = lb0; i <= ub0; ++i) {
        destinationDataPointer[arraySize_X * (arraySize_Y * k + j) + i] = sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] * -6.00000;
//        std::cout<< arraySize_X_src << " " << arraySize_Y_src << " " << i << ":" << j << ":" << k << " " << destinationDataPointer[arraySize_X * (arraySize_Y * k + j) + i] << "=" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] << "* -6.00000" << std::endl;
      }
    }
  }
// cout <<" The destination Box" << endl;
// Adest.print();
}
Beispiel #6
0
int main(int argc,char *argv[])
{
// MPI setup
// ------------------------------------------------------------------done 
  int rank, nprocs;

#if USE_XOMP_MPI
  xomp_init_mpi (&argc, &argv, &rank, &nprocs);
#else
  MPI_Init (&argc, &argv);
  MPI_Comm_rank (MPI_COMM_WORLD, &rank);
  MPI_Comm_size (MPI_COMM_WORLD, &nprocs);
#endif

  const class Point zero = getZeros();
  const class Point ones = getOnes();
  const class Point negones = ones * -1;
  const class Point lo(zero);
// DQ (2/7/2015): Fixup for error yet to be fixed in ROSE (or fixed on alternative branch not yet merged).
// Point hi = getOnes()*(BLOCKSIZE-1);
  const int adjustedBlockSize = SIZE;
  const class Point hi = getOnes() * adjustedBlockSize;
//box low and high corners for destination
  const class Box bxdest(lo,hi);
// This will grow the box by one ghost
// along each face and become the box for
// the source box. 
  const class Box bxsrc = bxdest .  grow (1);

//MPI specific code: each array dimension boundary information : [lower: size]
// upper bounds can be omitted
// ---------------------------------------------------------------------
// no need to handle them specially, Can we reuse existing variables ??
  int lb0src, lb1src, lb2src; 
//  int ub0src, ub1src, ub2src;

  int arraySize_X_src, arraySize_Y_src, arraySize_Z_src;

  int lb0dest, lb1dest, lb2dest, ub0dest, ub1dest, ub2dest;
  int arraySize_X_dest, arraySize_Y_dest, arraySize_Z_dest;

  int i,j,k;

// MPI specific code: for mapped arrays, 
//          to direction: one copy
//          from direction: one copy also

  double *sourceDataPointer;
  double *destinationDataPointer;
  double *destinationDataPointer_ref;

// WHY changed to pointer types?? necessary to have global scope, not limited by if(rank0)
  class RectMDArray< double  , 1 , 1 , 1 > *Asrc;
  class RectMDArray< double  , 1 , 1 , 1 > *Adest;

  class RectMDArray< double  , 1 , 1 , 1 > *Adest_ref;
  
  double begin, end, elapsed_secs; 

  if(rank == 0)
  {

// Only master process allocate all data
    Asrc = new RectMDArray< double  , 1 , 1 , 1 >(bxsrc);
    Adest= new RectMDArray< double  , 1 , 1 , 1 >(bxdest);
    Adest_ref = new RectMDArray< double  , 1 , 1 , 1 >(bxdest);

    initialize(*Asrc);
    initialize(*Adest);
    initialize(*Adest_ref);

// these initialization can happen in every process, or not?
// No, seg fault if move into all processes
    destinationDataPointer= Adest-> getPointer();

    lb2src = bxsrc .  getLowCorner ()[2];
    lb1src = bxsrc .  getLowCorner ()[1];

    arraySize_X_src = bxsrc .  size (0);
    arraySize_Y_src = bxsrc .  size (1);
    arraySize_Z_src = bxsrc .  size (2);

    lb0src = bxsrc .  getLowCorner ()[0];

//    ub2src = bxsrc .  getHighCorner ()[2];
//    ub1src = bxsrc .  getHighCorner ()[1];
//    ub0src = bxsrc .  getHighCorner ()[0];


    lb2dest = bxdest .  getLowCorner ()[2];
    ub2dest = bxdest .  getHighCorner ()[2];
    arraySize_X_dest = bxdest .  size (0);
    assert (lb2dest == 0);

    lb1dest = bxdest .  getLowCorner ()[1];
    ub1dest = bxdest .  getHighCorner ()[1];
    arraySize_Y_dest = bxdest .  size (1);
    assert (lb1dest == 0);

    lb0dest = bxdest .  getLowCorner ()[0];
    ub0dest = bxdest .  getHighCorner ()[0];
    arraySize_Z_dest = bxdest .  size (2);
    assert (lb0dest == 0);
    //They are different! one with halo, the other does not.
    assert (arraySize_Z_src == arraySize_Z_dest +2 );
    assert (arraySize_X_src == arraySize_X_dest +2 );
    assert (arraySize_Y_src == arraySize_Y_dest +2 );

    sourceDataPointer = Asrc->getPointer();
    destinationDataPointer_ref = Adest_ref->getPointer();

    begin = MPI_Wtime();
//----------------------------------------------------------    
//   reference sequential version
    for (k = lb2dest; k <= ub2dest; ++k) {
      for (j = lb1dest; j <= ub1dest; ++j) {
        for (i = lb0dest; i <= ub0dest; ++i) {
          destinationDataPointer_ref[arraySize_X_dest * (arraySize_Y_dest * k + j) + i] = 
                 sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] + 
                 sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] + 
                 sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] + 
                 sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] + 
                 sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] + 
                 sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] + 
                 sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] * -6.00000;

#if debug
        cout << i << " " << j << " " << k << " " << destinationDataPointer_ref[arraySize_X_dest * (arraySize_Y_dest * k + j) + i] << "= " << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] << "* -6.00000" << endl;
#endif
       }
     }
   }
   end = MPI_Wtime();
   elapsed_secs = (end - begin);
   cout << "Exec. time for serial code: " << elapsed_secs << endl;
#if debug
   cout <<" The serail result" << endl;
   Adest_ref->print();
   cout << endl;
#endif
//------------------end sequential reference execution----------------------------------------    
  }
//
// Be careful for a needed barrier!!  avoid race condition!!
  MPI_Barrier(MPI_COMM_WORLD);

  if(rank == 0)
  {
    std::cout << "I am rank " << rank << " of " <<  nprocs << " processes, I am calling master subroutine" << std::endl;
#if 0
    Adest= new RectMDArray< double  , 1 , 1 , 1 >(bxdest);
    initialize(*Adest);
    destinationDataPointer= Adest-> getPointer();
#endif
    begin = MPI_Wtime();
  }
  else
  {
    std::cout << "I am rank " << rank << " of " <<  nprocs << " processes, I am calling member subroutine" << std::endl;
  }


// -------------------TODO translate this 2015-10-26 ------------------
// Translate mapped scalar data, communicate to all processes
// Alternatively, each process calculate them??

  MPI_Bcast( &lb0src, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &lb1src, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &lb2src, 1, MPI_INT, 0, MPI_COMM_WORLD); 
// upper bounds can be omitted, calculated by add lower+size
//  MPI_Bcast( &ub0src, 1, MPI_INT, 0, MPI_COMM_WORLD); 
//  MPI_Bcast( &ub1src, 1, MPI_INT, 0, MPI_COMM_WORLD); 
//  MPI_Bcast( &ub2src, 1, MPI_INT, 0, MPI_COMM_WORLD); 

  MPI_Bcast( &arraySize_X_src, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &arraySize_Y_src, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &arraySize_Z_src, 1, MPI_INT, 0, MPI_COMM_WORLD); 

  MPI_Bcast( &lb0dest, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &lb1dest, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &lb2dest, 1, MPI_INT, 0, MPI_COMM_WORLD); 

// upper bounds can be omitted, calculated by add lower+size
  MPI_Bcast( &ub0dest, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &ub1dest, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &ub2dest, 1, MPI_INT, 0, MPI_COMM_WORLD); 

  MPI_Bcast( &arraySize_X_dest, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &arraySize_Y_dest, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &arraySize_Z_dest, 1, MPI_INT, 0, MPI_COMM_WORLD); 


// Translate mapped arrays
// calculate offset
// declare temp arrays for storing mapped arrays
  double *distsrc;
  double *distdest;
  int nghost = 1; // halo region size for source partitions

  // calculate strip size for the distributed dimension Z

  // a runtime function call for distributing data
  int offsetdest; // _mpi_destinationDataPointer_offset_2
  int distdestsize; // _mpi_destinationDataPointer_size_2
  // _xomp_nprocs , _xomp_rank, 
  // for source array,no need?
  // xomp_static_even_divide_start_size(lb2src,arraySize_Z_src,_xomp_nprocs,_xomp_rank,&_mpi_sourceDataPointer_offset_2,&_mpi_sourceDataPointer_size_2);
  // for destination array which needs being sent back

  // xomp_static_even_divide_start_size(lb2,arraySize_Z_dest,_xomp_nprocs,_xomp_rank,&_mpi_destinationDataPointer_offset_2,&_mpi_destinationDataPointer_size_2);
  xomp_static_even_divide_start_size (0, arraySize_Z_dest, nprocs, rank, & offsetdest, & distdestsize);

  //allocate source data partitions for each process
  // source partitions contain halo region elements
  // TODO a runtime allocation function call? too trivial
  //distsrc = (double*)calloc(1,sizeof(double)*(distdestsize+2*nghost)*arraySize_X_src*arraySize_Y_src);
  distsrc = (double*)calloc(sizeof(double), (distdestsize+2*nghost)*arraySize_X_src*arraySize_Y_src);

  // allocate destination data, no need to initialize
  //distdest = (double*)calloc(1,sizeof(double)*(distdestsize)*arraySize_X_dest*arraySize_Y_dest);
  distdest = (double*)calloc(sizeof(double),(distdestsize)*arraySize_X_dest*arraySize_Y_dest);

  // copy source data from source master to each process's local buffer
  // wrap into a runtime function call
  // Parameters: 
  //
  // INPUT
  //    sourceDataPointer: source array address
  //    source array dimension info:  size_x, size_y, size_z
  //    distribution policy: block on z dimension
  //    total processes: nprocs
  //    rank ID, 
  //    halo region size: 
  //    offsetdest: 0, + chunk , +2chunks, ... etc , can be calculated internally
  //    distdestsize: this can be calculated internally 
  //
  // OUTPUT: modified things
  //      distsrc: local copy of master/slave process
// extern void xomp_divide_scatter_array_to_all (void * sourceDataPointer, int element_type_id, int x_dim_size, int y_dim_size, int z_dim_size,
// int distributed_dimension_id, int halo_size, int rank_id, int process_count, int** distsrc);
  xomp_divide_scatter_array_to_all (sourceDataPointer, arraySize_X_src, arraySize_Y_src, arraySize_Z_src, 2, 1, rank, nprocs, &distsrc);
//       arraySize_X_dest, arraySize_Y_dest, arraySize_Z_dest);


// TODO: is a barrier needed here??

// computation, only k loop is distributed
// Also matches the k loop distribution of the nested loop
// another runtime function call for distributing loops 
 // int _lower, _upper; 
  //void xomp_static_even_divide_lower_upper (int start, int end, int thread_count, int thread_id, int* n_lower, int* n_upper);
  // This is wrong since the bounds are relative to the original global data buffer
  // What is needed is the bounds local to local portions. The lower bounds always start from 0. The size is the size of local portion!
  // So the bounds should match the size of a distributed array's local portion!!
  //xomp_static_even_divide_lower_upper (lb2dest, ub2dest, nprocs, rank, &_lower, &_upper);

  assert (lb2dest ==0);
  //assert (_lower ==0);
  //assert (_upper == distdestsize -1 );
  for (k = 0; k < distdestsize; ++k) { // is this correct??,  should lower starts with 0??
  // for (k = _lower; k <= _upper; ++k) { // bounds obtained from runtime calls, inclusive bounds
    for (j = lb1dest; j <= ub1dest; ++j) {
      for (i = lb0dest; i <= ub0dest; ++i) { // the loop's arrays are replaced with distributed version, the rest is intact
        distdest[arraySize_X_dest * (arraySize_Y_dest * k + j) + i] = \
             distsrc[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] + \
             distsrc[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] + \
             distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] + \
             distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] + \
             distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] + \
             distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] + \
             distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] * -6.00000;
      }
//cout << rank<<": end of j = " << j << endl;
    }
//cout << "end of k = " << k << endl;
  }

// A runtime function to collect data
// void xomp_collect_scattered_array_from_all ()
// Parameters:
// destinationDataPointer // aggregated data on the master process
// distdest // distributed portions on each process
//  arraySize_X_dest
//  arraySize_Y_dest
//  arraySize_Z_dest
//  halo_size // not used for now
//
//  distribution_dimension_id
//  nprocs
//  rank_id
// Calculated one:
//    offsetdest, distdestsize
  // distribution ID is Z (2 of 0,1,2), halo region is size 0 in this case
  xomp_collect_scattered_array_from_all (distdest, arraySize_X_dest, arraySize_Y_dest, arraySize_Z_dest, 2, 0, rank, nprocs, &destinationDataPointer);

  //TODO Is this barrier necessary? Yes before the deletion later
  MPI_Barrier(MPI_COMM_WORLD);

  if(rank == 0)
  {
#if debug
   cout <<" MPI result " << endl;
   Adest->print();
   cout << endl;
#endif
  assert(checksum(*Adest_ref, *Adest)==0);
  delete Adest_ref;
  delete Asrc;
  delete Adest;
  }

  MPI_Finalize();
}
Beispiel #7
0
int main(int argc,char *argv[])
{
  const class Point zero = getZeros();
  const class Point ones = getOnes();
  const class Point negones = ones * -1;
  const class Point lo(zero);
// DQ (2/7/2015): Fixup for error yet to be fixed in ROSE (or fixed on alternative branch not yet merged).
// Point hi = getOnes()*(BLOCKSIZE-1);
  const int adjustedBlockSize = 31;
  const class Point hi = getOnes() * adjustedBlockSize;
//box low and high corners for destination
  const class Box bxdest(lo,hi);
// This will grow the box by one ghost
// along each face and become the box for
// the source box. 
  const class Box bxsrc = bxdest .  grow (1);
// source and destination data containers
  class RectMDArray< double  , 1 , 1 , 1 > Asrc(bxsrc);
  class RectMDArray< double  , 1 , 1 , 1 > Adest(bxdest);
// all the coefficients I need for this operation
  const double ident = 1.0;
// DQ (2/18/2015): I need the simpler version because the current constant folding does not operate on floating point values.
// const double C0    = -2.0 * DIM;
  const double C0 = -6.00000;
  initialize(Asrc);
  initialize(Adest);
// cout <<" The source Box" << endl;
// Asrc.print();
// cout << endl;
// build the stencil, and the stencil operator
// Stencil<double> laplace(wt,shft);
  const std::array< Shift  , 3 > S = getShiftVec();
// This calls: template <class T> Stencil<T> operator*(T a_coef, Shift a_shift);
  class Stencil< double  > laplace = C0*((S)^(zero));
  for (int dir = 0; dir < 3; dir++) {
    const class Point thishft = getUnitv(dir);
// DQ (2/15/2015): Added operator+=() to support clearer updates of an existing object for compile-time analysis.
// laplace = laplace + ident*(S^thishft);
// laplace = laplace + ident*(S^(thishft*(-1)));
    laplace += ident*((S)^(thishft));
    laplace += ident*((S)^thishft * -1);
  }
  int lb2 = bxdest .  getLowCorner ()[2];
  int k = 0;
  int ub2 = bxdest .  getHighCorner ()[2];
  int arraySize_X = bxdest .  size (0);
  int lb1 = bxdest .  getLowCorner ()[1];
  int j = 0;
  int ub1 = bxdest .  getHighCorner ()[1];
  int arraySize_Y = bxdest .  size (1);
  int lb0 = bxdest .  getLowCorner ()[0];
  int i = 0;
  int ub0 = bxdest .  getHighCorner ()[0];
  int arraySize_Z = bxdest .  size (2);
#pragma 0
  double (*sourceDataPointer)[arraySize_Y][arraySize_X];
  sourceDataPointer = (double (*)[arraySize_Y][arraySize_X])malloc(sizeof(double)*arraySize_X*arraySize_Y*arraySize_Z);    
  memcpy(sourceDataPointer,Asrc.getPointer(),sizeof(double)*arraySize_X*arraySize_Y*arraySize_Z); 
#pragma 0
  double (*destinationDataPointer)[arraySize_Y][arraySize_X];
  destinationDataPointer = (double (*)[arraySize_Y][arraySize_X])malloc(sizeof(double)*arraySize_X*arraySize_Y*arraySize_Z);    
  memcpy(destinationDataPointer,Adest.getPointer(),sizeof(double)*arraySize_X*arraySize_Y*arraySize_Z); 
#pragma 32
  for (k = lb2; k < ub2; ++k) {
    for (j = lb1; j < ub1; ++j) {
      for (i = lb0; i < ub0; ++i) {
        destinationDataPointer[k][j][i] = sourceDataPointer[k + -1][j][i] + sourceDataPointer[k + 1][j][i] + sourceDataPointer[k][j + -1][i] + sourceDataPointer[k][j + 1][i] + sourceDataPointer[k][j][i + -1] + sourceDataPointer[k][j][i + 1] + sourceDataPointer[k][j][i] * -6.00000;
      }
    }
  }
  memcpy(Asrc.getPointer(),sourceDataPointer,sizeof(double)*arraySize_X*arraySize_Y*arraySize_Z); 
  memcpy(Adest.getPointer(),destinationDataPointer,sizeof(double)*arraySize_X*arraySize_Y*arraySize_Z);
  free(sourceDataPointer); 
  free(destinationDataPointer); 
// cout <<" The destination Box" << endl;
// Adest.print();
}
Beispiel #8
0
int master(int rank, int nprocs)
{
  const class Point zero = getZeros();
  const class Point ones = getOnes();
  const class Point negones = ones * -1;
  const class Point lo(zero);
// DQ (2/7/2015): Fixup for error yet to be fixed in ROSE (or fixed on alternative branch not yet merged).
// Point hi = getOnes()*(BLOCKSIZE-1);
  const int adjustedBlockSize = SIZE;
  const class Point hi = getOnes() * adjustedBlockSize;
//box low and high corners for destination
  const class Box bxdest(lo,hi);
// This will grow the box by one ghost
// along each face and become the box for
// the source box. 
  const class Box bxsrc = bxdest .  grow (1);
// source and destination data containers
  class RectMDArray< double  , 1 , 1 , 1 > Asrc(bxsrc);
  class RectMDArray< double  , 1 , 1 , 1 > Adest(bxdest);
// all the coefficients I need for this operation
  const double ident = 1.0;
// DQ (2/18/2015): I need the simpler version because the current constant folding does not operate on floating point values.
// const double C0    = -2.0 * DIM;
  const double C0 = -6.00000;
  initialize(Asrc);
  initialize(Adest);
#if debug
 cout <<" The source Box" << endl;
 Asrc.print();
 cout << endl;
#endif
// build the stencil, and the stencil operator
// Stencil<double> laplace(wt,shft);
  const std::array< Shift  , 3 > S = getShiftVec();
// This calls: template <class T> Stencil<T> operator*(T a_coef, Shift a_shift);
  class Stencil< double  > laplace = C0*((S)^(zero));
  for (int dir = 0; dir < 3; dir++) {
    const class Point thishft = getUnitv(dir);
// DQ (2/15/2015): Added operator+=() to support clearer updates of an existing object for compile-time analysis.
// laplace = laplace + ident*(S^thishft);
// laplace = laplace + ident*(S^(thishft*(-1)));
    laplace += ident*((S)^(thishft));
    laplace += ident*((S)^thishft * -1);
  }
// laplace.stencilDump();
// sequential version ------------------------------
// StencilOperator<double,double, double> op;
  double begin = MPI_Wtime();
  int lb2src = bxsrc .  getLowCorner ()[2];
  int k = 0;
  int ub2src = bxsrc .  getHighCorner ()[2];
  int arraySize_X_src = bxsrc .  size (0);
  int lb1src = bxsrc .  getLowCorner ()[1];
  int j = 0;
  int ub1src = bxsrc .  getHighCorner ()[1];
  int arraySize_Y_src = bxsrc .  size (1);
  int lb0src = bxsrc .  getLowCorner ()[0];
  int i = 0;
  int ub0src = bxsrc .  getHighCorner ()[0];
  int arraySize_Z_src = bxsrc .  size (2);
  int lb2dest = bxdest .  getLowCorner ()[2];
  int ub2dest = bxdest .  getHighCorner ()[2];
  int arraySize_X_dest = bxdest .  size (0);
  int lb1dest = bxdest .  getLowCorner ()[1];
  int ub1dest = bxdest .  getHighCorner ()[1];
  int arraySize_Y_dest = bxdest .  size (1);
  int lb0dest = bxdest .  getLowCorner ()[0];
  int ub0dest = bxdest .  getHighCorner ()[0];
  int arraySize_Z_dest = bxdest .  size (2);
  double *sourceDataPointer = Asrc . getPointer();
  double *destinationDataPointer = Adest . getPointer();


  for (k = lb2dest; k <= ub2dest; ++k) {
    for (j = lb1dest; j <= ub1dest; ++j) {
      for (i = lb0dest; i <= ub0dest; ++i) {
        destinationDataPointer[arraySize_X_dest * (arraySize_Y_dest * k + j) + i] = sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] * -6.00000;
#if debug
        cout << i << " " << j << " " << k << " " << destinationDataPointer[arraySize_X_dest * (arraySize_Y_dest * k + j) + i] << "= " << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] << "* -6.00000" << endl;
#endif
      }
    }
  }
  double end = MPI_Wtime();
  double elapsed_secs = (end - begin);
  cout << "Exec. time for serial code: " << elapsed_secs << endl;
#if debug
 cout <<" The serial result" << endl;
 Adest.print();
 cout << endl;
#endif

// real MPI in the following
  class RectMDArray< double  , 1 , 1 , 1 > Adest_new(bxdest);
  initialize(Adest_new);
  double *destinationDataPointer_new = Adest_new . getPointer();
  begin = MPI_Wtime();

  MPI_Bcast( &lb0src, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &lb1src, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &lb2src, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &ub0src, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &ub1src, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &ub2src, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &arraySize_X_src, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &arraySize_Y_src, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &arraySize_Z_src, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &lb0dest, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &lb1dest, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &lb2dest, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &ub0dest, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &ub1dest, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &ub2dest, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &arraySize_X_dest, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &arraySize_Y_dest, 1, MPI_INT, 0, MPI_COMM_WORLD); 
  MPI_Bcast( &arraySize_Z_dest, 1, MPI_INT, 0, MPI_COMM_WORLD); 

  double *distsrc;
  double *distdest;
  int nghost = 1;
  int distsrcsize = arraySize_Z_src / nprocs;
  int offsetsrc = rank * distsrcsize;
  if(rank < arraySize_Z_src%nprocs)
  {
    distsrcsize++;
  }
  if(rank >= arraySize_Z_src%nprocs)
    offsetsrc += arraySize_Z_src%nprocs;
  else
    offsetsrc += rank;
  int distdestsize = arraySize_Z_dest / nprocs;
  int offsetdest = rank * distdestsize;
  if(rank < arraySize_Z_dest%nprocs)
  {
    distdestsize++;
  }
  if(rank >= arraySize_Z_dest%nprocs)
    offsetdest += arraySize_Z_dest%nprocs;
  else
    offsetdest += rank;

  distsrc = (double*)calloc(1,sizeof(double)*(distdestsize+2*nghost)*arraySize_X_src*arraySize_Y_src);
  distdest = (double*)calloc(1,sizeof(double)*(distdestsize)*arraySize_X_dest*arraySize_Y_dest);

   // team leader send data to all members
   int copyOffset = offsetdest * arraySize_X_src*arraySize_Y_src;
   int copySize = (distdestsize + 2 * nghost) * arraySize_X_src*arraySize_Y_src;
   if(nprocs > 1)    
   { 
     int dest, send_tag=1;
     MPI_Request send_reqs[nprocs-1];
     MPI_Status send_status[nprocs-1];
     for(dest = 1; dest < nprocs; ++dest)
     {
       int sendSize = arraySize_Z_dest / nprocs;
       int sendOffset = dest * sendSize;
       if(dest < arraySize_Z_dest%nprocs)
       {
         sendSize++;
       }
       sendSize = (sendSize+2)*arraySize_X_src*arraySize_Y_src;
       if(dest >= arraySize_Z_dest%nprocs)
         sendOffset += arraySize_Z_dest%nprocs;
       else
         sendOffset += dest;
       sendOffset = sendOffset*arraySize_X_src*arraySize_Y_src;
#if debug
cout << "Master send size " << sendSize<< " from offset " << sendOffset << " "  << " to " << dest << endl;
#endif
        MPI_Isend(sourceDataPointer+sendOffset, sendSize,MPI_DOUBLE, dest, send_tag, MPI_COMM_WORLD,&send_reqs[dest-1]);
//     int idx;
//     for(idx = 0; idx < sendSize; ++idx)
//      printf("Source send to dest:%d result %d: %f\n",dest, idx, sourceDataPointer[offsetsrc+sendOffset+idx]); 
     }  
     MPI_Waitall(nprocs-1,send_reqs,send_status);
   }
   // local copy (this is optional, but simplier for transformation)
   memcpy(distsrc,sourceDataPointer+copyOffset,copySize*sizeof(double));

// computation
  for (k = lb2dest; k < distdestsize; ++k) {
    for (j = lb1dest; j <= ub1dest; ++j) {
      for (i = lb0dest; i <= ub0dest; ++i) {
        distdest[arraySize_X_dest * (arraySize_Y_dest * k + j) + i] = 
distsrc[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] + \
distsrc[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)]  + \
distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] + \
distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] + \
distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] + \
distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] + 
distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] * -6.00000;
#if debug
        cout << "rank0 " <<  i << " " << j << " " << k << " " <<  distdest[arraySize_X_dest * (arraySize_Y_dest * k + j) + i] << "= " << \
distsrc[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] << "+" << \
distsrc[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] << "+" << \
distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] << "+" << \
distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] << "+" << \
distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] << "+" << \
distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] << "+" << \
distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] << "* -6.00000" << endl;
#endif
      }
    }
  }

// team leader receives data to all members
    int src, recv_tag=1;
    MPI_Request recv_reqs[nprocs-1];
    MPI_Status recv_status[nprocs-1];
    for(src = 1; src < nprocs; ++src)
    {
      int recvSize = arraySize_Z_dest / nprocs;
      int recvOffset = src * recvSize;
      if(src < arraySize_Z_dest%nprocs)
      {
        recvSize++;
      }
      recvSize *= arraySize_X_dest*arraySize_Y_dest;
      if(src >= arraySize_Z_dest%nprocs)
        recvOffset += arraySize_Z_dest%nprocs;
      else
        recvOffset += src;
       recvOffset = recvOffset*arraySize_X_dest*arraySize_Y_dest;
       MPI_Irecv(destinationDataPointer_new+recvOffset, recvSize, MPI_DOUBLE, src, recv_tag, MPI_COMM_WORLD,&recv_reqs[src-1]);
    }  
    MPI_Waitall(nprocs-1,recv_reqs,recv_status);
    // local copy (this could be optional, but simpler for transformation)
    memcpy(destinationDataPointer_new+offsetdest,distdest,distdestsize*bxdest.size(0)*bxdest.size(1)*sizeof(double));

  end = MPI_Wtime();
  elapsed_secs = (end - begin);
  cout << "Exec. time for MPI code: " << elapsed_secs << endl;

#if debug
 cout <<" MPI result " << endl;
 Adest_new.print();
 cout << endl;
#endif
  assert(checksum(Adest, Adest_new)==0);
  return 0; 
}
Beispiel #9
0
ComplexArr shiftArray(ComplexArr arr, size_t n){
	ComplexArr retVal = getZeros(arr.size());
	ComplexArr inter = arr[std::slice(0, arr.size()-n, 1)];
	retVal[std::slice(n, arr.size()-n, 1)] = inter;
	return retVal;
}