ComplexArr shiftFreqs(ComplexArr arr, float constant){ ComplexArr retVal = getZeros(arr.size()); for (size_t i = 0; i < (arr.size()/2); i++) { retVal[i] = arr[size_t(i/constant)]; } return retVal; }
int main(int argc, char* argv[]) { // DQ: Modified code to add const. // Use of "const" makes the type a SgModifierType (so for now let's keep it simple). // const Point zero = getZeros(); Point zero = getZeros(); Point lo=zero; // Point hi=getOnes()*(DOMAINSIZE-1); Point hi=getOnes()*(6); Box bxdest(lo,hi); //box low and high corners for destination // This will grow the box by one ghost // along each face and become the box for // the source box. Box bxsrc = bxdest.grow(1); // source and destination data containers RectMDArray<double> Asrc(bxsrc); RectMDArray<double> Adest(bxdest); // all the coefficients I need for this operation const double ident = 1.0; const double C0 = -4.0; // An expression to recognize: // pair<Shift,double>(zero,C0); // This is a simpler interface to interpret (suggested by Anshu). // Stencil<double> laplace(pair<Shift,double>(zero,C0)); Stencil<double> laplace; // laplace = laplace + (pair<Shift,double>(zero,C0)); // laplace = laplace + (pair<Shift,double>(zero,C0)); Point xdir = getUnitv(0); xdir *= -1; laplace=laplace+(pair<Shift,double>(xdir,ident)); }
void setStencil(Stencil<double>& a_laplace, const double & a_dx) { double C0=-2.0*DIM; Point zero=getZeros(); Point ones=getOnes(); Point negones=ones*(-1); double ident=1.0; array<Shift,DIM> S=getShiftVec(); a_laplace = C0*(S^zero); for (int dir=0;dir<DIM;dir++) { Point thishft=getUnitv(dir); a_laplace = a_laplace + ident*(S^thishft); a_laplace = a_laplace + ident*(S^(thishft*(-1))); } //cout << "stencil unscaled by dx = " << endl; //a_laplace.stencilDump(); a_laplace *= (1.0/a_dx/a_dx); }
void testlap(LevelData<double >& a_phi, double a_dx,char* a_str) { double coef = 1./(a_dx*a_dx); Stencil<double> Laplacian(make_pair(getZeros(),-DIM*2*coef)); BoxLayout bl = a_phi.getBoxLayout(); for (int dir = 0; dir < DIM ; dir++) { Point edir = getUnitv(dir); Stencil<double> plus(make_pair(Shift(edir),coef)); Stencil<double> minus(make_pair(Shift(edir*(-1)),coef)); Laplacian = Laplacian + minus + plus; } a_phi.exchange(); RectMDArray<double> LPhi00(bl.getDomain()); for (BLIterator blit(bl); blit != blit.end(); ++blit) { LPhi00 |= Laplacian(a_phi[*blit],bl[*blit]); } MDWrite(a_str,LPhi00); };
int main(int argc,char *argv[]) { // Indicate SPMD code segments, without changing scopes of variables #pragma omp target device(mpi) begin const class Point zero = getZeros(); const class Point ones = getOnes(); const class Point negones = ones * -1; const class Point lo(zero); // DQ (2/7/2015): Fixup for error yet to be fixed in ROSE (or fixed on alternative branch not yet merged). // Point hi = getOnes()*(BLOCKSIZE-1); const int adjustedBlockSize = 511; const class Point hi = getOnes() * adjustedBlockSize; //box low and high corners for destination const class Box bxdest(lo,hi); // This will grow the box by one ghost // along each face and become the box for // the source box. const class Box bxsrc = bxdest . grow (1); #pragma omp target device(mpi) end // source and destination data containers #pragma omp target data map(to:Asrc) map(from:Adest) class RectMDArray< double , 1 , 1 , 1 > Asrc(bxsrc); class RectMDArray< double , 1 , 1 , 1 > Adest(bxdest); // all the coefficients I need for this operation const double ident = 1.0; // DQ (2/18/2015): I need the simpler version because the current constant folding does not operate on floating point values. // const double C0 = -2.0 * DIM; const double C0 = -6.00000; initialize(Asrc); initialize(Adest); // cout <<" The source Box" << endl; // Asrc.print(); // cout << endl; // build the stencil, and the stencil operator // Stencil<double> laplace(wt,shft); const std::array< Shift , 3 > S = getShiftVec(); // This calls: template <class T> Stencil<T> operator*(T a_coef, Shift a_shift); class Stencil< double > laplace = C0*((S)^(zero)); for (int dir = 0; dir < 3; dir++) { const class Point thishft = getUnitv(dir); // DQ (2/15/2015): Added operator+=() to support clearer updates of an existing object for compile-time analysis. // laplace = laplace + ident*(S^thishft); // laplace = laplace + ident*(S^(thishft*(-1))); laplace += ident*((S)^(thishft)); laplace += ident*((S)^thishft * -1); } // laplace.stencilDump(); // StencilOperator<double,double, double> op; int lb2 = bxdest . getLowCorner ()[2]; int k = 0; int ub2 = bxdest . getHighCorner ()[2]; int arraySize_X = bxdest . size (0); int lb1 = bxdest . getLowCorner ()[1]; int j = 0; int ub1 = bxdest . getHighCorner ()[1]; int arraySize_Y = bxdest . size (1); int lb0 = bxdest . getLowCorner ()[0]; int i = 0; int ub0 = bxdest . getHighCorner ()[0]; int arraySize_Z = bxdest . size (2); double *sourceDataPointer = Asrc . getPointer(); double *destinationDataPointer = Adest . getPointer(); int arraySize_X_src = bxsrc . size (0); int arraySize_Y_src = bxsrc . size (1); int arraySize_Z_src = bxsrc . size (2); int lb2src = bxsrc . getLowCorner ()[2]; int lb1src = bxsrc . getLowCorner ()[1]; int lb0src = bxsrc . getLowCorner ()[0]; #pragma omp target device(mpi) map(to:sourceDataPointer[lb0src:arraySize_X_src][lb1src:arraySize_Y_src][lb2src:arraySize_Z_src] dist_data(DUPLICATE, DUPLICATE, BLOCK)) map(from:destinationDataPointer[lb0:arraySize_X][lb1:arraySize_Y][lb2:arraySize_Z]) #pragma omp target parallel for for (k = lb2; k <= ub2; ++k) { // loop to be distributed must match the dimension being distributed (3rd dimension). for (j = lb1; j <= ub1; ++j) { for (i = lb0; i <= ub0; ++i) { destinationDataPointer[arraySize_X * (arraySize_Y * k + j) + i] = sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] * -6.00000; // std::cout<< arraySize_X_src << " " << arraySize_Y_src << " " << i << ":" << j << ":" << k << " " << destinationDataPointer[arraySize_X * (arraySize_Y * k + j) + i] << "=" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] << "* -6.00000" << std::endl; } } } // cout <<" The destination Box" << endl; // Adest.print(); }
int main(int argc,char *argv[]) { // MPI setup // ------------------------------------------------------------------done int rank, nprocs; #if USE_XOMP_MPI xomp_init_mpi (&argc, &argv, &rank, &nprocs); #else MPI_Init (&argc, &argv); MPI_Comm_rank (MPI_COMM_WORLD, &rank); MPI_Comm_size (MPI_COMM_WORLD, &nprocs); #endif const class Point zero = getZeros(); const class Point ones = getOnes(); const class Point negones = ones * -1; const class Point lo(zero); // DQ (2/7/2015): Fixup for error yet to be fixed in ROSE (or fixed on alternative branch not yet merged). // Point hi = getOnes()*(BLOCKSIZE-1); const int adjustedBlockSize = SIZE; const class Point hi = getOnes() * adjustedBlockSize; //box low and high corners for destination const class Box bxdest(lo,hi); // This will grow the box by one ghost // along each face and become the box for // the source box. const class Box bxsrc = bxdest . grow (1); //MPI specific code: each array dimension boundary information : [lower: size] // upper bounds can be omitted // --------------------------------------------------------------------- // no need to handle them specially, Can we reuse existing variables ?? int lb0src, lb1src, lb2src; // int ub0src, ub1src, ub2src; int arraySize_X_src, arraySize_Y_src, arraySize_Z_src; int lb0dest, lb1dest, lb2dest, ub0dest, ub1dest, ub2dest; int arraySize_X_dest, arraySize_Y_dest, arraySize_Z_dest; int i,j,k; // MPI specific code: for mapped arrays, // to direction: one copy // from direction: one copy also double *sourceDataPointer; double *destinationDataPointer; double *destinationDataPointer_ref; // WHY changed to pointer types?? necessary to have global scope, not limited by if(rank0) class RectMDArray< double , 1 , 1 , 1 > *Asrc; class RectMDArray< double , 1 , 1 , 1 > *Adest; class RectMDArray< double , 1 , 1 , 1 > *Adest_ref; double begin, end, elapsed_secs; if(rank == 0) { // Only master process allocate all data Asrc = new RectMDArray< double , 1 , 1 , 1 >(bxsrc); Adest= new RectMDArray< double , 1 , 1 , 1 >(bxdest); Adest_ref = new RectMDArray< double , 1 , 1 , 1 >(bxdest); initialize(*Asrc); initialize(*Adest); initialize(*Adest_ref); // these initialization can happen in every process, or not? // No, seg fault if move into all processes destinationDataPointer= Adest-> getPointer(); lb2src = bxsrc . getLowCorner ()[2]; lb1src = bxsrc . getLowCorner ()[1]; arraySize_X_src = bxsrc . size (0); arraySize_Y_src = bxsrc . size (1); arraySize_Z_src = bxsrc . size (2); lb0src = bxsrc . getLowCorner ()[0]; // ub2src = bxsrc . getHighCorner ()[2]; // ub1src = bxsrc . getHighCorner ()[1]; // ub0src = bxsrc . getHighCorner ()[0]; lb2dest = bxdest . getLowCorner ()[2]; ub2dest = bxdest . getHighCorner ()[2]; arraySize_X_dest = bxdest . size (0); assert (lb2dest == 0); lb1dest = bxdest . getLowCorner ()[1]; ub1dest = bxdest . getHighCorner ()[1]; arraySize_Y_dest = bxdest . size (1); assert (lb1dest == 0); lb0dest = bxdest . getLowCorner ()[0]; ub0dest = bxdest . getHighCorner ()[0]; arraySize_Z_dest = bxdest . size (2); assert (lb0dest == 0); //They are different! one with halo, the other does not. assert (arraySize_Z_src == arraySize_Z_dest +2 ); assert (arraySize_X_src == arraySize_X_dest +2 ); assert (arraySize_Y_src == arraySize_Y_dest +2 ); sourceDataPointer = Asrc->getPointer(); destinationDataPointer_ref = Adest_ref->getPointer(); begin = MPI_Wtime(); //---------------------------------------------------------- // reference sequential version for (k = lb2dest; k <= ub2dest; ++k) { for (j = lb1dest; j <= ub1dest; ++j) { for (i = lb0dest; i <= ub0dest; ++i) { destinationDataPointer_ref[arraySize_X_dest * (arraySize_Y_dest * k + j) + i] = sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] * -6.00000; #if debug cout << i << " " << j << " " << k << " " << destinationDataPointer_ref[arraySize_X_dest * (arraySize_Y_dest * k + j) + i] << "= " << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] << "* -6.00000" << endl; #endif } } } end = MPI_Wtime(); elapsed_secs = (end - begin); cout << "Exec. time for serial code: " << elapsed_secs << endl; #if debug cout <<" The serail result" << endl; Adest_ref->print(); cout << endl; #endif //------------------end sequential reference execution---------------------------------------- } // // Be careful for a needed barrier!! avoid race condition!! MPI_Barrier(MPI_COMM_WORLD); if(rank == 0) { std::cout << "I am rank " << rank << " of " << nprocs << " processes, I am calling master subroutine" << std::endl; #if 0 Adest= new RectMDArray< double , 1 , 1 , 1 >(bxdest); initialize(*Adest); destinationDataPointer= Adest-> getPointer(); #endif begin = MPI_Wtime(); } else { std::cout << "I am rank " << rank << " of " << nprocs << " processes, I am calling member subroutine" << std::endl; } // -------------------TODO translate this 2015-10-26 ------------------ // Translate mapped scalar data, communicate to all processes // Alternatively, each process calculate them?? MPI_Bcast( &lb0src, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &lb1src, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &lb2src, 1, MPI_INT, 0, MPI_COMM_WORLD); // upper bounds can be omitted, calculated by add lower+size // MPI_Bcast( &ub0src, 1, MPI_INT, 0, MPI_COMM_WORLD); // MPI_Bcast( &ub1src, 1, MPI_INT, 0, MPI_COMM_WORLD); // MPI_Bcast( &ub2src, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &arraySize_X_src, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &arraySize_Y_src, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &arraySize_Z_src, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &lb0dest, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &lb1dest, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &lb2dest, 1, MPI_INT, 0, MPI_COMM_WORLD); // upper bounds can be omitted, calculated by add lower+size MPI_Bcast( &ub0dest, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &ub1dest, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &ub2dest, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &arraySize_X_dest, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &arraySize_Y_dest, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &arraySize_Z_dest, 1, MPI_INT, 0, MPI_COMM_WORLD); // Translate mapped arrays // calculate offset // declare temp arrays for storing mapped arrays double *distsrc; double *distdest; int nghost = 1; // halo region size for source partitions // calculate strip size for the distributed dimension Z // a runtime function call for distributing data int offsetdest; // _mpi_destinationDataPointer_offset_2 int distdestsize; // _mpi_destinationDataPointer_size_2 // _xomp_nprocs , _xomp_rank, // for source array,no need? // xomp_static_even_divide_start_size(lb2src,arraySize_Z_src,_xomp_nprocs,_xomp_rank,&_mpi_sourceDataPointer_offset_2,&_mpi_sourceDataPointer_size_2); // for destination array which needs being sent back // xomp_static_even_divide_start_size(lb2,arraySize_Z_dest,_xomp_nprocs,_xomp_rank,&_mpi_destinationDataPointer_offset_2,&_mpi_destinationDataPointer_size_2); xomp_static_even_divide_start_size (0, arraySize_Z_dest, nprocs, rank, & offsetdest, & distdestsize); //allocate source data partitions for each process // source partitions contain halo region elements // TODO a runtime allocation function call? too trivial //distsrc = (double*)calloc(1,sizeof(double)*(distdestsize+2*nghost)*arraySize_X_src*arraySize_Y_src); distsrc = (double*)calloc(sizeof(double), (distdestsize+2*nghost)*arraySize_X_src*arraySize_Y_src); // allocate destination data, no need to initialize //distdest = (double*)calloc(1,sizeof(double)*(distdestsize)*arraySize_X_dest*arraySize_Y_dest); distdest = (double*)calloc(sizeof(double),(distdestsize)*arraySize_X_dest*arraySize_Y_dest); // copy source data from source master to each process's local buffer // wrap into a runtime function call // Parameters: // // INPUT // sourceDataPointer: source array address // source array dimension info: size_x, size_y, size_z // distribution policy: block on z dimension // total processes: nprocs // rank ID, // halo region size: // offsetdest: 0, + chunk , +2chunks, ... etc , can be calculated internally // distdestsize: this can be calculated internally // // OUTPUT: modified things // distsrc: local copy of master/slave process // extern void xomp_divide_scatter_array_to_all (void * sourceDataPointer, int element_type_id, int x_dim_size, int y_dim_size, int z_dim_size, // int distributed_dimension_id, int halo_size, int rank_id, int process_count, int** distsrc); xomp_divide_scatter_array_to_all (sourceDataPointer, arraySize_X_src, arraySize_Y_src, arraySize_Z_src, 2, 1, rank, nprocs, &distsrc); // arraySize_X_dest, arraySize_Y_dest, arraySize_Z_dest); // TODO: is a barrier needed here?? // computation, only k loop is distributed // Also matches the k loop distribution of the nested loop // another runtime function call for distributing loops // int _lower, _upper; //void xomp_static_even_divide_lower_upper (int start, int end, int thread_count, int thread_id, int* n_lower, int* n_upper); // This is wrong since the bounds are relative to the original global data buffer // What is needed is the bounds local to local portions. The lower bounds always start from 0. The size is the size of local portion! // So the bounds should match the size of a distributed array's local portion!! //xomp_static_even_divide_lower_upper (lb2dest, ub2dest, nprocs, rank, &_lower, &_upper); assert (lb2dest ==0); //assert (_lower ==0); //assert (_upper == distdestsize -1 ); for (k = 0; k < distdestsize; ++k) { // is this correct??, should lower starts with 0?? // for (k = _lower; k <= _upper; ++k) { // bounds obtained from runtime calls, inclusive bounds for (j = lb1dest; j <= ub1dest; ++j) { for (i = lb0dest; i <= ub0dest; ++i) { // the loop's arrays are replaced with distributed version, the rest is intact distdest[arraySize_X_dest * (arraySize_Y_dest * k + j) + i] = \ distsrc[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] + \ distsrc[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] + \ distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] + \ distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] + \ distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] + \ distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] + \ distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] * -6.00000; } //cout << rank<<": end of j = " << j << endl; } //cout << "end of k = " << k << endl; } // A runtime function to collect data // void xomp_collect_scattered_array_from_all () // Parameters: // destinationDataPointer // aggregated data on the master process // distdest // distributed portions on each process // arraySize_X_dest // arraySize_Y_dest // arraySize_Z_dest // halo_size // not used for now // // distribution_dimension_id // nprocs // rank_id // Calculated one: // offsetdest, distdestsize // distribution ID is Z (2 of 0,1,2), halo region is size 0 in this case xomp_collect_scattered_array_from_all (distdest, arraySize_X_dest, arraySize_Y_dest, arraySize_Z_dest, 2, 0, rank, nprocs, &destinationDataPointer); //TODO Is this barrier necessary? Yes before the deletion later MPI_Barrier(MPI_COMM_WORLD); if(rank == 0) { #if debug cout <<" MPI result " << endl; Adest->print(); cout << endl; #endif assert(checksum(*Adest_ref, *Adest)==0); delete Adest_ref; delete Asrc; delete Adest; } MPI_Finalize(); }
int main(int argc,char *argv[]) { const class Point zero = getZeros(); const class Point ones = getOnes(); const class Point negones = ones * -1; const class Point lo(zero); // DQ (2/7/2015): Fixup for error yet to be fixed in ROSE (or fixed on alternative branch not yet merged). // Point hi = getOnes()*(BLOCKSIZE-1); const int adjustedBlockSize = 31; const class Point hi = getOnes() * adjustedBlockSize; //box low and high corners for destination const class Box bxdest(lo,hi); // This will grow the box by one ghost // along each face and become the box for // the source box. const class Box bxsrc = bxdest . grow (1); // source and destination data containers class RectMDArray< double , 1 , 1 , 1 > Asrc(bxsrc); class RectMDArray< double , 1 , 1 , 1 > Adest(bxdest); // all the coefficients I need for this operation const double ident = 1.0; // DQ (2/18/2015): I need the simpler version because the current constant folding does not operate on floating point values. // const double C0 = -2.0 * DIM; const double C0 = -6.00000; initialize(Asrc); initialize(Adest); // cout <<" The source Box" << endl; // Asrc.print(); // cout << endl; // build the stencil, and the stencil operator // Stencil<double> laplace(wt,shft); const std::array< Shift , 3 > S = getShiftVec(); // This calls: template <class T> Stencil<T> operator*(T a_coef, Shift a_shift); class Stencil< double > laplace = C0*((S)^(zero)); for (int dir = 0; dir < 3; dir++) { const class Point thishft = getUnitv(dir); // DQ (2/15/2015): Added operator+=() to support clearer updates of an existing object for compile-time analysis. // laplace = laplace + ident*(S^thishft); // laplace = laplace + ident*(S^(thishft*(-1))); laplace += ident*((S)^(thishft)); laplace += ident*((S)^thishft * -1); } int lb2 = bxdest . getLowCorner ()[2]; int k = 0; int ub2 = bxdest . getHighCorner ()[2]; int arraySize_X = bxdest . size (0); int lb1 = bxdest . getLowCorner ()[1]; int j = 0; int ub1 = bxdest . getHighCorner ()[1]; int arraySize_Y = bxdest . size (1); int lb0 = bxdest . getLowCorner ()[0]; int i = 0; int ub0 = bxdest . getHighCorner ()[0]; int arraySize_Z = bxdest . size (2); #pragma 0 double (*sourceDataPointer)[arraySize_Y][arraySize_X]; sourceDataPointer = (double (*)[arraySize_Y][arraySize_X])malloc(sizeof(double)*arraySize_X*arraySize_Y*arraySize_Z); memcpy(sourceDataPointer,Asrc.getPointer(),sizeof(double)*arraySize_X*arraySize_Y*arraySize_Z); #pragma 0 double (*destinationDataPointer)[arraySize_Y][arraySize_X]; destinationDataPointer = (double (*)[arraySize_Y][arraySize_X])malloc(sizeof(double)*arraySize_X*arraySize_Y*arraySize_Z); memcpy(destinationDataPointer,Adest.getPointer(),sizeof(double)*arraySize_X*arraySize_Y*arraySize_Z); #pragma 32 for (k = lb2; k < ub2; ++k) { for (j = lb1; j < ub1; ++j) { for (i = lb0; i < ub0; ++i) { destinationDataPointer[k][j][i] = sourceDataPointer[k + -1][j][i] + sourceDataPointer[k + 1][j][i] + sourceDataPointer[k][j + -1][i] + sourceDataPointer[k][j + 1][i] + sourceDataPointer[k][j][i + -1] + sourceDataPointer[k][j][i + 1] + sourceDataPointer[k][j][i] * -6.00000; } } } memcpy(Asrc.getPointer(),sourceDataPointer,sizeof(double)*arraySize_X*arraySize_Y*arraySize_Z); memcpy(Adest.getPointer(),destinationDataPointer,sizeof(double)*arraySize_X*arraySize_Y*arraySize_Z); free(sourceDataPointer); free(destinationDataPointer); // cout <<" The destination Box" << endl; // Adest.print(); }
int master(int rank, int nprocs) { const class Point zero = getZeros(); const class Point ones = getOnes(); const class Point negones = ones * -1; const class Point lo(zero); // DQ (2/7/2015): Fixup for error yet to be fixed in ROSE (or fixed on alternative branch not yet merged). // Point hi = getOnes()*(BLOCKSIZE-1); const int adjustedBlockSize = SIZE; const class Point hi = getOnes() * adjustedBlockSize; //box low and high corners for destination const class Box bxdest(lo,hi); // This will grow the box by one ghost // along each face and become the box for // the source box. const class Box bxsrc = bxdest . grow (1); // source and destination data containers class RectMDArray< double , 1 , 1 , 1 > Asrc(bxsrc); class RectMDArray< double , 1 , 1 , 1 > Adest(bxdest); // all the coefficients I need for this operation const double ident = 1.0; // DQ (2/18/2015): I need the simpler version because the current constant folding does not operate on floating point values. // const double C0 = -2.0 * DIM; const double C0 = -6.00000; initialize(Asrc); initialize(Adest); #if debug cout <<" The source Box" << endl; Asrc.print(); cout << endl; #endif // build the stencil, and the stencil operator // Stencil<double> laplace(wt,shft); const std::array< Shift , 3 > S = getShiftVec(); // This calls: template <class T> Stencil<T> operator*(T a_coef, Shift a_shift); class Stencil< double > laplace = C0*((S)^(zero)); for (int dir = 0; dir < 3; dir++) { const class Point thishft = getUnitv(dir); // DQ (2/15/2015): Added operator+=() to support clearer updates of an existing object for compile-time analysis. // laplace = laplace + ident*(S^thishft); // laplace = laplace + ident*(S^(thishft*(-1))); laplace += ident*((S)^(thishft)); laplace += ident*((S)^thishft * -1); } // laplace.stencilDump(); // sequential version ------------------------------ // StencilOperator<double,double, double> op; double begin = MPI_Wtime(); int lb2src = bxsrc . getLowCorner ()[2]; int k = 0; int ub2src = bxsrc . getHighCorner ()[2]; int arraySize_X_src = bxsrc . size (0); int lb1src = bxsrc . getLowCorner ()[1]; int j = 0; int ub1src = bxsrc . getHighCorner ()[1]; int arraySize_Y_src = bxsrc . size (1); int lb0src = bxsrc . getLowCorner ()[0]; int i = 0; int ub0src = bxsrc . getHighCorner ()[0]; int arraySize_Z_src = bxsrc . size (2); int lb2dest = bxdest . getLowCorner ()[2]; int ub2dest = bxdest . getHighCorner ()[2]; int arraySize_X_dest = bxdest . size (0); int lb1dest = bxdest . getLowCorner ()[1]; int ub1dest = bxdest . getHighCorner ()[1]; int arraySize_Y_dest = bxdest . size (1); int lb0dest = bxdest . getLowCorner ()[0]; int ub0dest = bxdest . getHighCorner ()[0]; int arraySize_Z_dest = bxdest . size (2); double *sourceDataPointer = Asrc . getPointer(); double *destinationDataPointer = Adest . getPointer(); for (k = lb2dest; k <= ub2dest; ++k) { for (j = lb1dest; j <= ub1dest; ++j) { for (i = lb0dest; i <= ub0dest; ++i) { destinationDataPointer[arraySize_X_dest * (arraySize_Y_dest * k + j) + i] = sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] + sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] * -6.00000; #if debug cout << i << " " << j << " " << k << " " << destinationDataPointer[arraySize_X_dest * (arraySize_Y_dest * k + j) + i] << "= " << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] << "+" << sourceDataPointer[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] << "* -6.00000" << endl; #endif } } } double end = MPI_Wtime(); double elapsed_secs = (end - begin); cout << "Exec. time for serial code: " << elapsed_secs << endl; #if debug cout <<" The serial result" << endl; Adest.print(); cout << endl; #endif // real MPI in the following class RectMDArray< double , 1 , 1 , 1 > Adest_new(bxdest); initialize(Adest_new); double *destinationDataPointer_new = Adest_new . getPointer(); begin = MPI_Wtime(); MPI_Bcast( &lb0src, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &lb1src, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &lb2src, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &ub0src, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &ub1src, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &ub2src, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &arraySize_X_src, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &arraySize_Y_src, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &arraySize_Z_src, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &lb0dest, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &lb1dest, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &lb2dest, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &ub0dest, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &ub1dest, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &ub2dest, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &arraySize_X_dest, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &arraySize_Y_dest, 1, MPI_INT, 0, MPI_COMM_WORLD); MPI_Bcast( &arraySize_Z_dest, 1, MPI_INT, 0, MPI_COMM_WORLD); double *distsrc; double *distdest; int nghost = 1; int distsrcsize = arraySize_Z_src / nprocs; int offsetsrc = rank * distsrcsize; if(rank < arraySize_Z_src%nprocs) { distsrcsize++; } if(rank >= arraySize_Z_src%nprocs) offsetsrc += arraySize_Z_src%nprocs; else offsetsrc += rank; int distdestsize = arraySize_Z_dest / nprocs; int offsetdest = rank * distdestsize; if(rank < arraySize_Z_dest%nprocs) { distdestsize++; } if(rank >= arraySize_Z_dest%nprocs) offsetdest += arraySize_Z_dest%nprocs; else offsetdest += rank; distsrc = (double*)calloc(1,sizeof(double)*(distdestsize+2*nghost)*arraySize_X_src*arraySize_Y_src); distdest = (double*)calloc(1,sizeof(double)*(distdestsize)*arraySize_X_dest*arraySize_Y_dest); // team leader send data to all members int copyOffset = offsetdest * arraySize_X_src*arraySize_Y_src; int copySize = (distdestsize + 2 * nghost) * arraySize_X_src*arraySize_Y_src; if(nprocs > 1) { int dest, send_tag=1; MPI_Request send_reqs[nprocs-1]; MPI_Status send_status[nprocs-1]; for(dest = 1; dest < nprocs; ++dest) { int sendSize = arraySize_Z_dest / nprocs; int sendOffset = dest * sendSize; if(dest < arraySize_Z_dest%nprocs) { sendSize++; } sendSize = (sendSize+2)*arraySize_X_src*arraySize_Y_src; if(dest >= arraySize_Z_dest%nprocs) sendOffset += arraySize_Z_dest%nprocs; else sendOffset += dest; sendOffset = sendOffset*arraySize_X_src*arraySize_Y_src; #if debug cout << "Master send size " << sendSize<< " from offset " << sendOffset << " " << " to " << dest << endl; #endif MPI_Isend(sourceDataPointer+sendOffset, sendSize,MPI_DOUBLE, dest, send_tag, MPI_COMM_WORLD,&send_reqs[dest-1]); // int idx; // for(idx = 0; idx < sendSize; ++idx) // printf("Source send to dest:%d result %d: %f\n",dest, idx, sourceDataPointer[offsetsrc+sendOffset+idx]); } MPI_Waitall(nprocs-1,send_reqs,send_status); } // local copy (this is optional, but simplier for transformation) memcpy(distsrc,sourceDataPointer+copyOffset,copySize*sizeof(double)); // computation for (k = lb2dest; k < distdestsize; ++k) { for (j = lb1dest; j <= ub1dest; ++j) { for (i = lb0dest; i <= ub0dest; ++i) { distdest[arraySize_X_dest * (arraySize_Y_dest * k + j) + i] = distsrc[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] + \ distsrc[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] + \ distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] + \ distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] + \ distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] + \ distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] + distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] * -6.00000; #if debug cout << "rank0 " << i << " " << j << " " << k << " " << distdest[arraySize_X_dest * (arraySize_Y_dest * k + j) + i] << "= " << \ distsrc[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + -1) + (j-lb1src)) + (i-lb0src)] << "+" << \ distsrc[arraySize_X_src * (arraySize_Y_src * ((k-lb2src) + 1) + (j-lb1src)) + (i-lb0src)] << "+" << \ distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + -1)) + (i-lb0src)] << "+" << \ distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + ((j-lb1src) + 1)) + (i-lb0src)] << "+" << \ distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + -1)] << "+" << \ distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + ((i-lb0src) + 1)] << "+" << \ distsrc[arraySize_X_src * (arraySize_Y_src * (k-lb2src) + (j-lb1src)) + (i-lb0src)] << "* -6.00000" << endl; #endif } } } // team leader receives data to all members int src, recv_tag=1; MPI_Request recv_reqs[nprocs-1]; MPI_Status recv_status[nprocs-1]; for(src = 1; src < nprocs; ++src) { int recvSize = arraySize_Z_dest / nprocs; int recvOffset = src * recvSize; if(src < arraySize_Z_dest%nprocs) { recvSize++; } recvSize *= arraySize_X_dest*arraySize_Y_dest; if(src >= arraySize_Z_dest%nprocs) recvOffset += arraySize_Z_dest%nprocs; else recvOffset += src; recvOffset = recvOffset*arraySize_X_dest*arraySize_Y_dest; MPI_Irecv(destinationDataPointer_new+recvOffset, recvSize, MPI_DOUBLE, src, recv_tag, MPI_COMM_WORLD,&recv_reqs[src-1]); } MPI_Waitall(nprocs-1,recv_reqs,recv_status); // local copy (this could be optional, but simpler for transformation) memcpy(destinationDataPointer_new+offsetdest,distdest,distdestsize*bxdest.size(0)*bxdest.size(1)*sizeof(double)); end = MPI_Wtime(); elapsed_secs = (end - begin); cout << "Exec. time for MPI code: " << elapsed_secs << endl; #if debug cout <<" MPI result " << endl; Adest_new.print(); cout << endl; #endif assert(checksum(Adest, Adest_new)==0); return 0; }
ComplexArr shiftArray(ComplexArr arr, size_t n){ ComplexArr retVal = getZeros(arr.size()); ComplexArr inter = arr[std::slice(0, arr.size()-n, 1)]; retVal[std::slice(n, arr.size()-n, 1)] = inter; return retVal; }