void MICValidate(const Matrix2D<T>& s, const Matrix2D<T>& t,double valErrThreshold,unsigned int nValErrsToPrint) { assert( (s.GetNumRows() == t.GetNumRows()) && (s.GetNumColumns() == t.GetNumColumns()) ); #if 1 for( unsigned int i = 0; i < s.GetNumRows(); i++ ) { for( unsigned int j = 0; j < s.GetNumColumns(); j++ ) { T expVal = s.GetConstData()[i][j]; T actualVal = t.GetConstData()[i][j]; T delta = fabsf( actualVal - expVal ); T relError = (expVal != 0.0f) ? delta / expVal : 0.0f; if( relError > valErrThreshold ) { std::cout<<"Failed\n"; return; } } } std::cout<<"Passed\n"; #endif #if 0 std::cout<<"Expected Value \n"; for( unsigned int i = 0; i < s.GetNumRows(); i++ ) { for( unsigned int j = 0; j < s.GetNumColumns(); j++ ) { T expVal = s.GetConstData()[i][j]; std::cout<<expVal<<" "; } std::cout<<endl; } std::cout<<"Calculated vaue \n"; for( unsigned int i = 0; i < s.GetNumRows(); i++ ) { for( unsigned int j = 0; j < s.GetNumColumns(); j++ ) { T expVal = t.GetConstData()[i][j]; std::cout<<expVal<<" "; } std::cout<<endl; } #endif }
void Initialize<T>::operator()( Matrix2D<T>& mtx ) { srand48( seed ); int nTileRows = mtx.GetNumRows() - 2 * haloWidth; if( (rowPeriod != -1) && (rowPeriod < nTileRows) ) { nTileRows = rowPeriod; } int nTileCols = mtx.GetNumColumns() - 2 * haloWidth; if( (colPeriod != -1) && (colPeriod < nTileCols) ) { nTileCols = colPeriod; } // initialize first tile for( unsigned int i = 0; i < nTileRows; i++ ) { for( unsigned int j = 0; j < nTileCols; j++ ) { #ifndef READY mtx.GetData()[i+haloWidth][j+haloWidth] = i * j; #else mtx.GetData()[i+haloWidth][j+haloWidth] = (T)drand48(); #endif // READY } } // initialize any remaining tiles // first we fill along rows a tile at a time, // then fill out along columns a row at a time if( colPeriod != -1 ) { int nTiles = (mtx.GetNumColumns() - 2*haloWidth) / colPeriod; if( (mtx.GetNumColumns() - 2*haloWidth) % colPeriod != 0 ) { nTiles += 1; } for( unsigned int t = 1; t < nTiles; t++ ) { for( unsigned int i = 0; i < nTileRows; i++ ) { memcpy( &(mtx.GetData()[haloWidth + i][haloWidth + t*nTileCols]), &(mtx.GetData()[haloWidth + i][haloWidth]), nTileCols * sizeof(T) ); } } } if( rowPeriod != -1 ) { int nTiles = (mtx.GetNumRows() - 2*haloWidth) / rowPeriod; if( (mtx.GetNumRows() - 2*haloWidth) % rowPeriod != 0 ) { nTiles += 1; } for( unsigned int t = 1; t < nTiles; t++ ) { for( unsigned int i = 0; i < nTileRows; i++ ) { memcpy( &(mtx.GetData()[haloWidth + t*nTileRows + i][haloWidth]), &(mtx.GetData()[haloWidth + i][haloWidth]), (mtx.GetNumColumns() - 2*haloWidth) * sizeof(T) ); } } } // initialize halo for( unsigned int i = 0; i < mtx.GetNumRows(); i++ ) { for( unsigned int j = 0; j < mtx.GetNumColumns(); j++ ) { bool inHalo = false; if( (i < haloWidth) || (i > mtx.GetNumRows() - 1 - haloWidth) ) { inHalo = true; } else if( (j < haloWidth) || (j > mtx.GetNumColumns() - 1 - haloWidth) ) { inHalo = true; } if( inHalo ) { mtx.GetData()[i][j] = haloVal; } } } }
void MPICUDAStencil<T>::DoPreIterationWork( T* currBuf, // in device global memory T* altBuf, // in device global memory Matrix2D<T>& mtx, unsigned int iter ) { // do the halo exchange at desired frequency // note that we *do not* do the halo exchange here before the // first iteration, because we did it already (before we first // pushed the data onto the device) in our operator() method. unsigned int haloWidth = this->GetNumberIterationsPerHaloExchange(); if( (iter > 0) && (iter % haloWidth) == 0 ) { unsigned int nRows = mtx.GetNumRows(); unsigned int nCols = mtx.GetNumColumns(); unsigned int nPaddedCols = mtx.GetNumPaddedColumns(); T* flatData = mtx.GetFlatData(); size_t nsDataItemCount = haloWidth * nPaddedCols; size_t ewDataItemCount = haloWidth * nRows; size_t nsDataSize = nsDataItemCount * sizeof(T); size_t ewDataSize = ewDataItemCount * sizeof(T); // // read current data off device // we only read halo, and only for sides where we have a neighbor // if( this->HaveNorthNeighbor() ) { // north data is contiguous - copy directly into matrix cudaMemcpy( flatData + (haloWidth * nPaddedCols), // dest currBuf + (haloWidth * nPaddedCols), // src nsDataSize, // amount to transfer cudaMemcpyDeviceToHost ); // direction } if( this->HaveSouthNeighbor() ) { // south data is contiguous - copy directly into matrix cudaMemcpy( flatData + ((nRows - 2*haloWidth)*nPaddedCols), // dest currBuf + ((nRows - 2*haloWidth)*nPaddedCols), // src nsDataSize, // amount to transfer cudaMemcpyDeviceToHost ); // direction } if( this->HaveEastNeighbor() ) { // east data is non-contiguous - but CUDA has a strided read cudaMemcpy2D( flatData + (nCols - 2*haloWidth), // dest nPaddedCols * sizeof(T), // dest pitch currBuf + (nCols - 2*haloWidth), // src nPaddedCols * sizeof(T), // src pitch haloWidth * sizeof(T), // width of data to transfer (bytes) nRows, // height of data to transfer (rows) cudaMemcpyDeviceToHost ); // transfer direction } if( this->HaveWestNeighbor() ) { // west data is non-contiguous - but CUDA has a strided read cudaMemcpy2D( flatData + haloWidth, // dest nPaddedCols * sizeof(T), // dest pitch currBuf + haloWidth, // src nPaddedCols * sizeof(T), // src pitch haloWidth * sizeof(T), // width of data to transfer (bytes) nRows, // height of data to transfer (rows) cudaMemcpyDeviceToHost ); // transfer direction } // // do the actual halo exchange // if( dumpData ) { DumpData( ofs, mtx, "before halo exchange" ); } DoHaloExchange( mtx ); if( dumpData ) { DumpData( ofs, mtx, "after halo exchange" ); } // // push updated data back onto device // we only write halo, and only for sides where we have a neighbor // if( this->HaveNorthNeighbor() ) { // north data is contiguous - copy directly from matrix cudaMemcpy( currBuf, // dest flatData, // src nsDataSize, // amount to transfer cudaMemcpyHostToDevice ); // direction } if( this->HaveSouthNeighbor() ) { // south data is contiguous - copy directly from matrix cudaMemcpy( currBuf + ((nRows - haloWidth)*nPaddedCols), // dest flatData + ((nRows - haloWidth)*nPaddedCols), // src nsDataSize, // amount to transfer cudaMemcpyHostToDevice ); // direction } if( this->HaveEastNeighbor() ) { // east data is non-contiguous - but CUDA has a strided write cudaMemcpy2D( currBuf + (nCols - haloWidth), // dest nPaddedCols * sizeof(T), // dest pitch flatData + (nCols - haloWidth), // src nPaddedCols * sizeof(T), // src pitch haloWidth * sizeof(T), // width of data to transfer (bytes) nRows, // height of data to transfer (rows) cudaMemcpyHostToDevice ); // transfer direction } if( this->HaveWestNeighbor() ) { // west data is non-contiguous - but CUDA has a strided write cudaMemcpy2D( currBuf, // dest nPaddedCols * sizeof(T), // dest pitch flatData, // src nPaddedCols * sizeof(T), // src pitch haloWidth * sizeof(T), // width of data to transfer (bytes) nRows, // height of data to transfer (rows) cudaMemcpyHostToDevice ); // transfer direction } // we have changed the local halo values on the device // we need to update the local 1-wide halo in the alt buffer // note we only need to update the 1-wide halo here, even if // our real halo width is larger size_t rowExtent = mtx.GetNumPaddedColumns() * sizeof(T); cudaMemcpy2D( altBuf, // destination rowExtent, // destination pitch currBuf, // source rowExtent, // source pitch rowExtent, // width of data to transfer (bytes) 1, // height of data to transfer (rows) cudaMemcpyDeviceToDevice ); cudaMemcpy2D( altBuf + (mtx.GetNumRows() - 1) * mtx.GetNumPaddedColumns(), // destination rowExtent, // destination pitch currBuf + (mtx.GetNumRows() - 1) * mtx.GetNumPaddedColumns(), // source rowExtent, // source pitch rowExtent, // width of data to transfer (bytes) 1, // height of data to transfer (rows) cudaMemcpyDeviceToDevice ); // copy the non-contiguous data cudaMemcpy2D( altBuf, // destination rowExtent, // destination pitch currBuf, // source rowExtent, // source pitch sizeof(T), // width of data to transfer (bytes) mtx.GetNumRows(), // height of data to transfer (rows) cudaMemcpyDeviceToDevice ); cudaMemcpy2D( altBuf + (mtx.GetNumColumns() - 1), // destination rowExtent, // destination pitch currBuf + (mtx.GetNumColumns() - 1), // source rowExtent, // source pitch sizeof(T), // width of data to transfer (bytes) mtx.GetNumRows(), // height of data to transfer (rows) cudaMemcpyDeviceToDevice ); } }