void MICValidate(const Matrix2D<T>& s, const Matrix2D<T>& t,double valErrThreshold,unsigned int nValErrsToPrint) { assert( (s.GetNumRows() == t.GetNumRows()) && (s.GetNumColumns() == t.GetNumColumns()) ); #if 1 for( unsigned int i = 0; i < s.GetNumRows(); i++ ) { for( unsigned int j = 0; j < s.GetNumColumns(); j++ ) { T expVal = s.GetConstData()[i][j]; T actualVal = t.GetConstData()[i][j]; T delta = fabsf( actualVal - expVal ); T relError = (expVal != 0.0f) ? delta / expVal : 0.0f; if( relError > valErrThreshold ) { std::cout<<"Failed\n"; return; } } } std::cout<<"Passed\n"; #endif #if 0 std::cout<<"Expected Value \n"; for( unsigned int i = 0; i < s.GetNumRows(); i++ ) { for( unsigned int j = 0; j < s.GetNumColumns(); j++ ) { T expVal = s.GetConstData()[i][j]; std::cout<<expVal<<" "; } std::cout<<endl; } std::cout<<"Calculated vaue \n"; for( unsigned int i = 0; i < s.GetNumRows(); i++ ) { for( unsigned int j = 0; j < s.GetNumColumns(); j++ ) { T expVal = t.GetConstData()[i][j]; std::cout<<expVal<<" "; } std::cout<<endl; } #endif }
void Initialize<T>::operator()( Matrix2D<T>& mtx ) { srand48( seed ); int nTileRows = mtx.GetNumRows() - 2 * haloWidth; if( (rowPeriod != -1) && (rowPeriod < nTileRows) ) { nTileRows = rowPeriod; } int nTileCols = mtx.GetNumColumns() - 2 * haloWidth; if( (colPeriod != -1) && (colPeriod < nTileCols) ) { nTileCols = colPeriod; } // initialize first tile for( unsigned int i = 0; i < nTileRows; i++ ) { for( unsigned int j = 0; j < nTileCols; j++ ) { #ifndef READY mtx.GetData()[i+haloWidth][j+haloWidth] = i * j; #else mtx.GetData()[i+haloWidth][j+haloWidth] = (T)drand48(); #endif // READY } } // initialize any remaining tiles // first we fill along rows a tile at a time, // then fill out along columns a row at a time if( colPeriod != -1 ) { int nTiles = (mtx.GetNumColumns() - 2*haloWidth) / colPeriod; if( (mtx.GetNumColumns() - 2*haloWidth) % colPeriod != 0 ) { nTiles += 1; } for( unsigned int t = 1; t < nTiles; t++ ) { for( unsigned int i = 0; i < nTileRows; i++ ) { memcpy( &(mtx.GetData()[haloWidth + i][haloWidth + t*nTileCols]), &(mtx.GetData()[haloWidth + i][haloWidth]), nTileCols * sizeof(T) ); } } } if( rowPeriod != -1 ) { int nTiles = (mtx.GetNumRows() - 2*haloWidth) / rowPeriod; if( (mtx.GetNumRows() - 2*haloWidth) % rowPeriod != 0 ) { nTiles += 1; } for( unsigned int t = 1; t < nTiles; t++ ) { for( unsigned int i = 0; i < nTileRows; i++ ) { memcpy( &(mtx.GetData()[haloWidth + t*nTileRows + i][haloWidth]), &(mtx.GetData()[haloWidth + i][haloWidth]), (mtx.GetNumColumns() - 2*haloWidth) * sizeof(T) ); } } } // initialize halo for( unsigned int i = 0; i < mtx.GetNumRows(); i++ ) { for( unsigned int j = 0; j < mtx.GetNumColumns(); j++ ) { bool inHalo = false; if( (i < haloWidth) || (i > mtx.GetNumRows() - 1 - haloWidth) ) { inHalo = true; } else if( (j < haloWidth) || (j > mtx.GetNumColumns() - 1 - haloWidth) ) { inHalo = true; } if( inHalo ) { mtx.GetData()[i][j] = haloVal; } } } }
template <class T> void MICStencil<T>::operator()( Matrix2D<T>& mtx, unsigned int nIters ) { unsigned int uDimWithHalo = mtx.GetNumRows(); unsigned int uHaloWidth = LINESIZE / sizeof(T); unsigned int uImgElements = uDimWithHalo * uDimWithHalo; __declspec(target(mic), align(LINESIZE)) T* pIn = mtx.GetFlatData(); __declspec(target(mic), align(sizeof(T))) T wcenter = this->wCenter; __declspec(target(mic), align(sizeof(T))) T wdiag = this->wDiagonal; __declspec(target(mic), align(sizeof(T))) T wcardinal = this->wCardinal; #pragma offload target(mic) in(pIn:length(uImgElements) ALLOC RETAIN) { // Just copy pIn to compute the copy transfer time } #pragma offload target(mic) in(pIn:length(uImgElements) REUSE RETAIN) \ in(uImgElements) in(uDimWithHalo) \ in(wcenter) in(wdiag) in(wcardinal) { unsigned int uRowPartitions = sysconf(_SC_NPROCESSORS_ONLN) / 4 - 1; unsigned int uColPartitions = 4; // Threads per core for KNC unsigned int uRowTileSize = (uDimWithHalo - 2 * uHaloWidth) / uRowPartitions; unsigned int uColTileSize = (uDimWithHalo - 2 * uHaloWidth) / uColPartitions; uRowTileSize = ((uDimWithHalo - 2 * uHaloWidth) % uRowPartitions > 0) ? (uRowTileSize + 1) : (uRowTileSize); // Should use the "Halo Val" when filling the memory space T *pTmp = (T*)pIn; T *pCrnt = (T*)memset((T*)_mm_malloc(uImgElements * sizeof(T), LINESIZE), 0, uImgElements * sizeof(T)); #pragma omp parallel firstprivate(pTmp, pCrnt, uRowTileSize, uColTileSize, uHaloWidth, uDimWithHalo) { unsigned int uThreadId = omp_get_thread_num(); unsigned int uRowTileId = uThreadId / uColPartitions; unsigned int uColTileId = uThreadId % uColPartitions; unsigned int uStartLine = uRowTileId * uRowTileSize + uHaloWidth; unsigned int uStartCol = uColTileId * uColTileSize + uHaloWidth; unsigned int uEndLine = uStartLine + uRowTileSize; uEndLine = (uEndLine > (uDimWithHalo - uHaloWidth)) ? uDimWithHalo - uHaloWidth : uEndLine; unsigned int uEndCol = uStartCol + uColTileSize; uEndCol = (uEndCol > (uDimWithHalo - uHaloWidth)) ? uDimWithHalo - uHaloWidth : uEndCol; T cardinal0 = 0.0; T diagonal0 = 0.0; T center0 = 0.0; unsigned int cntIterations, i, j; for (cntIterations = 0; cntIterations < nIters; cntIterations ++) { // Do Stencil Operation for (i = uStartLine; i < uEndLine; i++) { T * pCenter = &pTmp [ i * uDimWithHalo]; T * pTop = pCenter - uDimWithHalo; T * pBottom = pCenter + uDimWithHalo; T * pOut = &pCrnt[ i * uDimWithHalo]; __assume_aligned(pCenter, 64); __assume_aligned(pTop, 64); __assume_aligned(pBottom, 64); __assume_aligned(pOut, 64); #pragma simd vectorlengthfor(float) for (j = uStartCol; j < uEndCol; j++) { cardinal0 = pCenter[j - 1] + pCenter[j + 1] + pTop[j] + pBottom[j]; diagonal0 = pTop[j - 1] + pTop[j + 1] + pBottom[j - 1] + pBottom[j + 1]; center0 = pCenter[j]; pOut[j] = wcardinal * cardinal0 + wdiag * diagonal0 + wcenter * center0; } } #pragma omp barrier ; // Switch pointers T* pAux = pTmp; pTmp = pCrnt; pCrnt = pAux; } // End For } // End Parallel _mm_free(pCrnt); } // End Offload #pragma offload target(mic) out(pIn:length(uImgElements) REUSE FREE) { // Just copy back pIn } }
void MPICUDAStencil<T>::DoPreIterationWork( T* currBuf, // in device global memory T* altBuf, // in device global memory Matrix2D<T>& mtx, unsigned int iter ) { // do the halo exchange at desired frequency // note that we *do not* do the halo exchange here before the // first iteration, because we did it already (before we first // pushed the data onto the device) in our operator() method. unsigned int haloWidth = this->GetNumberIterationsPerHaloExchange(); if( (iter > 0) && (iter % haloWidth) == 0 ) { unsigned int nRows = mtx.GetNumRows(); unsigned int nCols = mtx.GetNumColumns(); unsigned int nPaddedCols = mtx.GetNumPaddedColumns(); T* flatData = mtx.GetFlatData(); size_t nsDataItemCount = haloWidth * nPaddedCols; size_t ewDataItemCount = haloWidth * nRows; size_t nsDataSize = nsDataItemCount * sizeof(T); size_t ewDataSize = ewDataItemCount * sizeof(T); // // read current data off device // we only read halo, and only for sides where we have a neighbor // if( this->HaveNorthNeighbor() ) { // north data is contiguous - copy directly into matrix cudaMemcpy( flatData + (haloWidth * nPaddedCols), // dest currBuf + (haloWidth * nPaddedCols), // src nsDataSize, // amount to transfer cudaMemcpyDeviceToHost ); // direction } if( this->HaveSouthNeighbor() ) { // south data is contiguous - copy directly into matrix cudaMemcpy( flatData + ((nRows - 2*haloWidth)*nPaddedCols), // dest currBuf + ((nRows - 2*haloWidth)*nPaddedCols), // src nsDataSize, // amount to transfer cudaMemcpyDeviceToHost ); // direction } if( this->HaveEastNeighbor() ) { // east data is non-contiguous - but CUDA has a strided read cudaMemcpy2D( flatData + (nCols - 2*haloWidth), // dest nPaddedCols * sizeof(T), // dest pitch currBuf + (nCols - 2*haloWidth), // src nPaddedCols * sizeof(T), // src pitch haloWidth * sizeof(T), // width of data to transfer (bytes) nRows, // height of data to transfer (rows) cudaMemcpyDeviceToHost ); // transfer direction } if( this->HaveWestNeighbor() ) { // west data is non-contiguous - but CUDA has a strided read cudaMemcpy2D( flatData + haloWidth, // dest nPaddedCols * sizeof(T), // dest pitch currBuf + haloWidth, // src nPaddedCols * sizeof(T), // src pitch haloWidth * sizeof(T), // width of data to transfer (bytes) nRows, // height of data to transfer (rows) cudaMemcpyDeviceToHost ); // transfer direction } // // do the actual halo exchange // if( dumpData ) { DumpData( ofs, mtx, "before halo exchange" ); } DoHaloExchange( mtx ); if( dumpData ) { DumpData( ofs, mtx, "after halo exchange" ); } // // push updated data back onto device // we only write halo, and only for sides where we have a neighbor // if( this->HaveNorthNeighbor() ) { // north data is contiguous - copy directly from matrix cudaMemcpy( currBuf, // dest flatData, // src nsDataSize, // amount to transfer cudaMemcpyHostToDevice ); // direction } if( this->HaveSouthNeighbor() ) { // south data is contiguous - copy directly from matrix cudaMemcpy( currBuf + ((nRows - haloWidth)*nPaddedCols), // dest flatData + ((nRows - haloWidth)*nPaddedCols), // src nsDataSize, // amount to transfer cudaMemcpyHostToDevice ); // direction } if( this->HaveEastNeighbor() ) { // east data is non-contiguous - but CUDA has a strided write cudaMemcpy2D( currBuf + (nCols - haloWidth), // dest nPaddedCols * sizeof(T), // dest pitch flatData + (nCols - haloWidth), // src nPaddedCols * sizeof(T), // src pitch haloWidth * sizeof(T), // width of data to transfer (bytes) nRows, // height of data to transfer (rows) cudaMemcpyHostToDevice ); // transfer direction } if( this->HaveWestNeighbor() ) { // west data is non-contiguous - but CUDA has a strided write cudaMemcpy2D( currBuf, // dest nPaddedCols * sizeof(T), // dest pitch flatData, // src nPaddedCols * sizeof(T), // src pitch haloWidth * sizeof(T), // width of data to transfer (bytes) nRows, // height of data to transfer (rows) cudaMemcpyHostToDevice ); // transfer direction } // we have changed the local halo values on the device // we need to update the local 1-wide halo in the alt buffer // note we only need to update the 1-wide halo here, even if // our real halo width is larger size_t rowExtent = mtx.GetNumPaddedColumns() * sizeof(T); cudaMemcpy2D( altBuf, // destination rowExtent, // destination pitch currBuf, // source rowExtent, // source pitch rowExtent, // width of data to transfer (bytes) 1, // height of data to transfer (rows) cudaMemcpyDeviceToDevice ); cudaMemcpy2D( altBuf + (mtx.GetNumRows() - 1) * mtx.GetNumPaddedColumns(), // destination rowExtent, // destination pitch currBuf + (mtx.GetNumRows() - 1) * mtx.GetNumPaddedColumns(), // source rowExtent, // source pitch rowExtent, // width of data to transfer (bytes) 1, // height of data to transfer (rows) cudaMemcpyDeviceToDevice ); // copy the non-contiguous data cudaMemcpy2D( altBuf, // destination rowExtent, // destination pitch currBuf, // source rowExtent, // source pitch sizeof(T), // width of data to transfer (bytes) mtx.GetNumRows(), // height of data to transfer (rows) cudaMemcpyDeviceToDevice ); cudaMemcpy2D( altBuf + (mtx.GetNumColumns() - 1), // destination rowExtent, // destination pitch currBuf + (mtx.GetNumColumns() - 1), // source rowExtent, // source pitch sizeof(T), // width of data to transfer (bytes) mtx.GetNumRows(), // height of data to transfer (rows) cudaMemcpyDeviceToDevice ); } }