コード例 #1
0
void
MICValidate(const Matrix2D<T>& s, const Matrix2D<T>& t,double valErrThreshold,unsigned int nValErrsToPrint)
{

    assert( (s.GetNumRows() == t.GetNumRows()) && (s.GetNumColumns() == t.GetNumColumns()) );
	#if 1
    for( unsigned int i = 0; i < s.GetNumRows(); i++ )
    {
        for( unsigned int j = 0; j < s.GetNumColumns(); j++ )
        {
            T expVal = s.GetConstData()[i][j];
            T actualVal = t.GetConstData()[i][j];
            T delta = fabsf( actualVal - expVal );
            T relError = (expVal != 0.0f) ? delta / expVal : 0.0f;

            if( relError > valErrThreshold )
            {
		std::cout<<"Failed\n";
		return;
            }
        }
    }
    std::cout<<"Passed\n";
	#endif
	#if 0
	std::cout<<"Expected Value \n";
for( unsigned int i = 0; i < s.GetNumRows(); i++ )
    {
        for( unsigned int j = 0; j < s.GetNumColumns(); j++ )
        {
            T expVal = s.GetConstData()[i][j];
            std::cout<<expVal<<" ";
        }
	std::cout<<endl;
    }

	std::cout<<"Calculated vaue \n";
for( unsigned int i = 0; i < s.GetNumRows(); i++ )
    {   
        for( unsigned int j = 0; j < s.GetNumColumns(); j++ )
        {
            T expVal = t.GetConstData()[i][j];
            std::cout<<expVal<<" ";
        }   
        std::cout<<endl;
    }   
	#endif
}
コード例 #2
0
void
Initialize<T>::operator()( Matrix2D<T>& mtx )
{
    srand48( seed );

    int nTileRows = mtx.GetNumRows() - 2 * haloWidth;
    if( (rowPeriod != -1) && (rowPeriod < nTileRows) )
    {
        nTileRows = rowPeriod;
    }

    int nTileCols = mtx.GetNumColumns() - 2 * haloWidth;
    if( (colPeriod != -1) && (colPeriod < nTileCols) )
    {
        nTileCols = colPeriod;
    }


    // initialize first tile
    for( unsigned int i = 0; i < nTileRows; i++ )
    {
        for( unsigned int j = 0; j < nTileCols; j++ )
        {
#ifndef READY
            mtx.GetData()[i+haloWidth][j+haloWidth] = i * j;
#else
            mtx.GetData()[i+haloWidth][j+haloWidth] = (T)drand48();
#endif // READY
        }
    }

    // initialize any remaining tiles
    // first we fill along rows a tile at a time,
    // then fill out along columns a row at a time
    if( colPeriod != -1 )
    {
        int nTiles = (mtx.GetNumColumns() - 2*haloWidth) / colPeriod;
        if( (mtx.GetNumColumns() - 2*haloWidth) % colPeriod != 0 )
        {
            nTiles += 1;
        }

        for( unsigned int t = 1; t < nTiles; t++ )
        {
            for( unsigned int i = 0; i < nTileRows; i++ )
            {
                memcpy( &(mtx.GetData()[haloWidth + i][haloWidth + t*nTileCols]),
                        &(mtx.GetData()[haloWidth + i][haloWidth]),
                        nTileCols * sizeof(T) );
            }
        }
    }
    if( rowPeriod != -1 )
    {
        int nTiles = (mtx.GetNumRows() - 2*haloWidth) / rowPeriod;
        if( (mtx.GetNumRows() - 2*haloWidth) % rowPeriod != 0 )
        {
            nTiles += 1;
        }

        for( unsigned int t = 1; t < nTiles; t++ )
        {
            for( unsigned int i = 0; i < nTileRows; i++ )
            {
                memcpy( &(mtx.GetData()[haloWidth + t*nTileRows + i][haloWidth]),
                        &(mtx.GetData()[haloWidth + i][haloWidth]),
                        (mtx.GetNumColumns() - 2*haloWidth) * sizeof(T) );
            }
        }
    }

    // initialize halo
    for( unsigned int i = 0; i < mtx.GetNumRows(); i++ )
    {
        for( unsigned int j = 0; j < mtx.GetNumColumns(); j++ )
        {
            bool inHalo = false;

            if( (i < haloWidth) || (i > mtx.GetNumRows() - 1 - haloWidth) )
            {
                inHalo = true;
            }
            else if( (j < haloWidth) || (j > mtx.GetNumColumns() - 1 - haloWidth) )
            {
                inHalo = true;
            }

            if( inHalo )
            {
                mtx.GetData()[i][j] = haloVal;
            }
        }
    }
}
コード例 #3
0
template <class T> void
MICStencil<T>::operator()( Matrix2D<T>& mtx, unsigned int nIters )
{
    unsigned int uDimWithHalo    = mtx.GetNumRows();
    unsigned int uHaloWidth      = LINESIZE / sizeof(T);
    unsigned int uImgElements    = uDimWithHalo * uDimWithHalo;

    __declspec(target(mic), align(LINESIZE)) T* pIn = mtx.GetFlatData();

    __declspec(target(mic), align(sizeof(T)))    T wcenter      = this->wCenter;
    __declspec(target(mic), align(sizeof(T)))    T wdiag        = this->wDiagonal;
    __declspec(target(mic), align(sizeof(T)))    T wcardinal    = this->wCardinal;

    #pragma offload target(mic) in(pIn:length(uImgElements) ALLOC RETAIN)
    {
        // Just copy pIn to compute the copy transfer time
    }

    #pragma offload target(mic) in(pIn:length(uImgElements) REUSE RETAIN)    \
                                in(uImgElements) in(uDimWithHalo)            \
                                in(wcenter) in(wdiag) in(wcardinal)
    {
        unsigned int uRowPartitions = sysconf(_SC_NPROCESSORS_ONLN) / 4 - 1;
        unsigned int uColPartitions = 4;    // Threads per core for KNC

        unsigned int uRowTileSize    = (uDimWithHalo - 2 * uHaloWidth) / uRowPartitions;
        unsigned int uColTileSize    = (uDimWithHalo - 2 * uHaloWidth) / uColPartitions;

        uRowTileSize = ((uDimWithHalo - 2 * uHaloWidth) % uRowPartitions > 0) ? (uRowTileSize + 1) : (uRowTileSize);

        // Should use the "Halo Val" when filling the memory space
        T *pTmp     = (T*)pIn;
        T *pCrnt = (T*)memset((T*)_mm_malloc(uImgElements * sizeof(T), LINESIZE), 0, uImgElements * sizeof(T));

        #pragma omp parallel firstprivate(pTmp, pCrnt, uRowTileSize, uColTileSize, uHaloWidth, uDimWithHalo)
        {
            unsigned int uThreadId = omp_get_thread_num();

            unsigned int uRowTileId = uThreadId / uColPartitions;
            unsigned int uColTileId = uThreadId % uColPartitions;

            unsigned int uStartLine = uRowTileId * uRowTileSize + uHaloWidth;
            unsigned int uStartCol  = uColTileId * uColTileSize + uHaloWidth;

            unsigned int uEndLine = uStartLine + uRowTileSize;
            uEndLine = (uEndLine > (uDimWithHalo - uHaloWidth)) ? uDimWithHalo - uHaloWidth : uEndLine;

            unsigned int uEndCol    = uStartCol  + uColTileSize;
            uEndCol  = (uEndCol  > (uDimWithHalo - uHaloWidth)) ? uDimWithHalo - uHaloWidth : uEndCol;

            T    cardinal0 = 0.0;
            T    diagonal0 = 0.0;
            T    center0   = 0.0;

            unsigned int cntIterations, i, j;

            for (cntIterations = 0; cntIterations < nIters; cntIterations ++)
            {
                // Do Stencil Operation
                for (i = uStartLine; i < uEndLine; i++)
                {
                    T * pCenter      = &pTmp [ i * uDimWithHalo];
                    T * pTop         = pCenter - uDimWithHalo;
                    T * pBottom      = pCenter + uDimWithHalo;
                    T * pOut         = &pCrnt[ i * uDimWithHalo];

                    __assume_aligned(pCenter, 64);
                    __assume_aligned(pTop,    64);
                    __assume_aligned(pBottom, 64);
                    __assume_aligned(pOut,    64);

                    #pragma simd vectorlengthfor(float)
                    for (j = uStartCol; j < uEndCol; j++)
                    {
                        cardinal0   = pCenter[j - 1] + pCenter[j + 1] + pTop[j] + pBottom[j];
                        diagonal0   = pTop[j - 1] + pTop[j + 1] + pBottom[j - 1] + pBottom[j + 1];
                        center0     = pCenter[j];

                        pOut[j]     = wcardinal * cardinal0 + wdiag * diagonal0 + wcenter * center0;
                    }
                }

                #pragma omp barrier
                ;

                // Switch pointers
                T* pAux    = pTmp;
                pTmp     = pCrnt;
                pCrnt    = pAux;
            } // End For

        } // End Parallel

        _mm_free(pCrnt);
    } // End Offload

    #pragma offload target(mic) out(pIn:length(uImgElements) REUSE FREE)
    {
        // Just copy back pIn
    }
}
コード例 #4
0
ファイル: MPICUDAStencil.cpp プロジェクト: tositrino/shoc
void
MPICUDAStencil<T>::DoPreIterationWork( T* currBuf,  // in device global memory
                                       T* altBuf,  // in device global memory
                                       Matrix2D<T>& mtx,
                                       unsigned int iter )
{
    // do the halo exchange at desired frequency
    // note that we *do not* do the halo exchange here before the
    // first iteration, because we did it already (before we first
    // pushed the data onto the device) in our operator() method.
    unsigned int haloWidth = this->GetNumberIterationsPerHaloExchange();
    if( (iter > 0) && (iter % haloWidth) == 0 )
    {
        unsigned int nRows = mtx.GetNumRows();
        unsigned int nCols = mtx.GetNumColumns();
        unsigned int nPaddedCols = mtx.GetNumPaddedColumns();
        T* flatData = mtx.GetFlatData();

        size_t nsDataItemCount = haloWidth * nPaddedCols;
        size_t ewDataItemCount = haloWidth * nRows;
        size_t nsDataSize = nsDataItemCount * sizeof(T);
        size_t ewDataSize = ewDataItemCount * sizeof(T);

        //
        // read current data off device
        // we only read halo, and only for sides where we have a neighbor
        //
        if( this->HaveNorthNeighbor() )
        {
            // north data is contiguous - copy directly into matrix
            cudaMemcpy( flatData + (haloWidth * nPaddedCols),   // dest
                        currBuf + (haloWidth * nPaddedCols),     // src
                        nsDataSize,                 // amount to transfer
                        cudaMemcpyDeviceToHost );   // direction
        }

        if( this->HaveSouthNeighbor() )
        {
            // south data is contiguous - copy directly into matrix
            cudaMemcpy( flatData + ((nRows - 2*haloWidth)*nPaddedCols),   // dest
                        currBuf + ((nRows - 2*haloWidth)*nPaddedCols),    // src
                        nsDataSize,                 // amount to transfer
                        cudaMemcpyDeviceToHost );   // direction
        }

        if( this->HaveEastNeighbor() )
        {
            // east data is non-contiguous - but CUDA has a strided read
            cudaMemcpy2D( flatData + (nCols - 2*haloWidth), // dest
                          nPaddedCols * sizeof(T),        // dest pitch
                          currBuf + (nCols - 2*haloWidth),    // src
                          nPaddedCols * sizeof(T),        // src pitch
                          haloWidth * sizeof(T),          // width of data to transfer (bytes)
                          nRows,                          // height of data to transfer (rows)
                          cudaMemcpyDeviceToHost );       // transfer direction
        }

        if( this->HaveWestNeighbor() )
        {
            // west data is non-contiguous - but CUDA has a strided read
            cudaMemcpy2D( flatData + haloWidth,         // dest
                          nPaddedCols * sizeof(T),    // dest pitch
                          currBuf + haloWidth,        // src
                          nPaddedCols * sizeof(T),    // src pitch
                          haloWidth * sizeof(T),      // width of data to transfer (bytes)
                          nRows,          // height of data to transfer (rows)
                          cudaMemcpyDeviceToHost );   // transfer direction

        }


        //
        // do the actual halo exchange
        //
        if( dumpData )
        {
            DumpData( ofs, mtx, "before halo exchange" );
        }
        DoHaloExchange( mtx );
        if( dumpData )
        {
            DumpData( ofs, mtx, "after halo exchange" );
        }

        //
        // push updated data back onto device
        // we only write halo, and only for sides where we have a neighbor
        //
        if( this->HaveNorthNeighbor() )
        {
            // north data is contiguous - copy directly from matrix
            cudaMemcpy( currBuf,                    // dest
                        flatData,                   // src
                        nsDataSize,                 // amount to transfer
                        cudaMemcpyHostToDevice );   // direction
        }

        if( this->HaveSouthNeighbor() )
        {
            // south data is contiguous - copy directly from matrix
            cudaMemcpy( currBuf + ((nRows - haloWidth)*nPaddedCols),    // dest
                        flatData + ((nRows - haloWidth)*nPaddedCols),   // src
                        nsDataSize,                 // amount to transfer
                        cudaMemcpyHostToDevice );   // direction
        }

        if( this->HaveEastNeighbor() )
        {
            // east data is non-contiguous - but CUDA has a strided write
            cudaMemcpy2D( currBuf + (nCols - haloWidth),  // dest
                          nPaddedCols * sizeof(T),              // dest pitch
                          flatData + (nCols - haloWidth), // src
                          nPaddedCols * sizeof(T),              // src pitch
                          haloWidth * sizeof(T),          // width of data to transfer (bytes)
                          nRows,                          // height of data to transfer (rows)
                          cudaMemcpyHostToDevice );       // transfer direction
        }

        if( this->HaveWestNeighbor() )
        {
            // west data is non-contiguous - but CUDA has a strided write
            cudaMemcpy2D( currBuf,                      // dest
                          nPaddedCols * sizeof(T),          // dest pitch
                          flatData,                   // src
                          nPaddedCols * sizeof(T),          // src pitch
                          haloWidth * sizeof(T),      // width of data to transfer (bytes)
                          nRows,          // height of data to transfer (rows)
                          cudaMemcpyHostToDevice );   // transfer direction

        }


        // we have changed the local halo values on the device
        // we need to update the local 1-wide halo in the alt buffer
        // note we only need to update the 1-wide halo here, even if
        // our real halo width is larger
        size_t rowExtent = mtx.GetNumPaddedColumns() * sizeof(T);
        cudaMemcpy2D( altBuf,      // destination
                      rowExtent,  // destination pitch
                      currBuf,   // source
                      rowExtent,  // source pitch
                      rowExtent,  // width of data to transfer (bytes)
                      1,          // height of data to transfer (rows)
                      cudaMemcpyDeviceToDevice );
        cudaMemcpy2D( altBuf + (mtx.GetNumRows() - 1) * mtx.GetNumPaddedColumns(),      // destination
                      rowExtent,  // destination pitch
                      currBuf + (mtx.GetNumRows() - 1) * mtx.GetNumPaddedColumns(),   // source
                      rowExtent,  // source pitch
                      rowExtent,  // width of data to transfer (bytes)
                      1,          // height of data to transfer (rows)
                      cudaMemcpyDeviceToDevice );

        // copy the non-contiguous data
        cudaMemcpy2D( altBuf,      // destination
                      rowExtent,  // destination pitch
                      currBuf,   // source
                      rowExtent,  // source pitch
                      sizeof(T),  // width of data to transfer (bytes)
                      mtx.GetNumRows(),      // height of data to transfer (rows)
                      cudaMemcpyDeviceToDevice );
        cudaMemcpy2D( altBuf + (mtx.GetNumColumns() - 1),      // destination
                      rowExtent,  // destination pitch
                      currBuf + (mtx.GetNumColumns() - 1),   // source
                      rowExtent,  // source pitch
                      sizeof(T),  // width of data to transfer (bytes)
                      mtx.GetNumRows(),      // height of data to transfer (rows)
                      cudaMemcpyDeviceToDevice );
    }
}