コード例 #1
0
void
MICValidate(const Matrix2D<T>& s, const Matrix2D<T>& t,double valErrThreshold,unsigned int nValErrsToPrint)
{

    assert( (s.GetNumRows() == t.GetNumRows()) && (s.GetNumColumns() == t.GetNumColumns()) );
	#if 1
    for( unsigned int i = 0; i < s.GetNumRows(); i++ )
    {
        for( unsigned int j = 0; j < s.GetNumColumns(); j++ )
        {
            T expVal = s.GetConstData()[i][j];
            T actualVal = t.GetConstData()[i][j];
            T delta = fabsf( actualVal - expVal );
            T relError = (expVal != 0.0f) ? delta / expVal : 0.0f;

            if( relError > valErrThreshold )
            {
		std::cout<<"Failed\n";
		return;
            }
        }
    }
    std::cout<<"Passed\n";
	#endif
	#if 0
	std::cout<<"Expected Value \n";
for( unsigned int i = 0; i < s.GetNumRows(); i++ )
    {
        for( unsigned int j = 0; j < s.GetNumColumns(); j++ )
        {
            T expVal = s.GetConstData()[i][j];
            std::cout<<expVal<<" ";
        }
	std::cout<<endl;
    }

	std::cout<<"Calculated vaue \n";
for( unsigned int i = 0; i < s.GetNumRows(); i++ )
    {   
        for( unsigned int j = 0; j < s.GetNumColumns(); j++ )
        {
            T expVal = t.GetConstData()[i][j];
            std::cout<<expVal<<" ";
        }   
        std::cout<<endl;
    }   
	#endif
}
コード例 #2
0
void
Initialize<T>::operator()( Matrix2D<T>& mtx )
{
    srand48( seed );

    int nTileRows = mtx.GetNumRows() - 2 * haloWidth;
    if( (rowPeriod != -1) && (rowPeriod < nTileRows) )
    {
        nTileRows = rowPeriod;
    }

    int nTileCols = mtx.GetNumColumns() - 2 * haloWidth;
    if( (colPeriod != -1) && (colPeriod < nTileCols) )
    {
        nTileCols = colPeriod;
    }


    // initialize first tile
    for( unsigned int i = 0; i < nTileRows; i++ )
    {
        for( unsigned int j = 0; j < nTileCols; j++ )
        {
#ifndef READY
            mtx.GetData()[i+haloWidth][j+haloWidth] = i * j;
#else
            mtx.GetData()[i+haloWidth][j+haloWidth] = (T)drand48();
#endif // READY
        }
    }

    // initialize any remaining tiles
    // first we fill along rows a tile at a time,
    // then fill out along columns a row at a time
    if( colPeriod != -1 )
    {
        int nTiles = (mtx.GetNumColumns() - 2*haloWidth) / colPeriod;
        if( (mtx.GetNumColumns() - 2*haloWidth) % colPeriod != 0 )
        {
            nTiles += 1;
        }

        for( unsigned int t = 1; t < nTiles; t++ )
        {
            for( unsigned int i = 0; i < nTileRows; i++ )
            {
                memcpy( &(mtx.GetData()[haloWidth + i][haloWidth + t*nTileCols]),
                        &(mtx.GetData()[haloWidth + i][haloWidth]),
                        nTileCols * sizeof(T) );
            }
        }
    }
    if( rowPeriod != -1 )
    {
        int nTiles = (mtx.GetNumRows() - 2*haloWidth) / rowPeriod;
        if( (mtx.GetNumRows() - 2*haloWidth) % rowPeriod != 0 )
        {
            nTiles += 1;
        }

        for( unsigned int t = 1; t < nTiles; t++ )
        {
            for( unsigned int i = 0; i < nTileRows; i++ )
            {
                memcpy( &(mtx.GetData()[haloWidth + t*nTileRows + i][haloWidth]),
                        &(mtx.GetData()[haloWidth + i][haloWidth]),
                        (mtx.GetNumColumns() - 2*haloWidth) * sizeof(T) );
            }
        }
    }

    // initialize halo
    for( unsigned int i = 0; i < mtx.GetNumRows(); i++ )
    {
        for( unsigned int j = 0; j < mtx.GetNumColumns(); j++ )
        {
            bool inHalo = false;

            if( (i < haloWidth) || (i > mtx.GetNumRows() - 1 - haloWidth) )
            {
                inHalo = true;
            }
            else if( (j < haloWidth) || (j > mtx.GetNumColumns() - 1 - haloWidth) )
            {
                inHalo = true;
            }

            if( inHalo )
            {
                mtx.GetData()[i][j] = haloVal;
            }
        }
    }
}
コード例 #3
0
ファイル: MPICUDAStencil.cpp プロジェクト: tositrino/shoc
void
MPICUDAStencil<T>::DoPreIterationWork( T* currBuf,  // in device global memory
                                       T* altBuf,  // in device global memory
                                       Matrix2D<T>& mtx,
                                       unsigned int iter )
{
    // do the halo exchange at desired frequency
    // note that we *do not* do the halo exchange here before the
    // first iteration, because we did it already (before we first
    // pushed the data onto the device) in our operator() method.
    unsigned int haloWidth = this->GetNumberIterationsPerHaloExchange();
    if( (iter > 0) && (iter % haloWidth) == 0 )
    {
        unsigned int nRows = mtx.GetNumRows();
        unsigned int nCols = mtx.GetNumColumns();
        unsigned int nPaddedCols = mtx.GetNumPaddedColumns();
        T* flatData = mtx.GetFlatData();

        size_t nsDataItemCount = haloWidth * nPaddedCols;
        size_t ewDataItemCount = haloWidth * nRows;
        size_t nsDataSize = nsDataItemCount * sizeof(T);
        size_t ewDataSize = ewDataItemCount * sizeof(T);

        //
        // read current data off device
        // we only read halo, and only for sides where we have a neighbor
        //
        if( this->HaveNorthNeighbor() )
        {
            // north data is contiguous - copy directly into matrix
            cudaMemcpy( flatData + (haloWidth * nPaddedCols),   // dest
                        currBuf + (haloWidth * nPaddedCols),     // src
                        nsDataSize,                 // amount to transfer
                        cudaMemcpyDeviceToHost );   // direction
        }

        if( this->HaveSouthNeighbor() )
        {
            // south data is contiguous - copy directly into matrix
            cudaMemcpy( flatData + ((nRows - 2*haloWidth)*nPaddedCols),   // dest
                        currBuf + ((nRows - 2*haloWidth)*nPaddedCols),    // src
                        nsDataSize,                 // amount to transfer
                        cudaMemcpyDeviceToHost );   // direction
        }

        if( this->HaveEastNeighbor() )
        {
            // east data is non-contiguous - but CUDA has a strided read
            cudaMemcpy2D( flatData + (nCols - 2*haloWidth), // dest
                          nPaddedCols * sizeof(T),        // dest pitch
                          currBuf + (nCols - 2*haloWidth),    // src
                          nPaddedCols * sizeof(T),        // src pitch
                          haloWidth * sizeof(T),          // width of data to transfer (bytes)
                          nRows,                          // height of data to transfer (rows)
                          cudaMemcpyDeviceToHost );       // transfer direction
        }

        if( this->HaveWestNeighbor() )
        {
            // west data is non-contiguous - but CUDA has a strided read
            cudaMemcpy2D( flatData + haloWidth,         // dest
                          nPaddedCols * sizeof(T),    // dest pitch
                          currBuf + haloWidth,        // src
                          nPaddedCols * sizeof(T),    // src pitch
                          haloWidth * sizeof(T),      // width of data to transfer (bytes)
                          nRows,          // height of data to transfer (rows)
                          cudaMemcpyDeviceToHost );   // transfer direction

        }


        //
        // do the actual halo exchange
        //
        if( dumpData )
        {
            DumpData( ofs, mtx, "before halo exchange" );
        }
        DoHaloExchange( mtx );
        if( dumpData )
        {
            DumpData( ofs, mtx, "after halo exchange" );
        }

        //
        // push updated data back onto device
        // we only write halo, and only for sides where we have a neighbor
        //
        if( this->HaveNorthNeighbor() )
        {
            // north data is contiguous - copy directly from matrix
            cudaMemcpy( currBuf,                    // dest
                        flatData,                   // src
                        nsDataSize,                 // amount to transfer
                        cudaMemcpyHostToDevice );   // direction
        }

        if( this->HaveSouthNeighbor() )
        {
            // south data is contiguous - copy directly from matrix
            cudaMemcpy( currBuf + ((nRows - haloWidth)*nPaddedCols),    // dest
                        flatData + ((nRows - haloWidth)*nPaddedCols),   // src
                        nsDataSize,                 // amount to transfer
                        cudaMemcpyHostToDevice );   // direction
        }

        if( this->HaveEastNeighbor() )
        {
            // east data is non-contiguous - but CUDA has a strided write
            cudaMemcpy2D( currBuf + (nCols - haloWidth),  // dest
                          nPaddedCols * sizeof(T),              // dest pitch
                          flatData + (nCols - haloWidth), // src
                          nPaddedCols * sizeof(T),              // src pitch
                          haloWidth * sizeof(T),          // width of data to transfer (bytes)
                          nRows,                          // height of data to transfer (rows)
                          cudaMemcpyHostToDevice );       // transfer direction
        }

        if( this->HaveWestNeighbor() )
        {
            // west data is non-contiguous - but CUDA has a strided write
            cudaMemcpy2D( currBuf,                      // dest
                          nPaddedCols * sizeof(T),          // dest pitch
                          flatData,                   // src
                          nPaddedCols * sizeof(T),          // src pitch
                          haloWidth * sizeof(T),      // width of data to transfer (bytes)
                          nRows,          // height of data to transfer (rows)
                          cudaMemcpyHostToDevice );   // transfer direction

        }


        // we have changed the local halo values on the device
        // we need to update the local 1-wide halo in the alt buffer
        // note we only need to update the 1-wide halo here, even if
        // our real halo width is larger
        size_t rowExtent = mtx.GetNumPaddedColumns() * sizeof(T);
        cudaMemcpy2D( altBuf,      // destination
                      rowExtent,  // destination pitch
                      currBuf,   // source
                      rowExtent,  // source pitch
                      rowExtent,  // width of data to transfer (bytes)
                      1,          // height of data to transfer (rows)
                      cudaMemcpyDeviceToDevice );
        cudaMemcpy2D( altBuf + (mtx.GetNumRows() - 1) * mtx.GetNumPaddedColumns(),      // destination
                      rowExtent,  // destination pitch
                      currBuf + (mtx.GetNumRows() - 1) * mtx.GetNumPaddedColumns(),   // source
                      rowExtent,  // source pitch
                      rowExtent,  // width of data to transfer (bytes)
                      1,          // height of data to transfer (rows)
                      cudaMemcpyDeviceToDevice );

        // copy the non-contiguous data
        cudaMemcpy2D( altBuf,      // destination
                      rowExtent,  // destination pitch
                      currBuf,   // source
                      rowExtent,  // source pitch
                      sizeof(T),  // width of data to transfer (bytes)
                      mtx.GetNumRows(),      // height of data to transfer (rows)
                      cudaMemcpyDeviceToDevice );
        cudaMemcpy2D( altBuf + (mtx.GetNumColumns() - 1),      // destination
                      rowExtent,  // destination pitch
                      currBuf + (mtx.GetNumColumns() - 1),   // source
                      rowExtent,  // source pitch
                      sizeof(T),  // width of data to transfer (bytes)
                      mtx.GetNumRows(),      // height of data to transfer (rows)
                      cudaMemcpyDeviceToDevice );
    }
}