示例#1
0
ReducePlan::~ReducePlan()
{
    cudaFree(m_blockSums);
    m_blockSums = 0;

    cudaCheckMsg("freeReduceStorage");
}
示例#2
0
void ScanPlan::allocate(size_t elemSizeBytes, size_t numElements, size_t numRows, size_t rowPitch)
{
    const size_t blockSize = SCAN_ELTS_PER_THREAD * SCAN_CTA_SIZE;

    m_numElements = numElements;
    m_numRows = numRows;
    m_elemSizeBytes = elemSizeBytes;

    // find required number of levels
    size_t level = 0;
    size_t numElts = m_numElements;
    do
    {
        size_t numBlocks = (numElts + blockSize - 1) / blockSize;
        if (numBlocks > 1)
        {
            level++;
        }
        numElts = numBlocks;
    } while (numElts > 1);

    m_numLevels = level;

    m_blockSums = (void**) malloc(m_numLevels * sizeof(void*));

    if (m_numRows > 1)
    {
        m_rowPitches = (size_t*) malloc((m_numLevels + 1) * sizeof(size_t));
        m_rowPitches[0] = rowPitch;
    }

    // allocate storage for block sums
    numElts = m_numElements;
    level = 0;
    do
    {
        size_t numBlocks = (numElts + blockSize - 1) / blockSize;
        if (numBlocks > 1) 
        {
            // Use cudaMallocPitch for multi-row block sums to ensure alignment
            if (m_numRows > 1)
            {
                size_t dpitch;
                cudaSafeCall(cudaMallocPitch((void**)&(m_blockSums[level]), &dpitch, numBlocks * m_elemSizeBytes, numRows));
                m_rowPitches[level+1] = dpitch / m_elemSizeBytes;
            }
            else
            {
                cudaSafeCall(cudaMalloc((void**)&(m_blockSums[level]), numBlocks * m_elemSizeBytes));
            }
            level++;
        }
        numElts = numBlocks;
    } while (numElts > 1);

    cudaCheckMsg("ScanPlan::allocate");
}
示例#3
0
ReducePlan::ReducePlan(size_t elemSizeBytes, size_t numElements)
    : m_numElements(numElements),
      m_elemSizeBytes(elemSizeBytes),
      m_threadsPerBlock(REDUCE_CTA_SIZE),
      m_maxBlocks(64),
      m_blockSums(0)
{
    uint blocks = min(m_maxBlocks, (uint(m_numElements) + m_threadsPerBlock - 1) / m_threadsPerBlock);
    cudaMalloc(&m_blockSums, blocks * m_elemSizeBytes);

    cudaCheckMsg("allocReduceStorage");
}
示例#4
0
ScanPlan::~ScanPlan()
{
    for (unsigned int i = 0; i < m_numLevels; i++)
    {
        cudaFree(m_blockSums[i]);
    }

    cudaCheckMsg("ScanPlan::~ScanPlan");

    free(m_blockSums);
    m_blockSums = nullptr;
    if(m_numRows > 1)
    {
        free((void*)m_rowPitches);
        m_rowPitches = nullptr;
    }
    m_numElements = 0;
    m_numLevels = 0;
}