Beispiel #1
0
clsparseStatus
clsparseDCooMatrixfromFile( clsparseCooMatrix* cooMatx, const char* filePath, clsparseControl control, cl_bool read_explicit_zeroes )
{
    clsparseCooMatrixPrivate* pCooMatx = static_cast<clsparseCooMatrixPrivate*>( cooMatx );

    // Check that the file format is matrix market; the only format we can read right now
    // This is not a complete solution, and fails for directories with file names etc...
    // TODO: Should we use boost filesystem?
    std::string strPath( filePath );
    if( strPath.find_last_of( '.' ) != std::string::npos )
    {
        std::string ext = strPath.substr( strPath.find_last_of( '.' ) + 1 );
        if( ext != "mtx" )
            return clsparseInvalidFileFormat;
    }
    else
        return clsparseInvalidFileFormat;

    MatrixMarketReader< cl_double > mm_reader;
    if( mm_reader.MMReadFormat( filePath, read_explicit_zeroes ) )
        return clsparseInvalidFile;

    pCooMatx->num_rows = mm_reader.GetNumRows( );
    pCooMatx->num_cols = mm_reader.GetNumCols( );
    pCooMatx->num_nonzeros = mm_reader.GetNumNonZeroes( );

    // Transfers data from CPU buffer to GPU buffers
    clMemRAII< cl_double > rCooValues( control->queue( ), pCooMatx->values );
    clMemRAII< clsparseIdx_t > rCoocol_indices( control->queue( ), pCooMatx->col_indices );
    clMemRAII< clsparseIdx_t > rCoorow_indices( control->queue( ), pCooMatx->row_indices );

    cl_double* fCooValues = rCooValues.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCooMatx->valOffset( ), pCooMatx->num_nonzeros );
    clsparseIdx_t* iCoocol_indices = rCoocol_indices.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCooMatx->colIndOffset( ), pCooMatx->num_nonzeros );
    clsparseIdx_t* iCoorow_indices = rCoorow_indices.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCooMatx->rowOffOffset( ), pCooMatx->num_nonzeros );

    Coordinate< cl_double >* coords = mm_reader.GetUnsymCoordinates( );
    //JPA:: Coo matrix is need to be sorted as well because we need to have matrix
    // which is sorted by row and then column, in the mtx files usually is opposite.
    std::sort( coords, coords + pCooMatx->num_nonzeros, CoordinateCompare< cl_double > );

    for( clsparseIdx_t c = 0; c < pCooMatx->num_nonzeros; ++c )
    {
        iCoorow_indices[ c ] = coords[ c ].x;
        iCoocol_indices[ c ] = coords[ c ].y;
        fCooValues[ c ] = coords[ c ].val;
    }

    return clsparseSuccess;
}
Beispiel #2
0
clsparseStatus dot(clsparseScalarPrivate* pR,
                   const cldenseVectorPrivate* pX,
                   const cldenseVectorPrivate* pY,
                   const clsparseControl control)
{

    cl_int status;

    init_scalar(pR, (T)0, control);

    // with REDUCE_BLOCKS_NUMBER = 256 final reduction can be performed
    // within one block;
    const cl_ulong REDUCE_BLOCKS_NUMBER = 256;

    /* For future optimisation
    //workgroups per compute units;
    const cl_uint  WG_PER_CU = 64;
    const cl_ulong REDUCE_BLOCKS_NUMBER = control->max_compute_units * WG_PER_CU;
    */
    const cl_ulong REDUCE_BLOCK_SIZE = 256;

    cl_ulong xSize = pX->num_values - pX->offset();
    cl_ulong ySize = pY->num_values - pY->offset();

    assert (xSize == ySize);

    cl_ulong size = xSize;


    if (size > 0)
    {
        cl::Context context = control->getContext();

        //partial result
        cldenseVectorPrivate partial;
        clsparseInitVector(&partial);
        partial.num_values = REDUCE_BLOCKS_NUMBER;

        clMemRAII<T> rPartial (control->queue(), &partial.values, partial.num_values);

        status = inner_product<T>(&partial, pX, pY, size,  REDUCE_BLOCKS_NUMBER,
                               REDUCE_BLOCK_SIZE, control);

        if (status != clsparseSuccess)
        {
            return clsparseInvalidKernelExecution;
        }

       status = atomic_reduce<T>(pR, &partial, REDUCE_BLOCK_SIZE,
                                     control);

        if (status != CL_SUCCESS)
        {
            return clsparseInvalidKernelExecution;
        }
    }

    return clsparseSuccess;
}
Beispiel #3
0
clsparseStatus
clsparseCsrMetaCompute( clsparseCsrMatrix* csrMatx, clsparseControl control )
{
    clsparseCsrMatrixPrivate* pCsrMatx = static_cast<clsparseCsrMatrixPrivate*>( csrMatx );

    // Check to ensure nRows can fit in 32 bits
    if( static_cast<cl_ulong>( pCsrMatx->num_rows ) > static_cast<cl_ulong>( pow( 2, ( 64 - ROW_BITS ) ) ) )
    {
        printf( "Number of Rows in the Sparse Matrix is greater than what is supported at present ((64-WG_BITS) bits) !" );
        return clsparseOutOfResources;
    }

    clMemRAII< cl_int > rCsrRowOffsets( control->queue( ), pCsrMatx->rowOffsets );
    cl_int* rowDelimiters = rCsrRowOffsets.clMapMem( CL_TRUE, CL_MAP_READ, pCsrMatx->rowOffOffset( ), pCsrMatx->num_rows + 1 );

    clMemRAII< cl_ulong > rRowBlocks( control->queue( ), pCsrMatx->rowBlocks );
    cl_ulong* ulCsrRowBlocks = rRowBlocks.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->rowBlocksOffset( ), pCsrMatx->rowBlockSize );

    ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, rowDelimiters, pCsrMatx->num_rows, BLKSIZE );

    return clsparseSuccess;
}
Beispiel #4
0
// Converts a sparse matrix in COO compressed format into CSR compressed format
// Pre-condition: The CL device memory for CSR values, colIndices, rowOffsets has to be allocated prior to entering this routine
// and the offset variables for cl1.2 set
clsparseStatus
clsparseScoo2csr_host( clsparseCsrMatrix* csrMatx, const clsparseCooMatrix* cooMatx, clsparseControl control )
{
    if( !clsparseInitialized )
    {
        return clsparseNotInitialized;
    }

    //check opencl elements
    if( control == nullptr )
    {
        return clsparseInvalidControlObject;
    }

    const clsparseCooMatrixPrivate* pCooMatx = static_cast<const clsparseCooMatrixPrivate*>( cooMatx );
    clsparseCsrMatrixPrivate* pCsrMatx = static_cast<clsparseCsrMatrixPrivate*>( csrMatx );
    pCsrMatx->num_rows = pCooMatx->num_rows;
    pCsrMatx->num_cols = pCooMatx->num_cols;
    pCsrMatx->num_nonzeros = pCooMatx->num_nonzeros;

    clMemRAII< cl_float > rCooValues( control->queue( ), pCooMatx->values );
    clMemRAII< cl_int > rCooColIndices( control->queue( ), pCooMatx->colIndices );
    clMemRAII< cl_int > rCooRowIndices( control->queue( ), pCooMatx->rowIndices );
    clMemRAII< cl_float > rCsrValues( control->queue( ), pCsrMatx->values );
    clMemRAII< cl_int > rCsrColIndices( control->queue( ), pCsrMatx->colIndices );
    clMemRAII< cl_int > rCsrRowOffsets( control->queue( ), pCsrMatx->rowOffsets );

    cl_float* fCooValues = rCooValues.clMapMem( CL_TRUE, CL_MAP_READ, pCooMatx->valOffset( ), pCooMatx->num_nonzeros );
    cl_int* iCooColIndices = rCooColIndices.clMapMem( CL_TRUE, CL_MAP_READ, pCooMatx->colIndOffset( ), pCooMatx->num_nonzeros );
    cl_int* iCooRowIndices = rCooRowIndices.clMapMem( CL_TRUE, CL_MAP_READ, pCooMatx->rowOffOffset( ), pCooMatx->num_nonzeros );

    cl_float* fCsrValues = rCsrValues.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->valOffset( ), pCsrMatx->num_nonzeros );
    cl_int* iCsrColIndices = rCsrColIndices.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->colIndOffset( ), pCsrMatx->num_nonzeros );
    cl_int* iCsrRowOffsets = rCsrRowOffsets.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->rowOffOffset( ), pCsrMatx->num_rows + 1 );

    coo2csr_transform( fCooValues, iCooColIndices, iCooRowIndices, pCooMatx->num_nonzeros, fCsrValues, iCsrColIndices, iCsrRowOffsets );

    return clsparseSuccess;
}
Beispiel #5
0
clsparseStatus
clsparseScsr2coo(const clsparseCsrMatrix* csr,
                 clsparseCooMatrix* coo,
                 const clsparseControl control)
{

    const clsparseCsrMatrixPrivate* pCsr = static_cast<const clsparseCsrMatrixPrivate*>(csr);
    clsparseCooMatrixPrivate* pCoo = static_cast<clsparseCooMatrixPrivate*>(coo);

    pCoo->num_rows = pCsr->num_rows;
    pCoo->num_cols = pCsr->num_cols;
    pCoo->num_nonzeros = pCsr->num_nonzeros;  

    if (!clsparseInitialized)
    {
        return clsparseNotInitialized;
    }

    //check opencl elements
    if (control == nullptr)
    {
        return clsparseInvalidControlObject;
    }

    clsparseStatus status;

    //validate cl_mem objects
    status = validateMemObject(pCoo->rowIndices, sizeof(cl_int)* pCoo->num_nonzeros);
    if(status != clsparseSuccess)
        return status;

    status = validateMemObject(pCoo->colIndices, sizeof(cl_int)* pCoo->num_nonzeros);
    if(status != clsparseSuccess)
        return status;

    status = validateMemObject(pCoo->values, sizeof(cl_float)* pCoo->num_nonzeros);
    if(status != clsparseSuccess)
        return status;

    //validate cl_mem sizes
    //TODO: ask about validateMemObjectSize
    cl_uint nnz_per_row = pCoo->num_nonzeros / pCoo->num_rows; //average num_nonzeros per row
    cl_uint wave_size = control->wavefront_size;
    cl_uint group_size = 256; //wave_size * 8;    // 256 gives best performance!
    cl_uint subwave_size = wave_size;

    // adjust subwave_size according to nnz_per_row;
    // each wavefron will be assigned to the row of the csr matrix
    if(wave_size > 32)
    {
        //this apply only for devices with wavefront > 32 like AMD(64)
        if (nnz_per_row < 64) {  subwave_size = 32;  }
    }
    if (nnz_per_row < 32) {  subwave_size = 16;  }
    if (nnz_per_row < 16) {  subwave_size = 8;  }
    if (nnz_per_row < 8)  {  subwave_size = 4;  }
    if (nnz_per_row < 4)  {  subwave_size = 2;  }


    const std::string params = std::string() +
            "-DINDEX_TYPE=" + OclTypeTraits<cl_int>::type
            + " -DVALUE_TYPE=" + OclTypeTraits<cl_float>::type
            + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type
            + " -DWG_SIZE=" + std::to_string(group_size)
            + " -DWAVE_SIZE=" + std::to_string(wave_size)
            + " -DSUBWAVE_SIZE=" + std::to_string(subwave_size);



    //TODO add error handling
    //copy indices
    clEnqueueCopyBuffer(control->queue(),
                        pCsr-> colIndices,
                        pCoo-> colIndices,
                        0,
                        0,
                        sizeof(cl_int) * pCoo->num_nonzeros,
                        0,
                        NULL,
                        NULL);

    //copy values
    clEnqueueCopyBuffer(control->queue(),
                        pCsr-> values,
                        pCoo-> values,
                        0,
                        0,
                        sizeof(cl_float) * pCoo->num_nonzeros,
                        0,
                        NULL,
                        NULL);


    return csr2coo_transform( pCoo->num_rows, pCoo->num_cols,
                             pCsr->rowOffsets,
                             pCoo->rowIndices,
                             params,
                             group_size,
                             subwave_size,
                             control);

}
Beispiel #6
0
clsparseStatus
indices_to_offsets(clsparse::vector<T>& offsets,
                   const clsparse::vector<T>& indices,
                   const clsparseControl control)
{
    typedef typename clsparse::vector<T> IndicesArray;
    typedef typename clsparse::vector<T>::size_type SizeType;

    //if (std::is_integral<T>)

    if (!clsparseInitialized)
    {
        return clsparseNotInitialized;
    }

    //check opencl elements
    if (control == nullptr)
    {
        return clsparseInvalidControlObject;
    }

    SizeType size = indices.size() > offsets.size() ? indices.size() : offsets.size();

    IndicesArray values (control, indices.size(), 1, CL_MEM_READ_WRITE, true);
    IndicesArray keys_output (control, indices.size(), 0, CL_MEM_READ_WRITE, false);
    IndicesArray values_output (control, size, 0, CL_MEM_READ_WRITE, false);

    clsparseStatus status =
            internal::reduce_by_key(keys_output, values_output,
                                    indices, values, control);

    CLSPARSE_V(status, "Error: reduce_by_key");

    if (status != clsparseSuccess)
        return status;

    assert(values_output.size() >= offsets.size());

    cl_event clEvent;
    cl_int cl_status  = clEnqueueCopyBuffer(control->queue(),
                                            values_output.data()(),
                                            offsets.data()(),
                                            0,
                                            0,
                                            offsets.size() * sizeof(T),
                                            0,
                                            nullptr,
                                            &clEvent);

    CLSPARSE_V(cl_status, "Error: Enqueue copy buffer values to offsets");

    cl_status = clWaitForEvents(1, &clEvent);

    CLSPARSE_V(cl_status, "Error: clWaitForEvents");

    cl_status = clReleaseEvent(clEvent);

    CLSPARSE_V(cl_status, "Error: clReleaseEvent");

    // Dunno why but this throws CL_INVALID_CONTEXT erro;
    //    cl::Event event;
    //    cl::enqueueCopyBuffer(values_output.data(), offsets.data(),
    //                                          0, 0, offsets.size(),
    //                                          nullptr, &event);

    //    CLSPARSE_V(cl_status, "Error: enqueueCopyBuffer");
    //    CLSPARSE_V(event.wait(), "Error: event wait");

    status = exclusive_scan<EW_PLUS>(offsets, offsets, control);

    return status;
}
Beispiel #7
0
clsparseStatus
clsparseDCsrMatrixfromFile( clsparseCsrMatrix* csrMatx, const char* filePath, clsparseControl control, cl_bool read_explicit_zeroes )
{
    clsparseCsrMatrixPrivate* pCsrMatx = static_cast<clsparseCsrMatrixPrivate*>( csrMatx );

    // Check that the file format is matrix market; the only format we can read right now
    // This is not a complete solution, and fails for directories with file names etc...
    // TODO: Should we use boost filesystem?
    std::string strPath( filePath );
    if( strPath.find_last_of( '.' ) != std::string::npos )
    {
        std::string ext = strPath.substr( strPath.find_last_of( '.' ) + 1 );
        if( ext != "mtx" )
            return clsparseInvalidFileFormat;
    }
    else
        return clsparseInvalidFileFormat;

    // Read data from a file on disk into CPU buffers
    // Data is read natively as COO format with the reader
    MatrixMarketReader< cl_double > mm_reader;
    if( mm_reader.MMReadFormat( filePath, read_explicit_zeroes ) )
        return clsparseInvalidFile;

    // BUG: We need to check to see if openCL buffers currently exist and deallocate them first!
    // FIX: Below code will check whether the buffers were allocated in the first place;
    {
        clsparseStatus validationStatus = validateMemObject(pCsrMatx->values,
                                                            mm_reader.GetNumNonZeroes() * sizeof(cl_double));

        // I dont want to reallocate buffer because I suppress the users buffer memory flags;
        // It is users responsibility to provide good buffer;
        if (validationStatus != clsparseSuccess)
            return validationStatus;

        validationStatus = validateMemObject(pCsrMatx->col_indices,
                                             mm_reader.GetNumNonZeroes() * sizeof(clsparseIdx_t));
        if (validationStatus != clsparseSuccess)
            return validationStatus;

        validationStatus = validateMemObject(pCsrMatx->row_pointer,
                                             (mm_reader.GetNumRows() + 1) * sizeof(clsparseIdx_t));
        if (validationStatus != clsparseSuccess)
            return validationStatus;
    }



    pCsrMatx->num_rows = mm_reader.GetNumRows( );
    pCsrMatx->num_cols = mm_reader.GetNumCols( );
    pCsrMatx->num_nonzeros = mm_reader.GetNumNonZeroes( );

    // Transfers data from CPU buffer to GPU buffers
    cl_int mapStatus = 0;
    clMemRAII< cl_double > rCsrValues( control->queue( ), pCsrMatx->values);
    clMemRAII< clsparseIdx_t > rCsrcol_indices( control->queue( ), pCsrMatx->col_indices );
    clMemRAII< clsparseIdx_t > rCsrrow_pointer( control->queue( ), pCsrMatx->row_pointer );

    cl_double* fCsrValues =
            rCsrValues.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION,
                                 pCsrMatx->valOffset( ), pCsrMatx->num_nonzeros, &mapStatus );
    if (mapStatus != CL_SUCCESS)
    {
        CLSPARSE_V(mapStatus, "Error: Mapping rCsrValues failed");
        return clsparseInvalidMemObj;
    }

    clsparseIdx_t* iCsrcol_indices =
            rCsrcol_indices.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION,
                                     pCsrMatx->colIndOffset( ), pCsrMatx->num_nonzeros, &mapStatus );
    if (mapStatus != CL_SUCCESS)
    {
        CLSPARSE_V(mapStatus, "Error: Mapping rCsrcol_indices failed");
        return clsparseInvalidMemObj;
    }

    clsparseIdx_t* iCsrrow_pointer =
            rCsrrow_pointer.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION,
                                     pCsrMatx->rowOffOffset( ), pCsrMatx->num_rows + 1, &mapStatus );
    if (mapStatus != CL_SUCCESS)
    {
        CLSPARSE_V(mapStatus, "Error: Mapping rCsrrow_pointer failed");
        return clsparseInvalidMemObj;
    }

    //  The following section of code converts the sparse format from COO to CSR
    Coordinate< cl_double >* coords = mm_reader.GetUnsymCoordinates( );
    std::sort( coords, coords + pCsrMatx->num_nonzeros, CoordinateCompare< cl_double > );

    clsparseIdx_t current_row = 1;
    iCsrrow_pointer[ 0 ] = 0;
    for (clsparseIdx_t i = 0; i < pCsrMatx->num_nonzeros; i++)
    {
        iCsrcol_indices[ i ] = coords[ i ].y;
        fCsrValues[ i ] = coords[ i ].val;

        while( coords[ i ].x >= current_row )
            iCsrrow_pointer[ current_row++ ] = i;
    }
    iCsrrow_pointer[ current_row ] = pCsrMatx->num_nonzeros;
    while( current_row <= pCsrMatx->num_rows )
        iCsrrow_pointer[ current_row++ ] = pCsrMatx->num_nonzeros;

    return clsparseSuccess;
}
Beispiel #8
0
clsparseStatus
reduce_by_key(
    int keys_first,
    int keys_last,
    int values_first,
    cl_mem keys_input,
    cl_mem values_input,
    cl_mem keys_output,
    cl_mem values_output,
    int *count,
    clsparseControl control
)
{

    cl_int l_Error;

    /**********************************************************************************
     * Compile Options
     *********************************************************************************/
    const int kernel0_WgSize = WAVESIZE*KERNEL02WAVES;
    const int kernel1_WgSize = WAVESIZE*KERNEL1WAVES;
    const int kernel2_WgSize = WAVESIZE*KERNEL02WAVES;

    //const std::string params = std::string() +
    //          " -DKERNEL0WORKGROUPSIZE=" + std::to_string(kernel0_WgSize)
    //        + " -DKERNEL1WORKGROUPSIZE=" + std::to_string(kernel1_WgSize)
    //        + " -DKERNEL2WORKGROUPSIZE=" + std::to_string(kernel2_WgSize);
    const std::string params;

    cl::Context context = control->getContext();
    std::vector<cl::Device> dev = context.getInfo<CL_CONTEXT_DEVICES>();
    int computeUnits  = dev[0].getInfo< CL_DEVICE_MAX_COMPUTE_UNITS >( );
    int wgPerComputeUnit = dev[0].getInfo< CL_DEVICE_MAX_WORK_GROUP_SIZE >( );


    int resultCnt = computeUnits * wgPerComputeUnit;
    cl_uint numElements = keys_last - keys_first + 1;

    size_t sizeInputBuff = numElements;
    int modWgSize = (sizeInputBuff & (kernel0_WgSize-1));
    if( modWgSize )
    {
        sizeInputBuff &= ~modWgSize;
        sizeInputBuff += kernel0_WgSize;
    }
    cl_uint numWorkGroupsK0 = static_cast< cl_uint >( sizeInputBuff / kernel0_WgSize );

    size_t sizeScanBuff = numWorkGroupsK0;
    modWgSize = (sizeScanBuff & (kernel0_WgSize-1));
    if( modWgSize )
    {
        sizeScanBuff &= ~modWgSize;
        sizeScanBuff += kernel0_WgSize;
    }

    cl_mem tempArrayVec = clCreateBuffer(context(),CL_MEM_READ_WRITE, (numElements)*sizeof(int), NULL, NULL );

    /**********************************************************************************
     *  Kernel 0
     *********************************************************************************/

    cl::Kernel kernel0 = KernelCache::get(control->queue,"reduce_by_key", "OffsetCalculation", params);

    KernelWrap kWrapper0(kernel0);

    kWrapper0 << keys_input << tempArrayVec
              << numElements;

    cl::NDRange local0(kernel0_WgSize);
    cl::NDRange global0(sizeInputBuff);

    cl_int status = kWrapper0.run(control, global0, local0);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    int init = 0;

    scan(0,
	 numElements - 1,
         tempArrayVec,
         tempArrayVec,
         0,
         0,
         control
         );

    int pattern = 0;
    cl_mem keySumArray = clCreateBuffer(context(),CL_MEM_READ_WRITE, (sizeScanBuff)*sizeof(int), NULL, NULL );
    cl_mem preSumArray = clCreateBuffer(context(),CL_MEM_READ_WRITE, (sizeScanBuff)*sizeof(int), NULL, NULL );
    cl_mem postSumArray = clCreateBuffer(context(),CL_MEM_READ_WRITE,(sizeScanBuff)*sizeof(int), NULL, NULL );
    clEnqueueFillBuffer(control->queue(), keySumArray, &pattern, sizeof(int), 0,
                        (sizeScanBuff)*sizeof(int), 0, NULL, NULL);
    clEnqueueFillBuffer(control->queue(), preSumArray, &pattern, sizeof(int), 0,
                        (sizeScanBuff)*sizeof(int), 0, NULL, NULL);
    clEnqueueFillBuffer(control->queue(), postSumArray, &pattern, sizeof(int), 0,
                        (sizeScanBuff)*sizeof(int), 0, NULL, NULL);


    /**********************************************************************************
     *  Kernel 1
     *********************************************************************************/

    cl::Kernel kernel1 = KernelCache::get(control->queue,"reduce_by_key", "perBlockScanByKey", params);

    KernelWrap kWrapper1(kernel1);

    kWrapper1 << tempArrayVec
	      << values_input
              << numElements
	      << keySumArray
	      << preSumArray;

    cl::NDRange local1(kernel0_WgSize);
    cl::NDRange global1(sizeInputBuff);

    status = kWrapper1.run(control, global1, local1);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    /**********************************************************************************
     *  Kernel 2
     *********************************************************************************/
    cl_uint workPerThread = static_cast< cl_uint >( sizeScanBuff / kernel1_WgSize );

    cl::Kernel kernel2 = KernelCache::get(control->queue,"reduce_by_key", "intraBlockInclusiveScanByKey", params);

    KernelWrap kWrapper2(kernel2);

    kWrapper2 << keySumArray << preSumArray
              << postSumArray << numWorkGroupsK0 << workPerThread;

    cl::NDRange local2(kernel1_WgSize);
    cl::NDRange global2(kernel1_WgSize);

    status = kWrapper2.run(control, global2, local2);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    /**********************************************************************************
     *  Kernel 3
     *********************************************************************************/

    cl::Kernel kernel3 = KernelCache::get(control->queue,"reduce_by_key", "keyValueMapping", params);

    KernelWrap kWrapper3(kernel3);

    kWrapper3 << keys_input << keys_output
              << values_input << values_output << tempArrayVec
              << keySumArray << postSumArray << numElements;

    cl::NDRange local3(kernel0_WgSize);
    cl::NDRange global3(sizeInputBuff);

    status = kWrapper3.run(control, global3, local3);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    int *h_result = (int *) malloc (sizeof(int));

    clEnqueueReadBuffer(control->queue(),
                        tempArrayVec,
                        1,
                       (numElements-1)*sizeof(int),
                        sizeof(int),
                        h_result,
                        0,
                        0,
                        0);

    *count = *(h_result);
    //printf("h_result = %d\n", *count );

    //release buffers
    clReleaseMemObject(tempArrayVec);
    clReleaseMemObject(preSumArray);
    clReleaseMemObject(postSumArray);
    clReleaseMemObject(keySumArray);

    return clsparseSuccess;
}   //end of reduce_by_key