Beispiel #1
0
clsparseStatus dot(clsparseScalarPrivate* pR,
                   const cldenseVectorPrivate* pX,
                   const cldenseVectorPrivate* pY,
                   const clsparseControl control)
{

    cl_int status;

    init_scalar(pR, (T)0, control);

    // with REDUCE_BLOCKS_NUMBER = 256 final reduction can be performed
    // within one block;
    const cl_ulong REDUCE_BLOCKS_NUMBER = 256;

    /* For future optimisation
    //workgroups per compute units;
    const cl_uint  WG_PER_CU = 64;
    const cl_ulong REDUCE_BLOCKS_NUMBER = control->max_compute_units * WG_PER_CU;
    */
    const cl_ulong REDUCE_BLOCK_SIZE = 256;

    cl_ulong xSize = pX->num_values - pX->offset();
    cl_ulong ySize = pY->num_values - pY->offset();

    assert (xSize == ySize);

    cl_ulong size = xSize;


    if (size > 0)
    {
        cl::Context context = control->getContext();

        //partial result
        cldenseVectorPrivate partial;
        clsparseInitVector(&partial);
        partial.num_values = REDUCE_BLOCKS_NUMBER;

        clMemRAII<T> rPartial (control->queue(), &partial.values, partial.num_values);

        status = inner_product<T>(&partial, pX, pY, size,  REDUCE_BLOCKS_NUMBER,
                               REDUCE_BLOCK_SIZE, control);

        if (status != clsparseSuccess)
        {
            return clsparseInvalidKernelExecution;
        }

       status = atomic_reduce<T>(pR, &partial, REDUCE_BLOCK_SIZE,
                                     control);

        if (status != CL_SUCCESS)
        {
            return clsparseInvalidKernelExecution;
        }
    }

    return clsparseSuccess;
}
Beispiel #2
0
clsparseStatus
clsparseDCooMatrixfromFile( clsparseCooMatrix* cooMatx, const char* filePath, clsparseControl control, cl_bool read_explicit_zeroes )
{
    clsparseCooMatrixPrivate* pCooMatx = static_cast<clsparseCooMatrixPrivate*>( cooMatx );

    // Check that the file format is matrix market; the only format we can read right now
    // This is not a complete solution, and fails for directories with file names etc...
    // TODO: Should we use boost filesystem?
    std::string strPath( filePath );
    if( strPath.find_last_of( '.' ) != std::string::npos )
    {
        std::string ext = strPath.substr( strPath.find_last_of( '.' ) + 1 );
        if( ext != "mtx" )
            return clsparseInvalidFileFormat;
    }
    else
        return clsparseInvalidFileFormat;

    MatrixMarketReader< cl_double > mm_reader;
    if( mm_reader.MMReadFormat( filePath, read_explicit_zeroes ) )
        return clsparseInvalidFile;

    pCooMatx->num_rows = mm_reader.GetNumRows( );
    pCooMatx->num_cols = mm_reader.GetNumCols( );
    pCooMatx->num_nonzeros = mm_reader.GetNumNonZeroes( );

    // Transfers data from CPU buffer to GPU buffers
    clMemRAII< cl_double > rCooValues( control->queue( ), pCooMatx->values );
    clMemRAII< clsparseIdx_t > rCoocol_indices( control->queue( ), pCooMatx->col_indices );
    clMemRAII< clsparseIdx_t > rCoorow_indices( control->queue( ), pCooMatx->row_indices );

    cl_double* fCooValues = rCooValues.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCooMatx->valOffset( ), pCooMatx->num_nonzeros );
    clsparseIdx_t* iCoocol_indices = rCoocol_indices.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCooMatx->colIndOffset( ), pCooMatx->num_nonzeros );
    clsparseIdx_t* iCoorow_indices = rCoorow_indices.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCooMatx->rowOffOffset( ), pCooMatx->num_nonzeros );

    Coordinate< cl_double >* coords = mm_reader.GetUnsymCoordinates( );
    //JPA:: Coo matrix is need to be sorted as well because we need to have matrix
    // which is sorted by row and then column, in the mtx files usually is opposite.
    std::sort( coords, coords + pCooMatx->num_nonzeros, CoordinateCompare< cl_double > );

    for( clsparseIdx_t c = 0; c < pCooMatx->num_nonzeros; ++c )
    {
        iCoorow_indices[ c ] = coords[ c ].x;
        iCoocol_indices[ c ] = coords[ c ].y;
        fCooValues[ c ] = coords[ c ].val;
    }

    return clsparseSuccess;
}
Beispiel #3
0
clsparseStatus
clsparseGetEvent( clsparseControl control, cl_event *event )
{
    if( control == NULL )
    {
        return clsparseInvalidControlObject;
    }

    //keeps the event valid on the user side
    ::clRetainEvent( control->event( ) );

    *event = control->event( );

    return clsparseSuccess;

}
Beispiel #4
0
clsparseStatus dot(clsparse::array_base<T>& pR,
                   const clsparse::array_base<T>& pX,
                   const clsparse::array_base<T>& pY,
                   const clsparseControl control)
{

    cl_int status;

    //not necessary to have it, but remember to init the pR with the proper value
    init_scalar(pR, (T)0, control);

    // with REDUCE_BLOCKS_NUMBER = 256 final reduction can be performed
    // within one block;
    const cl_ulong REDUCE_BLOCKS_NUMBER = 256;

    /* For future optimisation
    //workgroups per compute units;
    const cl_uint  WG_PER_CU = 64;
    const cl_ulong REDUCE_BLOCKS_NUMBER = control->max_compute_units * WG_PER_CU;
    */
    const cl_ulong REDUCE_BLOCK_SIZE = 256;

    cl_ulong xSize = pX.size();
    cl_ulong ySize = pY.size();

    assert (xSize == ySize);

    cl_ulong size = xSize;

    if (size > 0)
    {
        cl::Context context = control->getContext();

        //partial result
        clsparse::vector<T> partial(control, REDUCE_BLOCKS_NUMBER, 0,
                                   CL_MEM_READ_WRITE, false);

        status = inner_product<T>(partial, pX, pY, size,  REDUCE_BLOCKS_NUMBER,
                               REDUCE_BLOCK_SIZE, control);

        if (status != clsparseSuccess)
        {
            return clsparseInvalidKernelExecution;
        }

       status = atomic_reduce<T>(pR, partial, REDUCE_BLOCK_SIZE,
                                     control);

        if (status != CL_SUCCESS)
        {
            return clsparseInvalidKernelExecution;
        }
    }

    return clsparseSuccess;
}
Beispiel #5
0
clsparseStatus
clsparseCsrMetaCompute( clsparseCsrMatrix* csrMatx, clsparseControl control )
{
    clsparseCsrMatrixPrivate* pCsrMatx = static_cast<clsparseCsrMatrixPrivate*>( csrMatx );

    // Check to ensure nRows can fit in 32 bits
    if( static_cast<cl_ulong>( pCsrMatx->num_rows ) > static_cast<cl_ulong>( pow( 2, ( 64 - ROW_BITS ) ) ) )
    {
        printf( "Number of Rows in the Sparse Matrix is greater than what is supported at present ((64-WG_BITS) bits) !" );
        return clsparseOutOfResources;
    }

    clMemRAII< cl_int > rCsrRowOffsets( control->queue( ), pCsrMatx->rowOffsets );
    cl_int* rowDelimiters = rCsrRowOffsets.clMapMem( CL_TRUE, CL_MAP_READ, pCsrMatx->rowOffOffset( ), pCsrMatx->num_rows + 1 );

    clMemRAII< cl_ulong > rRowBlocks( control->queue( ), pCsrMatx->rowBlocks );
    cl_ulong* ulCsrRowBlocks = rRowBlocks.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->rowBlocksOffset( ), pCsrMatx->rowBlockSize );

    ComputeRowBlocks( ulCsrRowBlocks, pCsrMatx->rowBlockSize, rowDelimiters, pCsrMatx->num_rows, BLKSIZE );

    return clsparseSuccess;
}
Beispiel #6
0
// Converts a sparse matrix in COO compressed format into CSR compressed format
// Pre-condition: The CL device memory for CSR values, colIndices, rowOffsets has to be allocated prior to entering this routine
// and the offset variables for cl1.2 set
clsparseStatus
clsparseScoo2csr_host( clsparseCsrMatrix* csrMatx, const clsparseCooMatrix* cooMatx, clsparseControl control )
{
    if( !clsparseInitialized )
    {
        return clsparseNotInitialized;
    }

    //check opencl elements
    if( control == nullptr )
    {
        return clsparseInvalidControlObject;
    }

    const clsparseCooMatrixPrivate* pCooMatx = static_cast<const clsparseCooMatrixPrivate*>( cooMatx );
    clsparseCsrMatrixPrivate* pCsrMatx = static_cast<clsparseCsrMatrixPrivate*>( csrMatx );
    pCsrMatx->num_rows = pCooMatx->num_rows;
    pCsrMatx->num_cols = pCooMatx->num_cols;
    pCsrMatx->num_nonzeros = pCooMatx->num_nonzeros;

    clMemRAII< cl_float > rCooValues( control->queue( ), pCooMatx->values );
    clMemRAII< cl_int > rCooColIndices( control->queue( ), pCooMatx->colIndices );
    clMemRAII< cl_int > rCooRowIndices( control->queue( ), pCooMatx->rowIndices );
    clMemRAII< cl_float > rCsrValues( control->queue( ), pCsrMatx->values );
    clMemRAII< cl_int > rCsrColIndices( control->queue( ), pCsrMatx->colIndices );
    clMemRAII< cl_int > rCsrRowOffsets( control->queue( ), pCsrMatx->rowOffsets );

    cl_float* fCooValues = rCooValues.clMapMem( CL_TRUE, CL_MAP_READ, pCooMatx->valOffset( ), pCooMatx->num_nonzeros );
    cl_int* iCooColIndices = rCooColIndices.clMapMem( CL_TRUE, CL_MAP_READ, pCooMatx->colIndOffset( ), pCooMatx->num_nonzeros );
    cl_int* iCooRowIndices = rCooRowIndices.clMapMem( CL_TRUE, CL_MAP_READ, pCooMatx->rowOffOffset( ), pCooMatx->num_nonzeros );

    cl_float* fCsrValues = rCsrValues.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->valOffset( ), pCsrMatx->num_nonzeros );
    cl_int* iCsrColIndices = rCsrColIndices.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->colIndOffset( ), pCsrMatx->num_nonzeros );
    cl_int* iCsrRowOffsets = rCsrRowOffsets.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, pCsrMatx->rowOffOffset( ), pCsrMatx->num_rows + 1 );

    coo2csr_transform( fCooValues, iCooColIndices, iCooRowIndices, pCooMatx->num_nonzeros, fCsrValues, iCsrColIndices, iCsrRowOffsets );

    return clsparseSuccess;
}
Beispiel #7
0
clsparseStatus
clsparseScsr2coo(const clsparseCsrMatrix* csr,
                 clsparseCooMatrix* coo,
                 const clsparseControl control)
{

    const clsparseCsrMatrixPrivate* pCsr = static_cast<const clsparseCsrMatrixPrivate*>(csr);
    clsparseCooMatrixPrivate* pCoo = static_cast<clsparseCooMatrixPrivate*>(coo);

    pCoo->num_rows = pCsr->num_rows;
    pCoo->num_cols = pCsr->num_cols;
    pCoo->num_nonzeros = pCsr->num_nonzeros;  

    if (!clsparseInitialized)
    {
        return clsparseNotInitialized;
    }

    //check opencl elements
    if (control == nullptr)
    {
        return clsparseInvalidControlObject;
    }

    clsparseStatus status;

    //validate cl_mem objects
    status = validateMemObject(pCoo->rowIndices, sizeof(cl_int)* pCoo->num_nonzeros);
    if(status != clsparseSuccess)
        return status;

    status = validateMemObject(pCoo->colIndices, sizeof(cl_int)* pCoo->num_nonzeros);
    if(status != clsparseSuccess)
        return status;

    status = validateMemObject(pCoo->values, sizeof(cl_float)* pCoo->num_nonzeros);
    if(status != clsparseSuccess)
        return status;

    //validate cl_mem sizes
    //TODO: ask about validateMemObjectSize
    cl_uint nnz_per_row = pCoo->num_nonzeros / pCoo->num_rows; //average num_nonzeros per row
    cl_uint wave_size = control->wavefront_size;
    cl_uint group_size = 256; //wave_size * 8;    // 256 gives best performance!
    cl_uint subwave_size = wave_size;

    // adjust subwave_size according to nnz_per_row;
    // each wavefron will be assigned to the row of the csr matrix
    if(wave_size > 32)
    {
        //this apply only for devices with wavefront > 32 like AMD(64)
        if (nnz_per_row < 64) {  subwave_size = 32;  }
    }
    if (nnz_per_row < 32) {  subwave_size = 16;  }
    if (nnz_per_row < 16) {  subwave_size = 8;  }
    if (nnz_per_row < 8)  {  subwave_size = 4;  }
    if (nnz_per_row < 4)  {  subwave_size = 2;  }


    const std::string params = std::string() +
            "-DINDEX_TYPE=" + OclTypeTraits<cl_int>::type
            + " -DVALUE_TYPE=" + OclTypeTraits<cl_float>::type
            + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type
            + " -DWG_SIZE=" + std::to_string(group_size)
            + " -DWAVE_SIZE=" + std::to_string(wave_size)
            + " -DSUBWAVE_SIZE=" + std::to_string(subwave_size);



    //TODO add error handling
    //copy indices
    clEnqueueCopyBuffer(control->queue(),
                        pCsr-> colIndices,
                        pCoo-> colIndices,
                        0,
                        0,
                        sizeof(cl_int) * pCoo->num_nonzeros,
                        0,
                        NULL,
                        NULL);

    //copy values
    clEnqueueCopyBuffer(control->queue(),
                        pCsr-> values,
                        pCoo-> values,
                        0,
                        0,
                        sizeof(cl_float) * pCoo->num_nonzeros,
                        0,
                        NULL,
                        NULL);


    return csr2coo_transform( pCoo->num_rows, pCoo->num_cols,
                             pCsr->rowOffsets,
                             pCoo->rowIndices,
                             params,
                             group_size,
                             subwave_size,
                             control);

}
Beispiel #8
0
clsparseStatus
dense_to_coo(clsparseCooMatrix* coo,
             const clsparse::vector<V>& A,
             const clsparse::vector<I>& nnz_locations,
             const clsparse::vector<I>& coo_indexes,
             const clsparseControl control)
{
    typedef typename clsparse::vector<V>::size_type SizeType;

    assert(coo->num_nonzeros > 0);
    assert(coo->num_cols > 0);
    assert(coo->num_rows > 0);

    assert(A.size() > 0);
    assert(nnz_locations.size() > 0);
    assert(coo_indexes.size() > 0);

    SizeType dense_size = A.size();

    cl_int cl_status;

    coo->values = clCreateBuffer( control->getContext()(), CL_MEM_READ_WRITE,
                                  coo->num_nonzeros * sizeof(V), NULL, &cl_status );
    CLSPARSE_V(cl_status, "Create coo values buffer");

    coo->colIndices = clCreateBuffer( control->getContext()(), CL_MEM_READ_WRITE,
                                      coo->num_nonzeros * sizeof(I), NULL, &cl_status );
    CLSPARSE_V(cl_status, "Create coo col indices buffer");

    coo->rowIndices = clCreateBuffer(control->getContext()(), CL_MEM_READ_WRITE,
                                     coo->num_nonzeros * sizeof(I), NULL, &cl_status );
    CLSPARSE_V(cl_status, "Create coo row indices buffer");



    SizeType workgroup_size   = 256;
    SizeType global_work_size = 0;

    if (dense_size % workgroup_size == 0)
        global_work_size = dense_size;
    else
        global_work_size = dense_size / workgroup_size * workgroup_size + workgroup_size;

    if (dense_size < workgroup_size) global_work_size = workgroup_size;

    const std::string params = std::string()
            + " -DINDEX_TYPE=" + OclTypeTraits<I>::type
            + " -DSIZE_TYPE="  + OclTypeTraits<SizeType>::type
            + " -DVALUE_TYPE=" + OclTypeTraits<V>::type
            + " -DWG_SIZE=" + std::to_string(workgroup_size)
            + " -DSUBWAVE_SIZE=" + std::to_string(2); //required by program;

    //cl::Kernel kernel = KernelCache::get(control->queue,"dense2csr", "spread_value", params);
    cl::Kernel kernel = KernelCache::get(control->queue,"conversion_utils",
                                         "scatter_coo_locations", params);

    KernelWrap kWrapper(kernel);

    kWrapper << (SizeType) coo->num_rows
             << (SizeType) coo->num_cols
             << (SizeType) dense_size
             << A.data()
             << nnz_locations.data()
             << coo_indexes.data()
             << coo->rowIndices
             << coo->colIndices
             << coo->values;

    cl::NDRange local(workgroup_size);
    cl::NDRange global(global_work_size);

    cl_status = kWrapper.run(control, global, local);

    CLSPARSE_V(cl_status, "Error process scaninput");

    if (cl_status != CL_SUCCESS)
        return clsparseInvalidKernelExecution;

    return clsparseSuccess;
}
Beispiel #9
0
clsparseStatus
indices_to_offsets(clsparse::vector<T>& offsets,
                   const clsparse::vector<T>& indices,
                   const clsparseControl control)
{
    typedef typename clsparse::vector<T> IndicesArray;
    typedef typename clsparse::vector<T>::size_type SizeType;

    //if (std::is_integral<T>)

    if (!clsparseInitialized)
    {
        return clsparseNotInitialized;
    }

    //check opencl elements
    if (control == nullptr)
    {
        return clsparseInvalidControlObject;
    }

    SizeType size = indices.size() > offsets.size() ? indices.size() : offsets.size();

    IndicesArray values (control, indices.size(), 1, CL_MEM_READ_WRITE, true);
    IndicesArray keys_output (control, indices.size(), 0, CL_MEM_READ_WRITE, false);
    IndicesArray values_output (control, size, 0, CL_MEM_READ_WRITE, false);

    clsparseStatus status =
            internal::reduce_by_key(keys_output, values_output,
                                    indices, values, control);

    CLSPARSE_V(status, "Error: reduce_by_key");

    if (status != clsparseSuccess)
        return status;

    assert(values_output.size() >= offsets.size());

    cl_event clEvent;
    cl_int cl_status  = clEnqueueCopyBuffer(control->queue(),
                                            values_output.data()(),
                                            offsets.data()(),
                                            0,
                                            0,
                                            offsets.size() * sizeof(T),
                                            0,
                                            nullptr,
                                            &clEvent);

    CLSPARSE_V(cl_status, "Error: Enqueue copy buffer values to offsets");

    cl_status = clWaitForEvents(1, &clEvent);

    CLSPARSE_V(cl_status, "Error: clWaitForEvents");

    cl_status = clReleaseEvent(clEvent);

    CLSPARSE_V(cl_status, "Error: clReleaseEvent");

    // Dunno why but this throws CL_INVALID_CONTEXT erro;
    //    cl::Event event;
    //    cl::enqueueCopyBuffer(values_output.data(), offsets.data(),
    //                                          0, 0, offsets.size(),
    //                                          nullptr, &event);

    //    CLSPARSE_V(cl_status, "Error: enqueueCopyBuffer");
    //    CLSPARSE_V(event.wait(), "Error: event wait");

    status = exclusive_scan<EW_PLUS>(offsets, offsets, control);

    return status;
}
Beispiel #10
0
clsparseStatus
clsparseDCsrMatrixfromFile( clsparseCsrMatrix* csrMatx, const char* filePath, clsparseControl control, cl_bool read_explicit_zeroes )
{
    clsparseCsrMatrixPrivate* pCsrMatx = static_cast<clsparseCsrMatrixPrivate*>( csrMatx );

    // Check that the file format is matrix market; the only format we can read right now
    // This is not a complete solution, and fails for directories with file names etc...
    // TODO: Should we use boost filesystem?
    std::string strPath( filePath );
    if( strPath.find_last_of( '.' ) != std::string::npos )
    {
        std::string ext = strPath.substr( strPath.find_last_of( '.' ) + 1 );
        if( ext != "mtx" )
            return clsparseInvalidFileFormat;
    }
    else
        return clsparseInvalidFileFormat;

    // Read data from a file on disk into CPU buffers
    // Data is read natively as COO format with the reader
    MatrixMarketReader< cl_double > mm_reader;
    if( mm_reader.MMReadFormat( filePath, read_explicit_zeroes ) )
        return clsparseInvalidFile;

    // BUG: We need to check to see if openCL buffers currently exist and deallocate them first!
    // FIX: Below code will check whether the buffers were allocated in the first place;
    {
        clsparseStatus validationStatus = validateMemObject(pCsrMatx->values,
                                                            mm_reader.GetNumNonZeroes() * sizeof(cl_double));

        // I dont want to reallocate buffer because I suppress the users buffer memory flags;
        // It is users responsibility to provide good buffer;
        if (validationStatus != clsparseSuccess)
            return validationStatus;

        validationStatus = validateMemObject(pCsrMatx->col_indices,
                                             mm_reader.GetNumNonZeroes() * sizeof(clsparseIdx_t));
        if (validationStatus != clsparseSuccess)
            return validationStatus;

        validationStatus = validateMemObject(pCsrMatx->row_pointer,
                                             (mm_reader.GetNumRows() + 1) * sizeof(clsparseIdx_t));
        if (validationStatus != clsparseSuccess)
            return validationStatus;
    }



    pCsrMatx->num_rows = mm_reader.GetNumRows( );
    pCsrMatx->num_cols = mm_reader.GetNumCols( );
    pCsrMatx->num_nonzeros = mm_reader.GetNumNonZeroes( );

    // Transfers data from CPU buffer to GPU buffers
    cl_int mapStatus = 0;
    clMemRAII< cl_double > rCsrValues( control->queue( ), pCsrMatx->values);
    clMemRAII< clsparseIdx_t > rCsrcol_indices( control->queue( ), pCsrMatx->col_indices );
    clMemRAII< clsparseIdx_t > rCsrrow_pointer( control->queue( ), pCsrMatx->row_pointer );

    cl_double* fCsrValues =
            rCsrValues.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION,
                                 pCsrMatx->valOffset( ), pCsrMatx->num_nonzeros, &mapStatus );
    if (mapStatus != CL_SUCCESS)
    {
        CLSPARSE_V(mapStatus, "Error: Mapping rCsrValues failed");
        return clsparseInvalidMemObj;
    }

    clsparseIdx_t* iCsrcol_indices =
            rCsrcol_indices.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION,
                                     pCsrMatx->colIndOffset( ), pCsrMatx->num_nonzeros, &mapStatus );
    if (mapStatus != CL_SUCCESS)
    {
        CLSPARSE_V(mapStatus, "Error: Mapping rCsrcol_indices failed");
        return clsparseInvalidMemObj;
    }

    clsparseIdx_t* iCsrrow_pointer =
            rCsrrow_pointer.clMapMem( CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION,
                                     pCsrMatx->rowOffOffset( ), pCsrMatx->num_rows + 1, &mapStatus );
    if (mapStatus != CL_SUCCESS)
    {
        CLSPARSE_V(mapStatus, "Error: Mapping rCsrrow_pointer failed");
        return clsparseInvalidMemObj;
    }

    //  The following section of code converts the sparse format from COO to CSR
    Coordinate< cl_double >* coords = mm_reader.GetUnsymCoordinates( );
    std::sort( coords, coords + pCsrMatx->num_nonzeros, CoordinateCompare< cl_double > );

    clsparseIdx_t current_row = 1;
    iCsrrow_pointer[ 0 ] = 0;
    for (clsparseIdx_t i = 0; i < pCsrMatx->num_nonzeros; i++)
    {
        iCsrcol_indices[ i ] = coords[ i ].y;
        fCsrValues[ i ] = coords[ i ].val;

        while( coords[ i ].x >= current_row )
            iCsrrow_pointer[ current_row++ ] = i;
    }
    iCsrrow_pointer[ current_row ] = pCsrMatx->num_nonzeros;
    while( current_row <= pCsrMatx->num_rows )
        iCsrrow_pointer[ current_row++ ] = pCsrMatx->num_nonzeros;

    return clsparseSuccess;
}
Beispiel #11
0
clsparseStatus
scan(VectorType& output, const VectorType& input,
     clsparseControl control, bool exclusive)
{
    typedef typename VectorType::size_type SizeType; //check for cl_ulong
    typedef typename VectorType::value_type T;

    if (!clsparseInitialized)
    {
        return clsparseNotInitialized;
    }

    //check opencl elements
    if (control == nullptr)
    {
        return clsparseInvalidControlObject;
    }

    assert (input.size() == output.size());

    SizeType num_elements = input.size();

    //std::cout << "num_elements = " << num_elements << std::endl;

    SizeType KERNEL02WAVES = 4;
    SizeType KERNEL1WAVES = 4;
    SizeType WAVESIZE = control->wavefront_size;

    SizeType kernel0_WgSize = WAVESIZE*KERNEL02WAVES;
    SizeType kernel1_WgSize = WAVESIZE*KERNEL1WAVES;
    SizeType kernel2_WgSize = WAVESIZE*KERNEL02WAVES;

    SizeType numElementsRUP = num_elements;
    SizeType modWgSize = (numElementsRUP & ((kernel0_WgSize*2)-1));

    if( modWgSize )
    {
        numElementsRUP &= ~modWgSize;
        numElementsRUP += (kernel0_WgSize*2);
    }

    //2 element per work item
    SizeType numWorkGroupsK0 = numElementsRUP / (kernel0_WgSize*2);

    SizeType sizeScanBuff = numWorkGroupsK0;

    modWgSize = (sizeScanBuff & ((kernel0_WgSize*2)-1));
    if( modWgSize )
    {
        sizeScanBuff &= ~modWgSize;
        sizeScanBuff += (kernel0_WgSize*2);
    }

    cl::Context ctx = control->getContext();

    clsparse::vector<T> preSumArray(control, sizeScanBuff,
                                    0, CL_MEM_READ_WRITE, false);
    clsparse::vector<T> preSumArray1(control, sizeScanBuff,
                                     0, CL_MEM_READ_WRITE, false);
    clsparse::vector<T> postSumArray(control, sizeScanBuff,
                                     0, CL_MEM_READ_WRITE, false);

    T operator_identity = 0;

    //std::cout << "operator_identity = " << operator_identity << std::endl;
    //scan in blocks
    {
        //local mem size
        std::size_t lds = kernel0_WgSize * 2 * sizeof(T);

        std::string params = std::string()
                + " -DVALUE_TYPE=" + OclTypeTraits<T>::type
                + " -DWG_SIZE="    + std::to_string(kernel0_WgSize)
                + " -D" + ElementWiseOperatorTrait<OP>::operation;

        if (sizeof(clsparseIdx_t) == 8)
        {
            std::string options = std::string()
                + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type;
            params.append(options);
        }
        else
        {
            std::string options = std::string()
                + " -DSIZE_TYPE=" + OclTypeTraits<cl_uint>::type;
            params.append(options);
        }

        if(typeid(T) == typeid(cl_double))
        {
            params.append(" -DDOUBLE");
            if (!control->dpfp_support)
            {
#ifndef NDEBUG
                std::cerr << "Failure attempting to run double precision kernel on device without DPFP support." << std::endl;
#endif
                return clsparseInvalidDevice;
            }
        }

        cl::Kernel kernel = KernelCache::get(control->queue, "scan",
                                             "per_block_inclusive_scan", params);

        KernelWrap kWrapper(kernel);


        kWrapper << input.data()
                 << operator_identity
                 << (SizeType)input.size()
                 << cl::Local(lds)
                 << preSumArray.data()
                 << preSumArray1.data()
                 << (int) exclusive;

        cl::NDRange global(numElementsRUP/2);
        cl::NDRange local (kernel0_WgSize);

        cl_int status = kWrapper.run(control, global, local);

        CLSPARSE_V(status, "Error: per_block_inclusive_scan");

        if (status != CL_SUCCESS)
        {
            return clsparseInvalidKernelExecution;
        }

    }


    {
        //local mem size
        std::size_t lds = kernel0_WgSize * sizeof(T);

        SizeType workPerThread = sizeScanBuff / kernel1_WgSize;

        std::string params = std::string()
                + " -DVALUE_TYPE=" + OclTypeTraits<T>::type
                + " -DWG_SIZE="    + std::to_string(kernel1_WgSize)
                + " -D" + ElementWiseOperatorTrait<OP>::operation;

        if (sizeof(clsparseIdx_t) == 8)
        {
            std::string options = std::string()
                + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type;
            params.append(options);
        }
        else
        {
            std::string options = std::string()
                + " -DSIZE_TYPE=" + OclTypeTraits<cl_uint>::type;
            params.append(options);
        }

        if(typeid(T) == typeid(cl_double))
        {
            params.append(" -DDOUBLE");
            if (!control->dpfp_support)
            {
#ifndef NDEBUG
                std::cerr << "Failure attempting to run double precision kernel on device without DPFP support." << std::endl;
#endif
                return clsparseInvalidDevice;
            }
        }

        cl::Kernel kernel = KernelCache::get(control->queue, "scan",
                                             "intra_block_inclusive_scan", params);

        KernelWrap kWrapper(kernel);

        kWrapper << postSumArray.data()
                 << preSumArray.data()
                 << operator_identity
                 << numWorkGroupsK0
                 << cl::Local(lds)
                 << workPerThread;

        cl::NDRange global ( kernel1_WgSize );
        cl::NDRange local  ( kernel1_WgSize );

        cl_int status = kWrapper.run(control, global, local);

        CLSPARSE_V(status, "Error: intra_block_inclusive_scan");

        if (status != CL_SUCCESS)
        {
            return clsparseInvalidKernelExecution;
        }
    }

    {
        std::size_t lds = kernel0_WgSize * sizeof(T); //local mem size

        std::string params = std::string()
                + " -DVALUE_TYPE=" + OclTypeTraits<T>::type
                + " -DWG_SIZE="    + std::to_string(kernel1_WgSize)
                + " -D" + ElementWiseOperatorTrait<OP>::operation;

        if (sizeof(clsparseIdx_t) == 8)
        {
            std::string options = std::string()
                + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type;
            params.append(options);
        }
        else
        {
            std::string options = std::string()
                + " -DSIZE_TYPE=" + OclTypeTraits<cl_uint>::type;
            params.append(options);
        }

        if(typeid(T) == typeid(cl_double))
        {
            params.append(" -DDOUBLE");
            if (!control->dpfp_support)
            {
#ifndef NDEBUG
                std::cerr << "Failure attempting to run double precision kernel on device without DPFP support." << std::endl;
#endif
                return clsparseInvalidDevice;
            }
        }

        cl::Kernel kernel = KernelCache::get(control->queue, "scan",
                                             "per_block_addition", params);

        KernelWrap kWrapper(kernel);

        kWrapper << output.data()
                 << input.data()
                 << postSumArray.data()
                 << preSumArray1.data()
                 << cl::Local(lds)
                 << num_elements
                 << (int)exclusive
                 << operator_identity;

        cl::NDRange global ( numElementsRUP );
        cl::NDRange local  ( kernel2_WgSize );

        cl_int status = kWrapper.run(control, global, local);

        CLSPARSE_V(status, "Error: per_block_addition");

        if (status != CL_SUCCESS)
        {
            return clsparseInvalidKernelExecution;
        }
    }

    return clsparseSuccess;

}
Beispiel #12
0
clsparseStatus
reduce_by_key(
    int keys_first,
    int keys_last,
    int values_first,
    cl_mem keys_input,
    cl_mem values_input,
    cl_mem keys_output,
    cl_mem values_output,
    int *count,
    clsparseControl control
)
{

    cl_int l_Error;

    /**********************************************************************************
     * Compile Options
     *********************************************************************************/
    const int kernel0_WgSize = WAVESIZE*KERNEL02WAVES;
    const int kernel1_WgSize = WAVESIZE*KERNEL1WAVES;
    const int kernel2_WgSize = WAVESIZE*KERNEL02WAVES;

    //const std::string params = std::string() +
    //          " -DKERNEL0WORKGROUPSIZE=" + std::to_string(kernel0_WgSize)
    //        + " -DKERNEL1WORKGROUPSIZE=" + std::to_string(kernel1_WgSize)
    //        + " -DKERNEL2WORKGROUPSIZE=" + std::to_string(kernel2_WgSize);
    const std::string params;

    cl::Context context = control->getContext();
    std::vector<cl::Device> dev = context.getInfo<CL_CONTEXT_DEVICES>();
    int computeUnits  = dev[0].getInfo< CL_DEVICE_MAX_COMPUTE_UNITS >( );
    int wgPerComputeUnit = dev[0].getInfo< CL_DEVICE_MAX_WORK_GROUP_SIZE >( );


    int resultCnt = computeUnits * wgPerComputeUnit;
    cl_uint numElements = keys_last - keys_first + 1;

    size_t sizeInputBuff = numElements;
    int modWgSize = (sizeInputBuff & (kernel0_WgSize-1));
    if( modWgSize )
    {
        sizeInputBuff &= ~modWgSize;
        sizeInputBuff += kernel0_WgSize;
    }
    cl_uint numWorkGroupsK0 = static_cast< cl_uint >( sizeInputBuff / kernel0_WgSize );

    size_t sizeScanBuff = numWorkGroupsK0;
    modWgSize = (sizeScanBuff & (kernel0_WgSize-1));
    if( modWgSize )
    {
        sizeScanBuff &= ~modWgSize;
        sizeScanBuff += kernel0_WgSize;
    }

    cl_mem tempArrayVec = clCreateBuffer(context(),CL_MEM_READ_WRITE, (numElements)*sizeof(int), NULL, NULL );

    /**********************************************************************************
     *  Kernel 0
     *********************************************************************************/

    cl::Kernel kernel0 = KernelCache::get(control->queue,"reduce_by_key", "OffsetCalculation", params);

    KernelWrap kWrapper0(kernel0);

    kWrapper0 << keys_input << tempArrayVec
              << numElements;

    cl::NDRange local0(kernel0_WgSize);
    cl::NDRange global0(sizeInputBuff);

    cl_int status = kWrapper0.run(control, global0, local0);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    int init = 0;

    scan(0,
	 numElements - 1,
         tempArrayVec,
         tempArrayVec,
         0,
         0,
         control
         );

    int pattern = 0;
    cl_mem keySumArray = clCreateBuffer(context(),CL_MEM_READ_WRITE, (sizeScanBuff)*sizeof(int), NULL, NULL );
    cl_mem preSumArray = clCreateBuffer(context(),CL_MEM_READ_WRITE, (sizeScanBuff)*sizeof(int), NULL, NULL );
    cl_mem postSumArray = clCreateBuffer(context(),CL_MEM_READ_WRITE,(sizeScanBuff)*sizeof(int), NULL, NULL );
    clEnqueueFillBuffer(control->queue(), keySumArray, &pattern, sizeof(int), 0,
                        (sizeScanBuff)*sizeof(int), 0, NULL, NULL);
    clEnqueueFillBuffer(control->queue(), preSumArray, &pattern, sizeof(int), 0,
                        (sizeScanBuff)*sizeof(int), 0, NULL, NULL);
    clEnqueueFillBuffer(control->queue(), postSumArray, &pattern, sizeof(int), 0,
                        (sizeScanBuff)*sizeof(int), 0, NULL, NULL);


    /**********************************************************************************
     *  Kernel 1
     *********************************************************************************/

    cl::Kernel kernel1 = KernelCache::get(control->queue,"reduce_by_key", "perBlockScanByKey", params);

    KernelWrap kWrapper1(kernel1);

    kWrapper1 << tempArrayVec
	      << values_input
              << numElements
	      << keySumArray
	      << preSumArray;

    cl::NDRange local1(kernel0_WgSize);
    cl::NDRange global1(sizeInputBuff);

    status = kWrapper1.run(control, global1, local1);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    /**********************************************************************************
     *  Kernel 2
     *********************************************************************************/
    cl_uint workPerThread = static_cast< cl_uint >( sizeScanBuff / kernel1_WgSize );

    cl::Kernel kernel2 = KernelCache::get(control->queue,"reduce_by_key", "intraBlockInclusiveScanByKey", params);

    KernelWrap kWrapper2(kernel2);

    kWrapper2 << keySumArray << preSumArray
              << postSumArray << numWorkGroupsK0 << workPerThread;

    cl::NDRange local2(kernel1_WgSize);
    cl::NDRange global2(kernel1_WgSize);

    status = kWrapper2.run(control, global2, local2);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    /**********************************************************************************
     *  Kernel 3
     *********************************************************************************/

    cl::Kernel kernel3 = KernelCache::get(control->queue,"reduce_by_key", "keyValueMapping", params);

    KernelWrap kWrapper3(kernel3);

    kWrapper3 << keys_input << keys_output
              << values_input << values_output << tempArrayVec
              << keySumArray << postSumArray << numElements;

    cl::NDRange local3(kernel0_WgSize);
    cl::NDRange global3(sizeInputBuff);

    status = kWrapper3.run(control, global3, local3);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    int *h_result = (int *) malloc (sizeof(int));

    clEnqueueReadBuffer(control->queue(),
                        tempArrayVec,
                        1,
                       (numElements-1)*sizeof(int),
                        sizeof(int),
                        h_result,
                        0,
                        0,
                        0);

    *count = *(h_result);
    //printf("h_result = %d\n", *count );

    //release buffers
    clReleaseMemObject(tempArrayVec);
    clReleaseMemObject(preSumArray);
    clReleaseMemObject(postSumArray);
    clReleaseMemObject(keySumArray);

    return clsparseSuccess;
}   //end of reduce_by_key