예제 #1
0
clsparseStatus
atomic_reduce(clsparse::array_base<T>& pR,
              const clsparse::array_base<T>& pX,
              const cl_ulong wg_size,
              const clsparseControl control)
{
    assert(wg_size == pX.size());

    std::string params = std::string()
            + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type
            + " -DVALUE_TYPE=" + OclTypeTraits<T>::type
            + " -DWG_SIZE=" + std::to_string(wg_size)
            + " -D" + ReduceOperatorTrait<OP>::operation;

    if (typeid(cl_float) == typeid(T))
    {
        std::string options = std::string() + " -DATOMIC_FLOAT";
        params.append(options);
    }
    else if (typeid(cl_double) == typeid(T))
    {
        std::string options = std::string() + " -DATOMIC_DOUBLE";
        params.append(options);
    }
    else if (typeid(cl_int) == typeid(T))
    {
        std::string options = std::string() + " -DATOMIC_INT";
        params.append(options);
    }
    else
    {
        return clsparseInvalidType;
    }

    cl::Kernel kernel = KernelCache::get(control->queue,
                                         "atomic_reduce", "reduce_block",
                                         params);

    KernelWrap kWrapper(kernel);

    kWrapper << pR.data();
    kWrapper << pX.data();

    int blocksNum = (pX.size() + wg_size - 1) / wg_size;
    int globalSize = blocksNum * wg_size;

    cl::NDRange local(wg_size);
    cl::NDRange global(globalSize);

    cl_int status = kWrapper.run(control, global, local);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    return clsparseSuccess;
}
예제 #2
0
clsparseStatus
csrmv_adaptive( const clsparseScalarPrivate* pAlpha,
                const clsparseCsrMatrixPrivate* pCsrMatx,
                const cldenseVectorPrivate* pX,
                const clsparseScalarPrivate* pBeta,
                cldenseVectorPrivate* pY,
                clsparseControl control )
{


    const cl_uint group_size = 256;

    std::string params = std::string( )
    + " -DROWBITS=" + std::to_string( ROW_BITS )
    + " -DWGBITS=" + std::to_string( WG_BITS )
    + " -DBLOCKSIZE=" + std::to_string( BLKSIZE );
#ifdef DOUBLE
    buildFlags += " -DDOUBLE";
#endif

    if(typeid(T) == typeid(cl_double))
    {
            std::string options = std::string() + " -DDOUBLE";
            params.append(options);
    }


    cl::Kernel kernel = KernelCache::get( control->queue,
                                          "csrmv_adaptive",
                                          "csrmv_adaptive",
                                          params );

    KernelWrap kWrapper( kernel );

    kWrapper << pCsrMatx->values
        << pCsrMatx->colIndices << pCsrMatx->rowOffsets
        << pX->values << pY->values
        << pCsrMatx->rowBlocks
        << pAlpha->value << pBeta->value;
        //<< h_alpha << h_beta;

    // if NVIDIA is used it does not allow to run the group size
    // which is not a multiplication of group_size. Don't know if that
    // have an impact on performance
    cl_uint global_work_size = ( pCsrMatx->rowBlockSize - 1 ) * group_size;
    cl::NDRange local( group_size );
    cl::NDRange global( global_work_size > local[ 0 ] ? global_work_size : local[ 0 ] );

    cl_int status = kWrapper.run( control, global, local );

    if( status != CL_SUCCESS )
    {
        return clsparseInvalidKernelExecution;
    }

    return clsparseSuccess;
}
예제 #3
0
clsparseStatus
axpby(clsparse::array_base<T>& pY,
      const clsparse::array_base<T>& pAlpha,
      const clsparse::array_base<T>& pX,
      const clsparse::array_base<T>& pBeta,
      const clsparse::array_base<T>& pZ,
      const clsparseControl control)
{

    const int group_size = 256; // this or higher? control->max_wg_size?

    const std::string params = std::string()
            + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type
            + " -DVALUE_TYPE=" + OclTypeTraits<T>::type
            + " -DWG_SIZE=" + std::to_string( group_size )
            + " -D" + ElementWiseOperatorTrait<OP>::operation;

    cl::Kernel kernel = KernelCache::get(control->queue, "blas1", "axpby",
                                         params);

    KernelWrap kWrapper(kernel);

    cl_ulong size = pY.size();

    //clsparse do not support offset;
    cl_ulong offset = 0;

    kWrapper << size
             << pY.data()
             << offset
             << pAlpha.data()
             << offset
             << pX.data()
             << offset
             << pBeta.data()
             << offset
             << pZ.data()
             << offset;

    int blocksNum = (size + group_size - 1) / group_size;
    int globalSize = blocksNum * group_size;

    cl::NDRange local(group_size);
    cl::NDRange global (globalSize);

    cl_int status = kWrapper.run(control, global, local);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    return clsparseSuccess;
}
예제 #4
0
clsparseStatus
axpby(cl_ulong size,
      cldenseVectorPrivate* pY,
      const clsparseScalarPrivate* pAlpha,
      const cldenseVectorPrivate* pX,
      const clsparseScalarPrivate* pBeta,
      const clsparseControl control)
{

    const int group_size = 256; // this or higher? control->max_wg_size?

    const std::string params = std::string()
            + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type
            + " -DVALUE_TYPE=" + OclTypeTraits<T>::type
            + " -DWG_SIZE=" + std::to_string( group_size )
            + " -D" + ElementWiseOperatorTrait<OP>::operation;

    cl::Kernel kernel = KernelCache::get(control->queue, "blas1", "axpby",
                                         params);

    KernelWrap kWrapper(kernel);

    kWrapper << size
             << pY->values
             << pY->offset()
             << pAlpha->value
             << pAlpha->offset()
             << pX->values
             << pX->offset()
             << pBeta->value
             << pBeta->offset()
             << pY->values
             << pY->offset();

    int blocksNum = (size + group_size - 1) / group_size;
    int globalSize = blocksNum * group_size;

    cl::NDRange local(group_size);
    cl::NDRange global (globalSize);

    cl_int status = kWrapper.run(control, global, local);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    return clsparseSuccess;
}
예제 #5
0
clsparseStatus
inner_product (cldenseVectorPrivate* partial,
     const cldenseVectorPrivate* pX,
     const cldenseVectorPrivate* pY,
     const cl_ulong size,
     const cl_ulong REDUCE_BLOCKS_NUMBER,
     const cl_ulong REDUCE_BLOCK_SIZE,
     const clsparseControl control)
{

    cl_ulong nthreads = REDUCE_BLOCK_SIZE * REDUCE_BLOCKS_NUMBER;

    std::string params = std::string()
            + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type
            + " -DVALUE_TYPE=" + OclTypeTraits<T>::type
            + " -DWG_SIZE=" + std::to_string(REDUCE_BLOCK_SIZE)
            + " -DREDUCE_BLOCK_SIZE=" + std::to_string(REDUCE_BLOCK_SIZE)
            + " -DN_THREADS=" + std::to_string(nthreads);

    cl::Kernel kernel = KernelCache::get(control->queue,
                                         "dot", "inner_product", params);

    KernelWrap kWrapper(kernel);

    kWrapper << size
             << partial->values
             << pX->values
             << pY->values;

    cl::NDRange local(REDUCE_BLOCK_SIZE);
    cl::NDRange global(REDUCE_BLOCKS_NUMBER * REDUCE_BLOCK_SIZE);


    cl_int status = kWrapper.run(control, global, local);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    return clsparseSuccess;
}
예제 #6
0
clsparseStatus
scale(clsparse::array_base<T>& pVector,
      const clsparse::array_base<T>& pAlpha,
      clsparseControl control)
{
    const int group_size = 256;
    //const int group_size = control->max_wg_size;

    const std::string params = std::string()
            + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type
            + " -DVALUE_TYPE="+ OclTypeTraits<T>::type
            + " -DWG_SIZE=" + std::to_string(group_size);

    cl::Kernel kernel = KernelCache::get(control->queue,
                                         "blas1", "scale",
                                         params);
    KernelWrap kWrapper(kernel);

    cl_ulong size = pVector.size();
    cl_ulong offset = 0;

    kWrapper << size
             << pVector.data()
             << offset
             << pAlpha.data()
             << offset;

    int blocksNum = (size + group_size - 1) / group_size;
    int globalSize = blocksNum * group_size;

    cl::NDRange local(group_size);
    cl::NDRange global (globalSize);

    cl_int status = kWrapper.run(control, global, local);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    return clsparseSuccess;
}
예제 #7
0
clsparseStatus
csr2coo_transform(const int m, const int n,
                    cl_mem csr_row_offsets,
                    cl_mem coo_row_indices,
                    const std::string& params,
                    const cl_uint group_size,
                    const cl_uint subwave_size,
                    clsparseControl control)
{
    cl::Kernel kernel = KernelCache::get(control->queue,"csr2coo", "csr2coo", params);

    KernelWrap kWrapper(kernel);

    kWrapper << m << n
             << csr_row_offsets
             << coo_row_indices;

    // subwave takes care of each row in matrix;
    // predicted number of subwaves to be executed;
    cl_uint predicted = subwave_size * m;

    //cl::NDRange local(group_size);
    //cl::NDRange global(predicted > local[0] ? predicted : local[0]);

    cl_uint global_work_size =
            group_size* ((predicted + group_size - 1 ) / group_size);
    cl::NDRange local(group_size);
    cl::NDRange global(global_work_size > local[0] ? global_work_size : local[0]);

    cl_int status = kWrapper.run(control, global, local);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    return clsparseSuccess;

}
예제 #8
0
clsparseStatus
scale( clsparse::array_base<T>& pResult,
       const clsparse::array_base<T>& pAlpha,
       const clsparse::array_base<T>& pVector,
       clsparseControl control)
{
    const int group_size = 256;
    //const int group_size = control->max_wg_size;

    std::string params = std::string()
            + " -DVALUE_TYPE="+ OclTypeTraits<T>::type
            + " -DWG_SIZE=" + std::to_string(group_size);

    if (sizeof(clsparseIdx_t) == 8)
    {
        std::string options = std::string()
            + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type;
        params.append(options);
    }
    else
    {
        std::string options = std::string()
            + " -DSIZE_TYPE=" + OclTypeTraits<cl_uint>::type;
        params.append(options);
    }

    if(typeid(T) == typeid(cl_double))
    {
        params.append(" -DDOUBLE");
        if (!control->dpfp_support)
        {
#ifndef NDEBUG
            std::cerr << "Failure attempting to run double precision kernel on device without DPFP support." << std::endl;
#endif
            return clsparseInvalidDevice;
        }
    }

    cl::Kernel kernel = KernelCache::get(control->queue,
                                         "blas1", "scale",
                                         params);
    KernelWrap kWrapper(kernel);

    clsparseIdx_t size = pResult.size();
    clsparseIdx_t offset = 0;

    kWrapper << size
             << pResult.data()
             << offset
             << pVector.data()
             << offset
             << pAlpha.data()
             << offset;

    clsparseIdx_t blocksNum = (size + group_size - 1) / group_size;
    clsparseIdx_t globalSize = blocksNum * group_size;

    cl::NDRange local(group_size);
    cl::NDRange global (globalSize);

    cl_int status = kWrapper.run(control, global, local);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    return clsparseSuccess;
}
예제 #9
0
clsparseStatus
csrmm( const clsparseScalarPrivate& pAlpha,
const clsparseCsrMatrixPrivate& pSparseCsrA,
const cldenseMatrixPrivate& pDenseB,
const clsparseScalarPrivate& pBeta,
cldenseMatrixPrivate& pDenseC,
const clsparseControl control )
{
    cl_uint nnz_per_row = pSparseCsrA.nnz_per_row( ); //average nnz per row
    cl_uint wave_size = control->wavefront_size;
    cl_uint group_size = 256;    // 256 gives best performance!
    cl_uint subwave_size = wave_size;

    // adjust subwave_size according to nnz_per_row;
    // each wavefron will be assigned to the row of the csr matrix
    if( wave_size > 32 )
    {
        //this apply only for devices with wavefront > 32 like AMD(64)
        if( nnz_per_row < 64 ) { subwave_size = 32; }
    }
    if( nnz_per_row < 32 ) { subwave_size = 16; }
    if( nnz_per_row < 16 ) { subwave_size = 8; }
    if( nnz_per_row < 8 )  { subwave_size = 4; }
    if( nnz_per_row < 4 )  { subwave_size = 2; }

    std::string params = std::string( ) +
        "-DINDEX_TYPE=" + OclTypeTraits<cl_int>::type
        + " -DVALUE_TYPE=" + OclTypeTraits<T>::type
        + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type
        + " -DWG_SIZE=" + std::to_string( group_size )
        + " -DWAVE_SIZE=" + std::to_string( wave_size )
        + " -DSUBWAVE_SIZE=" + std::to_string( subwave_size );

    if( typeid( T ) == typeid( cl_double ) )
    {
        params += " -DDOUBLE";
    }

    cl::Kernel kernel = KernelCache::get( control->queue,
                                          "csrmm_general",
                                          "csrmv_batched",
                                          params );
    KernelWrap kWrapper( kernel );

    kWrapper << pSparseCsrA.num_rows
        << pAlpha.value << pAlpha.offset( )
        << pSparseCsrA.rowOffsets << pSparseCsrA.colIndices << pSparseCsrA.values
        << pDenseB.values << pDenseB.lead_dim << pDenseB.offset( )
        << pBeta.value << pBeta.offset( )
        << pDenseC.values << pDenseC.num_rows << pDenseC.num_cols << pDenseC.lead_dim << pDenseC.offset( );

    // subwave takes care of each row in matrix;
    // predicted number of subwaves to be executed;
    cl_uint predicted = subwave_size * pSparseCsrA.num_rows;

    // if NVIDIA is used it does not allow to run the group size
    // which is not a multiplication of group_size. Don't know if that
    // have an impact on performance
    cl_uint global_work_size =
        group_size* ( ( predicted + group_size - 1 ) / group_size );
    cl::NDRange local( group_size );
    //cl::NDRange global(predicted > local[0] ? predicted : local[0]);
    cl::NDRange global( global_work_size > local[ 0 ] ? global_work_size : local[ 0 ] );

    cl_int status = kWrapper.run( control, global, local );

    if( status != CL_SUCCESS )
    {
        return clsparseInvalidKernelExecution;
    }

    return clsparseSuccess;
}
예제 #10
0
clsparseStatus
dense_to_coo(clsparseCooMatrix* coo,
             const clsparse::vector<V>& A,
             const clsparse::vector<I>& nnz_locations,
             const clsparse::vector<I>& coo_indexes,
             const clsparseControl control)
{
    typedef typename clsparse::vector<V>::size_type SizeType;

    assert(coo->num_nonzeros > 0);
    assert(coo->num_cols > 0);
    assert(coo->num_rows > 0);

    assert(A.size() > 0);
    assert(nnz_locations.size() > 0);
    assert(coo_indexes.size() > 0);

    SizeType dense_size = A.size();

    cl_int cl_status;

    coo->values = clCreateBuffer( control->getContext()(), CL_MEM_READ_WRITE,
                                  coo->num_nonzeros * sizeof(V), NULL, &cl_status );
    CLSPARSE_V(cl_status, "Create coo values buffer");

    coo->colIndices = clCreateBuffer( control->getContext()(), CL_MEM_READ_WRITE,
                                      coo->num_nonzeros * sizeof(I), NULL, &cl_status );
    CLSPARSE_V(cl_status, "Create coo col indices buffer");

    coo->rowIndices = clCreateBuffer(control->getContext()(), CL_MEM_READ_WRITE,
                                     coo->num_nonzeros * sizeof(I), NULL, &cl_status );
    CLSPARSE_V(cl_status, "Create coo row indices buffer");



    SizeType workgroup_size   = 256;
    SizeType global_work_size = 0;

    if (dense_size % workgroup_size == 0)
        global_work_size = dense_size;
    else
        global_work_size = dense_size / workgroup_size * workgroup_size + workgroup_size;

    if (dense_size < workgroup_size) global_work_size = workgroup_size;

    const std::string params = std::string()
            + " -DINDEX_TYPE=" + OclTypeTraits<I>::type
            + " -DSIZE_TYPE="  + OclTypeTraits<SizeType>::type
            + " -DVALUE_TYPE=" + OclTypeTraits<V>::type
            + " -DWG_SIZE=" + std::to_string(workgroup_size)
            + " -DSUBWAVE_SIZE=" + std::to_string(2); //required by program;

    //cl::Kernel kernel = KernelCache::get(control->queue,"dense2csr", "spread_value", params);
    cl::Kernel kernel = KernelCache::get(control->queue,"conversion_utils",
                                         "scatter_coo_locations", params);

    KernelWrap kWrapper(kernel);

    kWrapper << (SizeType) coo->num_rows
             << (SizeType) coo->num_cols
             << (SizeType) dense_size
             << A.data()
             << nnz_locations.data()
             << coo_indexes.data()
             << coo->rowIndices
             << coo->colIndices
             << coo->values;

    cl::NDRange local(workgroup_size);
    cl::NDRange global(global_work_size);

    cl_status = kWrapper.run(control, global, local);

    CLSPARSE_V(cl_status, "Error process scaninput");

    if (cl_status != CL_SUCCESS)
        return clsparseInvalidKernelExecution;

    return clsparseSuccess;
}
예제 #11
0
clsparseStatus
calculate_num_nonzeros(/*dense matrix*/
                       const clsparse::vector<V>& A,
                       clsparse::vector<I>& nnz_locations,
                       I& num_nonzeros,
                       const clsparseControl control)
{
    typedef typename clsparse::vector<I>::size_type SizeType;

    SizeType dense_size = A.size();

    SizeType workgroup_size   = 256;
    SizeType global_work_size = 0;

    if (dense_size % workgroup_size == 0)
        global_work_size = dense_size;
    else
        global_work_size = dense_size / workgroup_size * workgroup_size + workgroup_size;

    if (dense_size < workgroup_size) global_work_size = workgroup_size;

    const std::string params = std::string()
            + " -DINDEX_TYPE=" + OclTypeTraits<I>::type
            + " -DSIZE_TYPE="  + OclTypeTraits<SizeType>::type
            + " -DVALUE_TYPE=" + OclTypeTraits<V>::type
            + " -DWG_SIZE=" + std::to_string(workgroup_size)
            + " -DSUBWAVE_SIZE=" + std::to_string(2); //required by program;

    //cl::Kernel kernel = KernelCache::get(control->queue,"dense2csr", "process_scaninput", params);
    cl::Kernel kernel = KernelCache::get(control->queue,"conversion_utils", "scan_nonzero_locations", params);

    KernelWrap kWrapper(kernel);

    kWrapper << dense_size
             << A.data()
             << nnz_locations.data();

    cl::NDRange local(workgroup_size);
    cl::NDRange global(global_work_size);

    cl_int cl_status = kWrapper.run(control, global, local);

    CLSPARSE_V(cl_status, "Error process scaninput");

    if (cl_status != CL_SUCCESS)
        return clsparseInvalidKernelExecution;

    //TODO: is it just write_only?
    clsparse::vector<I> nnz (control, 1, 0, CL_MEM_READ_WRITE, false);

    //due to this definition nnz and nnz_location have to be of the same type;
    clsparseStatus status = reduce<I, RO_PLUS>(nnz, nnz_locations, control);

    CLSPARSE_V(status, "Error: reduce");

    if (status!= clsparseSuccess)
        return clsparseInvalidKernelExecution;

    num_nonzeros = nnz[0];
    //std::cout << "NNZ: " << num_nonzeros << std::endl;
    return status;
}
예제 #12
0
clsparseStatus
csrmv_adaptive( const clsparseScalarPrivate* pAlpha,
                const clsparseCsrMatrixPrivate* pCsrMatx,
                const cldenseVectorPrivate* pX,
                const clsparseScalarPrivate* pBeta,
                cldenseVectorPrivate* pY,
                clsparseControl control )
{


    const cl_uint group_size = 256;

    std::string params = std::string( )
    + " -DROWBITS=" + std::to_string( ROW_BITS )
    + " -DWGBITS=" + std::to_string( WG_BITS )
    + " -DVALUE_TYPE=" + OclTypeTraits<T>::type
    + " -DWG_SIZE=" + std::to_string( group_size )
    + " -DBLOCKSIZE=" + std::to_string( BLKSIZE )
    + " -DBLOCK_MULTIPLIER=" + std::to_string( BLOCK_MULTIPLIER )
    + " -DROWS_FOR_VECTOR=" + std::to_string( ROWS_FOR_VECTOR );

    if( sizeof( clsparseIdx_t ) == 8 )
    {
        std::string options = std::string()
            + " -DINDEX_TYPE=" + OclTypeTraits<cl_ulong>::type;
        params.append(options);
    }
    else
    {
        std::string options = std::string()
            + " -DINDEX_TYPE=" + OclTypeTraits<cl_uint>::type;
        params.append(options);
    }

    std::string options;
    if(typeid(T) == typeid(cl_double))
    {
        options = std::string() + " -DDOUBLE";
        if (!control->dpfp_support)
        {
#ifndef NDEBUG
            std::cerr << "Failure attempting to run double precision kernel on device without DPFP support." << std::endl;
#endif
            return clsparseInvalidDevice;
        }
    }
    else if(typeid(T) == typeid(cl_ulong))
        options = std::string() + " -DLONG";
    else if(typeid(T) == typeid(cl_long))
        options = std::string() + " -DLONG";

    if(control->extended_precision)
        options += " -DEXTENDED_PRECISION";
    params.append(options);

    cl::Kernel kernel = KernelCache::get( control->queue,
                                          "csrmv_adaptive",
                                          "csrmv_adaptive",
                                          params );

    KernelWrap kWrapper( kernel );

    const matrix_meta* meta_ptr = static_cast< const matrix_meta* >( pCsrMatx->meta );

    kWrapper << pCsrMatx->values
        << pCsrMatx->col_indices << pCsrMatx->row_pointer
        << pX->values << pY->values
        << meta_ptr->rowBlocks
        << pAlpha->value << pBeta->value;
        //<< h_alpha << h_beta;

    // if NVIDIA is used it does not allow to run the group size
    // which is not a multiplication of group_size. Don't know if that
    // have an impact on performance
    // Setting global work size to half the row block size because we are only
    // using half the row blocks buffer for actual work.
    // The other half is used for the extended precision reduction.
    clsparseIdx_t global_work_size = ( ( meta_ptr->rowBlockSize/2) - 1 ) * group_size;
    cl::NDRange local( group_size );
    cl::NDRange global( global_work_size > local[ 0 ] ? global_work_size : local[ 0 ] );

    cl_int status = kWrapper.run( control, global, local );

    if( status != CL_SUCCESS )
    {
        return clsparseInvalidKernelExecution;
    }

    return clsparseSuccess;
}
예제 #13
0
파일: scan.hpp 프로젝트: 10imaging/clSPARSE
clsparseStatus
scan(VectorType& output, const VectorType& input,
     clsparseControl control, bool exclusive)
{
    typedef typename VectorType::size_type SizeType; //check for cl_ulong
    typedef typename VectorType::value_type T;

    if (!clsparseInitialized)
    {
        return clsparseNotInitialized;
    }

    //check opencl elements
    if (control == nullptr)
    {
        return clsparseInvalidControlObject;
    }

    assert (input.size() == output.size());

    SizeType num_elements = input.size();

    //std::cout << "num_elements = " << num_elements << std::endl;

    SizeType KERNEL02WAVES = 4;
    SizeType KERNEL1WAVES = 4;
    SizeType WAVESIZE = control->wavefront_size;

    SizeType kernel0_WgSize = WAVESIZE*KERNEL02WAVES;
    SizeType kernel1_WgSize = WAVESIZE*KERNEL1WAVES;
    SizeType kernel2_WgSize = WAVESIZE*KERNEL02WAVES;

    SizeType numElementsRUP = num_elements;
    SizeType modWgSize = (numElementsRUP & ((kernel0_WgSize*2)-1));

    if( modWgSize )
    {
        numElementsRUP &= ~modWgSize;
        numElementsRUP += (kernel0_WgSize*2);
    }

    //2 element per work item
    SizeType numWorkGroupsK0 = numElementsRUP / (kernel0_WgSize*2);

    SizeType sizeScanBuff = numWorkGroupsK0;

    modWgSize = (sizeScanBuff & ((kernel0_WgSize*2)-1));
    if( modWgSize )
    {
        sizeScanBuff &= ~modWgSize;
        sizeScanBuff += (kernel0_WgSize*2);
    }

    cl::Context ctx = control->getContext();

    clsparse::vector<T> preSumArray(control, sizeScanBuff,
                                    0, CL_MEM_READ_WRITE, false);
    clsparse::vector<T> preSumArray1(control, sizeScanBuff,
                                     0, CL_MEM_READ_WRITE, false);
    clsparse::vector<T> postSumArray(control, sizeScanBuff,
                                     0, CL_MEM_READ_WRITE, false);

    T operator_identity = 0;

    //std::cout << "operator_identity = " << operator_identity << std::endl;
    //scan in blocks
    {
        //local mem size
        std::size_t lds = kernel0_WgSize * 2 * sizeof(T);

        std::string params = std::string()
                + " -DVALUE_TYPE=" + OclTypeTraits<T>::type
                + " -DWG_SIZE="    + std::to_string(kernel0_WgSize)
                + " -D" + ElementWiseOperatorTrait<OP>::operation;

        if (sizeof(clsparseIdx_t) == 8)
        {
            std::string options = std::string()
                + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type;
            params.append(options);
        }
        else
        {
            std::string options = std::string()
                + " -DSIZE_TYPE=" + OclTypeTraits<cl_uint>::type;
            params.append(options);
        }

        if(typeid(T) == typeid(cl_double))
        {
            params.append(" -DDOUBLE");
            if (!control->dpfp_support)
            {
#ifndef NDEBUG
                std::cerr << "Failure attempting to run double precision kernel on device without DPFP support." << std::endl;
#endif
                return clsparseInvalidDevice;
            }
        }

        cl::Kernel kernel = KernelCache::get(control->queue, "scan",
                                             "per_block_inclusive_scan", params);

        KernelWrap kWrapper(kernel);


        kWrapper << input.data()
                 << operator_identity
                 << (SizeType)input.size()
                 << cl::Local(lds)
                 << preSumArray.data()
                 << preSumArray1.data()
                 << (int) exclusive;

        cl::NDRange global(numElementsRUP/2);
        cl::NDRange local (kernel0_WgSize);

        cl_int status = kWrapper.run(control, global, local);

        CLSPARSE_V(status, "Error: per_block_inclusive_scan");

        if (status != CL_SUCCESS)
        {
            return clsparseInvalidKernelExecution;
        }

    }


    {
        //local mem size
        std::size_t lds = kernel0_WgSize * sizeof(T);

        SizeType workPerThread = sizeScanBuff / kernel1_WgSize;

        std::string params = std::string()
                + " -DVALUE_TYPE=" + OclTypeTraits<T>::type
                + " -DWG_SIZE="    + std::to_string(kernel1_WgSize)
                + " -D" + ElementWiseOperatorTrait<OP>::operation;

        if (sizeof(clsparseIdx_t) == 8)
        {
            std::string options = std::string()
                + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type;
            params.append(options);
        }
        else
        {
            std::string options = std::string()
                + " -DSIZE_TYPE=" + OclTypeTraits<cl_uint>::type;
            params.append(options);
        }

        if(typeid(T) == typeid(cl_double))
        {
            params.append(" -DDOUBLE");
            if (!control->dpfp_support)
            {
#ifndef NDEBUG
                std::cerr << "Failure attempting to run double precision kernel on device without DPFP support." << std::endl;
#endif
                return clsparseInvalidDevice;
            }
        }

        cl::Kernel kernel = KernelCache::get(control->queue, "scan",
                                             "intra_block_inclusive_scan", params);

        KernelWrap kWrapper(kernel);

        kWrapper << postSumArray.data()
                 << preSumArray.data()
                 << operator_identity
                 << numWorkGroupsK0
                 << cl::Local(lds)
                 << workPerThread;

        cl::NDRange global ( kernel1_WgSize );
        cl::NDRange local  ( kernel1_WgSize );

        cl_int status = kWrapper.run(control, global, local);

        CLSPARSE_V(status, "Error: intra_block_inclusive_scan");

        if (status != CL_SUCCESS)
        {
            return clsparseInvalidKernelExecution;
        }
    }

    {
        std::size_t lds = kernel0_WgSize * sizeof(T); //local mem size

        std::string params = std::string()
                + " -DVALUE_TYPE=" + OclTypeTraits<T>::type
                + " -DWG_SIZE="    + std::to_string(kernel1_WgSize)
                + " -D" + ElementWiseOperatorTrait<OP>::operation;

        if (sizeof(clsparseIdx_t) == 8)
        {
            std::string options = std::string()
                + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type;
            params.append(options);
        }
        else
        {
            std::string options = std::string()
                + " -DSIZE_TYPE=" + OclTypeTraits<cl_uint>::type;
            params.append(options);
        }

        if(typeid(T) == typeid(cl_double))
        {
            params.append(" -DDOUBLE");
            if (!control->dpfp_support)
            {
#ifndef NDEBUG
                std::cerr << "Failure attempting to run double precision kernel on device without DPFP support." << std::endl;
#endif
                return clsparseInvalidDevice;
            }
        }

        cl::Kernel kernel = KernelCache::get(control->queue, "scan",
                                             "per_block_addition", params);

        KernelWrap kWrapper(kernel);

        kWrapper << output.data()
                 << input.data()
                 << postSumArray.data()
                 << preSumArray1.data()
                 << cl::Local(lds)
                 << num_elements
                 << (int)exclusive
                 << operator_identity;

        cl::NDRange global ( numElementsRUP );
        cl::NDRange local  ( kernel2_WgSize );

        cl_int status = kWrapper.run(control, global, local);

        CLSPARSE_V(status, "Error: per_block_addition");

        if (status != CL_SUCCESS)
        {
            return clsparseInvalidKernelExecution;
        }
    }

    return clsparseSuccess;

}
예제 #14
0
clsparseStatus
extract_diagonal(cldenseVectorPrivate* pDiag,
                 const clsparseCsrMatrixPrivate* pA,
                 clsparseControl control)
{
    if (!clsparseInitialized)
    {
        return clsparseNotInitialized;
    }

    //check opencl elements
    if (control == nullptr)
    {
        return clsparseInvalidControlObject;
    }

    assert (pA->num_rows > 0);
    assert (pA->num_cols > 0);
    assert (pA->num_nonzeros > 0);

    assert (pDiag->num_values == std::min(pA->num_rows, pA->num_cols));

    cl_ulong wg_size = 256;
    cl_ulong size = pA->num_rows;

    cl_ulong nnz_per_row = pA->nnz_per_row();
    cl_ulong wave_size = control->wavefront_size;
    cl_ulong subwave_size = wave_size;

    // adjust subwave_size according to nnz_per_row;
    // each wavefron will be assigned to the row of the csr matrix
    if(wave_size > 32)
    {
        //this apply only for devices with wavefront > 32 like AMD(64)
        if (nnz_per_row < 64) {  subwave_size = 32;  }
    }
    if (nnz_per_row < 32) {  subwave_size = 16;  }
    if (nnz_per_row < 16) {  subwave_size = 8;  }
    if (nnz_per_row < 8)  {  subwave_size = 4;  }
    if (nnz_per_row < 4)  {  subwave_size = 2;  }


    std::string params = std::string()
            + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type
            + " -DINDEX_TYPE=" + OclTypeTraits<cl_int>::type
            + " -DVALUE_TYPE=" + OclTypeTraits<T>::type
            + " -DWG_SIZE=" + std::to_string(wg_size)
            + " -DWAVE_SIZE=" + std::to_string(wave_size)
            + " -DSUBWAVE_SIZE=" + std::to_string(subwave_size);

    if (inverse)
        params.append(" -DOP_DIAG_INVERSE");

    if(typeid(T) == typeid(cl_double))
    {
        params.append(" -DDOUBLE");
        if (!control->dpfp_support)
        {
#ifndef NDEBUG
            std::cerr << "Failure attempting to run double precision kernel on device without DPFP support." << std::endl;
#endif
            return clsparseInvalidDevice;
        }
    }

    cl::Kernel kernel = KernelCache::get(control->queue, "matrix_utils",
                                         "extract_diagonal", params);

    KernelWrap kWrapper(kernel);

    kWrapper << size
             << pDiag->values
             << pA->rowOffsets
             << pA->colIndices
             << pA->values;

    cl_uint predicted = subwave_size * size;

    cl_uint global_work_size =
            wg_size * ((predicted + wg_size - 1 ) / wg_size);
    cl::NDRange local(wg_size);
    //cl::NDRange global(predicted > local[0] ? predicted : local[0]);
    cl::NDRange global(global_work_size > local[0] ? global_work_size : local[0]);

    cl_int status = kWrapper.run(control, global, local);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    return clsparseSuccess;
}