Beispiel #1
0
clsparseStatus
atomic_reduce(clsparse::array_base<T>& pR,
              const clsparse::array_base<T>& pX,
              const cl_ulong wg_size,
              const clsparseControl control)
{
    assert(wg_size == pX.size());

    std::string params = std::string()
            + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type
            + " -DVALUE_TYPE=" + OclTypeTraits<T>::type
            + " -DWG_SIZE=" + std::to_string(wg_size)
            + " -D" + ReduceOperatorTrait<OP>::operation;

    if (typeid(cl_float) == typeid(T))
    {
        std::string options = std::string() + " -DATOMIC_FLOAT";
        params.append(options);
    }
    else if (typeid(cl_double) == typeid(T))
    {
        std::string options = std::string() + " -DATOMIC_DOUBLE";
        params.append(options);
    }
    else if (typeid(cl_int) == typeid(T))
    {
        std::string options = std::string() + " -DATOMIC_INT";
        params.append(options);
    }
    else
    {
        return clsparseInvalidType;
    }

    cl::Kernel kernel = KernelCache::get(control->queue,
                                         "atomic_reduce", "reduce_block",
                                         params);

    KernelWrap kWrapper(kernel);

    kWrapper << pR.data();
    kWrapper << pX.data();

    int blocksNum = (pX.size() + wg_size - 1) / wg_size;
    int globalSize = blocksNum * wg_size;

    cl::NDRange local(wg_size);
    cl::NDRange global(globalSize);

    cl_int status = kWrapper.run(control, global, local);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    return clsparseSuccess;
}
Beispiel #2
0
clsparseStatus dot(clsparse::array_base<T>& pR,
                   const clsparse::array_base<T>& pX,
                   const clsparse::array_base<T>& pY,
                   const clsparseControl control)
{

    cl_int status;

    //not necessary to have it, but remember to init the pR with the proper value
    init_scalar(pR, (T)0, control);

    // with REDUCE_BLOCKS_NUMBER = 256 final reduction can be performed
    // within one block;
    const cl_ulong REDUCE_BLOCKS_NUMBER = 256;

    /* For future optimisation
    //workgroups per compute units;
    const cl_uint  WG_PER_CU = 64;
    const cl_ulong REDUCE_BLOCKS_NUMBER = control->max_compute_units * WG_PER_CU;
    */
    const cl_ulong REDUCE_BLOCK_SIZE = 256;

    cl_ulong xSize = pX.size();
    cl_ulong ySize = pY.size();

    assert (xSize == ySize);

    cl_ulong size = xSize;

    if (size > 0)
    {
        cl::Context context = control->getContext();

        //partial result
        clsparse::vector<T> partial(control, REDUCE_BLOCKS_NUMBER, 0,
                                   CL_MEM_READ_WRITE, false);

        status = inner_product<T>(partial, pX, pY, size,  REDUCE_BLOCKS_NUMBER,
                               REDUCE_BLOCK_SIZE, control);

        if (status != clsparseSuccess)
        {
            return clsparseInvalidKernelExecution;
        }

       status = atomic_reduce<T>(pR, partial, REDUCE_BLOCK_SIZE,
                                     control);

        if (status != CL_SUCCESS)
        {
            return clsparseInvalidKernelExecution;
        }
    }

    return clsparseSuccess;
}
Beispiel #3
0
clsparseStatus
csrmv_adaptive( const clsparse::array_base<T>& pAlpha,
                const clsparseCsrMatrixPrivate* pCsrMatx,
            const clsparse::array_base<T>& pX,
            const clsparse::array_base<T>& pBeta,
            clsparse::array_base<T>& pY,
            clsparseControl control )
{

    const cl_uint group_size = 256;

    std::string params = std::string( )
    + " -DROWBITS=" + std::to_string( ROW_BITS )
    + " -DWGBITS=" + std::to_string( WG_BITS )
    + " -DBLOCKSIZE=" + std::to_string( BLKSIZE );

    if(typeid(T) == typeid(cl_double))
    {
        std::string options = std::string() + " -DDOUBLE";
        params.append(options);
    }

    cl::Kernel kernel = KernelCache::get( control->queue,
                                          "csrmv_adaptive",
                                          "csrmv_adaptive",
                                          params );

    KernelWrap kWrapper( kernel );

    kWrapper << pCsrMatx->values
        << pCsrMatx->colIndices << pCsrMatx->rowOffsets
        << pX.data() << pY.data()
        << pCsrMatx->rowBlocks
        << pAlpha.data() << pBeta.data();
        //<< h_alpha << h_beta;

    // if NVIDIA is used it does not allow to run the group size
    // which is not a multiplication of group_size. Don't know if that
    // have an impact on performance
    cl_uint global_work_size = ( pCsrMatx->rowBlockSize - 1 ) * group_size;
    cl::NDRange local( group_size );
    cl::NDRange global( global_work_size > local[ 0 ] ? global_work_size : local[ 0 ] );

    cl_int status = kWrapper.run( control, global, local );

    if( status != CL_SUCCESS )
    {
        return clsparseInvalidKernelExecution;
    }

    return clsparseSuccess;
}
Beispiel #4
0
clsparseStatus
axpy(clsparse::array_base<T>& pY,
     const clsparse::array_base<T>& pAlpha,
     const clsparse::array_base<T>& pX,
     const clsparse::array_base<T>& pZ,
     const clsparseControl control)
{
    const int group_size = 256; // this or higher? control->max_wg_size?

    const std::string params = std::string()
            + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type
            + " -DVALUE_TYPE=" + OclTypeTraits<T>::type
            + " -DWG_SIZE=" + std::to_string( group_size )
            + " -D" + ElementWiseOperatorTrait<OP>::operation;

    cl::Kernel kernel = KernelCache::get(control->queue, "blas1", "axpy",
                                         params);

    KernelWrap kWrapper(kernel);

    cl_ulong size = pY.size();
    cl_ulong offset = 0;

    kWrapper << size
             << pY.data()
             << offset
             << pAlpha.data()
             << offset
             << pX.data()
             << offset
             << pZ.data()
             << offset;

    int blocksNum = (size + group_size - 1) / group_size;
    int globalSize = blocksNum * group_size;

    cl::NDRange local(group_size);
    cl::NDRange global (globalSize);

    cl_int status = kWrapper.run(control, global, local);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    return clsparseSuccess;
}
Beispiel #5
0
clsparseStatus
inner_product (clsparse::array_base<T>& partial,
     const clsparse::array_base<T>& pX,
     const clsparse::array_base<T>& pY,
     const cl_ulong size,
     const cl_ulong REDUCE_BLOCKS_NUMBER,
     const cl_ulong REDUCE_BLOCK_SIZE,
     const clsparseControl control)
{

    cl_ulong nthreads = REDUCE_BLOCK_SIZE * REDUCE_BLOCKS_NUMBER;

    std::string params = std::string()
            + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type
            + " -DVALUE_TYPE=" + OclTypeTraits<T>::type
            + " -DWG_SIZE=" + std::to_string(REDUCE_BLOCK_SIZE)
            + " -DREDUCE_BLOCK_SIZE=" + std::to_string(REDUCE_BLOCK_SIZE)
            + " -DN_THREADS=" + std::to_string(nthreads);

    cl::Kernel kernel = KernelCache::get(control->queue,
                                         "dot", "inner_product", params);

    KernelWrap kWrapper(kernel);

    kWrapper << size
             << partial.data()
             << pX.data()
             << pY.data();

    cl::NDRange local(REDUCE_BLOCK_SIZE);
    cl::NDRange global(REDUCE_BLOCKS_NUMBER * REDUCE_BLOCK_SIZE);


    cl_int status = kWrapper.run(control, global, local);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    return clsparseSuccess;
}
Beispiel #6
0
clsparseStatus
scale(clsparse::array_base<T>& pVector,
      const clsparse::array_base<T>& pAlpha,
      clsparseControl control)
{
    const int group_size = 256;
    //const int group_size = control->max_wg_size;

    const std::string params = std::string()
            + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type
            + " -DVALUE_TYPE="+ OclTypeTraits<T>::type
            + " -DWG_SIZE=" + std::to_string(group_size);

    cl::Kernel kernel = KernelCache::get(control->queue,
                                         "blas1", "scale",
                                         params);
    KernelWrap kWrapper(kernel);

    cl_ulong size = pVector.size();
    cl_ulong offset = 0;

    kWrapper << size
             << pVector.data()
             << offset
             << pAlpha.data()
             << offset;

    int blocksNum = (size + group_size - 1) / group_size;
    int globalSize = blocksNum * group_size;

    cl::NDRange local(group_size);
    cl::NDRange global (globalSize);

    cl_int status = kWrapper.run(control, global, local);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    return clsparseSuccess;
}
Beispiel #7
0
clsparseStatus
scale( clsparse::array_base<T>& pResult,
       const clsparse::array_base<T>& pAlpha,
       const clsparse::array_base<T>& pVector,
       clsparseControl control)
{
    const int group_size = 256;
    //const int group_size = control->max_wg_size;

    std::string params = std::string()
            + " -DVALUE_TYPE="+ OclTypeTraits<T>::type
            + " -DWG_SIZE=" + std::to_string(group_size);

    if (sizeof(clsparseIdx_t) == 8)
    {
        std::string options = std::string()
            + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type;
        params.append(options);
    }
    else
    {
        std::string options = std::string()
            + " -DSIZE_TYPE=" + OclTypeTraits<cl_uint>::type;
        params.append(options);
    }

    if(typeid(T) == typeid(cl_double))
    {
        params.append(" -DDOUBLE");
        if (!control->dpfp_support)
        {
#ifndef NDEBUG
            std::cerr << "Failure attempting to run double precision kernel on device without DPFP support." << std::endl;
#endif
            return clsparseInvalidDevice;
        }
    }

    cl::Kernel kernel = KernelCache::get(control->queue,
                                         "blas1", "scale",
                                         params);
    KernelWrap kWrapper(kernel);

    clsparseIdx_t size = pResult.size();
    clsparseIdx_t offset = 0;

    kWrapper << size
             << pResult.data()
             << offset
             << pVector.data()
             << offset
             << pAlpha.data()
             << offset;

    clsparseIdx_t blocksNum = (size + group_size - 1) / group_size;
    clsparseIdx_t globalSize = blocksNum * group_size;

    cl::NDRange local(group_size);
    cl::NDRange global (globalSize);

    cl_int status = kWrapper.run(control, global, local);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    return clsparseSuccess;
}
Beispiel #8
0
clsparseStatus
csrmv_vector(const clsparse::array_base<T>& pAlpha,
       const clsparseCsrMatrixPrivate* pMatx,
       const clsparse::array_base<T>& pX,
       const clsparse::array_base<T>& pBeta,
       clsparse::array_base<T>& pY,
       clsparseControl control)
{
    cl_uint nnz_per_row = pMatx->nnz_per_row(); //average nnz per row
    cl_uint wave_size = control->wavefront_size;
    cl_uint group_size = 256;    // 256 gives best performance!
    cl_uint subwave_size = wave_size;

    // adjust subwave_size according to nnz_per_row;
    // each wavefron will be assigned to the row of the csr matrix
    if(wave_size > 32)
    {
        //this apply only for devices with wavefront > 32 like AMD(64)
        if (nnz_per_row < 64) {  subwave_size = 32;  }
    }
    if (nnz_per_row < 32) {  subwave_size = 16;  }
    if (nnz_per_row < 16) {  subwave_size = 8;  }
    if (nnz_per_row < 8)  {  subwave_size = 4;  }
    if (nnz_per_row < 4)  {  subwave_size = 2;  }

    const std::string params = std::string() +
            "-DINDEX_TYPE=" + OclTypeTraits<cl_int>::type
            + " -DVALUE_TYPE=" + OclTypeTraits<T>::type
            + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type
            + " -DWG_SIZE=" + std::to_string(group_size)
            + " -DWAVE_SIZE=" + std::to_string(wave_size)
            + " -DSUBWAVE_SIZE=" + std::to_string(subwave_size);


    cl::Kernel kernel = KernelCache::get(control->queue,
                                         "csrmv_general",
                                         "csrmv_general",
                                         params);
    KernelWrap kWrapper(kernel);

    cl_ulong offset  = 0;

    kWrapper << pMatx->num_rows
             << pAlpha.data() << offset
             << pMatx->rowOffsets
             << pMatx->colIndices
             << pMatx->values
             << pX.data() << offset
             << pBeta.data() << offset
             << pY.data() << offset;

    // subwave takes care of each row in matrix;
    // predicted number of subwaves to be executed;
    cl_uint predicted = subwave_size * pMatx->num_rows;

    // if NVIDIA is used it does not allow to run the group size
    // which is not a multiplication of group_size. Don't know if that
    // have an impact on performance
    cl_uint global_work_size =
            group_size* ((predicted + group_size - 1 ) / group_size);
    cl::NDRange local(group_size);
    //cl::NDRange global(predicted > local[0] ? predicted : local[0]);
    cl::NDRange global(global_work_size > local[0] ? global_work_size : local[0]);

    cl_int status = kWrapper.run(control, global, local);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    return clsparseSuccess;

}
Beispiel #9
0
clsparseStatus
csrmv_adaptive( const clsparse::array_base<T>& pAlpha,
                const clsparseCsrMatrixPrivate* pCsrMatx,
            const clsparse::array_base<T>& pX,
            const clsparse::array_base<T>& pBeta,
            clsparse::array_base<T>& pY,
            clsparseControl control )
{

    const cl_uint group_size = 256;

    std::string params = std::string( )
    + " -DROWBITS=" + std::to_string( ROW_BITS )
    + " -DWGBITS=" + std::to_string( WG_BITS )
    + " -DVALUE_TYPE=" + OclTypeTraits<T>::type
    + " -DWG_SIZE=" + std::to_string( group_size )
    + " -DBLOCKSIZE=" + std::to_string( BLKSIZE )
    + " -DBLOCK_MULTIPLIER=" + std::to_string( BLOCK_MULTIPLIER )
    + " -DROWS_FOR_VECTOR=" + std::to_string( ROWS_FOR_VECTOR );

    if( sizeof( clsparseIdx_t ) == 8 )
    {
        std::string options = std::string()
            + " -DINDEX_TYPE=" + OclTypeTraits<cl_ulong>::type;
        params.append(options);
    }
    else
    {
        std::string options = std::string()
            + " -DINDEX_TYPE=" + OclTypeTraits<cl_uint>::type;
        params.append(options);
    }

    std::string options;
    if(typeid(T) == typeid(cl_double))
    {
        options = std::string() + " -DDOUBLE";
        if (!control->dpfp_support)
        {
#ifndef NDEBUG
            std::cerr << "Failure attempting to run double precision kernel on device without DPFP support." << std::endl;
#endif
            return clsparseInvalidDevice;
        }
    }
    else if(typeid(T) == typeid(cl_ulong))
        options = std::string() + " -DLONG";
    else if(typeid(T) == typeid(cl_long))
        options = std::string() + " -DLONG";

    if(control->extended_precision)
        options += " -DEXTENDED_PRECISION";
    params.append(options);

    cl::Kernel kernel = KernelCache::get( control->queue,
                                          "csrmv_adaptive",
                                          "csrmv_adaptive",
                                          params );

    KernelWrap kWrapper( kernel );

    const matrix_meta* meta_ptr = static_cast< const matrix_meta* >( pCsrMatx->meta );

    kWrapper << pCsrMatx->values
        << pCsrMatx->col_indices << pCsrMatx->row_pointer
        << pX.data() << pY.data()
        << meta_ptr->rowBlocks
        << pAlpha.data() << pBeta.data();
        //<< h_alpha << h_beta;

    // if NVIDIA is used it does not allow to run the group size
    // which is not a multiplication of group_size. Don't know if that
    // have an impact on performance
    // Setting global work size to half the row block size because we are only
    // using half the row blocks buffer for actual work.
    // The other half is used for the extended precision reduction.
    clsparseIdx_t global_work_size = ( ( meta_ptr->rowBlockSize/2) - 1 ) * group_size;
    cl::NDRange local( group_size );
    cl::NDRange global( global_work_size > local[ 0 ] ? global_work_size : local[ 0 ] );

    cl_int status = kWrapper.run( control, global, local );

    if( status != CL_SUCCESS )
    {
        return clsparseInvalidKernelExecution;
    }

    return clsparseSuccess;
}