예제 #1
0
clsparseStatus dot(clsparseScalarPrivate* pR,
                   const cldenseVectorPrivate* pX,
                   const cldenseVectorPrivate* pY,
                   const clsparseControl control)
{

    cl_int status;

    init_scalar(pR, (T)0, control);

    // with REDUCE_BLOCKS_NUMBER = 256 final reduction can be performed
    // within one block;
    const cl_ulong REDUCE_BLOCKS_NUMBER = 256;

    /* For future optimisation
    //workgroups per compute units;
    const cl_uint  WG_PER_CU = 64;
    const cl_ulong REDUCE_BLOCKS_NUMBER = control->max_compute_units * WG_PER_CU;
    */
    const cl_ulong REDUCE_BLOCK_SIZE = 256;

    cl_ulong xSize = pX->num_values - pX->offset();
    cl_ulong ySize = pY->num_values - pY->offset();

    assert (xSize == ySize);

    cl_ulong size = xSize;


    if (size > 0)
    {
        cl::Context context = control->getContext();

        //partial result
        cldenseVectorPrivate partial;
        clsparseInitVector(&partial);
        partial.num_values = REDUCE_BLOCKS_NUMBER;

        clMemRAII<T> rPartial (control->queue(), &partial.values, partial.num_values);

        status = inner_product<T>(&partial, pX, pY, size,  REDUCE_BLOCKS_NUMBER,
                               REDUCE_BLOCK_SIZE, control);

        if (status != clsparseSuccess)
        {
            return clsparseInvalidKernelExecution;
        }

       status = atomic_reduce<T>(pR, &partial, REDUCE_BLOCK_SIZE,
                                     control);

        if (status != CL_SUCCESS)
        {
            return clsparseInvalidKernelExecution;
        }
    }

    return clsparseSuccess;
}
예제 #2
0
clsparseStatus dot(clsparse::array_base<T>& pR,
                   const clsparse::array_base<T>& pX,
                   const clsparse::array_base<T>& pY,
                   const clsparseControl control)
{

    cl_int status;

    //not necessary to have it, but remember to init the pR with the proper value
    init_scalar(pR, (T)0, control);

    // with REDUCE_BLOCKS_NUMBER = 256 final reduction can be performed
    // within one block;
    const cl_ulong REDUCE_BLOCKS_NUMBER = 256;

    /* For future optimisation
    //workgroups per compute units;
    const cl_uint  WG_PER_CU = 64;
    const cl_ulong REDUCE_BLOCKS_NUMBER = control->max_compute_units * WG_PER_CU;
    */
    const cl_ulong REDUCE_BLOCK_SIZE = 256;

    cl_ulong xSize = pX.size();
    cl_ulong ySize = pY.size();

    assert (xSize == ySize);

    cl_ulong size = xSize;

    if (size > 0)
    {
        cl::Context context = control->getContext();

        //partial result
        clsparse::vector<T> partial(control, REDUCE_BLOCKS_NUMBER, 0,
                                   CL_MEM_READ_WRITE, false);

        status = inner_product<T>(partial, pX, pY, size,  REDUCE_BLOCKS_NUMBER,
                               REDUCE_BLOCK_SIZE, control);

        if (status != clsparseSuccess)
        {
            return clsparseInvalidKernelExecution;
        }

       status = atomic_reduce<T>(pR, partial, REDUCE_BLOCK_SIZE,
                                     control);

        if (status != CL_SUCCESS)
        {
            return clsparseInvalidKernelExecution;
        }
    }

    return clsparseSuccess;
}
예제 #3
0
파일: scan.hpp 프로젝트: 10imaging/clSPARSE
clsparseStatus
scan(VectorType& output, const VectorType& input,
     clsparseControl control, bool exclusive)
{
    typedef typename VectorType::size_type SizeType; //check for cl_ulong
    typedef typename VectorType::value_type T;

    if (!clsparseInitialized)
    {
        return clsparseNotInitialized;
    }

    //check opencl elements
    if (control == nullptr)
    {
        return clsparseInvalidControlObject;
    }

    assert (input.size() == output.size());

    SizeType num_elements = input.size();

    //std::cout << "num_elements = " << num_elements << std::endl;

    SizeType KERNEL02WAVES = 4;
    SizeType KERNEL1WAVES = 4;
    SizeType WAVESIZE = control->wavefront_size;

    SizeType kernel0_WgSize = WAVESIZE*KERNEL02WAVES;
    SizeType kernel1_WgSize = WAVESIZE*KERNEL1WAVES;
    SizeType kernel2_WgSize = WAVESIZE*KERNEL02WAVES;

    SizeType numElementsRUP = num_elements;
    SizeType modWgSize = (numElementsRUP & ((kernel0_WgSize*2)-1));

    if( modWgSize )
    {
        numElementsRUP &= ~modWgSize;
        numElementsRUP += (kernel0_WgSize*2);
    }

    //2 element per work item
    SizeType numWorkGroupsK0 = numElementsRUP / (kernel0_WgSize*2);

    SizeType sizeScanBuff = numWorkGroupsK0;

    modWgSize = (sizeScanBuff & ((kernel0_WgSize*2)-1));
    if( modWgSize )
    {
        sizeScanBuff &= ~modWgSize;
        sizeScanBuff += (kernel0_WgSize*2);
    }

    cl::Context ctx = control->getContext();

    clsparse::vector<T> preSumArray(control, sizeScanBuff,
                                    0, CL_MEM_READ_WRITE, false);
    clsparse::vector<T> preSumArray1(control, sizeScanBuff,
                                     0, CL_MEM_READ_WRITE, false);
    clsparse::vector<T> postSumArray(control, sizeScanBuff,
                                     0, CL_MEM_READ_WRITE, false);

    T operator_identity = 0;

    //std::cout << "operator_identity = " << operator_identity << std::endl;
    //scan in blocks
    {
        //local mem size
        std::size_t lds = kernel0_WgSize * 2 * sizeof(T);

        std::string params = std::string()
                + " -DVALUE_TYPE=" + OclTypeTraits<T>::type
                + " -DWG_SIZE="    + std::to_string(kernel0_WgSize)
                + " -D" + ElementWiseOperatorTrait<OP>::operation;

        if (sizeof(clsparseIdx_t) == 8)
        {
            std::string options = std::string()
                + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type;
            params.append(options);
        }
        else
        {
            std::string options = std::string()
                + " -DSIZE_TYPE=" + OclTypeTraits<cl_uint>::type;
            params.append(options);
        }

        if(typeid(T) == typeid(cl_double))
        {
            params.append(" -DDOUBLE");
            if (!control->dpfp_support)
            {
#ifndef NDEBUG
                std::cerr << "Failure attempting to run double precision kernel on device without DPFP support." << std::endl;
#endif
                return clsparseInvalidDevice;
            }
        }

        cl::Kernel kernel = KernelCache::get(control->queue, "scan",
                                             "per_block_inclusive_scan", params);

        KernelWrap kWrapper(kernel);


        kWrapper << input.data()
                 << operator_identity
                 << (SizeType)input.size()
                 << cl::Local(lds)
                 << preSumArray.data()
                 << preSumArray1.data()
                 << (int) exclusive;

        cl::NDRange global(numElementsRUP/2);
        cl::NDRange local (kernel0_WgSize);

        cl_int status = kWrapper.run(control, global, local);

        CLSPARSE_V(status, "Error: per_block_inclusive_scan");

        if (status != CL_SUCCESS)
        {
            return clsparseInvalidKernelExecution;
        }

    }


    {
        //local mem size
        std::size_t lds = kernel0_WgSize * sizeof(T);

        SizeType workPerThread = sizeScanBuff / kernel1_WgSize;

        std::string params = std::string()
                + " -DVALUE_TYPE=" + OclTypeTraits<T>::type
                + " -DWG_SIZE="    + std::to_string(kernel1_WgSize)
                + " -D" + ElementWiseOperatorTrait<OP>::operation;

        if (sizeof(clsparseIdx_t) == 8)
        {
            std::string options = std::string()
                + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type;
            params.append(options);
        }
        else
        {
            std::string options = std::string()
                + " -DSIZE_TYPE=" + OclTypeTraits<cl_uint>::type;
            params.append(options);
        }

        if(typeid(T) == typeid(cl_double))
        {
            params.append(" -DDOUBLE");
            if (!control->dpfp_support)
            {
#ifndef NDEBUG
                std::cerr << "Failure attempting to run double precision kernel on device without DPFP support." << std::endl;
#endif
                return clsparseInvalidDevice;
            }
        }

        cl::Kernel kernel = KernelCache::get(control->queue, "scan",
                                             "intra_block_inclusive_scan", params);

        KernelWrap kWrapper(kernel);

        kWrapper << postSumArray.data()
                 << preSumArray.data()
                 << operator_identity
                 << numWorkGroupsK0
                 << cl::Local(lds)
                 << workPerThread;

        cl::NDRange global ( kernel1_WgSize );
        cl::NDRange local  ( kernel1_WgSize );

        cl_int status = kWrapper.run(control, global, local);

        CLSPARSE_V(status, "Error: intra_block_inclusive_scan");

        if (status != CL_SUCCESS)
        {
            return clsparseInvalidKernelExecution;
        }
    }

    {
        std::size_t lds = kernel0_WgSize * sizeof(T); //local mem size

        std::string params = std::string()
                + " -DVALUE_TYPE=" + OclTypeTraits<T>::type
                + " -DWG_SIZE="    + std::to_string(kernel1_WgSize)
                + " -D" + ElementWiseOperatorTrait<OP>::operation;

        if (sizeof(clsparseIdx_t) == 8)
        {
            std::string options = std::string()
                + " -DSIZE_TYPE=" + OclTypeTraits<cl_ulong>::type;
            params.append(options);
        }
        else
        {
            std::string options = std::string()
                + " -DSIZE_TYPE=" + OclTypeTraits<cl_uint>::type;
            params.append(options);
        }

        if(typeid(T) == typeid(cl_double))
        {
            params.append(" -DDOUBLE");
            if (!control->dpfp_support)
            {
#ifndef NDEBUG
                std::cerr << "Failure attempting to run double precision kernel on device without DPFP support." << std::endl;
#endif
                return clsparseInvalidDevice;
            }
        }

        cl::Kernel kernel = KernelCache::get(control->queue, "scan",
                                             "per_block_addition", params);

        KernelWrap kWrapper(kernel);

        kWrapper << output.data()
                 << input.data()
                 << postSumArray.data()
                 << preSumArray1.data()
                 << cl::Local(lds)
                 << num_elements
                 << (int)exclusive
                 << operator_identity;

        cl::NDRange global ( numElementsRUP );
        cl::NDRange local  ( kernel2_WgSize );

        cl_int status = kWrapper.run(control, global, local);

        CLSPARSE_V(status, "Error: per_block_addition");

        if (status != CL_SUCCESS)
        {
            return clsparseInvalidKernelExecution;
        }
    }

    return clsparseSuccess;

}
예제 #4
0
clsparseStatus
dense_to_coo(clsparseCooMatrix* coo,
             const clsparse::vector<V>& A,
             const clsparse::vector<I>& nnz_locations,
             const clsparse::vector<I>& coo_indexes,
             const clsparseControl control)
{
    typedef typename clsparse::vector<V>::size_type SizeType;

    assert(coo->num_nonzeros > 0);
    assert(coo->num_cols > 0);
    assert(coo->num_rows > 0);

    assert(A.size() > 0);
    assert(nnz_locations.size() > 0);
    assert(coo_indexes.size() > 0);

    SizeType dense_size = A.size();

    cl_int cl_status;

    coo->values = clCreateBuffer( control->getContext()(), CL_MEM_READ_WRITE,
                                  coo->num_nonzeros * sizeof(V), NULL, &cl_status );
    CLSPARSE_V(cl_status, "Create coo values buffer");

    coo->colIndices = clCreateBuffer( control->getContext()(), CL_MEM_READ_WRITE,
                                      coo->num_nonzeros * sizeof(I), NULL, &cl_status );
    CLSPARSE_V(cl_status, "Create coo col indices buffer");

    coo->rowIndices = clCreateBuffer(control->getContext()(), CL_MEM_READ_WRITE,
                                     coo->num_nonzeros * sizeof(I), NULL, &cl_status );
    CLSPARSE_V(cl_status, "Create coo row indices buffer");



    SizeType workgroup_size   = 256;
    SizeType global_work_size = 0;

    if (dense_size % workgroup_size == 0)
        global_work_size = dense_size;
    else
        global_work_size = dense_size / workgroup_size * workgroup_size + workgroup_size;

    if (dense_size < workgroup_size) global_work_size = workgroup_size;

    const std::string params = std::string()
            + " -DINDEX_TYPE=" + OclTypeTraits<I>::type
            + " -DSIZE_TYPE="  + OclTypeTraits<SizeType>::type
            + " -DVALUE_TYPE=" + OclTypeTraits<V>::type
            + " -DWG_SIZE=" + std::to_string(workgroup_size)
            + " -DSUBWAVE_SIZE=" + std::to_string(2); //required by program;

    //cl::Kernel kernel = KernelCache::get(control->queue,"dense2csr", "spread_value", params);
    cl::Kernel kernel = KernelCache::get(control->queue,"conversion_utils",
                                         "scatter_coo_locations", params);

    KernelWrap kWrapper(kernel);

    kWrapper << (SizeType) coo->num_rows
             << (SizeType) coo->num_cols
             << (SizeType) dense_size
             << A.data()
             << nnz_locations.data()
             << coo_indexes.data()
             << coo->rowIndices
             << coo->colIndices
             << coo->values;

    cl::NDRange local(workgroup_size);
    cl::NDRange global(global_work_size);

    cl_status = kWrapper.run(control, global, local);

    CLSPARSE_V(cl_status, "Error process scaninput");

    if (cl_status != CL_SUCCESS)
        return clsparseInvalidKernelExecution;

    return clsparseSuccess;
}
예제 #5
0
clsparseStatus
reduce_by_key(
    int keys_first,
    int keys_last,
    int values_first,
    cl_mem keys_input,
    cl_mem values_input,
    cl_mem keys_output,
    cl_mem values_output,
    int *count,
    clsparseControl control
)
{

    cl_int l_Error;

    /**********************************************************************************
     * Compile Options
     *********************************************************************************/
    const int kernel0_WgSize = WAVESIZE*KERNEL02WAVES;
    const int kernel1_WgSize = WAVESIZE*KERNEL1WAVES;
    const int kernel2_WgSize = WAVESIZE*KERNEL02WAVES;

    //const std::string params = std::string() +
    //          " -DKERNEL0WORKGROUPSIZE=" + std::to_string(kernel0_WgSize)
    //        + " -DKERNEL1WORKGROUPSIZE=" + std::to_string(kernel1_WgSize)
    //        + " -DKERNEL2WORKGROUPSIZE=" + std::to_string(kernel2_WgSize);
    const std::string params;

    cl::Context context = control->getContext();
    std::vector<cl::Device> dev = context.getInfo<CL_CONTEXT_DEVICES>();
    int computeUnits  = dev[0].getInfo< CL_DEVICE_MAX_COMPUTE_UNITS >( );
    int wgPerComputeUnit = dev[0].getInfo< CL_DEVICE_MAX_WORK_GROUP_SIZE >( );


    int resultCnt = computeUnits * wgPerComputeUnit;
    cl_uint numElements = keys_last - keys_first + 1;

    size_t sizeInputBuff = numElements;
    int modWgSize = (sizeInputBuff & (kernel0_WgSize-1));
    if( modWgSize )
    {
        sizeInputBuff &= ~modWgSize;
        sizeInputBuff += kernel0_WgSize;
    }
    cl_uint numWorkGroupsK0 = static_cast< cl_uint >( sizeInputBuff / kernel0_WgSize );

    size_t sizeScanBuff = numWorkGroupsK0;
    modWgSize = (sizeScanBuff & (kernel0_WgSize-1));
    if( modWgSize )
    {
        sizeScanBuff &= ~modWgSize;
        sizeScanBuff += kernel0_WgSize;
    }

    cl_mem tempArrayVec = clCreateBuffer(context(),CL_MEM_READ_WRITE, (numElements)*sizeof(int), NULL, NULL );

    /**********************************************************************************
     *  Kernel 0
     *********************************************************************************/

    cl::Kernel kernel0 = KernelCache::get(control->queue,"reduce_by_key", "OffsetCalculation", params);

    KernelWrap kWrapper0(kernel0);

    kWrapper0 << keys_input << tempArrayVec
              << numElements;

    cl::NDRange local0(kernel0_WgSize);
    cl::NDRange global0(sizeInputBuff);

    cl_int status = kWrapper0.run(control, global0, local0);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    int init = 0;

    scan(0,
	 numElements - 1,
         tempArrayVec,
         tempArrayVec,
         0,
         0,
         control
         );

    int pattern = 0;
    cl_mem keySumArray = clCreateBuffer(context(),CL_MEM_READ_WRITE, (sizeScanBuff)*sizeof(int), NULL, NULL );
    cl_mem preSumArray = clCreateBuffer(context(),CL_MEM_READ_WRITE, (sizeScanBuff)*sizeof(int), NULL, NULL );
    cl_mem postSumArray = clCreateBuffer(context(),CL_MEM_READ_WRITE,(sizeScanBuff)*sizeof(int), NULL, NULL );
    clEnqueueFillBuffer(control->queue(), keySumArray, &pattern, sizeof(int), 0,
                        (sizeScanBuff)*sizeof(int), 0, NULL, NULL);
    clEnqueueFillBuffer(control->queue(), preSumArray, &pattern, sizeof(int), 0,
                        (sizeScanBuff)*sizeof(int), 0, NULL, NULL);
    clEnqueueFillBuffer(control->queue(), postSumArray, &pattern, sizeof(int), 0,
                        (sizeScanBuff)*sizeof(int), 0, NULL, NULL);


    /**********************************************************************************
     *  Kernel 1
     *********************************************************************************/

    cl::Kernel kernel1 = KernelCache::get(control->queue,"reduce_by_key", "perBlockScanByKey", params);

    KernelWrap kWrapper1(kernel1);

    kWrapper1 << tempArrayVec
	      << values_input
              << numElements
	      << keySumArray
	      << preSumArray;

    cl::NDRange local1(kernel0_WgSize);
    cl::NDRange global1(sizeInputBuff);

    status = kWrapper1.run(control, global1, local1);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    /**********************************************************************************
     *  Kernel 2
     *********************************************************************************/
    cl_uint workPerThread = static_cast< cl_uint >( sizeScanBuff / kernel1_WgSize );

    cl::Kernel kernel2 = KernelCache::get(control->queue,"reduce_by_key", "intraBlockInclusiveScanByKey", params);

    KernelWrap kWrapper2(kernel2);

    kWrapper2 << keySumArray << preSumArray
              << postSumArray << numWorkGroupsK0 << workPerThread;

    cl::NDRange local2(kernel1_WgSize);
    cl::NDRange global2(kernel1_WgSize);

    status = kWrapper2.run(control, global2, local2);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    /**********************************************************************************
     *  Kernel 3
     *********************************************************************************/

    cl::Kernel kernel3 = KernelCache::get(control->queue,"reduce_by_key", "keyValueMapping", params);

    KernelWrap kWrapper3(kernel3);

    kWrapper3 << keys_input << keys_output
              << values_input << values_output << tempArrayVec
              << keySumArray << postSumArray << numElements;

    cl::NDRange local3(kernel0_WgSize);
    cl::NDRange global3(sizeInputBuff);

    status = kWrapper3.run(control, global3, local3);

    if (status != CL_SUCCESS)
    {
        return clsparseInvalidKernelExecution;
    }

    int *h_result = (int *) malloc (sizeof(int));

    clEnqueueReadBuffer(control->queue(),
                        tempArrayVec,
                        1,
                       (numElements-1)*sizeof(int),
                        sizeof(int),
                        h_result,
                        0,
                        0,
                        0);

    *count = *(h_result);
    //printf("h_result = %d\n", *count );

    //release buffers
    clReleaseMemObject(tempArrayVec);
    clReleaseMemObject(preSumArray);
    clReleaseMemObject(postSumArray);
    clReleaseMemObject(keySumArray);

    return clsparseSuccess;
}   //end of reduce_by_key