Ejemplo n.º 1
0
cl_int clEnqueueNDRangeKernel_fusion ( cl_command_queue* command_queue,
    cl_kernel kernel,
    cl_uint work_dim,
    const size_t *global_work_offset,
    const size_t *global_work_size,
    const size_t *local_work_size,
    cl_uint num_events_in_wait_list,
    const cl_event *event_wait_list,
    cl_event *event){

  cl_event eventList[2];
  int cpu_run=0, gpu_run=0;
 size_t global_offset[2];
 size_t global_offset_start[2];
 size_t remain_global_work_size[2];
 int i;


  if(cpu_offset == 0){
    gpu_run=1;
  }
  else if(cpu_offset == 100){
    cpu_run=1;
  }
  else{
    gpu_run=1;
    cpu_run=1;
  }

    for(i=0; i<work_dim; i++){
    global_offset[i]=global_work_size[i];   
    remain_global_work_size[i]=global_work_size[i];   
  }

  global_offset[0]=((double)cpu_offset/100)*global_work_size[0];   
  int t1=global_offset[0], t2=local_work_size[0];
  global_offset[0]=(t1/t2+(int)(t1%t2!=0))*t2;
  remain_global_work_size[0] = global_work_size[0]-global_offset[0];
  if(remain_global_work_size[0] == 0)
    gpu_run = 0;
  global_offset_start[0]=global_offset[0];
  global_offset_start[1]=0;
  if(gpu_run){
    errcode = clEnqueueNDRangeKernel(command_queue[0], kernel, work_dim, global_offset_start, remain_global_work_size, local_work_size, 0, NULL, &(eventList[0]));
    if(errcode != CL_SUCCESS) printf("Error in gpu clEnqueueNDRangeKernel\n");
  }
//  clFinish(command_queue[0]);
  if(cpu_run){
    errcode = clEnqueueNDRangeKernel(command_queue[1], kernel, work_dim, NULL, global_offset, local_work_size, 0, NULL, &(eventList[1]));
    if(errcode != CL_SUCCESS) printf("Error in cpu clEnqueueNDRangeKernel\n");
  }
  
  if(gpu_run) errcode = clFlush(command_queue[0]);
  if(cpu_run) errcode = clFlush(command_queue[1]);
  if(gpu_run) errcode = clWaitForEvents(1,&eventList[0]);
  if(cpu_run) errcode = clWaitForEvents(1,&eventList[1]);

  return errcode;
}
Ejemplo n.º 2
0
void vglClToGl(VglImage* img)
{
    //vglDownload(img);

    if (!vglIsInContext(img, VGL_CL_CONTEXT))
    {
        //vglGlToCl(img);      
        //fprintf(stderr, "vglClToGl: Error: image context = %d not in VGL_CL_CONTEXT\n", img->inContext);
        return;
	}

    cl_int err_cl;
    //printf("==========RELEASE: vgl = %p, ocl = %d\n", img, img->oclPtr);
    err_cl = clEnqueueReleaseGLObjects(cl.commandQueue, 1 , (cl_mem*) &img->oclPtr, 0 , NULL, NULL);
    vglClCheckError(err_cl, (char*) "clEnqueueReleaseGLObjects");

    err_cl = clFlush(cl.commandQueue);
    vglClCheckError(err_cl, (char*) "clFlush");

	err_cl = clFinish(cl.commandQueue);
	vglClCheckError(err_cl, (char*) "clFinish");

    vglSetContext(img, VGL_GL_CONTEXT);

    //printf("Vai sair de %s\n", __FUNCTION__);
    

}
Ejemplo n.º 3
0
void runKernel(cl_runtime_env env,
               std::string kernel_name,
               double* vars,
               double* out,
               int start_index,
               int out_len)

{
    kernel kern;
    for (int i = 1; i < env.num_kerns; i++)
    {
        if (env.kernels[i].name == kernel_name)
            kern = env.kernels[i];
    }
    runKernel(env.cv, env.cl_kernels[kernel_name],
              kern,
              env.gpu_data,
              vars);
    cl_int err;

    err = clEnqueueReadBuffer(env.cv.commands, env.gpu_data["out"].array,
                              true,
                              start_index,
                              sizeof(double)*out_len,
                              out,
                              0, NULL, NULL);
    CHK_ERR(err);

    err = clFlush(env.cv.commands);
    CHK_ERR(err);
}
Ejemplo n.º 4
0
void cl_copyBuffer(cl_mem dest, int destOffset, cl_mem src, int srcOffset, size_t size,int *index,cl_event *eventList,int *Flag_CPU_GPU,double * burden, int _CPU_GPU)
{
	int preFlag=(*Flag_CPU_GPU);
	double preBurden=(*burden);
	int CPU_GPU=0;
	CPU_GPU=cl_copyBufferscheduler(size,Flag_CPU_GPU,burden,_CPU_GPU);
	cl_int ciErr1;
	(*Flag_CPU_GPU)=CPU_GPU;
	if(*index!=0)
	{
		ciErr1 = clEnqueueCopyBuffer(CommandQueue[CPU_GPU], src, dest, srcOffset, destOffset, size, 1, &eventList[((*index)-1)%2], &eventList[(*index)%2]);		
		deschedule(preFlag,preBurden);
	}
	else
	ciErr1 = clEnqueueCopyBuffer(CommandQueue[CPU_GPU], src, dest, srcOffset, destOffset, size, 0, NULL, &eventList[*index]);

	
	(*index)++;
		//clEnqueueWriteBuffer(CommandQueue[CPU_GPU], to, CL_FALSE, 0, size, from, 0, NULL, NULL);
    if (ciErr1 != CL_SUCCESS)
    {
        printf("Error %d in cl_copyBuffer, Line %u in file %s !!!\n\n", ciErr1,__LINE__, __FILE__);
        cl_clean(EXIT_FAILURE);
    }
	clFlush(CommandQueue[CPU_GPU]);	
}
Ejemplo n.º 5
0
/** Perform Hermitian matrix-vector product, \f$ y = \alpha A x + \beta y \f$.

    @param[in]
    uplo    Whether the upper or lower triangle of A is referenced.

    @param[in]
    n       Number of rows and columns of A. n >= 0.

    @param[in]
    alpha   Scalar \f$ \alpha \f$

    @param[in]
    dA      COMPLEX array of dimension (ldda,n), ldda >= max(1,n).
            The n-by-n matrix A, on GPU device.

    @param[in]
    ldda    Leading dimension of dA.

    @param[in]
    dx      COMPLEX array on GPU device.
            The m element vector x of dimension (1 + (m-1)*incx).

    @param[in]
    incx    Stride between consecutive elements of dx. incx != 0.

    @param[in]
    beta    Scalar \f$ \beta \f$

    @param[in,out]
    dy      COMPLEX array on GPU device.
            The n element vector y of dimension (1 + (n-1)*incy).

    @param[in]
    incy    Stride between consecutive elements of dy. incy != 0.

    @ingroup magma_cblas2
*/
extern "C" void
magma_chemv(
    magma_uplo_t uplo,
    magma_int_t n,
    magmaFloatComplex alpha,
    magmaFloatComplex_const_ptr dA, size_t dA_offset, magma_int_t ldda,
    magmaFloatComplex_const_ptr dx, size_t dx_offset, magma_int_t incx,
    magmaFloatComplex beta,
    magmaFloatComplex_ptr       dy, size_t dy_offset, magma_int_t incy,
    magma_queue_t queue )
{
    if ( n <= 0 )
        return;

    cl_int err = clblasChemv(
        clblasColumnMajor,
        clblas_uplo_const( uplo ),
        n,
        alpha, dA, dA_offset, ldda,
               dx, dx_offset, incx,
        beta,  dy, dy_offset, incy,
        1, &queue, 0, NULL, g_event );
    clFlush(queue);
    check_error( err );
}
Ejemplo n.º 6
0
/** Returns index of element of vector x having max. absolute value;
    i.e., max (infinity) norm.
    
    @param[in]
    n       Number of elements in vector x. n >= 0.
            
    @param[in]
    dx      COMPLEX array on GPU device.
            The n element vector x of dimension (1 + (n-1)*incx).
            
    @param[in]
    incx    Stride between consecutive elements of dx. incx > 0.
            
    @ingroup magma_cblas1
*/
extern "C" magma_int_t
magma_icamax(
    magma_int_t n,
    magmaFloatComplex_const_ptr dx, size_t dx_offset, magma_int_t incx,
    magma_queue_t queue )
{
    magma_ptr dimax, scratchBuff;
    magma_malloc( &dimax,       sizeof(unsigned int) );
    magma_malloc( &scratchBuff, (2*n+1)*sizeof(magmaFloatComplex) );
    
    cl_int err = clblasiCamax(
        n, dimax, 0,
        dx, dx_offset, incx,
        scratchBuff,
        1, &queue, 0, NULL, g_event);
    
    unsigned int imax_cpu;
    magma_getvector( 1, sizeof(unsigned int), dimax, 0, 1, &imax_cpu, 1, queue );
    clFlush(queue);
    
    magma_free( dimax );
    magma_free( scratchBuff );
    
    return imax_cpu;
}
Ejemplo n.º 7
0
/** Perform Hermitian rank-2k update.
        \f$ C = \alpha A B^T + \alpha B A^T \beta C \f$ (trans == MagmaNoTrans), or \n
        \f$ C = \alpha A^T B + \alpha B^T A \beta C \f$ (trans == MagmaTrans),      \n
        where \f$ C \f$ is Hermitian.

    @param[in]
    uplo    Whether the upper or lower triangle of C is referenced.

    @param[in]
    trans   Operation to perform on A and B.

    @param[in]
    n       Number of rows and columns of C. n >= 0.

    @param[in]
    k       Number of columns of A and B (for MagmaNoTrans) or rows of A and B (for MagmaTrans). k >= 0.

    @param[in]
    alpha   Scalar \f$ \alpha \f$

    @param[in]
    dA      COMPLEX array on GPU device.
            If trans == MagmaNoTrans, the n-by-k matrix A of dimension (ldda,k), ldda >= max(1,n); \n
            otherwise,                the k-by-n matrix A of dimension (ldda,n), ldda >= max(1,k).

    @param[in]
    ldda    Leading dimension of dA.

    @param[in]
    dB      COMPLEX array on GPU device.
            If trans == MagmaNoTrans, the n-by-k matrix B of dimension (lddb,k), lddb >= max(1,n); \n
            otherwise,                the k-by-n matrix B of dimension (lddb,n), lddb >= max(1,k).

    @param[in]
    lddb    Leading dimension of dB.

    @param[in]
    beta    Scalar \f$ \beta \f$

    @param[in,out]
    dC      COMPLEX array on GPU device.
            The n-by-n Hermitian matrix C of dimension (lddc,n), lddc >= max(1,n).

    @param[in]
    lddc    Leading dimension of dC.

    @ingroup magma_cblas3
*/
extern "C" void
magma_cher2k(
    magma_uplo_t uplo, magma_trans_t trans,
    magma_int_t n, magma_int_t k,
    magmaFloatComplex alpha,
    magmaFloatComplex_const_ptr dA, size_t dA_offset, magma_int_t ldda,
    magmaFloatComplex_const_ptr dB, size_t dB_offset, magma_int_t lddb,
    float beta,
    magmaFloatComplex_ptr dC, size_t dC_offset, magma_int_t lddc,
    magma_queue_t queue )
{
    if (n <= 0 || k <= 0)
        return;

    cl_int err = clblasCher2k(
        clblasColumnMajor,
        clblas_uplo_const( uplo ),
        clblas_trans_const( trans ),
        n, k,
        alpha, dA, dA_offset, ldda,
        dB, dB_offset, lddb,
        beta, dC, dC_offset, lddc,
        1, &queue, 0, NULL, g_event );
    clFlush(queue);
    check_error( err );
}
Ejemplo n.º 8
0
/** Perform matrix-matrix product, \f$ C = \alpha op(A) op(B) + \beta C \f$.

    @param[in]
    transA  Operation op(A) to perform on matrix A.

    @param[in]
    transB  Operation op(B) to perform on matrix B.

    @param[in]
    m       Number of rows of C and op(A). m >= 0.

    @param[in]
    n       Number of columns of C and op(B). n >= 0.

    @param[in]
    k       Number of columns of op(A) and rows of op(B). k >= 0.

    @param[in]
    alpha   Scalar \f$ \alpha \f$

    @param[in]
    dA      COMPLEX array on GPU device.
            If transA == MagmaNoTrans, the m-by-k matrix A of dimension (ldda,k), ldda >= max(1,m); \n
            otherwise,                 the k-by-m matrix A of dimension (ldda,m), ldda >= max(1,k).

    @param[in]
    ldda    Leading dimension of dA.

    @param[in]
    dB      COMPLEX array on GPU device.
            If transB == MagmaNoTrans, the k-by-n matrix B of dimension (lddb,n), lddb >= max(1,k); \n
            otherwise,                 the n-by-k matrix B of dimension (lddb,k), lddb >= max(1,n).

    @param[in]
    lddb    Leading dimension of dB.

    @param[in]
    beta    Scalar \f$ \beta \f$

    @param[in,out]
    dC      COMPLEX array on GPU device.
            The m-by-n matrix C of dimension (lddc,n), lddc >= max(1,m).

    @param[in]
    lddc    Leading dimension of dC.

    @ingroup magma_cblas3
*/
extern "C" void
magma_cgemm(
    magma_trans_t transA, magma_trans_t transB,
    magma_int_t m, magma_int_t n, magma_int_t k,
    magmaFloatComplex alpha,
    magmaFloatComplex_const_ptr dA, size_t dA_offset, magma_int_t ldda,
    magmaFloatComplex_const_ptr dB, size_t dB_offset, magma_int_t lddb,
    magmaFloatComplex beta,
    magmaFloatComplex_ptr       dC, size_t dC_offset, magma_int_t lddc,
    magma_queue_t queue )
{
    if ( m <= 0 || n <= 0 || k <= 0 )
        return;

    cl_int err = clblasCgemm(
        clblasColumnMajor,
        clblas_trans_const( transA ),
        clblas_trans_const( transB ),
        m, n, k,
        alpha, dA, dA_offset, ldda,
               dB, dB_offset, lddb,
        beta,  dC, dC_offset, lddc,
        1, &queue, 0, NULL, g_event );
    clFlush(queue);
    check_error( err );
}
Ejemplo n.º 9
0
/* Let's see if this is any different from local memory.
 * Outcome: much slower than private memory, slower than naive method. */
void mat_mul_cl_row_local(const F *A, const F *B, F *C, size_t n, Cache *cache) {
    cl_uint ncl;
    size_t global_work_size, local_work_size, mat_sizeof;

    /* Setup variables. */
    /* Cannot be larger than 1 on this example, otherwise memory conflicts
     * will happen between work items. */
    local_work_size = 1;
    global_work_size = n;
    mat_sizeof = n * n * sizeof(F);
    ncl = n;

    /* Run kernel. */
    common_create_kernel_file(&cache->common, "matmul_row_local.cl", NULL);
    clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_a, CL_TRUE, 0, mat_sizeof, (F*)A, 0, NULL, NULL);
    clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_b, CL_TRUE, 0, mat_sizeof, (F*)B, 0, NULL, NULL);
    clSetKernelArg(cache->common.kernel, 0, sizeof(cache->buf_a), &cache->buf_a);
    clSetKernelArg(cache->common.kernel, 1, sizeof(cache->buf_b), &cache->buf_b);
    clSetKernelArg(cache->common.kernel, 2, sizeof(cache->buf_c), &cache->buf_c);
    clSetKernelArg(cache->common.kernel, 3, n * sizeof(F), NULL);
    clSetKernelArg(cache->common.kernel, 4, sizeof(ncl), &ncl);
    clEnqueueNDRangeKernel(cache->common.command_queue, cache->common.kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
    clFlush(cache->common.command_queue);
    clFinish(cache->common.command_queue);
    clEnqueueReadBuffer(cache->common.command_queue, cache->buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL);
}
Ejemplo n.º 10
0
/* Like row private, but to reduce global memory accesses,
 * we copy only once per work group to local memory.
 *
 * Each work group contains a few rows of A.
 *
 * We load the first column, multiply all rows by that column, synrhconize,
 * load another column, and so on.
 *
 * This leads to a thread blockage / memory access tradeoff.
 *
 * We make work groups as large as possible to reload memory less times. */
void mat_mul_cl_row_priv_col_local(const F *A, const F *B, F *C, size_t n, Cache *cache) {
    char options[256];
    cl_uint ncl;
    size_t global_work_size, local_work_size, mat_sizeof;

    /* Setup variables. */
    global_work_size = n;
    mat_sizeof = n * n * sizeof(F);
    ncl = n;

    /* Run kernel. */
    snprintf(options, sizeof(options), "-DPRIV_ROW_SIZE=%ju", n);
    common_create_kernel_file(&cache->common, "matmul_row_priv_col_local.cl", options);
    local_work_size = 0;
    clGetDeviceInfo(cache->common.device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_work_size), &local_work_size, NULL);
    local_work_size = zmin(local_work_size, n);
    clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_a, CL_TRUE, 0, mat_sizeof, (F*)A, 0, NULL, NULL);
    clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_b, CL_TRUE, 0, mat_sizeof, (F*)B, 0, NULL, NULL);
    clSetKernelArg(cache->common.kernel, 0, sizeof(cache->buf_a), &cache->buf_a);
    clSetKernelArg(cache->common.kernel, 1, sizeof(cache->buf_b), &cache->buf_b);
    clSetKernelArg(cache->common.kernel, 2, sizeof(cache->buf_c), &cache->buf_c);
    clSetKernelArg(cache->common.kernel, 3, n * sizeof(F), NULL);
    clSetKernelArg(cache->common.kernel, 4, sizeof(ncl), &ncl);
    clEnqueueNDRangeKernel(cache->common.command_queue, cache->common.kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL);
    clFlush(cache->common.command_queue);
    clFinish(cache->common.command_queue);
    clEnqueueReadBuffer(cache->common.command_queue, cache->buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL);
}
Ejemplo n.º 11
0
int main(void) {
    char options[256];
    const char *source =
        "__kernel void kmain(__global int *out) {\n"
        "    out[0] = X;\n"
        "}\n";
    cl_int io[] = {0};
    const cl_int X = 1;
    cl_mem buffer;
    Common common;
    const size_t global_work_size = sizeof(io) / sizeof(io[0]);

    /* Run kernel. */
    snprintf(options, sizeof(options), "-DX=%d", X);
    common_init_options(&common, source, options);
    buffer = clCreateBuffer(common.context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(io), io, NULL);
    clSetKernelArg(common.kernel, 0, sizeof(buffer), &buffer);
    clEnqueueNDRangeKernel(common.command_queue, common.kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL);
    clFlush(common.command_queue);
    clFinish(common.command_queue);
    clEnqueueReadBuffer(common.command_queue, buffer, CL_TRUE, 0, sizeof(io), io, 0, NULL, NULL);

    /* Assertions. */
    assert(io[0] == X);

    /* Cleanup. */
    clReleaseMemObject(buffer);
    common_deinit(&common);
    return EXIT_SUCCESS;
}
Ejemplo n.º 12
0
void* mitk::OclDataSet::TransferDataToCPU(cl_command_queue gpuComQueue)
{
  cl_int clErr = 0;

  // if image created on GPU, needs to create mitk::Image
  if( m_gpuBuffer == nullptr ){
    MITK_ERROR("ocl.DataSet") << "(mitk) No buffer present!\n";
    return nullptr;
  }

  // check buffersize
  char* data = new char[m_bufferSize * (size_t)m_BpE];

  // debug info
  #ifdef SHOW_MEM_INFO
  oclPrintMemObjectInfo( m_gpuBuffer );
  #endif

  clErr = clEnqueueReadBuffer( gpuComQueue, m_gpuBuffer, CL_TRUE, 0, m_bufferSize * (size_t)m_BpE, data ,0, nullptr, nullptr);
  CHECK_OCL_ERR(clErr);

  if(clErr != CL_SUCCESS)
    mitkThrow() << "openCL Error when reading Output Buffer";

  clFlush( gpuComQueue );
  // the cpu data is same as gpu
  this->m_gpuModified = false;

  return (void*) data;
}
Ejemplo n.º 13
0
// --------------------
    magma_err_t
magma_ztrsm(
        magma_side_t side, magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag,
        magma_int_t m, magma_int_t n,
        magmaDoubleComplex alpha, magmaDoubleComplex_const_ptr dA, size_t dA_offset, magma_int_t lda,
        magmaDoubleComplex_ptr       dB, size_t dB_offset, magma_int_t ldb,
        magma_queue_t queue )
{
/*    
    magmaDoubleComplex *hA, *hB;
    if(side==MagmaRight){
        hA = (magmaDoubleComplex*)malloc(lda*n*sizeof(magmaDoubleComplex));
        hB = (magmaDoubleComplex*)malloc(ldb*n*sizeof(magmaDoubleComplex));
        magma_zgetmatrix(n, n, dA, dA_offset, lda, hA, 0, lda, queue);
        magma_zgetmatrix(m, n, dB, dB_offset, ldb, hB, 0, ldb, queue);
#if defined(PRECISION_z) || defined(PRECISION_c)
        cblas_ztrsm(CblasColMajor, (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans, (CBLAS_DIAG)diag,
                m, n,
                &alpha, hA, lda, hB, ldb);
#else
        cblas_ztrsm(CblasColMajor, (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans, (CBLAS_DIAG)diag,
                m, n,
                alpha, hA, lda, hB, ldb);
#endif
        magma_zsetmatrix(m, n, hB, 0, ldb, dB, dB_offset, ldb, queue);
        free(hB);
        free(hA);
    }else{
        hA = (magmaDoubleComplex*)malloc(lda*m*sizeof(magmaDoubleComplex));
        hB = (magmaDoubleComplex*)malloc(ldb*n*sizeof(magmaDoubleComplex));
        magma_zgetmatrix(m, m, dA, dA_offset, lda, hA, 0, lda, queue);
        magma_zgetmatrix(m, n, dB, dB_offset, ldb, hB, 0, ldb, queue);
#if defined(PRECISION_z) || defined(PRECISION_c)
        cblas_ztrsm(CblasColMajor, (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans, (CBLAS_DIAG)diag, 
                m, n,
                &alpha, hA, lda, hB, ldb);
#else
        cblas_ztrsm(CblasColMajor, (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans, (CBLAS_DIAG)diag, 
                m, n,
                alpha, hA, lda, hB, ldb);
#endif
        magma_zsetmatrix(m, n, hB, 0, ldb, dB, dB_offset, ldb, queue);
        free(hB);
        free(hA);
    }
    return CL_SUCCESS;
*/
       cl_int err = clAmdBlasZtrsmEx(
       clAmdBlasColumnMajor,
       amdblas_side_const( side ),
       amdblas_uplo_const( uplo ),
       amdblas_trans_const( trans ),
       amdblas_diag_const( diag ),
       m, n,
       alpha, dA, dA_offset, lda,
       dB, dB_offset, ldb,
       1, &queue, 0, NULL, NULL );
       clFlush(queue);
       return err;
}
Ejemplo n.º 14
0
void CLContext::flush( cl_command_queue& command_queue )
{
	cl_int status;
	status = clFlush( commandQueue );
	if(!checkVal( status, CL_SUCCESS,  "clFlush failed."))
		exit(1);
}
Ejemplo n.º 15
0
void OclHost::readFromDevice(cl_mem buffer, cl_bool blocking_read,
                             size_t offset, size_t size, void * ptr, size_t size_of) {
    cl_int ciErrNum = clEnqueueReadBuffer(oclCommandQueue, buffer,
                                          blocking_read, offset, size * size_of, ptr, 0, 0, 0);
    clFlush(oclCommandQueue);
    checkClError("Unable to read from device.", ciErrNum);
}
Ejemplo n.º 16
0
void __accr_launchkernel(char* szKernelName, char* szKernelLib, int async_expr)
{
	//CL kernel file
	cl_int cl_error_code;
	staic size_t global_work_items[3];

	global_work_items[0] = gangs[0] * vectors[0];
	global_work_items[1] = gangs[1] * vectors[1];
	global_work_items[2] = gangs[2] * vectors[2];
		
	if(bIsAuto_cl_local_work_partition)
		cl_error_code = clEnqueueNDRangeKernel(context->cl_cq, current_cl_kernel_handle, cl_work_dim,
										NULL, global_work_items, NULL, 0, NULL, NULL);
	else		
		cl_error_code = clEnqueueNDRangeKernel(context->cl_cq, current_cl_kernel_handle, cl_work_dim, 
										NULL, global_work_items, vectors, 0, NULL, NULL);
	if(async_expr < 0)
	{
		cl_error_code = clFlush(context->cl_cq);
	  	cl_error_code = clFinish(context->cl_cq);
	}
	else //async
	{
	}
}
Ejemplo n.º 17
0
void BeforeCheckingExample2::parallelExecuteOrigin() {
    /*struct timeval tv1, tv2;
    gettimeofday(&tv1, NULL);
    */

    int p = 0;
    int clStatus;

    clSetKernelArg(loopKernelOrigin, p++, sizeof(cl_mem), &device_a );
    clSetKernelArg(loopKernelOrigin, p++, sizeof(cl_mem), &device_b );
    clSetKernelArg(loopKernelOrigin, p++, sizeof(cl_mem), &device_c );
    clSetKernelArg(loopKernelOrigin, p++, sizeof(cl_mem), &device_Q );
    clSetKernelArg(loopKernelOrigin, p++, sizeof(cl_mem), &device_P );
    clSetKernelArg(loopKernelOrigin, p++, sizeof(cl_int), &LOOP_SIZE );
    clSetKernelArg(loopKernelOrigin, p++, sizeof(cl_int), &CALC_SIZE );

    size_t global_size0 = LOOP_SIZE;
    size_t local_size = 64;

    clStatus = clEnqueueNDRangeKernel(command_queue, loopKernelOrigin, 1, NULL, &global_size0, &local_size, 0, NULL, NULL);

    clFlush(command_queue);
    clFinish(command_queue);

    /*
    gettimeofday(&tv2, NULL);
    double used_time = (double) (tv2.tv_usec - tv1.tv_usec) + (double) (tv2.tv_sec - tv1.tv_sec) * 1000000;

    printf("time origin = %.2f\n", used_time);
    */

}
Ejemplo n.º 18
0
/** Perform Hermitian matrix-matrix product.
        \f$ C = \alpha A B + \beta C \f$ (side == MagmaLeft), or \n
        \f$ C = \alpha B A + \beta C \f$ (side == MagmaRight),   \n
        where \f$ A \f$ is Hermitian.

    @param[in]
    side    Whether A is on the left or right.

    @param[in]
    uplo    Whether the upper or lower triangle of A is referenced.

    @param[in]
    m       Number of rows of C. m >= 0.

    @param[in]
    n       Number of columns of C. n >= 0.

    @param[in]
    alpha   Scalar \f$ \alpha \f$

    @param[in]
    dA      COMPLEX array on GPU device.
            If side == MagmaLeft, the m-by-m Hermitian matrix A of dimension (ldda,m), ldda >= max(1,m); \n
            otherwise,            the n-by-n Hermitian matrix A of dimension (ldda,n), ldda >= max(1,n).

    @param[in]
    ldda    Leading dimension of dA.

    @param[in]
    dB      COMPLEX array on GPU device.
            The m-by-n matrix B of dimension (lddb,n), lddb >= max(1,m).

    @param[in]
    lddb    Leading dimension of dB.

    @param[in]
    beta    Scalar \f$ \beta \f$

    @param[in,out]
    dC      COMPLEX array on GPU device.
            The m-by-n matrix C of dimension (lddc,n), lddc >= max(1,m).

    @param[in]
    lddc    Leading dimension of dC.

    @ingroup magma_cblas3
*/
extern "C" void
magma_chemm(
    magma_side_t side, magma_uplo_t uplo,
    magma_int_t m, magma_int_t n,
    magmaFloatComplex alpha,
    magmaFloatComplex_const_ptr dA, size_t dA_offset, magma_int_t ldda,
    magmaFloatComplex_const_ptr dB, size_t dB_offset, magma_int_t lddb,
    magmaFloatComplex beta,
    magmaFloatComplex_ptr       dC, size_t dC_offset, magma_int_t lddc,
    magma_queue_t queue )
{
    if ( m <= 0 || n <= 0)
        return;

    cl_int err = clblasChemm(
        clblasColumnMajor,
        clblas_side_const( side ),
        clblas_uplo_const( uplo ),
        m, n,
        alpha, dA, dA_offset, ldda,
               dB, dB_offset, lddb,
        beta,  dC, dC_offset, lddc,
        1, &queue, 0, NULL, g_event );
    clFlush(queue);
    check_error( err );
}
void timedBufUnmap( cl_command_queue queue,
                    cl_mem buf,
                    void **ptr,
                    bool quiet )
{
    CPerfCounter t1;
    cl_int ret;
    cl_event ev;

    t1.Reset();
    t1.Start();

    ret = clEnqueueUnmapMemObject( queue,
                                   buf,
                                   (void *) *ptr,
                                   0, NULL, &ev );
    ASSERT_CL_RETURN( ret );

    clFlush( queue );
    spinForEventsComplete( 1, &ev );

    t1.Stop();

    if( !quiet )
        tlog->Timer( "%32s  %lf s [ %8.2lf GB/s ]\n", "clEnqueueUnmapMemObject():", t1.GetElapsedTime(), nBytes, 1 );
}
Ejemplo n.º 20
0
/** Solve triangular matrix-matrix system (multiple right-hand sides).
        \f$ op(A) X = \alpha B \f$ (side == MagmaLeft), or \n
        \f$ X op(A) = \alpha B \f$ (side == MagmaRight),   \n
        where \f$ A \f$ is triangular.

    @param[in]
    side    Whether A is on the left or right.

    @param[in]
    uplo    Whether A is upper or lower triangular.

    @param[in]
    trans   Operation to perform on A.

    @param[in]
    diag    Whether the diagonal of A is assumed to be unit or non-unit.

    @param[in]
    m       Number of rows of B. m >= 0.

    @param[in]
    n       Number of columns of B. n >= 0.

    @param[in]
    alpha   Scalar \f$ \alpha \f$

    @param[in]
    dA      COMPLEX array on GPU device.
            If side == MagmaLeft, the m-by-m triangular matrix A of dimension (ldda,m), ldda >= max(1,m); \n
            otherwise,            the n-by-n triangular matrix A of dimension (ldda,n), ldda >= max(1,n).

    @param[in]
    ldda    Leading dimension of dA.

    @param[in,out]
    dB      COMPLEX array on GPU device.
            On entry, m-by-n matrix B of dimension (lddb,n), lddb >= max(1,m).
            On exit, overwritten with the solution matrix X.

    @param[in]
    lddb    Leading dimension of dB.

    @ingroup magma_cblas3
*/
extern "C" void
magma_ctrsm(
    magma_side_t side, magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag,
    magma_int_t m, magma_int_t n,
    magmaFloatComplex alpha,
    magmaFloatComplex_const_ptr dA, size_t dA_offset, magma_int_t ldda,
    magmaFloatComplex_ptr       dB, size_t dB_offset, magma_int_t lddb,
    magma_queue_t queue )
{
    if (m <= 0 || n <= 0)
        return;

    cl_int err = clblasCtrsm(
        clblasColumnMajor,
        clblas_side_const( side ),
        clblas_uplo_const( uplo ),
        clblas_trans_const( trans ),
        clblas_diag_const( diag ),
        m, n,
        alpha, dA, dA_offset, ldda,
               dB, dB_offset, lddb,
        1, &queue, 0, NULL, g_event );
    clFlush(queue);
    check_error( err );
}
void timedImageCLWrite( cl_command_queue queue,
                      cl_mem image,
                      void *ptr )
{
    CPerfCounter t1;
    cl_int ret;
    cl_event ev;

    t1.Start();

    ret = clEnqueueWriteImage( queue,
                               image,
                               CL_FALSE,
                               imageOrigin, imageRegion,
                               0,0,
                               ptr,
                               0, NULL,
                               &ev );
    ASSERT_CL_RETURN( ret );

    clFlush( queue );
    spinForEventsComplete( 1, &ev );

    t1.Stop();
    
    tlog->Timer( "%32s  %lf s   %8.2lf GB/s\n", "clEnqueueWriteImage():", t1.GetElapsedTime(), nBytesRegion, 1 );
}
Ejemplo n.º 22
0
/** Perform matrix-vector product.
        \f$ y = \alpha A   x + \beta y \f$  (transA == MagmaNoTrans), or \n
        \f$ y = \alpha A^T x + \beta y \f$  (transA == MagmaTrans),   or \n
        \f$ y = \alpha A^H x + \beta y \f$  (transA == MagmaConjTrans).

    @param[in]
    transA  Operation to perform on A.

    @param[in]
    m       Number of rows of A. m >= 0.

    @param[in]
    n       Number of columns of A. n >= 0.

    @param[in]
    alpha   Scalar \f$ \alpha \f$

    @param[in]
    dA      COMPLEX array of dimension (ldda,n), ldda >= max(1,m).
            The m-by-n matrix A, on GPU device.

    @param[in]
    ldda    Leading dimension of dA.

    @param[in]
    dx      COMPLEX array on GPU device.
            If transA == MagmaNoTrans, the n element vector x of dimension (1 + (n-1)*incx); \n
            otherwise,                 the m element vector x of dimension (1 + (m-1)*incx).

    @param[in]
    incx    Stride between consecutive elements of dx. incx != 0.

    @param[in]
    beta    Scalar \f$ \beta \f$

    @param[in,out]
    dy      COMPLEX array on GPU device.
            If transA == MagmaNoTrans, the m element vector y of dimension (1 + (m-1)*incy); \n
            otherwise,                 the n element vector y of dimension (1 + (n-1)*incy).

    @param[in]
    incy    Stride between consecutive elements of dy. incy != 0.

    @ingroup magma_cblas2
*/
extern "C" void
magma_cgemv(
    magma_trans_t transA,
    magma_int_t m, magma_int_t n,
    magmaFloatComplex alpha,
    magmaFloatComplex_const_ptr dA, size_t dA_offset, magma_int_t ldda,
    magmaFloatComplex_const_ptr dx, size_t dx_offset, magma_int_t incx,
    magmaFloatComplex beta,
    magmaFloatComplex_ptr       dy, size_t dy_offset, magma_int_t incy,
    magma_queue_t queue )
{
    if ( m <= 0 || n <= 0 )
        return;

    cl_int err = clblasCgemv(
        clblasColumnMajor,
        clblas_trans_const( transA ),
        m, n,
        alpha, dA, dA_offset, ldda,
               dx, dx_offset, incx,
        beta,  dy, dy_offset, incy,
        1, &queue, 0, NULL, g_event );
    clFlush(queue);
    check_error( err );
}
void timedBufferImageCLCopy( cl_command_queue queue,
                             cl_mem srcBuf,
                             cl_mem dstImg )
{
    CPerfCounter t1;
    cl_int ret;
    cl_event ev;

    t1.Start();

    ret = clEnqueueCopyBufferToImage( queue,
                                      srcBuf,
                                      dstImg,
                                      0,
                                      imageOrigin,
                                      imageRegion,
                                      0, NULL,
                                      &ev );
    ASSERT_CL_RETURN( ret );

    clFlush( queue );
    spinForEventsComplete( 1, &ev );

    t1.Stop();

    tlog->Timer( "%32s  %lf s   %8.2lf GB/s\n", "clEnqueueCopyBufferToImage():", t1.GetElapsedTime(), nBytesRegion, 1 );
}
Ejemplo n.º 24
0
/** Perform triangular matrix-vector product.
        \f$ x = A   x \f$  (trans == MagmaNoTrans), or \n
        \f$ x = A^T x \f$  (trans == MagmaTrans),   or \n
        \f$ x = A^H x \f$  (trans == MagmaConjTrans).

    @param[in]
    uplo    Whether the upper or lower triangle of A is referenced.

    @param[in]
    trans   Operation to perform on A.

    @param[in]
    diag    Whether the diagonal of A is assumed to be unit or non-unit.

    @param[in]
    n       Number of rows and columns of A. n >= 0.

    @param[in]
    dA      COMPLEX array of dimension (ldda,n), ldda >= max(1,n).
            The n-by-n matrix A, on GPU device.

    @param[in]
    ldda    Leading dimension of dA.

    @param[in]
    dx      COMPLEX array on GPU device.
            The n element vector x of dimension (1 + (n-1)*incx).

    @param[in]
    incx    Stride between consecutive elements of dx. incx != 0.

    @ingroup magma_cblas2
*/
extern "C" void
magma_ctrmv(
    magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag,
    magma_int_t n,
    magmaFloatComplex_const_ptr dA, size_t dA_offset, magma_int_t ldda,
    magmaFloatComplex_ptr       dx, size_t dx_offset, magma_int_t incx,
    magma_queue_t queue )
{
    if ( n <= 0 )
        return;

    magmaFloatComplex_ptr dwork;
    magma_cmalloc( &dwork, (1 + (n-1)*abs(incx)) );
    
    cl_int err = clblasCtrmv(
        clblasColumnMajor,
        clblas_uplo_const( uplo ),
        clblas_trans_const( trans ),
        clblas_diag_const( diag ),
        n,
        dA, dA_offset, ldda,
        dx, dx_offset, incx,
        dwork,
        1, &queue, 0, NULL, g_event );
    clFlush(queue);
    check_error( err );
    
    magma_free( dwork );
}
Ejemplo n.º 25
0
void vglGlToCl(VglImage* img)
{
    glFlush();
    glFinish();

    if (img->oclPtr == NULL)
    {
        vglClAlloc(img);
    }

    if (!vglIsInContext(img, VGL_CL_CONTEXT))
    {
        //printf("==========ACQUIRE: vgl = %p, ocl = %d\n", img, img->oclPtr);
		cl_int err_cl = clFlush(cl.commandQueue);
		vglClCheckError(err_cl, (char*) "clFlush");

		err_cl = clFinish(cl.commandQueue);
		vglClCheckError(err_cl, (char*) "clFinish");

        err_cl = clEnqueueAcquireGLObjects(cl.commandQueue, 1 , (cl_mem*) &img->oclPtr, 0 , NULL, NULL);
		vglClCheckError(err_cl, (char*) "clEnqueueAcquireGLObjects");
        
        vglSetContext(img, VGL_CL_CONTEXT);
    }
    //printf("Vai sair de %s\n", __FUNCTION__);
}
Ejemplo n.º 26
0
void*
Optzd::eval(std::vector<uint8_t>* t){
  cl_float* sh = (cl_float*)clEnqueueMapBuffer(queue, clm_sh, CL_FALSE, CL_MAP_WRITE, 0, 2*sizeof(cl_float), 0, NULL, NULL, &status);
  uint8_t* pic = reinterpret_cast<uint8_t*>(t->data());
  sh[0] = 0;
  sh[1] = 0;
  // Copy frame to the device
  status = clEnqueueWriteBuffer(queue, clm_pic, CL_TRUE, 0, cols * rows * sizeof(cl_uchar), pic, 0, NULL, NULL);
  status = clEnqueueWriteBuffer(queue, clm_sh, CL_TRUE, 0, 2 * sizeof(cl_float), sh, 0, NULL, NULL);
  // Execute the OpenCL kernel on the list
  status = clEnqueueNDRangeKernel(queue, kern, 1, NULL, &global_threads, NULL, 0, NULL, NULL);
  status = clEnqueueReadBuffer(queue, clm_sh, CL_TRUE, 0, 2*sizeof(float), sh, 0, NULL, NULL);
  // Clean up and wait for all the comands to complete.
  status = clFlush(queue);
  status = clFinish(queue);
  //eval-ing BS
  float shb = sh[1], shnb = sh[0];
  // for (int i = 0; i < (global_threads / 2); i++) {
  //   shb += sh[i*2];
  //   shnb += sh[(i*2)+1];
  // }
  
  clEnqueueUnmapMemObject(queue, clm_sh, sh, 0, NULL, NULL);
  if (shnb == 0) shnb = 4;
  *BS = shb/(shnb);
  return BS;
}
void cl_clean_up()
{
	// Clean up
	errcode = clFlush(clCommandQue);
	//errcode = clFinish(clCommandQue);
	errcode = clReleaseKernel(clKernel1);
	errcode = clReleaseKernel(clKernel2);
	errcode = clReleaseKernel(clKernel3);
	errcode = clReleaseKernel(clKernel4);
	errcode = clReleaseKernel(clKernel5);
	errcode = clReleaseKernel(clKernel6);
	errcode = clReleaseKernel(clConnect_1_6);
	errcode = clReleaseProgram(clProgram);
	errcode = clReleaseMemObject(a_mem_obj_k1);
	errcode = clReleaseMemObject(b_mem_obj_k1);
	errcode = clReleaseMemObject(a_mem_obj_k2);
	errcode = clReleaseMemObject(b_mem_obj_k2);
	errcode = clReleaseMemObject(a_mem_obj_k3);
	errcode = clReleaseMemObject(b_mem_obj_k3);
	errcode = clReleaseMemObject(a_mem_obj_k4);
	errcode = clReleaseMemObject(b_mem_obj_k4);
	errcode = clReleaseMemObject(a_mem_obj_k5);
	errcode = clReleaseMemObject(b_mem_obj_k5);
	errcode = clReleaseMemObject(a_mem_obj_k6);
	errcode = clReleaseMemObject(b_mem_obj_k6);
	errcode = clReleaseCommandQueue(clCommandQue);
	errcode = clReleaseContext(clGPUContext);
	if(errcode != CL_SUCCESS) printf("Error in cleanup\n");
}
Ejemplo n.º 28
0
int main(void) {
    const char *source =
        "__kernel void main(int in, __global int *out) {\n"
        "    out[0] = in + 1;\n"
        "}\n";
    cl_command_queue command_queue;
    cl_context context;
    cl_device_id device;
    cl_int input = 1;
    cl_kernel kernel;
    cl_mem buffer;
    cl_platform_id platform;
    cl_program program;

    clGetPlatformIDs(1, &platform, NULL);
    clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL);
    context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL);
    command_queue = clCreateCommandQueue(context, device, 0, NULL);
    buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, NULL);
    program = clCreateProgramWithSource(context, 1, &source, NULL, NULL);
    clBuildProgram(program, 1, &device, "", NULL, NULL);
    kernel = clCreateKernel(program, "main", NULL);
    clSetKernelArg(kernel, 0, sizeof(cl_int), &input);
    clSetKernelArg(kernel, 1, sizeof(cl_mem), &buffer);
    clEnqueueTask(command_queue, kernel, 0, NULL, NULL);
    clFlush(command_queue);
    clFinish(command_queue);
    clEnqueueReadBuffer(command_queue, buffer, CL_TRUE, 0, sizeof(cl_int), &input, 0, NULL, NULL);

    assert(input == 2);
    return EXIT_SUCCESS;
}
Ejemplo n.º 29
0
Archivo: Alg.cpp Proyecto: Freyr666/blc
void*
Alg::eval(std::vector<uint8_t>* pic, std::vector<uint8_t>* picprev){
  uint8_t* t = reinterpret_cast<uint8_t*>(pic->data());
  uint8_t* tp = reinterpret_cast<uint8_t*>(picprev->data());
  // Copy frame to the device
  float* result = (float*)clEnqueueMapBuffer(queue, clm_res, CL_FALSE, CL_MAP_READ, 0, 6*sizeof(cl_float), 0, NULL, NULL, &status);
  for (int i = 0; i < 6; i++) {
    result[i] = 0;
  }
  // Copy frame to the device
  status = clEnqueueWriteBuffer(queue, clm_pic, CL_FALSE, 0, cols * rows * sizeof(cl_uchar), t, 0, NULL, NULL);
  status = clEnqueueWriteBuffer(queue, clm_picprev, CL_FALSE, 0, cols * rows * sizeof(cl_uchar), tp, 0, NULL, NULL);
  status = clEnqueueWriteBuffer(queue, clm_res, CL_FALSE, 0, 6 * sizeof(float), result, 0, NULL, NULL);
  // Execute the OpenCL kernel on the list
  status = clEnqueueNDRangeKernel(queue, kern, 1, NULL, &global_threads, NULL, 0, NULL, NULL);
  status = clEnqueueReadBuffer(queue, clm_res, CL_FALSE, 0, 6*sizeof(float), result, 0, NULL, NULL);
  // Clean up and wait for all the comands to complete.
  status = clFlush(queue);
  status = clFinish(queue);
  for (int i = 0; i < 2; i++) {
    percentage[i] = result[i]*100.0 / (cols*rows);
  }
  for (int i = 0; i < 2; i++) {
    percentage[i + 2] = result[i + 2] / (cols*rows);
  }
  if (result[5])
    percentage[4] = result[4] / result[5];
  else
    percentage[4] = result[4] / 4;
  clEnqueueUnmapMemObject(queue, clm_res, result, 0, NULL, NULL);
  return percentage;
}
Ejemplo n.º 30
0
static int 
Initialize(int gpu)
{
	int err;
	err = SetupGraphics();
	if (err != GL_NO_ERROR)
	{
		printf ("Failed to setup OpenGL state!");
		exit (err);
	}

	err = SetupComputeDevices(gpu);
	if(err != CL_SUCCESS)
	{
		printf ("Failed to connect to compute device! Error %d\n", err);
		exit (err);
	}

	err = SetupGLProgram();
	if (err != 1)
	{
		printf ("Failed to setup OpenGL Shader! Error %d\n", err);
		exit (err);
	}

	err = InitData();
	if (err != 1)
	{
		printf ("Failed to Init FFT Data! Error %d\n", err);
		exit (err);
	}

	err = CreateGLResouce();
	if (err != 1)
	{
		printf ("Failed to create GL resource! Error %d\n", err);
		exit (err);
	}

	glFinish();

	err = SetupComputeKernel();
	if (err != CL_SUCCESS)
	{
		printf ("Failed to setup compute kernel! Error %d\n", err);
		exit (err);
	}

	err = CreateComputeResource();
	if(err != CL_SUCCESS)
	{
		printf ("Failed to create compute result! Error %d\n", err);
		exit (err);
	}

	clFlush(ComputeCommands);

	return CL_SUCCESS;
}