cl_int clEnqueueNDRangeKernel_fusion ( cl_command_queue* command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event){ cl_event eventList[2]; int cpu_run=0, gpu_run=0; size_t global_offset[2]; size_t global_offset_start[2]; size_t remain_global_work_size[2]; int i; if(cpu_offset == 0){ gpu_run=1; } else if(cpu_offset == 100){ cpu_run=1; } else{ gpu_run=1; cpu_run=1; } for(i=0; i<work_dim; i++){ global_offset[i]=global_work_size[i]; remain_global_work_size[i]=global_work_size[i]; } global_offset[0]=((double)cpu_offset/100)*global_work_size[0]; int t1=global_offset[0], t2=local_work_size[0]; global_offset[0]=(t1/t2+(int)(t1%t2!=0))*t2; remain_global_work_size[0] = global_work_size[0]-global_offset[0]; if(remain_global_work_size[0] == 0) gpu_run = 0; global_offset_start[0]=global_offset[0]; global_offset_start[1]=0; if(gpu_run){ errcode = clEnqueueNDRangeKernel(command_queue[0], kernel, work_dim, global_offset_start, remain_global_work_size, local_work_size, 0, NULL, &(eventList[0])); if(errcode != CL_SUCCESS) printf("Error in gpu clEnqueueNDRangeKernel\n"); } // clFinish(command_queue[0]); if(cpu_run){ errcode = clEnqueueNDRangeKernel(command_queue[1], kernel, work_dim, NULL, global_offset, local_work_size, 0, NULL, &(eventList[1])); if(errcode != CL_SUCCESS) printf("Error in cpu clEnqueueNDRangeKernel\n"); } if(gpu_run) errcode = clFlush(command_queue[0]); if(cpu_run) errcode = clFlush(command_queue[1]); if(gpu_run) errcode = clWaitForEvents(1,&eventList[0]); if(cpu_run) errcode = clWaitForEvents(1,&eventList[1]); return errcode; }
void vglClToGl(VglImage* img) { //vglDownload(img); if (!vglIsInContext(img, VGL_CL_CONTEXT)) { //vglGlToCl(img); //fprintf(stderr, "vglClToGl: Error: image context = %d not in VGL_CL_CONTEXT\n", img->inContext); return; } cl_int err_cl; //printf("==========RELEASE: vgl = %p, ocl = %d\n", img, img->oclPtr); err_cl = clEnqueueReleaseGLObjects(cl.commandQueue, 1 , (cl_mem*) &img->oclPtr, 0 , NULL, NULL); vglClCheckError(err_cl, (char*) "clEnqueueReleaseGLObjects"); err_cl = clFlush(cl.commandQueue); vglClCheckError(err_cl, (char*) "clFlush"); err_cl = clFinish(cl.commandQueue); vglClCheckError(err_cl, (char*) "clFinish"); vglSetContext(img, VGL_GL_CONTEXT); //printf("Vai sair de %s\n", __FUNCTION__); }
void runKernel(cl_runtime_env env, std::string kernel_name, double* vars, double* out, int start_index, int out_len) { kernel kern; for (int i = 1; i < env.num_kerns; i++) { if (env.kernels[i].name == kernel_name) kern = env.kernels[i]; } runKernel(env.cv, env.cl_kernels[kernel_name], kern, env.gpu_data, vars); cl_int err; err = clEnqueueReadBuffer(env.cv.commands, env.gpu_data["out"].array, true, start_index, sizeof(double)*out_len, out, 0, NULL, NULL); CHK_ERR(err); err = clFlush(env.cv.commands); CHK_ERR(err); }
void cl_copyBuffer(cl_mem dest, int destOffset, cl_mem src, int srcOffset, size_t size,int *index,cl_event *eventList,int *Flag_CPU_GPU,double * burden, int _CPU_GPU) { int preFlag=(*Flag_CPU_GPU); double preBurden=(*burden); int CPU_GPU=0; CPU_GPU=cl_copyBufferscheduler(size,Flag_CPU_GPU,burden,_CPU_GPU); cl_int ciErr1; (*Flag_CPU_GPU)=CPU_GPU; if(*index!=0) { ciErr1 = clEnqueueCopyBuffer(CommandQueue[CPU_GPU], src, dest, srcOffset, destOffset, size, 1, &eventList[((*index)-1)%2], &eventList[(*index)%2]); deschedule(preFlag,preBurden); } else ciErr1 = clEnqueueCopyBuffer(CommandQueue[CPU_GPU], src, dest, srcOffset, destOffset, size, 0, NULL, &eventList[*index]); (*index)++; //clEnqueueWriteBuffer(CommandQueue[CPU_GPU], to, CL_FALSE, 0, size, from, 0, NULL, NULL); if (ciErr1 != CL_SUCCESS) { printf("Error %d in cl_copyBuffer, Line %u in file %s !!!\n\n", ciErr1,__LINE__, __FILE__); cl_clean(EXIT_FAILURE); } clFlush(CommandQueue[CPU_GPU]); }
/** Perform Hermitian matrix-vector product, \f$ y = \alpha A x + \beta y \f$. @param[in] uplo Whether the upper or lower triangle of A is referenced. @param[in] n Number of rows and columns of A. n >= 0. @param[in] alpha Scalar \f$ \alpha \f$ @param[in] dA COMPLEX array of dimension (ldda,n), ldda >= max(1,n). The n-by-n matrix A, on GPU device. @param[in] ldda Leading dimension of dA. @param[in] dx COMPLEX array on GPU device. The m element vector x of dimension (1 + (m-1)*incx). @param[in] incx Stride between consecutive elements of dx. incx != 0. @param[in] beta Scalar \f$ \beta \f$ @param[in,out] dy COMPLEX array on GPU device. The n element vector y of dimension (1 + (n-1)*incy). @param[in] incy Stride between consecutive elements of dy. incy != 0. @ingroup magma_cblas2 */ extern "C" void magma_chemv( magma_uplo_t uplo, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, size_t dA_offset, magma_int_t ldda, magmaFloatComplex_const_ptr dx, size_t dx_offset, magma_int_t incx, magmaFloatComplex beta, magmaFloatComplex_ptr dy, size_t dy_offset, magma_int_t incy, magma_queue_t queue ) { if ( n <= 0 ) return; cl_int err = clblasChemv( clblasColumnMajor, clblas_uplo_const( uplo ), n, alpha, dA, dA_offset, ldda, dx, dx_offset, incx, beta, dy, dy_offset, incy, 1, &queue, 0, NULL, g_event ); clFlush(queue); check_error( err ); }
/** Returns index of element of vector x having max. absolute value; i.e., max (infinity) norm. @param[in] n Number of elements in vector x. n >= 0. @param[in] dx COMPLEX array on GPU device. The n element vector x of dimension (1 + (n-1)*incx). @param[in] incx Stride between consecutive elements of dx. incx > 0. @ingroup magma_cblas1 */ extern "C" magma_int_t magma_icamax( magma_int_t n, magmaFloatComplex_const_ptr dx, size_t dx_offset, magma_int_t incx, magma_queue_t queue ) { magma_ptr dimax, scratchBuff; magma_malloc( &dimax, sizeof(unsigned int) ); magma_malloc( &scratchBuff, (2*n+1)*sizeof(magmaFloatComplex) ); cl_int err = clblasiCamax( n, dimax, 0, dx, dx_offset, incx, scratchBuff, 1, &queue, 0, NULL, g_event); unsigned int imax_cpu; magma_getvector( 1, sizeof(unsigned int), dimax, 0, 1, &imax_cpu, 1, queue ); clFlush(queue); magma_free( dimax ); magma_free( scratchBuff ); return imax_cpu; }
/** Perform Hermitian rank-2k update. \f$ C = \alpha A B^T + \alpha B A^T \beta C \f$ (trans == MagmaNoTrans), or \n \f$ C = \alpha A^T B + \alpha B^T A \beta C \f$ (trans == MagmaTrans), \n where \f$ C \f$ is Hermitian. @param[in] uplo Whether the upper or lower triangle of C is referenced. @param[in] trans Operation to perform on A and B. @param[in] n Number of rows and columns of C. n >= 0. @param[in] k Number of columns of A and B (for MagmaNoTrans) or rows of A and B (for MagmaTrans). k >= 0. @param[in] alpha Scalar \f$ \alpha \f$ @param[in] dA COMPLEX array on GPU device. If trans == MagmaNoTrans, the n-by-k matrix A of dimension (ldda,k), ldda >= max(1,n); \n otherwise, the k-by-n matrix A of dimension (ldda,n), ldda >= max(1,k). @param[in] ldda Leading dimension of dA. @param[in] dB COMPLEX array on GPU device. If trans == MagmaNoTrans, the n-by-k matrix B of dimension (lddb,k), lddb >= max(1,n); \n otherwise, the k-by-n matrix B of dimension (lddb,n), lddb >= max(1,k). @param[in] lddb Leading dimension of dB. @param[in] beta Scalar \f$ \beta \f$ @param[in,out] dC COMPLEX array on GPU device. The n-by-n Hermitian matrix C of dimension (lddc,n), lddc >= max(1,n). @param[in] lddc Leading dimension of dC. @ingroup magma_cblas3 */ extern "C" void magma_cher2k( magma_uplo_t uplo, magma_trans_t trans, magma_int_t n, magma_int_t k, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, size_t dA_offset, magma_int_t ldda, magmaFloatComplex_const_ptr dB, size_t dB_offset, magma_int_t lddb, float beta, magmaFloatComplex_ptr dC, size_t dC_offset, magma_int_t lddc, magma_queue_t queue ) { if (n <= 0 || k <= 0) return; cl_int err = clblasCher2k( clblasColumnMajor, clblas_uplo_const( uplo ), clblas_trans_const( trans ), n, k, alpha, dA, dA_offset, ldda, dB, dB_offset, lddb, beta, dC, dC_offset, lddc, 1, &queue, 0, NULL, g_event ); clFlush(queue); check_error( err ); }
/** Perform matrix-matrix product, \f$ C = \alpha op(A) op(B) + \beta C \f$. @param[in] transA Operation op(A) to perform on matrix A. @param[in] transB Operation op(B) to perform on matrix B. @param[in] m Number of rows of C and op(A). m >= 0. @param[in] n Number of columns of C and op(B). n >= 0. @param[in] k Number of columns of op(A) and rows of op(B). k >= 0. @param[in] alpha Scalar \f$ \alpha \f$ @param[in] dA COMPLEX array on GPU device. If transA == MagmaNoTrans, the m-by-k matrix A of dimension (ldda,k), ldda >= max(1,m); \n otherwise, the k-by-m matrix A of dimension (ldda,m), ldda >= max(1,k). @param[in] ldda Leading dimension of dA. @param[in] dB COMPLEX array on GPU device. If transB == MagmaNoTrans, the k-by-n matrix B of dimension (lddb,n), lddb >= max(1,k); \n otherwise, the n-by-k matrix B of dimension (lddb,k), lddb >= max(1,n). @param[in] lddb Leading dimension of dB. @param[in] beta Scalar \f$ \beta \f$ @param[in,out] dC COMPLEX array on GPU device. The m-by-n matrix C of dimension (lddc,n), lddc >= max(1,m). @param[in] lddc Leading dimension of dC. @ingroup magma_cblas3 */ extern "C" void magma_cgemm( magma_trans_t transA, magma_trans_t transB, magma_int_t m, magma_int_t n, magma_int_t k, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, size_t dA_offset, magma_int_t ldda, magmaFloatComplex_const_ptr dB, size_t dB_offset, magma_int_t lddb, magmaFloatComplex beta, magmaFloatComplex_ptr dC, size_t dC_offset, magma_int_t lddc, magma_queue_t queue ) { if ( m <= 0 || n <= 0 || k <= 0 ) return; cl_int err = clblasCgemm( clblasColumnMajor, clblas_trans_const( transA ), clblas_trans_const( transB ), m, n, k, alpha, dA, dA_offset, ldda, dB, dB_offset, lddb, beta, dC, dC_offset, lddc, 1, &queue, 0, NULL, g_event ); clFlush(queue); check_error( err ); }
/* Let's see if this is any different from local memory. * Outcome: much slower than private memory, slower than naive method. */ void mat_mul_cl_row_local(const F *A, const F *B, F *C, size_t n, Cache *cache) { cl_uint ncl; size_t global_work_size, local_work_size, mat_sizeof; /* Setup variables. */ /* Cannot be larger than 1 on this example, otherwise memory conflicts * will happen between work items. */ local_work_size = 1; global_work_size = n; mat_sizeof = n * n * sizeof(F); ncl = n; /* Run kernel. */ common_create_kernel_file(&cache->common, "matmul_row_local.cl", NULL); clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_a, CL_TRUE, 0, mat_sizeof, (F*)A, 0, NULL, NULL); clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_b, CL_TRUE, 0, mat_sizeof, (F*)B, 0, NULL, NULL); clSetKernelArg(cache->common.kernel, 0, sizeof(cache->buf_a), &cache->buf_a); clSetKernelArg(cache->common.kernel, 1, sizeof(cache->buf_b), &cache->buf_b); clSetKernelArg(cache->common.kernel, 2, sizeof(cache->buf_c), &cache->buf_c); clSetKernelArg(cache->common.kernel, 3, n * sizeof(F), NULL); clSetKernelArg(cache->common.kernel, 4, sizeof(ncl), &ncl); clEnqueueNDRangeKernel(cache->common.command_queue, cache->common.kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); clFlush(cache->common.command_queue); clFinish(cache->common.command_queue); clEnqueueReadBuffer(cache->common.command_queue, cache->buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL); }
/* Like row private, but to reduce global memory accesses, * we copy only once per work group to local memory. * * Each work group contains a few rows of A. * * We load the first column, multiply all rows by that column, synrhconize, * load another column, and so on. * * This leads to a thread blockage / memory access tradeoff. * * We make work groups as large as possible to reload memory less times. */ void mat_mul_cl_row_priv_col_local(const F *A, const F *B, F *C, size_t n, Cache *cache) { char options[256]; cl_uint ncl; size_t global_work_size, local_work_size, mat_sizeof; /* Setup variables. */ global_work_size = n; mat_sizeof = n * n * sizeof(F); ncl = n; /* Run kernel. */ snprintf(options, sizeof(options), "-DPRIV_ROW_SIZE=%ju", n); common_create_kernel_file(&cache->common, "matmul_row_priv_col_local.cl", options); local_work_size = 0; clGetDeviceInfo(cache->common.device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_work_size), &local_work_size, NULL); local_work_size = zmin(local_work_size, n); clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_a, CL_TRUE, 0, mat_sizeof, (F*)A, 0, NULL, NULL); clEnqueueWriteBuffer(cache->common.command_queue, cache->buf_b, CL_TRUE, 0, mat_sizeof, (F*)B, 0, NULL, NULL); clSetKernelArg(cache->common.kernel, 0, sizeof(cache->buf_a), &cache->buf_a); clSetKernelArg(cache->common.kernel, 1, sizeof(cache->buf_b), &cache->buf_b); clSetKernelArg(cache->common.kernel, 2, sizeof(cache->buf_c), &cache->buf_c); clSetKernelArg(cache->common.kernel, 3, n * sizeof(F), NULL); clSetKernelArg(cache->common.kernel, 4, sizeof(ncl), &ncl); clEnqueueNDRangeKernel(cache->common.command_queue, cache->common.kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); clFlush(cache->common.command_queue); clFinish(cache->common.command_queue); clEnqueueReadBuffer(cache->common.command_queue, cache->buf_c, CL_TRUE, 0, mat_sizeof, C, 0, NULL, NULL); }
int main(void) { char options[256]; const char *source = "__kernel void kmain(__global int *out) {\n" " out[0] = X;\n" "}\n"; cl_int io[] = {0}; const cl_int X = 1; cl_mem buffer; Common common; const size_t global_work_size = sizeof(io) / sizeof(io[0]); /* Run kernel. */ snprintf(options, sizeof(options), "-DX=%d", X); common_init_options(&common, source, options); buffer = clCreateBuffer(common.context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(io), io, NULL); clSetKernelArg(common.kernel, 0, sizeof(buffer), &buffer); clEnqueueNDRangeKernel(common.command_queue, common.kernel, 1, NULL, &global_work_size, NULL, 0, NULL, NULL); clFlush(common.command_queue); clFinish(common.command_queue); clEnqueueReadBuffer(common.command_queue, buffer, CL_TRUE, 0, sizeof(io), io, 0, NULL, NULL); /* Assertions. */ assert(io[0] == X); /* Cleanup. */ clReleaseMemObject(buffer); common_deinit(&common); return EXIT_SUCCESS; }
void* mitk::OclDataSet::TransferDataToCPU(cl_command_queue gpuComQueue) { cl_int clErr = 0; // if image created on GPU, needs to create mitk::Image if( m_gpuBuffer == nullptr ){ MITK_ERROR("ocl.DataSet") << "(mitk) No buffer present!\n"; return nullptr; } // check buffersize char* data = new char[m_bufferSize * (size_t)m_BpE]; // debug info #ifdef SHOW_MEM_INFO oclPrintMemObjectInfo( m_gpuBuffer ); #endif clErr = clEnqueueReadBuffer( gpuComQueue, m_gpuBuffer, CL_TRUE, 0, m_bufferSize * (size_t)m_BpE, data ,0, nullptr, nullptr); CHECK_OCL_ERR(clErr); if(clErr != CL_SUCCESS) mitkThrow() << "openCL Error when reading Output Buffer"; clFlush( gpuComQueue ); // the cpu data is same as gpu this->m_gpuModified = false; return (void*) data; }
// -------------------- magma_err_t magma_ztrsm( magma_side_t side, magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n, magmaDoubleComplex alpha, magmaDoubleComplex_const_ptr dA, size_t dA_offset, magma_int_t lda, magmaDoubleComplex_ptr dB, size_t dB_offset, magma_int_t ldb, magma_queue_t queue ) { /* magmaDoubleComplex *hA, *hB; if(side==MagmaRight){ hA = (magmaDoubleComplex*)malloc(lda*n*sizeof(magmaDoubleComplex)); hB = (magmaDoubleComplex*)malloc(ldb*n*sizeof(magmaDoubleComplex)); magma_zgetmatrix(n, n, dA, dA_offset, lda, hA, 0, lda, queue); magma_zgetmatrix(m, n, dB, dB_offset, ldb, hB, 0, ldb, queue); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_ztrsm(CblasColMajor, (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans, (CBLAS_DIAG)diag, m, n, &alpha, hA, lda, hB, ldb); #else cblas_ztrsm(CblasColMajor, (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans, (CBLAS_DIAG)diag, m, n, alpha, hA, lda, hB, ldb); #endif magma_zsetmatrix(m, n, hB, 0, ldb, dB, dB_offset, ldb, queue); free(hB); free(hA); }else{ hA = (magmaDoubleComplex*)malloc(lda*m*sizeof(magmaDoubleComplex)); hB = (magmaDoubleComplex*)malloc(ldb*n*sizeof(magmaDoubleComplex)); magma_zgetmatrix(m, m, dA, dA_offset, lda, hA, 0, lda, queue); magma_zgetmatrix(m, n, dB, dB_offset, ldb, hB, 0, ldb, queue); #if defined(PRECISION_z) || defined(PRECISION_c) cblas_ztrsm(CblasColMajor, (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans, (CBLAS_DIAG)diag, m, n, &alpha, hA, lda, hB, ldb); #else cblas_ztrsm(CblasColMajor, (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, (CBLAS_TRANSPOSE)trans, (CBLAS_DIAG)diag, m, n, alpha, hA, lda, hB, ldb); #endif magma_zsetmatrix(m, n, hB, 0, ldb, dB, dB_offset, ldb, queue); free(hB); free(hA); } return CL_SUCCESS; */ cl_int err = clAmdBlasZtrsmEx( clAmdBlasColumnMajor, amdblas_side_const( side ), amdblas_uplo_const( uplo ), amdblas_trans_const( trans ), amdblas_diag_const( diag ), m, n, alpha, dA, dA_offset, lda, dB, dB_offset, ldb, 1, &queue, 0, NULL, NULL ); clFlush(queue); return err; }
void CLContext::flush( cl_command_queue& command_queue ) { cl_int status; status = clFlush( commandQueue ); if(!checkVal( status, CL_SUCCESS, "clFlush failed.")) exit(1); }
void OclHost::readFromDevice(cl_mem buffer, cl_bool blocking_read, size_t offset, size_t size, void * ptr, size_t size_of) { cl_int ciErrNum = clEnqueueReadBuffer(oclCommandQueue, buffer, blocking_read, offset, size * size_of, ptr, 0, 0, 0); clFlush(oclCommandQueue); checkClError("Unable to read from device.", ciErrNum); }
void __accr_launchkernel(char* szKernelName, char* szKernelLib, int async_expr) { //CL kernel file cl_int cl_error_code; staic size_t global_work_items[3]; global_work_items[0] = gangs[0] * vectors[0]; global_work_items[1] = gangs[1] * vectors[1]; global_work_items[2] = gangs[2] * vectors[2]; if(bIsAuto_cl_local_work_partition) cl_error_code = clEnqueueNDRangeKernel(context->cl_cq, current_cl_kernel_handle, cl_work_dim, NULL, global_work_items, NULL, 0, NULL, NULL); else cl_error_code = clEnqueueNDRangeKernel(context->cl_cq, current_cl_kernel_handle, cl_work_dim, NULL, global_work_items, vectors, 0, NULL, NULL); if(async_expr < 0) { cl_error_code = clFlush(context->cl_cq); cl_error_code = clFinish(context->cl_cq); } else //async { } }
void BeforeCheckingExample2::parallelExecuteOrigin() { /*struct timeval tv1, tv2; gettimeofday(&tv1, NULL); */ int p = 0; int clStatus; clSetKernelArg(loopKernelOrigin, p++, sizeof(cl_mem), &device_a ); clSetKernelArg(loopKernelOrigin, p++, sizeof(cl_mem), &device_b ); clSetKernelArg(loopKernelOrigin, p++, sizeof(cl_mem), &device_c ); clSetKernelArg(loopKernelOrigin, p++, sizeof(cl_mem), &device_Q ); clSetKernelArg(loopKernelOrigin, p++, sizeof(cl_mem), &device_P ); clSetKernelArg(loopKernelOrigin, p++, sizeof(cl_int), &LOOP_SIZE ); clSetKernelArg(loopKernelOrigin, p++, sizeof(cl_int), &CALC_SIZE ); size_t global_size0 = LOOP_SIZE; size_t local_size = 64; clStatus = clEnqueueNDRangeKernel(command_queue, loopKernelOrigin, 1, NULL, &global_size0, &local_size, 0, NULL, NULL); clFlush(command_queue); clFinish(command_queue); /* gettimeofday(&tv2, NULL); double used_time = (double) (tv2.tv_usec - tv1.tv_usec) + (double) (tv2.tv_sec - tv1.tv_sec) * 1000000; printf("time origin = %.2f\n", used_time); */ }
/** Perform Hermitian matrix-matrix product. \f$ C = \alpha A B + \beta C \f$ (side == MagmaLeft), or \n \f$ C = \alpha B A + \beta C \f$ (side == MagmaRight), \n where \f$ A \f$ is Hermitian. @param[in] side Whether A is on the left or right. @param[in] uplo Whether the upper or lower triangle of A is referenced. @param[in] m Number of rows of C. m >= 0. @param[in] n Number of columns of C. n >= 0. @param[in] alpha Scalar \f$ \alpha \f$ @param[in] dA COMPLEX array on GPU device. If side == MagmaLeft, the m-by-m Hermitian matrix A of dimension (ldda,m), ldda >= max(1,m); \n otherwise, the n-by-n Hermitian matrix A of dimension (ldda,n), ldda >= max(1,n). @param[in] ldda Leading dimension of dA. @param[in] dB COMPLEX array on GPU device. The m-by-n matrix B of dimension (lddb,n), lddb >= max(1,m). @param[in] lddb Leading dimension of dB. @param[in] beta Scalar \f$ \beta \f$ @param[in,out] dC COMPLEX array on GPU device. The m-by-n matrix C of dimension (lddc,n), lddc >= max(1,m). @param[in] lddc Leading dimension of dC. @ingroup magma_cblas3 */ extern "C" void magma_chemm( magma_side_t side, magma_uplo_t uplo, magma_int_t m, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, size_t dA_offset, magma_int_t ldda, magmaFloatComplex_const_ptr dB, size_t dB_offset, magma_int_t lddb, magmaFloatComplex beta, magmaFloatComplex_ptr dC, size_t dC_offset, magma_int_t lddc, magma_queue_t queue ) { if ( m <= 0 || n <= 0) return; cl_int err = clblasChemm( clblasColumnMajor, clblas_side_const( side ), clblas_uplo_const( uplo ), m, n, alpha, dA, dA_offset, ldda, dB, dB_offset, lddb, beta, dC, dC_offset, lddc, 1, &queue, 0, NULL, g_event ); clFlush(queue); check_error( err ); }
void timedBufUnmap( cl_command_queue queue, cl_mem buf, void **ptr, bool quiet ) { CPerfCounter t1; cl_int ret; cl_event ev; t1.Reset(); t1.Start(); ret = clEnqueueUnmapMemObject( queue, buf, (void *) *ptr, 0, NULL, &ev ); ASSERT_CL_RETURN( ret ); clFlush( queue ); spinForEventsComplete( 1, &ev ); t1.Stop(); if( !quiet ) tlog->Timer( "%32s %lf s [ %8.2lf GB/s ]\n", "clEnqueueUnmapMemObject():", t1.GetElapsedTime(), nBytes, 1 ); }
/** Solve triangular matrix-matrix system (multiple right-hand sides). \f$ op(A) X = \alpha B \f$ (side == MagmaLeft), or \n \f$ X op(A) = \alpha B \f$ (side == MagmaRight), \n where \f$ A \f$ is triangular. @param[in] side Whether A is on the left or right. @param[in] uplo Whether A is upper or lower triangular. @param[in] trans Operation to perform on A. @param[in] diag Whether the diagonal of A is assumed to be unit or non-unit. @param[in] m Number of rows of B. m >= 0. @param[in] n Number of columns of B. n >= 0. @param[in] alpha Scalar \f$ \alpha \f$ @param[in] dA COMPLEX array on GPU device. If side == MagmaLeft, the m-by-m triangular matrix A of dimension (ldda,m), ldda >= max(1,m); \n otherwise, the n-by-n triangular matrix A of dimension (ldda,n), ldda >= max(1,n). @param[in] ldda Leading dimension of dA. @param[in,out] dB COMPLEX array on GPU device. On entry, m-by-n matrix B of dimension (lddb,n), lddb >= max(1,m). On exit, overwritten with the solution matrix X. @param[in] lddb Leading dimension of dB. @ingroup magma_cblas3 */ extern "C" void magma_ctrsm( magma_side_t side, magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t m, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, size_t dA_offset, magma_int_t ldda, magmaFloatComplex_ptr dB, size_t dB_offset, magma_int_t lddb, magma_queue_t queue ) { if (m <= 0 || n <= 0) return; cl_int err = clblasCtrsm( clblasColumnMajor, clblas_side_const( side ), clblas_uplo_const( uplo ), clblas_trans_const( trans ), clblas_diag_const( diag ), m, n, alpha, dA, dA_offset, ldda, dB, dB_offset, lddb, 1, &queue, 0, NULL, g_event ); clFlush(queue); check_error( err ); }
void timedImageCLWrite( cl_command_queue queue, cl_mem image, void *ptr ) { CPerfCounter t1; cl_int ret; cl_event ev; t1.Start(); ret = clEnqueueWriteImage( queue, image, CL_FALSE, imageOrigin, imageRegion, 0,0, ptr, 0, NULL, &ev ); ASSERT_CL_RETURN( ret ); clFlush( queue ); spinForEventsComplete( 1, &ev ); t1.Stop(); tlog->Timer( "%32s %lf s %8.2lf GB/s\n", "clEnqueueWriteImage():", t1.GetElapsedTime(), nBytesRegion, 1 ); }
/** Perform matrix-vector product. \f$ y = \alpha A x + \beta y \f$ (transA == MagmaNoTrans), or \n \f$ y = \alpha A^T x + \beta y \f$ (transA == MagmaTrans), or \n \f$ y = \alpha A^H x + \beta y \f$ (transA == MagmaConjTrans). @param[in] transA Operation to perform on A. @param[in] m Number of rows of A. m >= 0. @param[in] n Number of columns of A. n >= 0. @param[in] alpha Scalar \f$ \alpha \f$ @param[in] dA COMPLEX array of dimension (ldda,n), ldda >= max(1,m). The m-by-n matrix A, on GPU device. @param[in] ldda Leading dimension of dA. @param[in] dx COMPLEX array on GPU device. If transA == MagmaNoTrans, the n element vector x of dimension (1 + (n-1)*incx); \n otherwise, the m element vector x of dimension (1 + (m-1)*incx). @param[in] incx Stride between consecutive elements of dx. incx != 0. @param[in] beta Scalar \f$ \beta \f$ @param[in,out] dy COMPLEX array on GPU device. If transA == MagmaNoTrans, the m element vector y of dimension (1 + (m-1)*incy); \n otherwise, the n element vector y of dimension (1 + (n-1)*incy). @param[in] incy Stride between consecutive elements of dy. incy != 0. @ingroup magma_cblas2 */ extern "C" void magma_cgemv( magma_trans_t transA, magma_int_t m, magma_int_t n, magmaFloatComplex alpha, magmaFloatComplex_const_ptr dA, size_t dA_offset, magma_int_t ldda, magmaFloatComplex_const_ptr dx, size_t dx_offset, magma_int_t incx, magmaFloatComplex beta, magmaFloatComplex_ptr dy, size_t dy_offset, magma_int_t incy, magma_queue_t queue ) { if ( m <= 0 || n <= 0 ) return; cl_int err = clblasCgemv( clblasColumnMajor, clblas_trans_const( transA ), m, n, alpha, dA, dA_offset, ldda, dx, dx_offset, incx, beta, dy, dy_offset, incy, 1, &queue, 0, NULL, g_event ); clFlush(queue); check_error( err ); }
void timedBufferImageCLCopy( cl_command_queue queue, cl_mem srcBuf, cl_mem dstImg ) { CPerfCounter t1; cl_int ret; cl_event ev; t1.Start(); ret = clEnqueueCopyBufferToImage( queue, srcBuf, dstImg, 0, imageOrigin, imageRegion, 0, NULL, &ev ); ASSERT_CL_RETURN( ret ); clFlush( queue ); spinForEventsComplete( 1, &ev ); t1.Stop(); tlog->Timer( "%32s %lf s %8.2lf GB/s\n", "clEnqueueCopyBufferToImage():", t1.GetElapsedTime(), nBytesRegion, 1 ); }
/** Perform triangular matrix-vector product. \f$ x = A x \f$ (trans == MagmaNoTrans), or \n \f$ x = A^T x \f$ (trans == MagmaTrans), or \n \f$ x = A^H x \f$ (trans == MagmaConjTrans). @param[in] uplo Whether the upper or lower triangle of A is referenced. @param[in] trans Operation to perform on A. @param[in] diag Whether the diagonal of A is assumed to be unit or non-unit. @param[in] n Number of rows and columns of A. n >= 0. @param[in] dA COMPLEX array of dimension (ldda,n), ldda >= max(1,n). The n-by-n matrix A, on GPU device. @param[in] ldda Leading dimension of dA. @param[in] dx COMPLEX array on GPU device. The n element vector x of dimension (1 + (n-1)*incx). @param[in] incx Stride between consecutive elements of dx. incx != 0. @ingroup magma_cblas2 */ extern "C" void magma_ctrmv( magma_uplo_t uplo, magma_trans_t trans, magma_diag_t diag, magma_int_t n, magmaFloatComplex_const_ptr dA, size_t dA_offset, magma_int_t ldda, magmaFloatComplex_ptr dx, size_t dx_offset, magma_int_t incx, magma_queue_t queue ) { if ( n <= 0 ) return; magmaFloatComplex_ptr dwork; magma_cmalloc( &dwork, (1 + (n-1)*abs(incx)) ); cl_int err = clblasCtrmv( clblasColumnMajor, clblas_uplo_const( uplo ), clblas_trans_const( trans ), clblas_diag_const( diag ), n, dA, dA_offset, ldda, dx, dx_offset, incx, dwork, 1, &queue, 0, NULL, g_event ); clFlush(queue); check_error( err ); magma_free( dwork ); }
void vglGlToCl(VglImage* img) { glFlush(); glFinish(); if (img->oclPtr == NULL) { vglClAlloc(img); } if (!vglIsInContext(img, VGL_CL_CONTEXT)) { //printf("==========ACQUIRE: vgl = %p, ocl = %d\n", img, img->oclPtr); cl_int err_cl = clFlush(cl.commandQueue); vglClCheckError(err_cl, (char*) "clFlush"); err_cl = clFinish(cl.commandQueue); vglClCheckError(err_cl, (char*) "clFinish"); err_cl = clEnqueueAcquireGLObjects(cl.commandQueue, 1 , (cl_mem*) &img->oclPtr, 0 , NULL, NULL); vglClCheckError(err_cl, (char*) "clEnqueueAcquireGLObjects"); vglSetContext(img, VGL_CL_CONTEXT); } //printf("Vai sair de %s\n", __FUNCTION__); }
void* Optzd::eval(std::vector<uint8_t>* t){ cl_float* sh = (cl_float*)clEnqueueMapBuffer(queue, clm_sh, CL_FALSE, CL_MAP_WRITE, 0, 2*sizeof(cl_float), 0, NULL, NULL, &status); uint8_t* pic = reinterpret_cast<uint8_t*>(t->data()); sh[0] = 0; sh[1] = 0; // Copy frame to the device status = clEnqueueWriteBuffer(queue, clm_pic, CL_TRUE, 0, cols * rows * sizeof(cl_uchar), pic, 0, NULL, NULL); status = clEnqueueWriteBuffer(queue, clm_sh, CL_TRUE, 0, 2 * sizeof(cl_float), sh, 0, NULL, NULL); // Execute the OpenCL kernel on the list status = clEnqueueNDRangeKernel(queue, kern, 1, NULL, &global_threads, NULL, 0, NULL, NULL); status = clEnqueueReadBuffer(queue, clm_sh, CL_TRUE, 0, 2*sizeof(float), sh, 0, NULL, NULL); // Clean up and wait for all the comands to complete. status = clFlush(queue); status = clFinish(queue); //eval-ing BS float shb = sh[1], shnb = sh[0]; // for (int i = 0; i < (global_threads / 2); i++) { // shb += sh[i*2]; // shnb += sh[(i*2)+1]; // } clEnqueueUnmapMemObject(queue, clm_sh, sh, 0, NULL, NULL); if (shnb == 0) shnb = 4; *BS = shb/(shnb); return BS; }
void cl_clean_up() { // Clean up errcode = clFlush(clCommandQue); //errcode = clFinish(clCommandQue); errcode = clReleaseKernel(clKernel1); errcode = clReleaseKernel(clKernel2); errcode = clReleaseKernel(clKernel3); errcode = clReleaseKernel(clKernel4); errcode = clReleaseKernel(clKernel5); errcode = clReleaseKernel(clKernel6); errcode = clReleaseKernel(clConnect_1_6); errcode = clReleaseProgram(clProgram); errcode = clReleaseMemObject(a_mem_obj_k1); errcode = clReleaseMemObject(b_mem_obj_k1); errcode = clReleaseMemObject(a_mem_obj_k2); errcode = clReleaseMemObject(b_mem_obj_k2); errcode = clReleaseMemObject(a_mem_obj_k3); errcode = clReleaseMemObject(b_mem_obj_k3); errcode = clReleaseMemObject(a_mem_obj_k4); errcode = clReleaseMemObject(b_mem_obj_k4); errcode = clReleaseMemObject(a_mem_obj_k5); errcode = clReleaseMemObject(b_mem_obj_k5); errcode = clReleaseMemObject(a_mem_obj_k6); errcode = clReleaseMemObject(b_mem_obj_k6); errcode = clReleaseCommandQueue(clCommandQue); errcode = clReleaseContext(clGPUContext); if(errcode != CL_SUCCESS) printf("Error in cleanup\n"); }
int main(void) { const char *source = "__kernel void main(int in, __global int *out) {\n" " out[0] = in + 1;\n" "}\n"; cl_command_queue command_queue; cl_context context; cl_device_id device; cl_int input = 1; cl_kernel kernel; cl_mem buffer; cl_platform_id platform; cl_program program; clGetPlatformIDs(1, &platform, NULL); clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL); context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); command_queue = clCreateCommandQueue(context, device, 0, NULL); buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL, NULL); program = clCreateProgramWithSource(context, 1, &source, NULL, NULL); clBuildProgram(program, 1, &device, "", NULL, NULL); kernel = clCreateKernel(program, "main", NULL); clSetKernelArg(kernel, 0, sizeof(cl_int), &input); clSetKernelArg(kernel, 1, sizeof(cl_mem), &buffer); clEnqueueTask(command_queue, kernel, 0, NULL, NULL); clFlush(command_queue); clFinish(command_queue); clEnqueueReadBuffer(command_queue, buffer, CL_TRUE, 0, sizeof(cl_int), &input, 0, NULL, NULL); assert(input == 2); return EXIT_SUCCESS; }
void* Alg::eval(std::vector<uint8_t>* pic, std::vector<uint8_t>* picprev){ uint8_t* t = reinterpret_cast<uint8_t*>(pic->data()); uint8_t* tp = reinterpret_cast<uint8_t*>(picprev->data()); // Copy frame to the device float* result = (float*)clEnqueueMapBuffer(queue, clm_res, CL_FALSE, CL_MAP_READ, 0, 6*sizeof(cl_float), 0, NULL, NULL, &status); for (int i = 0; i < 6; i++) { result[i] = 0; } // Copy frame to the device status = clEnqueueWriteBuffer(queue, clm_pic, CL_FALSE, 0, cols * rows * sizeof(cl_uchar), t, 0, NULL, NULL); status = clEnqueueWriteBuffer(queue, clm_picprev, CL_FALSE, 0, cols * rows * sizeof(cl_uchar), tp, 0, NULL, NULL); status = clEnqueueWriteBuffer(queue, clm_res, CL_FALSE, 0, 6 * sizeof(float), result, 0, NULL, NULL); // Execute the OpenCL kernel on the list status = clEnqueueNDRangeKernel(queue, kern, 1, NULL, &global_threads, NULL, 0, NULL, NULL); status = clEnqueueReadBuffer(queue, clm_res, CL_FALSE, 0, 6*sizeof(float), result, 0, NULL, NULL); // Clean up and wait for all the comands to complete. status = clFlush(queue); status = clFinish(queue); for (int i = 0; i < 2; i++) { percentage[i] = result[i]*100.0 / (cols*rows); } for (int i = 0; i < 2; i++) { percentage[i + 2] = result[i + 2] / (cols*rows); } if (result[5]) percentage[4] = result[4] / result[5]; else percentage[4] = result[4] / 4; clEnqueueUnmapMemObject(queue, clm_res, result, 0, NULL, NULL); return percentage; }
static int Initialize(int gpu) { int err; err = SetupGraphics(); if (err != GL_NO_ERROR) { printf ("Failed to setup OpenGL state!"); exit (err); } err = SetupComputeDevices(gpu); if(err != CL_SUCCESS) { printf ("Failed to connect to compute device! Error %d\n", err); exit (err); } err = SetupGLProgram(); if (err != 1) { printf ("Failed to setup OpenGL Shader! Error %d\n", err); exit (err); } err = InitData(); if (err != 1) { printf ("Failed to Init FFT Data! Error %d\n", err); exit (err); } err = CreateGLResouce(); if (err != 1) { printf ("Failed to create GL resource! Error %d\n", err); exit (err); } glFinish(); err = SetupComputeKernel(); if (err != CL_SUCCESS) { printf ("Failed to setup compute kernel! Error %d\n", err); exit (err); } err = CreateComputeResource(); if(err != CL_SUCCESS) { printf ("Failed to create compute result! Error %d\n", err); exit (err); } clFlush(ComputeCommands); return CL_SUCCESS; }