void cl_launch_kernel(Queue& queue) { int n = N; size_t oldLocalWorkSize[1], globalWorkSize[1]; oldLocalWorkSize[0] = DIM_LOCAL_WORK_GROUP_X; globalWorkSize[0] = N; /////////////////////////////////////////////// size_t localWorkSize[1]; getNewSizes(NULL, oldLocalWorkSize, NULL, localWorkSize, "mvt_kernel1", 1); /////////////////////////////////////////////// // Set the arguments of the kernel kernel1->setArgument( 0,*a_mem_obj); kernel1->setArgument( 1,*x1_mem_obj); kernel1->setArgument( 2,*y1_mem_obj); kernel1->setArgument( 3, sizeof(int), (void *)&n); // Execute the OpenCL kernel queue.run(*kernel1, 1,0, globalWorkSize,localWorkSize); getNewSizes(NULL, oldLocalWorkSize, NULL, localWorkSize, "mvt_kernel2", 1); // Set the arguments of the kernel kernel2->setArgument( 0,*a_mem_obj); kernel2->setArgument( 1,*x2_mem_obj); kernel2->setArgument( 2,*y2_mem_obj); kernel2->setArgument( 3, sizeof(int), (void *)&n); // Execute the OpenCL kernel queue.run(*kernel2, 1, 0,globalWorkSize,localWorkSize); queue.finish(); }
int main(void) { DATA_TYPE* A; DATA_TYPE* A_outputFromGpu; DATA_TYPE* R; DATA_TYPE* Q; ///////////////////////// // Kernel 1. size_t oldSizes[2] = { M, N }; size_t newSizes[2]; getNewSizes(oldSizes, NULL, newSizes, NULL, "gramschmidt_kernel1", 2); M = newSizes[0]; N = newSizes[1]; // Kernel 2. getNewSizes(newSizes, NULL, newSizes, NULL, "gramschmidt_kernel2", 2); M = newSizes[0]; N = newSizes[1]; // Kernel 3. getNewSizes(newSizes, NULL, newSizes, NULL, "gramschmidt_kernel3", 2); M = newSizes[0]; N = newSizes[1]; ///////////////////////// A = (DATA_TYPE*)malloc(M*N*sizeof(DATA_TYPE)); A_outputFromGpu = (DATA_TYPE*)malloc(M*N*sizeof(DATA_TYPE)); R = (DATA_TYPE*)malloc(M*N*sizeof(DATA_TYPE)); Q = (DATA_TYPE*)malloc(M*N*sizeof(DATA_TYPE)); init_array(A); read_cl_file(); cl_initialization(device_id, clGPUContext, clCommandQue); cl_mem_init(A); cl_load_prog(); cl_launch_kernel(); errcode = clEnqueueReadBuffer(clCommandQue, a_mem_obj, CL_TRUE, 0, M*N*sizeof(DATA_TYPE), A_outputFromGpu, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n"); // gramschmidt(A, R, Q); // compareResults(A, A_outputFromGpu); cl_clean_up(); free(A); free(A_outputFromGpu); free(R); free(Q); return 0; }
int main(void) { DATA_TYPE *a; DATA_TYPE *x1; DATA_TYPE *x2; DATA_TYPE *x1_outputFromGpu; DATA_TYPE *x2_outputFromGpu; DATA_TYPE *y_1; DATA_TYPE *y_2; ///////////////////////// size_t oldSizes[1] = { N }; size_t newSizes[1]; getNewSizes(oldSizes, NULL, newSizes, NULL, "mvt_kernel1", 1); N = newSizes[0]; ///////////////////////// a = (DATA_TYPE *)malloc(N * N * sizeof(DATA_TYPE)); x1 = (DATA_TYPE *)malloc(N * sizeof(DATA_TYPE)); x2 = (DATA_TYPE *)malloc(N * sizeof(DATA_TYPE)); x1_outputFromGpu = (DATA_TYPE *)malloc(N * sizeof(DATA_TYPE)); x2_outputFromGpu = (DATA_TYPE *)malloc(N * sizeof(DATA_TYPE)); y_1 = (DATA_TYPE *)malloc(N * sizeof(DATA_TYPE)); y_2 = (DATA_TYPE *)malloc(N * sizeof(DATA_TYPE)); init_arrays(a, x1, x2, y_1, y_2); platform = new Platform(PLATFORM_ID); context = platform->getContext(); Device device = platform->getDevice(DEVICE_ID); Queue queue(*context,device,Queue::EnableProfiling); cl_mem_init(a, x1, x2, y_1, y_2,queue); Program program(context,KERNEL_DIRECTORY KERNEL_FILE_NAME); if(!program.build(device)){ std::cout << "Error building the program: \n"; std::cout <<program.getBuildLog(device); } kernel1=program.createKernel(kernel1Name.c_str()); kernel2=program.createKernel(kernel2Name.c_str()); cl_launch_kernel(queue); queue.readBuffer(*x1_mem_obj,N * sizeof(DATA_TYPE), x1_outputFromGpu); queue.readBuffer(*x2_mem_obj,N * sizeof(DATA_TYPE), x2_outputFromGpu); queue.finish(); runMvt(a, x1, x2, y_1, y_2, x1_outputFromGpu,x2_outputFromGpu); cl_clean_up(); free(a); free(x1); free(x2); free(x1_outputFromGpu); free(x2_outputFromGpu); free(y_1); free(y_2); return 0; }
void cl_launch_kernel(Queue& queue) { int ni = NI; int nj = NJ; size_t oldLocalWorkSize[2], globalWorkSize[2]; oldLocalWorkSize[0] = DIM_LOCAL_WORK_GROUP_X; oldLocalWorkSize[1] = DIM_LOCAL_WORK_GROUP_Y; globalWorkSize[0] = NI; globalWorkSize[1] = NJ; /////////////////////////////////////////////// size_t localWorkSize[2]; getNewSizes(NULL, oldLocalWorkSize, NULL, localWorkSize, "Convolution2D_kernel", 2); /////////////////////////////////////////////// // Set the arguments of the kernel kernel->setArgument(0,*a_mem_obj); kernel->setArgument(1,*b_mem_obj); kernel->setArgument(2,sizeof(int),&ni); kernel->setArgument(3,sizeof(int),&nj); // Execute the OpenCL kernel queue.run(*kernel,2,0,globalWorkSize,localWorkSize); queue.finish(); }
int main(int argc, char *argv[]) { DATA_TYPE *A; DATA_TYPE *B_outputFromGpu; ///////////////////////// size_t oldSizes[2] = { NI, NJ }; size_t newSizes[2]; getNewSizes(oldSizes, NULL, newSizes, NULL, "Convolution2D_kernel", 2); NI = newSizes[0]; NJ = newSizes[1]; ///////////////////////// A = (DATA_TYPE *)malloc(NI * NJ * sizeof(DATA_TYPE)); B_outputFromGpu = (DATA_TYPE *)malloc(NI * NJ * sizeof(DATA_TYPE)); init(A); platform = new Platform(PLATFORM_ID); context = platform->getContext(); Device device = platform->getDevice(DEVICE_ID); Queue queue = Queue(*context,device,Queue::EnableProfiling); cl_mem_init(A,queue); SourceFile kernelFile = KERNEL_DIRECTORY KERNEL_FILE_NAME; // Create a program from the kernel source Program program(context,kernelFile); if(!program.build(device)) { std::cout << "Error building the program: " << "\n"; std::cout << program.getBuildLog(device) << "\n"; return 1; } // Create the OpenCL kernel kernel = program.createKernel(kernelName.c_str()); cl_launch_kernel(queue); queue.readBuffer(*b_mem_obj,NI * NJ * sizeof(DATA_TYPE),(void*) B_outputFromGpu); queue.finish(); conv2D(A, B_outputFromGpu); free(A); free(B_outputFromGpu); cl_clean_up(); return 0; }
//----------------------------------------------------------------------------- void setNDRangeSizes() { std::vector<size_t> newGlobalWorkSize(globalWorkSize.size(), 0); std::vector<size_t> newLocalWorkSize(localWorkSize.size(), 0); getNewSizes(globalWorkSize.data(), localWorkSize.data(), newGlobalWorkSize.data(), newLocalWorkSize.data(), kernelName.c_str(), globalWorkSize.size()); globalWorkSize.clear(); localWorkSize.clear(); std::copy(newGlobalWorkSize.begin(), newGlobalWorkSize.end(), std::back_inserter(globalWorkSize)); std::copy(newLocalWorkSize.begin(), newLocalWorkSize.end(), std::back_inserter(localWorkSize)); }
int main(void) { DATA_TYPE *A; DATA_TYPE *y; DATA_TYPE *tmp; ///////////////////////// size_t oldSizes[1] = { NY }; size_t newSizes[1]; getNewSizes(oldSizes, NULL, newSizes, NULL, "atax_kernel2", 1); NY = newSizes[0]; ///////////////////////// A = (DATA_TYPE *)malloc(NX_DEFAULT * NY * sizeof(DATA_TYPE)); y = (DATA_TYPE *)malloc(NY * sizeof(DATA_TYPE)); tmp = (DATA_TYPE *)malloc(NX_DEFAULT * sizeof(DATA_TYPE)); init_array(tmp, A); platform = new Platform(PLATFORM_ID); context = platform->getContext(); Device device = platform->getDevice(DEVICE_ID); Queue queue(*context,device,Queue::EnableProfiling); cl_mem_init(A, y, tmp,queue); Program program(context,KERNEL_DIRECTORY KERNEL_FILE_NAME); if(!program.build(device)){ std::cout << "Error building the program: \n"; std::cout <<program.getBuildLog(device); } kernel2=program.createKernel(kernel2Name.c_str()); cl_launch_kernel(queue); queue.readBuffer(*y_mem_obj,NY * sizeof(DATA_TYPE), y); queue.finish(); atax_cpu(A, tmp, y); cl_clean_up(); free(A); free(y); free(tmp); return 0; }
int main(void) { DATA_TYPE* A; DATA_TYPE* B; DATA_TYPE* C; DATA_TYPE* C_outputFromGpu; ///////////////////////// size_t oldSizes[2] = { NJ, NI }; size_t newSizes[2]; getNewSizes(oldSizes, NULL, newSizes, NULL, "gemm", 2); NJ = newSizes[0]; NI = newSizes[1]; NK = NJ; ///////////////////////// A = (DATA_TYPE*)malloc(NI*NK*sizeof(DATA_TYPE)); B = (DATA_TYPE*)malloc(NK*NJ*sizeof(DATA_TYPE)); C = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE)); C_outputFromGpu = (DATA_TYPE*)malloc(NI*NJ*sizeof(DATA_TYPE)); init(A, B, C); read_cl_file(); cl_initialization(device_id, clGPUContext, clCommandQue); cl_mem_init(A, B, C); cl_load_prog(); cl_launch_kernel(); errcode = clEnqueueReadBuffer(clCommandQue, c_mem_obj, CL_TRUE, 0, NI*NJ*sizeof(DATA_TYPE), C_outputFromGpu, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in reading GPU mem\n"); gemm(A, B, C, C_outputFromGpu); cl_clean_up(); free(A); free(B); free(C); free(C_outputFromGpu); return 0; }
void cl_launch_kernel() { int ni=NI; int nj=NJ; int nk=NK; DATA_TYPE alpha = ALPHA; DATA_TYPE beta = BETA; size_t oldLocalWorkSize[2], globalWorkSize[2]; oldLocalWorkSize[0] = DIM_LOCAL_WORK_GROUP_X; oldLocalWorkSize[1] = DIM_LOCAL_WORK_GROUP_Y; globalWorkSize[0] = NJ; globalWorkSize[1] = NI; /////////////////////////////////////////////// size_t localWorkSize[2]; getNewSizes(NULL, oldLocalWorkSize, NULL, localWorkSize, "gemm", 2); /////////////////////////////////////////////// // Set the arguments of the kernel errcode = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&a_mem_obj); errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&b_mem_obj); errcode |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&c_mem_obj); errcode |= clSetKernelArg(clKernel, 3, sizeof(DATA_TYPE), (void *)&alpha); errcode |= clSetKernelArg(clKernel, 4, sizeof(DATA_TYPE), (void *)&beta); errcode |= clSetKernelArg(clKernel, 5, sizeof(int), (void *)&ni); errcode |= clSetKernelArg(clKernel, 6, sizeof(int), (void *)&nj); errcode |= clSetKernelArg(clKernel, 7, sizeof(int), (void *)&nk); if(errcode != CL_SUCCESS) printf("Error in seting arguments\n"); // Execute the OpenCL kernel errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in launching kernel\n"); clFinish(clCommandQue); }
void cl_launch_kernel(Queue& queue) { int nx = NX_DEFAULT; int ny = NY; size_t oldLocalWorkSize[1], globalWorkSize[1]; oldLocalWorkSize[0] = DIM_LOCAL_WORK_GROUP_X; globalWorkSize[0] = NY; /////////////////////////////////////////////// size_t localWorkSize[1]; getNewSizes(NULL, oldLocalWorkSize, NULL, localWorkSize, "atax_kernel2", 1); /////////////////////////////////////////////// // Set the arguments of the kernel kernel2->setArgument( 0,*a_mem_obj); kernel2->setArgument( 1,*y_mem_obj); kernel2->setArgument( 2,*tmp_mem_obj); kernel2->setArgument( 3, sizeof(int), (void *)&nx); kernel2->setArgument( 4, sizeof(int), (void *)&ny); queue.run(*kernel2, 1, 0, globalWorkSize, localWorkSize); queue.finish(); }
void cl_launch_kernel() { int m = M; int n = N; size_t oldLocalWorkSize[2], globalWorkSizeKernel1[2], localWorkSize[2]; size_t globalWorkSizeKernel2[2], globalWorkSizeKernel3[2]; oldLocalWorkSize[0] = DIM_THREAD_BLOCK_X; oldLocalWorkSize[1] = DIM_THREAD_BLOCK_Y; globalWorkSizeKernel1[0] = DIM_THREAD_BLOCK_X; globalWorkSizeKernel1[1] = DIM_THREAD_BLOCK_Y; globalWorkSizeKernel2[0] = N; globalWorkSizeKernel2[1] = 1; globalWorkSizeKernel3[0] = N; globalWorkSizeKernel3[1] = 1; /////////////////////////////////////////////// // Kernel 2. getNewSizes(NULL, oldLocalWorkSize, NULL, localWorkSize, "gramschmidt_kernel2", 2); // Kernel 3. getNewSizes(NULL, localWorkSize, NULL, localWorkSize, "gramschmidt_kernel3", 2); /////////////////////////////////////////////// int k; for (k = 0; k < 1; k++) { // Set the arguments of the kernel errcode = clSetKernelArg(clKernel1, 0, sizeof(cl_mem), (void *)&a_mem_obj); errcode = clSetKernelArg(clKernel1, 1, sizeof(cl_mem), (void *)&r_mem_obj); errcode |= clSetKernelArg(clKernel1, 2, sizeof(cl_mem), (void *)&q_mem_obj); errcode |= clSetKernelArg(clKernel1, 3, sizeof(int), (void *)&k); errcode |= clSetKernelArg(clKernel1, 4, sizeof(int), (void *)&m); errcode |= clSetKernelArg(clKernel1, 5, sizeof(int), (void *)&n); if(errcode != CL_SUCCESS) printf("Error in seting arguments1\n"); // Execute the OpenCL kernel errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel1, 1, NULL, globalWorkSizeKernel1, localWorkSize, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in launching kernel1\n"); clEnqueueBarrier(clCommandQue); errcode = clSetKernelArg(clKernel2, 0, sizeof(cl_mem), (void *)&a_mem_obj); errcode = clSetKernelArg(clKernel2, 1, sizeof(cl_mem), (void *)&r_mem_obj); errcode |= clSetKernelArg(clKernel2, 2, sizeof(cl_mem), (void *)&q_mem_obj); errcode |= clSetKernelArg(clKernel2, 3, sizeof(int), (void *)&k); errcode |= clSetKernelArg(clKernel2, 4, sizeof(int), (void *)&m); errcode |= clSetKernelArg(clKernel2, 5, sizeof(int), (void *)&n); if(errcode != CL_SUCCESS) printf("Error in seting arguments1\n"); // Execute the OpenCL kernel errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel2, 1, NULL, globalWorkSizeKernel2, localWorkSize, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in launching kernel2\n"); clEnqueueBarrier(clCommandQue); errcode = clSetKernelArg(clKernel3, 0, sizeof(cl_mem), (void *)&a_mem_obj); errcode = clSetKernelArg(clKernel3, 1, sizeof(cl_mem), (void *)&r_mem_obj); errcode |= clSetKernelArg(clKernel3, 2, sizeof(cl_mem), (void *)&q_mem_obj); errcode |= clSetKernelArg(clKernel3, 3, sizeof(int), (void *)&k); errcode |= clSetKernelArg(clKernel3, 4, sizeof(int), (void *)&m); errcode |= clSetKernelArg(clKernel3, 5, sizeof(int), (void *)&n); if(errcode != CL_SUCCESS) printf("Error in seting arguments1\n"); // Execute the OpenCL kernel errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel3, 1, NULL, globalWorkSizeKernel3, localWorkSize, 0, NULL, NULL); if(errcode != CL_SUCCESS) printf("Error in launching kernel3\n"); clEnqueueBarrier(clCommandQue); } clFinish(clCommandQue); }