void ClSetup::SetupCl(int platform_index, int device_index) { platform = GetPlatformID(platform_index); PrintPlatformInfo(platform); device = GetDeviceID(device_index); PrintDeviceInfo(device); context = CreateContext(); queue = CreateCommandQueue(); bInit = true; }
int main(int argc, char** argv) { glutInit(&argc, argv); glutInitDisplayMode(GLUT_DOUBLE | GLUT_RGBA); glutCreateWindow("GL interop"); glutIconifyWindow(); glutDisplayFunc(computeVBO); glutIdleFunc(computeVBO); initVBO(); CreateContext(); if(!context) { std::cerr << "Failed to create OpenCL context." << std::endl; return EXIT_FAILURE; } CreateCommandQueue(); if(!commandQueue) { Cleanup(); return EXIT_FAILURE; } CreateProgram(); if(!program) { Cleanup(); return EXIT_FAILURE; } kernel = clCreateKernel(program, "init_vbo_kernel", NULL); if(!kernel) { std::cerr << "Failed to create kernel" << std::endl; Cleanup(); return EXIT_FAILURE; } if (!CreateMemObjects()) { Cleanup(); return EXIT_FAILURE; } glutMainLoop(); std::cout << std::endl; std::cout << "Executed program succesfully." << std::endl; Cleanup(); }
inline void ID3D12DeviceEx::createCommandContext(CommandContext<T, N, L>* commandContext, const bool isHighPriority, const bool disableGpuTimeout) { static_assert(N > 0 && L > 0, "Invalid command context parameters."); assert(commandContext); // Fill out the command queue description. const D3D12_COMMAND_QUEUE_DESC queueDesc = { /* Type */ static_cast<D3D12_COMMAND_LIST_TYPE>(T), /* Priority */ isHighPriority ? D3D12_COMMAND_QUEUE_PRIORITY_HIGH : D3D12_COMMAND_QUEUE_PRIORITY_NORMAL, /* Flags */ disableGpuTimeout ? D3D12_COMMAND_QUEUE_FLAG_DISABLE_GPU_TIMEOUT : D3D12_COMMAND_QUEUE_FLAG_NONE, /* NodeMask */ nodeMask }; // Create a command queue. CHECK_CALL(CreateCommandQueue(&queueDesc, IID_PPV_ARGS(&commandContext->m_commandQueue)), "Failed to create a command queue."); // Create command allocators. for (size_t i = 0; i < N; ++i) { for (size_t j = 0; j < L; ++j) { CHECK_CALL(CreateCommandAllocator(static_cast<D3D12_COMMAND_LIST_TYPE>(T), IID_PPV_ARGS(&commandContext->m_commandAllocators[i][j])), "Failed to create a command list allocator."); } } // Set the initial frame allocator set index to 0. commandContext->m_frameAllocatorSet = 0; // Create command lists in the closed, NULL state using the initial allocator. for (size_t i = 0; i < L; ++i) { CHECK_CALL(CreateCommandList(nodeMask, static_cast<D3D12_COMMAND_LIST_TYPE>(T), commandContext->m_commandAllocators[0][i].Get(), nullptr, IID_PPV_ARGS(&commandContext->m_commandLists[i])), "Failed to create a command list."); CHECK_CALL(commandContext->m_commandLists[i]->Close(), "Failed to close the command list."); } // Create a 0-initialized fence object. commandContext->m_fenceValue = 0; CHECK_CALL(CreateFence(0, D3D12_FENCE_FLAG_NONE, IID_PPV_ARGS(&commandContext->m_fence)), "Failed to create a fence object."); // Set the last fence value for each command allocator to 0. for (size_t i = 0; i < N; ++i) { commandContext->m_lastFenceValues[i] = 0; } // Create a synchronization event. commandContext->m_syncEvent = CreateEvent(nullptr, FALSE, FALSE, nullptr); if (!commandContext->m_syncEvent) { CHECK_CALL(HRESULT_FROM_WIN32(GetLastError()), "Failed to create a synchronization event."); } }
int SetOpenCLParemeters(OpenCLParameters *opencl_paremeters, const char *file_name, const char *kernel_name) { opencl_paremeters->context = CreateContext(); if (opencl_paremeters->context == NULL) { return 0; } opencl_paremeters->command_queue = CreateCommandQueue(opencl_paremeters->context, &(opencl_paremeters->device)); if (opencl_paremeters->command_queue == NULL) { CleanOpenCLParemeters(opencl_paremeters); return 0; } opencl_paremeters->program = CreateProgram(opencl_paremeters->context, opencl_paremeters->device, file_name); if (opencl_paremeters->program == NULL) { CleanOpenCLParemeters(opencl_paremeters); return 0; } opencl_paremeters->kernel = clCreateKernel(opencl_paremeters->program, kernel_name, NULL); if (opencl_paremeters->kernel == NULL) { CleanOpenCLParemeters(opencl_paremeters); std::cerr << "Failed to create kernel!" << std::endl; return 0; } return 1; }
/// // main() for HelloWorld example // int main(int argc, char** argv) { cl_context context = 0; cl_command_queue commandQueue = 0; cl_program program = 0; cl_device_id device = 0; cl_kernel kernels[2] = { 0, 0 }; cl_mem memObjects[3] = { 0, 0, 0 }; cl_int errNum; // Create an OpenCL context on first available platform context = CreateContext(); if (context == NULL) { std::cerr << "Failed to create OpenCL context." << std::endl; return 1; } // Create a command-queue on the first device available // on the created context commandQueue = CreateCommandQueue(context, &device); if (commandQueue == NULL) { Cleanup(context, commandQueue, program, kernels, memObjects); return 1; } // Create OpenCL program from HelloWorld.cl kernel source program = CreateProgram(context, device, "simple.cl"); if (program == NULL) { Cleanup(context, commandQueue, program, kernels, memObjects); return 1; } // Create OpenCL kernel //clCreateKernel(program, "hello_kernel", NULL); cl_uint numberOfKernels = 0; errNum = clCreateKernelsInProgram(program, 0, NULL, &numberOfKernels ); if (errNum != CL_SUCCESS) { std::cerr << "Failed to get number of kernels" << std::endl; Cleanup(context, commandQueue, program, kernels, memObjects); return 1; } else { std::cout << "numberOfKernels is:" << numberOfKernels << std::endl; } assert(numberOfKernels == 2 && "number of kernels was not as expected"); errNum = clCreateKernelsInProgram(program, 2, kernels, NULL ); if (errNum != CL_SUCCESS) { std::cerr << "Failed to retrieve kernels" << std::endl; Cleanup(context, commandQueue, program, kernels, memObjects); return 1; } // Create memory objects that will be used as arguments to // kernels. First create host memory arrays that will be // used to store the arguments to the kernel float result[ARRAY_SIZE]; float a[ARRAY_SIZE]; float b[ARRAY_SIZE]; for (int i = 0; i < ARRAY_SIZE; i++) { a[i] = (float)i; b[i] = (float)(i * 2); } if (!CreateMemObjects(context, memObjects, a, b)) { Cleanup(context, commandQueue, program, kernels, memObjects); return 1; } for (int i = 0; i < numberOfKernels; ++i) { // Set the kernel arguments (result, a, b) errNum = clSetKernelArg(kernels[i], 0, sizeof(cl_mem), &memObjects[0]); errNum |= clSetKernelArg(kernels[i], 1, sizeof(cl_mem), &memObjects[1]); errNum |= clSetKernelArg(kernels[i], 2, sizeof(cl_mem), &memObjects[2]); if (errNum != CL_SUCCESS) { std::cerr << "Error setting kernels[" << i << "] arguments." << std::endl; Cleanup(context, commandQueue, program, kernels, memObjects); return 1; } } size_t globalWorkSize[1] = { ARRAY_SIZE }; size_t localWorkSize[1] = { 1 }; cl_event waitFor = NULL; for (int i = 0; i < numberOfKernels; ++i) { cl_uint numToWaitFor = 0; cl_event waitList[1] = { 0 }; cl_event* waitListP = 0; if (waitFor != NULL) { numToWaitFor = 1; waitList[0] = waitFor; waitListP = waitList; } // Queue the kernel up for execution across the array errNum = clEnqueueNDRangeKernel(commandQueue, kernels[i], 1, NULL, globalWorkSize, localWorkSize, numToWaitFor, waitListP, &waitFor); if (errNum != CL_SUCCESS) { std::cerr << "Error queuing kernel for execution." << std::endl; Cleanup(context, commandQueue, program, kernels, memObjects); return 1; } } // Read the output buffer back to the Host errNum = clEnqueueReadBuffer(commandQueue, memObjects[2], CL_TRUE, 0, ARRAY_SIZE * sizeof(float), result, 0, NULL, NULL); if (errNum != CL_SUCCESS) { std::cerr << "Error reading result buffer." << std::endl; Cleanup(context, commandQueue, program, kernels, memObjects); return 1; } // Output the result buffer for (int i = 0; i < ARRAY_SIZE; i++) { std::cout << result[i] << " "; } std::cout << std::endl; std::cout << "Executed program succesfully." << std::endl; Cleanup(context, commandQueue, program, kernels, memObjects); return 0; }
int main(int argc, char** argv) { cl_context context = 0; cl_command_queue commandQueue = 0; cl_program program = 0; cl_device_id device = 0; cl_kernel kernel = 0; cl_int status; char filename[] = "../../kernels/VectorUpdate_vec_kernel.cl"; char filename2[] = "../../common/types_kernel.h"; int profiling_info = 0; cl_event myEvent, myEvent2; if( argc != 4 ) { printf("Usage: %s vector_file1 vector_file2 alpha\n", argv[0]); return EXIT_FAILURE; } char xfilename[50]; char yfilename[50]; real alpha; strcpy(xfilename, argv[1]); strcpy(yfilename, argv[2]); alpha = strtod(argv[3], NULL); #ifdef PROFILE cl_ulong startTime, endTime, startTime2, endTime2; cl_ulong kernelExecTimeNs, readFromGpuTime; profiling_info = 1; #endif /* READING DATA FROM FILE */ real *x; real *y; real *ref_x; int N, M, N4; std::ifstream xfile; xfile.open (xfilename, std::ios::in); if (!xfile.is_open()) { printf("Error: cannot open file\n"); return EXIT_FAILURE; } xfile >> N; // it must be N%4 == 0 N4 = ((N & (4-1)) == 0) ? N : N+(4-(N&3)); HANDLE_ALLOC_ERROR(x = (real*)malloc(N4*sizeof(real))); for( int i = 0; i < N; i++) xfile >> x[i]; for(int i = N; i < N4; ++i) x[i] = 0; xfile.close(); // needed for checking result HANDLE_ALLOC_ERROR(ref_x = (real*)malloc(N*sizeof(real))); memcpy(ref_x, x, N*sizeof(real)); std::ifstream yfile; yfile.open (yfilename, std::ios::in); if (!yfile.is_open()) { printf("Error: cannot open file\n"); return EXIT_FAILURE; } yfile >> M; assert(N==M); HANDLE_ALLOC_ERROR(y = (real*)malloc(N4*sizeof(real))); for( int i = 0; i < N; i++) yfile >> y[i]; for(int i = N; i < N4; ++i) y[i] = 0; yfile.close(); int Ndev4 = N4/4; TIME start = tic(); TIME init = tic(); // Create an OpenCL context context = CreateContext(); if(context == NULL) { std::cerr << "Failed to create OpenCL context." << std::endl; Cleanup(context, commandQueue, program, kernel); return EXIT_FAILURE; } // Create a command queue commandQueue = CreateCommandQueue(context, &device, profiling_info); if(commandQueue == NULL) { std::cerr << "Failed to create OpenCL command queue." << std::endl; Cleanup(context, commandQueue, program, kernel); return EXIT_FAILURE; } // Create OpenCL program program = CreateProgram(context, device, filename, filename2); if (program == NULL) { Cleanup(context, commandQueue, program, kernel); return EXIT_FAILURE; } // Create OpenCL kernel kernel = clCreateKernel(program, "VectorUpdate", NULL); if(kernel == NULL) { std::cerr << "Failed to create kernel." << std::endl; Cleanup(context, commandQueue, program, kernel); return EXIT_FAILURE; } printf("%lf\n",toc(init)); /* QUERYING DEVICE INFO */ size_t kernelWorkGroupSize; // maximum work-group size that can be used to execute a kernel size_t sizeOfWarp; // the preferred multiple of workgroup size for launch cl_ulong localMemSize; // the amount of local memory in bytes being used by a kernel cl_ulong privateMemSize; // the minimum amount of private memory, in bytes, used by each workitem in the kernel. HANDLE_OPENCL_ERROR(clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, NULL)); HANDLE_OPENCL_ERROR(clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &sizeOfWarp, NULL)); #ifdef PRINT_INFO HANDLE_OPENCL_ERROR(clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(cl_ulong), &localMemSize, NULL)); HANDLE_OPENCL_ERROR(clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_PRIVATE_MEM_SIZE, sizeof(cl_ulong), &privateMemSize, NULL)); #endif #ifdef PRINT_INFO printf("------------ Some info: --------------\n"); printf("kernelWorkGroupSize = %lu \n", kernelWorkGroupSize); printf("sizeOfWarp = %lu \n", sizeOfWarp); printf("localMemSize = %lu \n", localMemSize); printf("privateMemSize = %lu \n", privateMemSize); printf("------------------------ --------------\n"); #endif if( WORK_GROUP_SIZE > kernelWorkGroupSize ) { printf("Error: wrong work group size\n"); return EXIT_FAILURE; } size_t localWorkSize[1] = {WORK_GROUP_SIZE}; int numWorkGroups = (Ndev4-1)/WORK_GROUP_SIZE+1; size_t globalWorkSize[1] = {numWorkGroups*WORK_GROUP_SIZE}; TIME t = tic(); cl_mem DEV_x = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(real)*N4, x, &status); HANDLE_OPENCL_ERROR(status); cl_mem DEV_y = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(real)*N4, y, &status); HANDLE_OPENCL_ERROR(status); int n = 0; status = clSetKernelArg(kernel, n++, sizeof(cl_mem), (void*)&DEV_x); status |= clSetKernelArg(kernel, n++, sizeof(cl_mem), (void*)&DEV_y); status |= clSetKernelArg(kernel, n++, sizeof(real), (void*)&alpha); status |= clSetKernelArg(kernel, n++, sizeof(int), (void*)&Ndev4); HANDLE_OPENCL_ERROR(status); printf("%lf\n",toc(t)); // Queue the kernel HANDLE_OPENCL_ERROR(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &myEvent)); // Read the output buffer back to the Host HANDLE_OPENCL_ERROR(clEnqueueReadBuffer(commandQueue, DEV_x, CL_TRUE, 0, N4*sizeof(real), x, 0, NULL, &myEvent2)); clFinish(commandQueue); // wait for all events to finish double elapsed_time = toc(start); /* CHECK RESULT */ TIME start_seq = tic(); for (int i = 0; i < N; i++) ref_x[i] += alpha*y[i]; double elapsed_time_seq = toc(start_seq); assert(ref_x[10] < 1000000); //std::cout << ref_x[0] << " " << x[0] << std::endl; // for (int i = 0; i < N; i++) // assert( abs(x[i] - ref_x[i]) < TOL ); //std::cout << "Verified..." << std::endl; #ifdef PROFILE clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); clGetEventProfilingInfo(myEvent2, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime2, NULL); clGetEventProfilingInfo(myEvent2, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime2, NULL); kernelExecTimeNs = endTime-startTime; readFromGpuTime = endTime2-startTime2; printf(/*"Kernel execution time: %lf\n"*/"%lf\n", (double)readFromGpuTime/1000000000.0); printf(/*"Kernel execution time: %lf\n"*/"%lf\n", (double)kernelExecTimeNs/1000000000.0); #endif printf(/*"Total execution time: %lf\n"*/"%lf\n", elapsed_time); printf(/*"Total execution time (seq.):*/"%lf\n", elapsed_time_seq); Cleanup(context, commandQueue, program, kernel); free(x); free(y); clReleaseMemObject(DEV_x); clReleaseMemObject(DEV_y); return EXIT_SUCCESS; }
int main(int argc, char **argv) { int optindex = 0; char ch; struct option longopts[] = { {"dimension", required_argument, 0, 'd'}, //dimensions of src img {"components", required_argument, 0, 'c'}, //numger of components of src img {"depth", required_argument, 0, 'b'}, //bit depth of src img {"level", required_argument, 0, 'l'}, //level of dwt {"device", required_argument, 0, 'D'}, //cuda device {"forward", no_argument, 0, 'f'}, //forward transform {"reverse", no_argument, 0, 'r'}, //forward transform {"97", no_argument, 0, '9'}, //9/7 transform {"53", no_argument, 0, '5' }, //5/3transform {"write-visual",no_argument, 0, 'w' }, //write output (subbands) in visual (tiled) order instead of linear {"help", no_argument, 0, 'h'} }; int pixWidth = 0; //<real pixWidth int pixHeight = 0; //<real pixHeight int compCount = 3; //number of components; 3 for RGB or YUV, 4 for RGBA int bitDepth = 8; int dwtLvls = 3; //default numuber of DWT levels int device = 0; int forward = 1; //forward transform int dwt97 = 0; //1=dwt9/7, 0=dwt5/3 transform int writeVisual = 0; //write output (subbands) in visual (tiled) order instead of linear char * pos; while ((ch = getopt_long(argc, argv, "d:c:b:l:D:fr95wh", longopts, &optindex)) != -1) { switch (ch) { case 'd': pixWidth = atoi(optarg); pos = strstr(optarg, "x"); if (pos == NULL || pixWidth == 0 || (strlen(pos) >= strlen(optarg))) { usage(); return -1; } pixHeight = atoi(pos+1); break; case 'c': compCount = atoi(optarg); break; case 'b': bitDepth = atoi(optarg); break; case 'l': dwtLvls = atoi(optarg); break; case 'D': device = atoi(optarg); break; case 'f': forward = 1; break; case 'r': forward = 0; break; case '9': dwt97 = 1; break; case '5': dwt97 = 0; break; case 'w': writeVisual = 1; break; case 'h': usage(); return 0; case '?': return -1; default : usage(); return -1; } } argc -= optind; argv += optind; if (argc == 0) { // at least one filename is expected printf("Please supply src file name\n"); usage(); return -1; } if (pixWidth <= 0 || pixHeight <=0) { printf("Wrong or missing dimensions\n"); usage(); return -1; } if (forward == 0) { writeVisual = 0; //do not write visual when RDWT } // // device init // Create an OpenCL context on first available platform context = CreateContext(); if (context == NULL) { std::cerr << "Failed to create OpenCL context." << std::endl; return 1; } // Create a command-queue on the first device available // on the created context commandQueue = CreateCommandQueue(context, &cldevice); if (commandQueue == NULL) { Cleanup(context, commandQueue, program, kernel); return 1; } // Create OpenCL program from com_dwt.cl kernel source program = CreateProgram(context, cldevice, "com_dwt.cl"); if (program == NULL) { printf("fail to create program!!\n"); } // Create OpenCL kernel c_CopySrcToComponents = clCreateKernel(program, "c_CopySrcToComponents", NULL); if (c_CopySrcToComponents == NULL) { std::cerr << "Failed to create kernel" << std::endl; } c_CopySrcToComponent = clCreateKernel(program, "c_CopySrcToComponent", NULL); if (c_CopySrcToComponent == NULL) { std::cerr << "Failed to create kernel" << std::endl; } kl_fdwt53Kernel = clCreateKernel(program, "cl_fdwt53Kernel", NULL); if (kl_fdwt53Kernel == NULL) { std::cerr<<"Failed to create kernel\n"; } //initialize struct dwt struct dwt *d; d = (struct dwt *)malloc(sizeof(struct dwt)); d->srcImg = NULL; d->pixWidth = pixWidth; d->pixHeight = pixHeight; d->components = compCount; d->dwtLvls = dwtLvls; // file names d->srcFilename = (char *)malloc(strlen(argv[0])); strcpy(d->srcFilename, argv[0]); if (argc == 1) { // only one filename supplyed d->outFilename = (char *)malloc(strlen(d->srcFilename)+4); strcpy(d->outFilename, d->srcFilename); strcpy(d->outFilename+strlen(d->srcFilename), ".dwt"); } else { d->outFilename = strdup(argv[1]); } //Input review printf("\nSource file:\t\t%s\n", d->srcFilename); printf(" Dimensions:\t\t%dx%d\n", pixWidth, pixHeight); printf(" Components count:\t%d\n", compCount); printf(" Bit depth:\t\t%d\n", bitDepth); printf(" DWT levels:\t\t%d\n", dwtLvls); printf(" Forward transform:\t%d\n", forward); printf(" 9/7 transform:\t\t%d\n", dwt97); //data sizes int inputSize = pixWidth*pixHeight*compCount; //<amount of data (in bytes) to proccess //load img source image d->srcImg = (unsigned char *) malloc (inputSize); if (getImg(d->srcFilename, d->srcImg, inputSize) == -1) return -1; // DWT // Create memory objects, Set arguments for kernel functions, Queue the kernel up for execution across the array, Read the output buffer back to the Host, Output the result buffer if (forward == 1) { if(dwt97 == 1 ) processDWT<float>(d, forward, writeVisual); else // 5/3 processDWT<int>(d, forward, writeVisual); } else { // reverse if(dwt97 == 1 ) processDWT<float>(d, forward, writeVisual); else // 5/3 processDWT<int>(d, forward, writeVisual); } Cleanup(context, commandQueue, program, kernel); clReleaseKernel(c_CopySrcToComponents); clReleaseKernel(c_CopySrcToComponent); return 0; }
/// // main() for GLinterop example // int main(int argc, char** argv) { cl_device_id device = 0; imWidth = 256; imHeight = 256; vbolen = imHeight; initGlut(argc, argv, imWidth, imHeight); vbo = initVBO(vbolen); initTexture(imWidth, imHeight); // Create an OpenCL context on first available platform context = CreateContext(); if (context == NULL) { std::cerr << "Failed to create OpenCL context." << std::endl; return 1; } // Create a command-queue on the first device available // on the created context commandQueue = CreateCommandQueue(context, &device); if (commandQueue == NULL) { Cleanup(); return 1; } // Create OpenCL program from GLinterop.cl kernel source program = CreateProgram(context, device, "kernel/GLinterop.cl"); if (program == NULL) { Cleanup(); return 1; } // Create OpenCL kernel kernel = clCreateKernel(program, "init_vbo_kernel", NULL); if (kernel == NULL) { std::cerr << "Failed to create kernel" << std::endl; Cleanup(); return 1; } tex_kernel = clCreateKernel(program, "init_texture_kernel", NULL); if (tex_kernel == NULL) { std::cerr << "Failed to create kernel" << std::endl; Cleanup(); return 1; } // Create memory objects that will be used as arguments to // kernel if (!CreateMemObjects(context, tex, vbo, &cl_vbo_mem, &cl_tex_mem)) { Cleanup(); return 1; } // Perform some queries to get information about the OpenGL objects performQueries(); glutMainLoop(); std::cout << std::endl; std::cout << "Executed program succesfully." << std::endl; Cleanup(); return 0; }
/// // main() for HelloWorld example // int main(int argc, char** argv) { cl_context context = 0; cl_command_queue commandQueue = 0; cl_program program = 0; cl_device_id device = 0; cl_kernel kernel = 0; cl_mem memObjects[2] = {0, 0}; cl_int errNum; assert( (ARRAY_SIZE & (ARRAY_SIZE -1)) == 0 && "Array size must be a power of 2"); int numberOfIterations = (int) ceil(log2(ARRAY_SIZE)); assert( (1 << numberOfIterations) == ARRAY_SIZE && "numberOfIterations calculation is incorrect"); // Create an OpenCL context on first available platform context = CreateContext(); if (context == NULL) { std::cerr << "Failed to create OpenCL context." << std::endl; return 1; } // Create a command-queue on the first device available // on the created context commandQueue = CreateCommandQueue(context, &device); if (commandQueue == NULL) { Cleanup(context, commandQueue, program, kernel, memObjects); return 1; } // Create OpenCL program from HelloWorld.cl kernel source program = CreateProgram(context, device, "SimplePrefixSum.cl"); if (program == NULL) { Cleanup(context, commandQueue, program, kernel, memObjects); return 1; } // Create OpenCL kernel kernel = clCreateKernel(program, "prefix_sum", NULL); if (kernel == NULL) { std::cerr << "Failed to create kernel" << std::endl; Cleanup(context, commandQueue, program, kernel, memObjects); return 1; } // Create memory objects that will be used as arguments to // kernel. First create host memory arrays that will be // used to store the arguments to the kernel cl_int a[ARRAY_SIZE]; cl_int b[ARRAY_SIZE]; for (int i = 0; i < ARRAY_SIZE; i++) { a[i] = i+1; b[i] = 0; } if (!CreateMemObjects(context, memObjects, a, b)) { Cleanup(context, commandQueue, program, kernel, memObjects); return 1; } // Set the kernel arguments (result, a, b) errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]); errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]); errNum |= clSetKernelArg(kernel, 2, sizeof(int), &numberOfIterations); if (errNum != CL_SUCCESS) { std::cerr << "Error setting kernel arguments." << std::endl; Cleanup(context, commandQueue, program, kernel, memObjects); return 1; } size_t globalWorkSize[1] = { ARRAY_SIZE }; size_t localWorkSize[1] = { ARRAY_SIZE }; // Queue the kernel up for execution across the array errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); if (errNum != CL_SUCCESS) { std::cerr << "Error queuing kernel for execution." << std::endl; Cleanup(context, commandQueue, program, kernel, memObjects); return 1; } // The location of the result depends on how many loop iterations were executed. cl_mem returnObject = (numberOfIterations % 2 == 0)? memObjects[0] : memObjects[1]; // Read the output buffer back to the Host errNum = clEnqueueReadBuffer(commandQueue, returnObject, CL_TRUE, 0, ARRAY_SIZE * sizeof(cl_int), a, 0, NULL, NULL); if (errNum != CL_SUCCESS) { std::cerr << "Error reading result buffer." << std::endl; Cleanup(context, commandQueue, program, kernel, memObjects); return 1; } // Output the result buffer std::cout << "Result:" << std::endl; for (int i = 0; i < ARRAY_SIZE; i++) { std::cout << a[i] << " "; } std::cout << std::endl; std::cout << "Executed program succesfully." << std::endl; Cleanup(context, commandQueue, program, kernel, memObjects); return 0; }
int main(int argc, char** argv) { cl_context context = 0; cl_command_queue commandQueue = 0; cl_program program = 0; cl_device_id device = 0; cl_kernel kernel = 0; cl_int status; int profiling_info = 0; cl_event myEvent, myEvent2; int ITEMS_PER_ROW; if( argc < 3 || argc > 4 ) { printf("Usage: %s matrix_file vector_file [ref_file]\n", argv[0]); return EXIT_FAILURE; } int verify = (argc == 4 ? 1 : 0); char Mfilename[50]; char xfilename[50]; char reffilename[50]; strcpy(Mfilename, argv[1]); strcpy(xfilename, argv[2]); if(verify) strcpy(reffilename, argv[3]); #ifdef PROFILE cl_ulong startTime, endTime, startTime2, endTime2; cl_ulong kernelExecTimeNs, readFromGpuTime; profiling_info = 1; #endif char filename[] = "../../kernels/MatVecMul_kernel.cl"; char filename2[] = "../../common/types_kernel.h"; /* READING DATA FROM FILE */ struct MatrixParams params; real *Values; int *RowPtr; int *ColInd; real *x; real * res; int N; std::ifstream Mfile; Mfile.open (Mfilename, std::ios::in); if (!Mfile.is_open()) { printf("Error: cannot open file\n"); return EXIT_FAILURE; } Mfile >> params.NRows; Mfile >> params.NCols; Mfile >> params.NNZ; HANDLE_ALLOC_ERROR(Values = (real*)malloc(params.NNZ*sizeof(real))); HANDLE_ALLOC_ERROR(ColInd = (int*)malloc(params.NNZ*sizeof(int))); HANDLE_ALLOC_ERROR(RowPtr = (int*)malloc((params.NRows+1)*sizeof(int))); HANDLE_ALLOC_ERROR(x = (real*)malloc(params.NCols*sizeof(real))); HANDLE_ALLOC_ERROR(res = (real*)malloc(params.NRows*sizeof(real))); for( int i = 0; i < params.NRows+1; i++) Mfile >> RowPtr[i]; for( int i = 0; i < params.NNZ; i++) Mfile >> ColInd[i]; for( int i = 0; i < params.NNZ; i++) Mfile >> Values[i]; Mfile.close(); std::ifstream xfile; xfile.open (xfilename, std::ios::in); if (!xfile.is_open()) { printf("Error: cannot open file\n"); return EXIT_FAILURE; } xfile >> N; assert(params.NRows == N); for( int i = 0; i < params.NCols; i++) xfile >> x[i]; xfile.close(); TIME start = tic(); TIME init = tic(); // Create an OpenCL context context = CreateContext(); if(context == NULL) { std::cerr << "Failed to create OpenCL context." << std::endl; Cleanup(context, commandQueue, program, kernel); return EXIT_FAILURE; } // Create a command queue commandQueue = CreateCommandQueue(context, &device, profiling_info); if(commandQueue == NULL) { std::cerr << "Failed to create OpenCL command queue." << std::endl; Cleanup(context, commandQueue, program, kernel); return EXIT_FAILURE; } program = CreateProgram(context, device, filename, filename2); if (program == NULL) { Cleanup(context, commandQueue, program, kernel); return 1; } // Create OpenCL kernel kernel = clCreateKernel(program, "MatVecMul", NULL); if(kernel == NULL) { std::cerr << "Failed to create kernel." << std::endl; Cleanup(context, commandQueue, program, kernel); return EXIT_FAILURE; } printf("%lf\n", toc(init)); /* QUERYING DEVICE INFO */ size_t kernelWorkGroupSize; // maximum work-group size that can be used to execute a kernel size_t sizeOfWarp; // the preferred multiple of workgroup size for launch cl_ulong localMemSize; // the amount of local memory in bytes being used by a kernel cl_ulong privateMemSize; // the minimum amount of private memory, in bytes, used by each workitem in the kernel. HANDLE_OPENCL_ERROR(clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, NULL)); HANDLE_OPENCL_ERROR(clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(size_t), &sizeOfWarp, NULL)); #ifdef PRINT_INFO HANDLE_OPENCL_ERROR(clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(cl_ulong), &localMemSize, NULL)); HANDLE_OPENCL_ERROR(clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_PRIVATE_MEM_SIZE, sizeof(cl_ulong), &privateMemSize, NULL)); #endif #ifdef PRINT_INFO printf("------------ Some info: --------------\n"); printf("kernelWorkGroupSize = %lu \n", kernelWorkGroupSize); printf("sizeOfWarp = %lu \n", sizeOfWarp); printf("localMemSize = %lu \n", localMemSize); printf("privateMemSize = %lu \n", privateMemSize); printf("------------------------ --------------\n"); #endif if( WORK_GROUP_SIZE > kernelWorkGroupSize ) { printf("Error: wrong work group size\n"); return EXIT_FAILURE; } ITEMS_PER_ROW = sizeOfWarp; TIME t = tic(); int numWorkRows = params.NRows; size_t localWorkSize[1] = {WORK_GROUP_SIZE}; int numWarpsInGroup = WORK_GROUP_SIZE/ITEMS_PER_ROW; size_t globalWorkSize[1] = {((numWorkRows-1) / numWarpsInGroup + 1)*WORK_GROUP_SIZE}; cl_mem DEV_Values = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(real)*params.NNZ, Values, &status); HANDLE_OPENCL_ERROR(status); cl_mem DEV_ColInd = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*params.NNZ, ColInd, &status); HANDLE_OPENCL_ERROR(status); cl_mem DEV_RowPtr = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*(params.NRows+1), RowPtr, &status); HANDLE_OPENCL_ERROR(status); cl_mem DEV_x = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(real)*params.NCols, x, &status); HANDLE_OPENCL_ERROR(status); cl_mem DEV_res = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(real)*params.NRows, NULL, &status); HANDLE_OPENCL_ERROR(status); int p_dim = 2; int p[] = {ITEMS_PER_ROW, params.NRows}; cl_mem DEV_p = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(int)*p_dim, p, &status); HANDLE_OPENCL_ERROR(status); int n = 0; status |= clSetKernelArg(kernel, n++, sizeof(cl_mem), (void*)&DEV_Values); status |= clSetKernelArg(kernel, n++, sizeof(cl_mem), (void*)&DEV_ColInd); status |= clSetKernelArg(kernel, n++, sizeof(cl_mem), (void*)&DEV_RowPtr); status |= clSetKernelArg(kernel, n++, sizeof(cl_mem), (void*)&DEV_x); status |= clSetKernelArg(kernel, n++, sizeof(cl_mem), (void*)&DEV_res); status |= clSetKernelArg(kernel, n++, sizeof(cl_mem), (void*)&DEV_p); status |= clSetKernelArg(kernel, n++, sizeof(real)*WORK_GROUP_SIZE, NULL); HANDLE_OPENCL_ERROR(status); printf("%lf\n", toc(t)); // Queue the kernel HANDLE_OPENCL_ERROR(clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, &myEvent)); // Read the output buffer back to the Host HANDLE_OPENCL_ERROR(clEnqueueReadBuffer(commandQueue, DEV_res, CL_TRUE, 0, params.NRows*sizeof(real), res, 0, NULL, &myEvent2)); clFinish(commandQueue); // wait for all events to finish double elapsed_time = toc(start); /* CHECK RESULTS */ if( verify ) { std::ifstream reffile; reffile.open (reffilename, std::ios::in); if (!reffile.is_open()) { printf("Error: cannot open file\n"); return EXIT_FAILURE; } int N_ref; reffile >> N_ref; assert(N == N_ref); real *ref; HANDLE_ALLOC_ERROR(ref = (real*)malloc(N*sizeof(real))); for( int i = 0; i < N; i++) reffile >> ref[i]; reffile.close(); for( int i = 0; i < N; i++) { //std::cout << ref[i] << "==" << res[i] << ", "; assert(abs(ref[i] - res[i]) < TOL); } std::cout << "Verified..." << std::endl; } #ifdef PROFILE clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, NULL); clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); clGetEventProfilingInfo(myEvent2, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime2, NULL); clGetEventProfilingInfo(myEvent2, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime2, NULL); kernelExecTimeNs = endTime-startTime; readFromGpuTime = endTime2-startTime2; printf(/*"Kernel execution time: %lf\n"*/"%lf\n", (double)readFromGpuTime/1000000000.0); printf(/*"Kernel execution time: %lf\n"*/"%lf\n", (double)kernelExecTimeNs/1000000000.0); #endif printf(/*"Total execution time: %lf\n"*/"%lf\n", elapsed_time); Cleanup(context, commandQueue, program, kernel); free(Values); free(RowPtr); free(ColInd); free(x); free(res); clReleaseMemObject(DEV_Values); clReleaseMemObject(DEV_ColInd); clReleaseMemObject(DEV_RowPtr); clReleaseMemObject(DEV_x); clReleaseMemObject(DEV_res); clReleaseMemObject(DEV_p); return EXIT_SUCCESS; }
/// // main() for HelloBinaryWorld example // int main(int argc, char** argv) { time_t start, end; double duration = 0; time(&start); cl_context context = 0; cl_command_queue commandQueue = 0; cl_program program = 0; cl_device_id device = 0; cl_kernel kernel = 0; cl_mem imageObjects[2] = { 0, 0 }; cl_sampler sampler = 0; cl_int errNum; if (argc != 3) { std::cerr << "USAGE: " << argv[0] << " <inputImageFile> <outputImageFiles>" << std::endl; return 1; } // Create an OpenCL context on first available platform context = CreateContext(); if (context == NULL) { std::cerr << "Failed to create OpenCL context." << std::endl; return 1; } // Create a command-queue on the first device available // on the created context commandQueue = CreateCommandQueue(context, &device); if (commandQueue == NULL) { Cleanup(context, commandQueue, program, kernel, imageObjects, sampler); return 1; } // Make sure the device supports images, otherwise exit cl_bool imageSupport = CL_FALSE; clGetDeviceInfo(device, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &imageSupport, NULL); if (imageSupport != CL_TRUE) { std::cerr << "OpenCL device does not support images." << std::endl; Cleanup(context, commandQueue, program, kernel, imageObjects, sampler); return 1; } // Load input image from file and load it into // an OpenCL image object int width, height; imageObjects[0] = LoadImage(context, argv[1], width, height); if (imageObjects[0] == 0) { std::cerr << "Error loading: " << std::string(argv[1]) << std::endl; Cleanup(context, commandQueue, program, kernel, imageObjects, sampler); return 1; } // Create ouput image object cl_image_format clImageFormat; clImageFormat.image_channel_order = CL_RGBA; clImageFormat.image_channel_data_type = CL_UNORM_INT8; imageObjects[1] = clCreateImage2D(context, CL_MEM_WRITE_ONLY, &clImageFormat, width, height, 0, NULL, &errNum); if (errNum != CL_SUCCESS) { std::cerr << "Error creating CL output image object." << std::endl; Cleanup(context, commandQueue, program, kernel, imageObjects, sampler); return 1; } // Create sampler for sampling image object sampler = clCreateSampler(context, CL_FALSE, // Non-normalized coordinates CL_ADDRESS_CLAMP_TO_EDGE, CL_FILTER_NEAREST, &errNum); if (errNum != CL_SUCCESS) { std::cerr << "Error creating CL sampler object." << std::endl; Cleanup(context, commandQueue, program, kernel, imageObjects, sampler); return 1; } // Create OpenCL program program = CreateProgram(context, device, "ImageFilter2D.cl"); if (program == NULL) { Cleanup(context, commandQueue, program, kernel, imageObjects, sampler); return 1; } // Create OpenCL kernel kernel = clCreateKernel(program, "gaussian_filter", NULL); if (kernel == NULL) { std::cerr << "Failed to create kernel" << std::endl; Cleanup(context, commandQueue, program, kernel, imageObjects, sampler); return 1; } size_t localWorkSize[1] = { 8 }; size_t globalWorkSize[1] = { RoundUp(localWorkSize[0], height) }; // Set the kernel arguments errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &imageObjects[0]); errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &imageObjects[1]); errNum |= clSetKernelArg(kernel, 2, sizeof(cl_sampler), &sampler); errNum |= clSetKernelArg(kernel, 3, sizeof(cl_int), &width); errNum |= clSetKernelArg(kernel, 4, sizeof(cl_int), &height); errNum |= clSetKernelArg(kernel, 5, sizeof(float) * ((width+2)*(localWorkSize[0]+2)*4), NULL); if (errNum != CL_SUCCESS) { std::cerr << "Error setting kernel arguments." << std::endl; Cleanup(context, commandQueue, program, kernel, imageObjects, sampler); return 1; } // Queue the kernel up for execution errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); if (errNum != CL_SUCCESS) { std::cerr << "Error queuing kernel for execution." << std::endl; Cleanup(context, commandQueue, program, kernel, imageObjects, sampler); return 1; } // Read the output buffer back to the Host char *buffer = new char [width * height * 4]; size_t origin[3] = { 0, 0, 0 }; size_t region[3] = { width, height, 1}; errNum = clEnqueueReadImage(commandQueue, imageObjects[1], CL_TRUE, origin, region, 0, 0, buffer, 0, NULL, NULL); if (errNum != CL_SUCCESS) { std::cerr << "Error reading result buffer." << std::endl; Cleanup(context, commandQueue, program, kernel, imageObjects, sampler); return 1; } std::cout << std::endl; std::cout << "Executed program succesfully." << std::endl; //memset(buffer, 0xff, width * height * 4); // Save the image out to disk if (!SaveImage(argv[2], buffer, width, height)) { std::cerr << "Error writing output image: " << argv[2] << std::endl; Cleanup(context, commandQueue, program, kernel, imageObjects, sampler); delete [] buffer; return 1; } delete [] buffer; Cleanup(context, commandQueue, program, kernel, imageObjects, sampler); time(&end); duration = difftime(end, start); std::cout << "Total time taken: " << duration << " milliseconds" << std::endl; return 0; }
void QHoneycombWidget::initializeGL() { cl_device_id device = 0; connect(context(), &QOpenGLContext::aboutToBeDestroyed, this, &QHoneycombWidget::cleanup); initializeOpenGLFunctions(); initTexture(); // Create an OpenCL context on first available platform clcontext = CreateContext(); if (clcontext == NULL) { std::cerr << "Failed to create OpenCL context." << std::endl; return ; } // Create a command-queue on the first device available // on the created context commandQueue = CreateCommandQueue(clcontext, &device); if (commandQueue == NULL) { cleanup(); return ; } // Create OpenCL program from GLinterop.cl kernel source program = CreateProgram(device, "Kelvin.cl"); if (program == NULL) { cleanup(); return; } // Create OpenCL kernel m_KernelCopyIntoTexture = clCreateKernel(program, "copy_texture_kernel", NULL); if (m_KernelCopyIntoTexture == NULL) { std::cerr << "Failed to create kernel" << std::endl; cleanup(); return; } m_KernelCopyIntoTextureBasic = clCreateKernel(program, "copy_texture_kernel_basic", NULL); if (m_KernelCopyIntoTextureBasic == NULL) { std::cerr << "Failed to create kernel" << std::endl; cleanup(); return; } m_KernelIteration = clCreateKernel(program, "KelvinIteration", NULL); if (m_KernelIteration == NULL) { std::cerr << "Failed to create kernel" << std::endl; cleanup(); return; } // Create memory objects that will be used as arguments to // kernel if (!CreateMemObjects()) { cleanup(); return ; } }
int main(int argc, char *argv[]){ if (MODE == 5){ printf("---OpenCL Test Code---\n\n"); cl_int errNum; cl_uint numPlatforms; cl_platform_id *platforms = NULL; cl_uint numDevices; cl_device_id *devices = NULL; //platform info fields char vendor[1024], name[1024], version[1024]; //device info fields size_t MAX_WORK_GROUP_SIZE; cl_ulong GLOBAL_MEM_CACHE_SIZE, GLOBAL_MEM_SIZE, LOCAL_MEM_SIZE, GLOBAL_MEM_CACHELINE_SIZE; cl_uint MAX_COMPUTE_UNITS, MAX_WORK_ITEM_DIMENSIONS; size_t MAX_WORK_ITEM_SIZES[3]; char DEVICE_NAME[1024], DEVICE_VENDOR[1024], DEVICE_VERSION[1024], DRIVER_VERSION[1024], EXTENSIONS[2048]; cl_device_mem_cache_type GLOBAL_MEM_CACHE_TYPE; //printf("Getting number of OpenCL Platforms...\n"); errNum = clGetPlatformIDs(0, NULL, &numPlatforms); if (errNum != CL_SUCCESS) { printf("Failed to get number of OpenCL platforms.\n"); return 0; } else { //printf("found %d.\n", numPlatforms); } //printf("Allocating space for the platform info...\n"); platforms = (cl_platform_id *)malloc(numPlatforms*sizeof(cl_platform_id)); printf("---Platform Info---\n"); errNum = clGetPlatformIDs(numPlatforms, platforms, NULL); if (errNum != CL_SUCCESS) { printf("Failed to get platform info.\n"); return 0; } else { clGetPlatformInfo (platforms[0], CL_PLATFORM_VENDOR, sizeof(vendor), vendor, NULL); clGetPlatformInfo (platforms[0], CL_PLATFORM_NAME, sizeof(name), name, NULL); clGetPlatformInfo (platforms[0], CL_PLATFORM_VERSION, sizeof(version), version, NULL); //printf("Got platform info.\n"); printf("Vendor: \t%s\n", vendor); printf("Name: \t%s\n", name); printf("Version:\t%s\n", version); } //printf("Getting number of devices...\n"); errNum = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); if (errNum != CL_SUCCESS) { printf("Failed to get number of devices.\n"); return 0; } else { //printf("Found %d.\n", numDevices); } //printf("Allocating space for device info...\n"); devices = (cl_device_id*)malloc(numDevices * sizeof(cl_device_id)); printf("\n---Device Info---"); errNum = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); if (errNum != CL_SUCCESS) { printf("Failed to get device info.\n"); return 0; } else { int i, j = 0; for (i = 0; i < numDevices; i++ ) { printf("\nDevice ID: %d\n", i+1); clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(DEVICE_NAME), DEVICE_NAME, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(DEVICE_VENDOR), DEVICE_VENDOR, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(DEVICE_VERSION), DEVICE_VERSION, NULL); clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(DRIVER_VERSION), DRIVER_VERSION, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_EXTENSIONS, sizeof(EXTENSIONS), EXTENSIONS, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(MAX_COMPUTE_UNITS), &MAX_COMPUTE_UNITS, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(GLOBAL_MEM_SIZE), &GLOBAL_MEM_SIZE, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(LOCAL_MEM_SIZE), &LOCAL_MEM_SIZE, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(MAX_WORK_ITEM_DIMENSIONS), &MAX_WORK_ITEM_DIMENSIONS, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(MAX_WORK_ITEM_SIZES), MAX_WORK_ITEM_SIZES, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(MAX_WORK_GROUP_SIZE), &MAX_WORK_GROUP_SIZE, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_CACHE_SIZE, sizeof(GLOBAL_MEM_CACHE_SIZE), &GLOBAL_MEM_CACHE_SIZE, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE, sizeof(GLOBAL_MEM_CACHELINE_SIZE), &GLOBAL_MEM_CACHELINE_SIZE, NULL); clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_CACHE_TYPE, sizeof(GLOBAL_MEM_CACHE_TYPE), &GLOBAL_MEM_CACHE_TYPE, NULL); printf("Device Name:\t%s\n", DEVICE_NAME); printf("Device Vendor:\t%s\n", DEVICE_VENDOR); printf("Device Version:\t%s\n", DEVICE_VERSION); printf("Driver Version:\t%s\n", DRIVER_VERSION); printf("EXTENSIONS:\t%s\n", EXTENSIONS); printf("Number of CUs:\t%d\n", MAX_COMPUTE_UNITS); printf("GMem:\t\t%lld (Bytes)\n", (long long) GLOBAL_MEM_SIZE); printf("GMem $ Size:\t%lld (Bytes)\n", (long long) GLOBAL_MEM_CACHE_SIZE); printf("GMem $ Line:\t%lld (Bytes)\n", (long long) GLOBAL_MEM_CACHELINE_SIZE); if(GLOBAL_MEM_CACHE_TYPE == CL_NONE) { printf("GMem $ Type:\tCL_NONE\n"); } else if(GLOBAL_MEM_CACHE_TYPE == CL_READ_ONLY_CACHE) { printf("GMem $ Type:\tCL_READ_ONLY_CACHE\n"); } else if(GLOBAL_MEM_CACHE_TYPE == CL_READ_WRITE_CACHE) { printf("GMem $ Type:\tCL_READ_WRITE_CACHE\n"); } printf("LMem:\t\t%lld (Bytes)\n", (long long) LOCAL_MEM_SIZE); printf("Work Group Size:%d (Max)\n", (int) MAX_WORK_GROUP_SIZE); printf("Work Item Dim:\t%d (Max)\n", MAX_WORK_ITEM_DIMENSIONS); printf("Work Item Size:\t"); for(j = 0; j < MAX_WORK_ITEM_DIMENSIONS; j ++) { if (j != (MAX_WORK_ITEM_DIMENSIONS -1)) printf("%d, ", (int) MAX_WORK_ITEM_SIZES[j]); if (j == (MAX_WORK_ITEM_DIMENSIONS -1)) printf("%d ", (int) MAX_WORK_ITEM_SIZES[j]); } printf("(Max)\n"); } //printf("Got device info.\n"); } } else if (MODE == 4){ cl_context context = 0; cl_command_queue commandQueue = 0; cl_program program = 0; cl_device_id device = 0; //Create an OpenCL context on first available platform context = CreateContext(); if (context == NULL) { printf("Failed to create OpenCL context.\n"); return 1; } //Create a command-queue on the first device available on the created context commandQueue = CreateCommandQueue(context, &device); if (commandQueue == NULL) { printf("Failed to create commandQueue.\n"); Cleanup(context, commandQueue, program, NULL); return 1; } // Create OpenCL program and store the binary for future use. printf("Attempting to create kernel binary from source.\n"); program = CreateProgram(context, device, KERNELPATHIN); if (program == NULL) { printf("Failed to create Program"); Cleanup(context, commandQueue, program, NULL); return 1; } printf("Kernel is saved.\n"); if (SaveProgramBinary(program, device, KERNELPATHOUT) == false) { printf("Failed to write program binary.\n"); Cleanup(context, commandQueue, program, NULL); return 1; } //printf("---Done---"); //return 1; } else if (MODE == 3){ //todo free remaining objects not passed to cleanup //profiling int write_bytes = 0; int read_bytes = 0; /*unsigned long long start_cycles, stop_cycles; unsigned long long start_setup, stop_setup; unsigned long long start_write, stop_write; unsigned long long start_read, stop_read; unsigned long long start_finalize, stop_finalize; struct timespec start_time_t, stop_time_t;*/ printf("Stream Mode\n\n"); //clock_gettime(CLOCK_MONOTONIC, &start_time_t); //start_cycles = rdtsc(); int i; time_t t; srand((unsigned) time(&t)); // Create the two input vectors printf("\nHostside malloc(s)\n"); fflush(stdout); int *A = (int*)malloc(sizeof(int)*(SIZE*SIZE)); int *B = (int*)malloc(sizeof(int)*(SIZE*SIZE)); int *C = (int*)malloc(sizeof(int)*(SIZE*SIZE)); //profile //bytes += 3 * sizeof(int)*(SIZE*SIZE); printf("\nHostside mat init\n"); fflush(stdout); for(i = 0; i < (SIZE*SIZE); i++) { A[i] = B[i] = rand() % 10 + 1;; } //print matrix printf("Matrix A[%d][%d]:\n", SIZE, SIZE); for(i = 0; i < (SIZE*SIZE); i++) { printf("%3d ", A[i]); if(((i + 1) % SIZE) == 0) printf("\n"); } //print matrix printf("\nMatrix B[%d][%d]:\n", SIZE, SIZE); for(i = 0; i < (SIZE*SIZE); i++) { printf("%3d ", B[i]); if(((i + 1) % SIZE) == 0) printf("\n"); } //syscall(STATS_RESET); //Get platform and device information cl_context context = 0; cl_command_queue commandQueue = 0; cl_program program = 0; cl_device_id device = 0; cl_kernel kernel = 0; cl_uint err = 0; //char *filepath = NULL; //Create the context printf("\nCreateContext\n"); fflush(stdout); context = CreateContext(); if (context == NULL) { printf("Failed to create OpenCL context.\n"); return 1; } /* printf("\nEnd CreateContext\n"); fflush(stdout);*/ //Create a command-queue on the first device available on the created context printf("\nCreateCommandQueue\n"); fflush(stdout); commandQueue = CreateCommandQueue(context, &device); if (commandQueue == NULL) { printf("Failed to create command queue.\n"); Cleanup(context, commandQueue, program, NULL); return 1; } //create the program from the binary //program = CreateProgramFromBinary(context, device, "/home/stardica/Desktop/Kernels/vector.cl.bin.GPU"); //strcat(KERNELPATHOUT, ".GPU") printf("\nCreateProgramFromBinary\n"); fflush(stdout); program = CreateProgramFromBinary(context, device, KERNEL); if (program == NULL) { printf("Failed to load kernel binary,\n"); Cleanup(context, commandQueue, program, NULL); return 1; } // Create OpenCL kernel printf("\nclCreateKernel\n"); fflush(stdout); kernel = clCreateKernel(program, "Matrix", NULL); if (kernel == NULL) { printf("Failed to create kernel.\n"); Cleanup(context, commandQueue, program, NULL); return 1; } cl_mem a_mem_obj = 0; cl_mem b_mem_obj = 0; cl_mem c_mem_obj = 0; //Create memory buffers on the device for each vector printf("\nclCreateBuffer(s)\n"); fflush(stdout); if(LOCALMEM == 1 && CACHEDMEM == 0) { //this creates uncached buffers in the GPU's local memory #if M2S_CGM_OCL_SIM { a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); } #else { a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); } #endif } if(SYSMEM == 1 && CACHEDMEM == 0) { //this creates uncached buffers in the system memory #if M2S_CGM_OCL_SIM { a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); b_mem_obj = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); } #else { a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); b_mem_obj = clCreateBuffer(context,CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); } #endif } if(SYSMEM == 1 && CACHEDMEM == 1) { //this creates cached buffers in the system memory. #if M2S_CGM_OCL_SIM { a_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); b_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); c_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL, CL_FALSE); } #else { a_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); b_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); c_mem_obj = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, (sizeof(int)*(SIZE*SIZE)), NULL, NULL); } #endif } if (a_mem_obj == NULL || b_mem_obj == NULL || c_mem_obj == NULL) { printf("Failed to create memory objects.\n"); Cleanup(context, commandQueue, program, kernel); return 1; } //Copy the lists A and B to their respective memory buffers printf("\nclEnqueueWriteBuffer(s)\n"); fflush(stdout); write_bytes += 2 * sizeof(int)*(SIZE*SIZE); // start_write = rdtsc(); clEnqueueWriteBuffer(commandQueue, a_mem_obj, CL_TRUE, 0, (sizeof(int)*(SIZE*SIZE)), A, 0, NULL, NULL); clEnqueueWriteBuffer(commandQueue, b_mem_obj, CL_TRUE, 0, (sizeof(int)*(SIZE*SIZE)), B, 0, NULL, NULL); // stop_write = rdtsc(); // Set the arguments of the kernel int *size = (int *)SIZE; printf("\nclSetKernelArg(s)\n"); fflush(stdout); err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&c_mem_obj); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&a_mem_obj); err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&b_mem_obj); err = clSetKernelArg(kernel, 3, sizeof(int), (void *)&size); if (err != CL_SUCCESS) { printf("Kernel args not set.\n"); return 1; } // Execute the OpenCL kernel on the list size_t GlobalWorkSize[2], LocalWorkSize[2]; //Rember that in OpenCL we need to express the globalWorkSize in //terms of the total number of threads. The underlying OpenCL API //will look at the globalWorkSize and divide by the localWorkSize //to arrive at a 64 by 64 NDRange of 16 by 16 work groups. GlobalWorkSize[0] = GWS_0;//SIZE*SIZE*SIZE; // Process the entire lists GlobalWorkSize[1] = GWS_1;//SIZE*SIZE*SIZE; // Process the entire lists LocalWorkSize[0] = LWS_0; //SIZE Divide work items into groups of 64 LocalWorkSize[1] = LWS_1; //SIZE Divide work items into groups of 64 //used null for local, lets OpenCL determine the best local size. //err = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, GlobalWorkSize, LocalWorkSize, 0, NULL, NULL); printf("\nclEnqueueNDRangeKernel\n"); fflush(stdout); err = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, GlobalWorkSize, LocalWorkSize, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("ND range not enqueued. Code: %d\n", err); return 1; } //Read the memory buffer C on the device to the local variable C printf("\nclEnqueueReadBuffer\n"); fflush(stdout); read_bytes += sizeof(int)*(SIZE*SIZE); //start_read = rdtsc(); err = clEnqueueReadBuffer(commandQueue, c_mem_obj, CL_TRUE, 0, (sizeof(int)*(SIZE*SIZE)), C, 0, NULL, NULL); // stop_read = rdtsc(); if (err != CL_SUCCESS) { printf("Buffer not returned.\n"); return 1; } //syscall(STATS_STOP); //print matrix printf("\nMatrix C[%d][%d] = A[%d][%d]*B[%d][%d]:\n", SIZE, SIZE, SIZE, SIZE, SIZE, SIZE); for(i = 0; i < (SIZE*SIZE); i++) { printf("%3d ", C[i]); if(((i + 1) % SIZE) == 0) printf("\n"); } printf("\nHostside clean up\n"); fflush(stdout); err = clFlush(commandQueue); err = clFinish(commandQueue); Cleanup(context, commandQueue, program, kernel); err = clReleaseMemObject(a_mem_obj); err = clReleaseMemObject(b_mem_obj); err = clReleaseMemObject(c_mem_obj); free(A); free(B); free(C); //printf("---Done---"); /*stop_cycles = rdtsc(); clock_gettime(CLOCK_MONOTONIC, &stop_time_t); printf("Total cycles = %llu\n", (stop_cycles - start_cycles)); long int time_s = stop_time_t.tv_nsec - start_time_t.tv_nsec; printf("Approximate runtime (check) = %ld ms\n", (time_s/1000000)); printf("Bytes written %d\n", write_bytes); printf("transfer cycles = %llu\n", (stop_write - start_write)); printf("start at = %llu\n", (start_write - start_cycles)); printf("Bytes read %d\n", read_bytes); printf("transfer cycles = %llu\n", (stop_read - start_read)); printf("start at = %llu\n", (start_read - start_cycles));*/ } else if (MODE == 2){ printf("Multi Thread Mode\n"); //cal this: //assignToThisCore(0);//assign to core 0,1,2,... unsigned long long a, b; int i = 0; int j = 0; int k = 0; LoadMatrices(); pthread_t tid[SIZE*SIZE]; //printf("waiting\n"); //start our threads a = rdtsc(); syscall(BEGIN_PARALLEL_SECTION); for(i=0;i<SIZE;i++){ for(j=0;j<SIZE;j++){ struct RowColumnData *RCData = (struct RowColumnData *) malloc(sizeof(struct RowColumnData)); RCData->RowNum = i; RCData->ColumnNum = j; //printf("Thread create %d Row %d Col %d\n", k, RCData->RowNum, RCData->ColumnNum); pthread_create(&tid[k], NULL, RowColumnMultiply, RCData); k++; } } //Join threads//////////////////////////// for (i=0;i<NUM_THREADS;i++) { pthread_join(tid[i], NULL); } syscall(END_PARALLEL_SECTION); b = rdtsc(); PrintMatrices(); //printf("\nend clock Cycles: %llu\n", b); printf("\nDone. Number of clock Cycles: %llu\n", b-a); } else if (MODE == 1) { printf("Single Thread Mode\n\n"); //unsigned long long a, b; //a = rdtsc(); //time_t t; int i,j,k; //srand((unsigned) time(&t)); LoadMatrices(); //multiply mats///////////////////////// for (i=0;i<SIZE;i++){ for(j=0;j<SIZE;j++){ for(k=0;k<SIZE;k++){ matC[i][j] = matC[i][j] + (matA[i][k] * matB[k][j]); } } } PrintMatrices(); //b = rdtsc(); //printf("\nDone. Number of clock Cycles: %llu\n", b-a); } else if (MODE == 0) { printf("---Misc Tests---\n\n"); printf("size of long long is %d\n", (int) sizeof(long long)); printf("size of long is %d\n", (int) sizeof(long)); printf("size of int is %d\n", (int) sizeof(int)); printf("size of short is %d\n", (int) sizeof(short)); printf("size of char * %d\n", (int) sizeof(char *)); printf("size of unsigned int (word) %d\n", (int) sizeof(unsigned int)); char *string = "test string"; printf("Here is the string 1: \"%s\"\n", string); //Using the struct //set string variable and point to print_me. object.string = strdup(string); object.print_me = (void (*)(void *)) print_me; //use of print_me object.print_me(object.string); //pointer fun struct Object *ptr = &object; printf("this is the value of the pointer to struct object: %p\n", ptr); object.next=&object; printf("this is the value of the pointer to struct object: %p\n", object.next); object_ptr = &object; object_ptr->next = &object; printf("this is the value of the pointer to struct object: %p\n", object_ptr->next); //Macro fun PRINT(ptr, ptr); PRINT(object.next, object.next); PRINT(object_ptr->next, object_ptr->next); int mmu_page_size = 1 << 12; printf("mmu_papge_size = %d\n", mmu_page_size); //setjmp and longjmp fun /*jmp_buf environment; int i; i = setjmp(environment); printf("\n\nsetjmp returned = %d\n", i); printf("Env 1:\n"); int x = 0; for(x = 0; x < 6; x++) { printf(" %x\n", environment[x]); } if (i < 3) { longjmp(environment, 3); } printf("longjmp finished with i = %d\n", i);*/ } else { printf("---Invalid Mode Set---\n\n"); } printf("\n---Done---\n"); return 1; }
int main(int argc, char** argv) { cl_context context = 0; cl_command_queue commandQueue = 0; cl_program program = 0; cl_device_id device = 0; cl_kernel kernel = 0; cl_mem memObjects[3] = { 0, 0, 0 }; cl_int errNum; context = CreateContext(); if (context == NULL) { std::cerr << "Failed to create OpenCL context." << std::endl; return 1; } commandQueue = CreateCommandQueue(context, &device); if (commandQueue == NULL) { Cleanup(context, commandQueue, program, kernel, memObjects); return 1; } program = CreateProgram(context, device, "sun.cl"); if (program == NULL) { Cleanup(context, commandQueue, program, kernel, memObjects); return 1; } kernel = clCreateKernel(program, "hello_kernel", NULL); if (kernel == NULL) { std::cerr << "Failed to create kernel" << std::endl; Cleanup(context, commandQueue, program, kernel, memObjects); return 1; } // Create memory objects that will be used as arguments to // kernel. First create host memory arrays that will be // used to store the arguments to the kernel float result[ARRAY_SIZE]; float a[ARRAY_SIZE]; float b[ARRAY_SIZE]; for (int i = 0; i < ARRAY_SIZE; i++) { a[i] = (float)i; b[i] = (float)(i * 2); } //在设备上创建buffer对象 if (!CreateMemObjects(context, memObjects, a, b)) { Cleanup(context, commandQueue, program, kernel, memObjects); return 1; } // Set the kernel arguments (result, a, b) errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]); errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]); errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &memObjects[2]); if (errNum != CL_SUCCESS) { std::cerr << "Error setting kernel arguments." << std::endl; Cleanup(context, commandQueue, program, kernel, memObjects); return 1; } size_t globalWorkSize[1] = { ARRAY_SIZE }; size_t localWorkSize[1] = { 1 }; // Queue the kernel up for execution -----opencl runtime errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); if (errNum != CL_SUCCESS) { std::cerr << "Error queuing kernel for execution." << std::endl; Cleanup(context, commandQueue, program, kernel, memObjects); return 1; } // Read the output buffer back to the Host errNum = clEnqueueReadBuffer(commandQueue, memObjects[2], CL_TRUE, 0, ARRAY_SIZE * sizeof(float), result, 0, NULL, NULL); if (errNum != CL_SUCCESS) { std::cerr << "Error reading result buffer." << std::endl; Cleanup(context, commandQueue, program, kernel, memObjects); return 1; } // Output the result buffer for (int i = 0; i < ARRAY_SIZE; i++) { std::cout << result[i] << " "; } std::cout << std::endl; std::cout << "Executed program succesfully." << std::endl; Cleanup(context, commandQueue, program, kernel, memObjects); return 0; }