int setupCL(const char *filename) { //connect to a compute device err = clGetDeviceIDs(NULL,CL_DEVICE_TYPE_CPU, 1, &devices, NULL); //err = clGetDeviceIDs(NULL,CL_DEVICE_TYPE_GPU, 1, &devices, NULL); //get info about device size_t returned_size = 0; cl_char vendor_name[1024] = {0}; cl_char device_name[1024] = {0}; err = clGetDeviceInfo(devices, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, &returned_size); err|= clGetDeviceInfo(devices, CL_DEVICE_NAME, sizeof(device_name), device_name, &returned_size); printf("Connecting to %s %s...\n", vendor_name, device_name); //read the program printf("Loading program '%s'\n\n", filename); char *program_source = loadProgramSource(filename); //create the context and command queue context = clCreateContext(0, 1, &devices, NULL, NULL, &err); cmd_queue = clCreateCommandQueue(context, devices, 0, NULL); //create program from .cl file program = clCreateProgramWithSource(context,1, (const char**)&program_source, NULL, &err); //build the kernel program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); printf("Error: %d\n", err); char build[2048]; clGetProgramBuildInfo(program, devices, CL_PROGRAM_BUILD_LOG, 2048, build, NULL); printf("Build Log:\n%s\n",build); //create the kernel kernel = clCreateKernel(program, "rhs", &err); //get size of buffers size_t w_n_buffer_size = sizeof(float)*KSIZE*X*Y; size_t K_buffer_size = sizeof(float)*KSIZE*X*Y; size_t p_buffer_size = sizeof(float)*X*Y; size_t H0_buffer_size = sizeof(float)*X*Y; //create buffers w_n_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, w_n_buffer_size, NULL, NULL); p_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, p_buffer_size, NULL, NULL); H0_mem = clCreateBuffer(context, CL_MEM_READ_ONLY, H0_buffer_size, NULL, NULL); K_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, K_buffer_size, NULL, NULL); //set kernel arguments err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &w_n_mem); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &K_mem); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &p_mem); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &H0_mem); err |= clSetKernelArg(kernel, 4, sizeof(int), &Y); err |= clSetKernelArg(kernel, 5, sizeof(int), &X); err |= clSetKernelArg(kernel, 6, sizeof(float), &dy); err |= clSetKernelArg(kernel, 7, sizeof(float), &dx); err |= clSetKernelArg(kernel, 8, sizeof(float), &alpha); return CL_SUCCESS; }
/*------------------------------------------------------------------------------------------------------------ * 指定したソースコードファイルからプログラムオブジェクとを作成する * filename:カーネルのソースファイル */ void ClHelper::preloadProgram(const char* filename) { cl_int status; char *source = 0; // ファイルからプログラムを読み込む try { source = loadProgramSource(filename); } catch (MyError err) { fprintf(stderr, "Error: %s\n", err.cstr()); throw MyError("failed to load compute program from file.", __FUNCTION__); } // プログラムオブジェクトを作成する mProgram = clCreateProgramWithSource(mContext, 1, (const char **)&source, NULL, &status); if (mProgram ==(cl_program)0) { printError(status); delete [] source; throw MyError("failed to create program object", __FUNCTION__); } // プログラムをビルドする cl_device_id devices[1]; devices[0] = mDevaiceId; status = clBuildProgram(mProgram, 1, devices, NULL, NULL, NULL); if (status != CL_SUCCESS) { printError(status); showBuildingLog(mProgram, devices[0]); delete [] source; throw MyError("failed to build program object.", __FUNCTION__); } delete [] source; }
int main() { cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_mem objA = NULL; cl_mem objB = NULL; cl_mem objC = NULL; cl_program program = NULL; cl_kernel kernel = NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret; cl_event event1; int i, j; float *A; float *B; float *C; A = (float *)malloc(4*4*sizeof(float)); B = (float *)malloc(4*4*sizeof(float)); C = (float *)malloc(4*4*sizeof(float)); /* Initialize input data */ for (i=0; i<4; i++) { for (j=0; j<4; j++) { A[i*4+j] = i*4+j+1; B[i*4+j] = j*4+i+1; } } /* Get Platform/Device Information*/ ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); ret = clGetDeviceIDs( platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); /* Create OpenCL Context */ context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); /* Create command queue */ command_queue = clCreateCommandQueue(context, device_id, 0, &ret); /* Create Buffer Object */ objA = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret); objB = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret); objC = clCreateBuffer(context, CL_MEM_READ_WRITE, 4*4*sizeof(float), NULL, &ret); /* * Creating an user event * As a user event is created, its execution status is set to be CL_SUBMITTED * and we tag the event to a callback so when event reaches CL_COMPLETE, it will * execute postProcess */ event1 = clCreateUserEvent(context, &ret); clSetEventCallback(event1, CL_COMPLETE, &postProcess, "Looks like its done."); /* Copy input data to the memory buffer */ ret = clEnqueueWriteBuffer(command_queue, objA, CL_TRUE, 0, 4*4*sizeof(float), A, 0, NULL, NULL ); printf("A has been written\n"); /* The next command will wait for event1 according to its status*/ ret = clEnqueueWriteBuffer(command_queue, objB, CL_TRUE, 0, 4*4*sizeof(float), B, 1, &event1, NULL); printf("B has been written\n"); /* Tell event1 to complete */ clSetUserEventStatus(event1, CL_COMPLETE); const char *file_names[] = {"sample_kernel.cl"}; const int NUMBER_OF_FILES = 1; char* buffer[NUMBER_OF_FILES]; size_t sizes[NUMBER_OF_FILES]; loadProgramSource(file_names, NUMBER_OF_FILES, buffer, sizes); /* Create kernel program from source file*/ program = clCreateProgramWithSource(context, 1, (const char **)buffer, sizes, &ret); ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); /* Create data parallel OpenCL kernel */ kernel = clCreateKernel(program, "sample", &ret); /* Set OpenCL kernel arguments */ ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&objA); ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&objB); ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&objC); size_t global_item_size = 4; size_t local_item_size = 1; /* Execute OpenCL kernel as data parallel */ ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); /* Transfer result to host */ ret = clEnqueueReadBuffer(command_queue, objC, CL_TRUE, 0, 4*4*sizeof(float), C, 0, NULL, NULL); /* Display Results */ for (i=0; i<4; i++) { for (j=0; j<4; j++) { printf("%7.2f ", C[i*4+j]); } printf("\n"); } /* Finalization */ ret = clFlush(command_queue); ret = clFinish(command_queue); ret = clReleaseKernel(kernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(objA); ret = clReleaseMemObject(objB); ret = clReleaseMemObject(objC); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); free(A); free(B); free(C); return 0; }
int main(int argc, char** argv) { /* OpenCL 1.1 data structures */ cl_platform_id* platforms; cl_program program; cl_device_id device; cl_context context; /* OpenCL 1.1 scalar data types */ cl_uint numOfPlatforms; cl_int error; /* Get the number of platforms Remember that for each vendor's SDK installed on the computer, the number of available platform also increased. */ error = clGetPlatformIDs(0, NULL, &numOfPlatforms); if(error != CL_SUCCESS) { perror("Unable to find any OpenCL platforms"); exit(1); } platforms = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numOfPlatforms); printf("Number of OpenCL platforms found: %d\n", numOfPlatforms); error = clGetPlatformIDs(numOfPlatforms, platforms, NULL); if(error != CL_SUCCESS) { perror("Unable to find any OpenCL platforms"); exit(1); } // Search for a CPU/GPU device through the installed platforms // Build a OpenCL program and do not run it. for(cl_uint i = 0; i < numOfPlatforms; i++ ) { // Get the GPU device error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 1, &device, NULL); if(error != CL_SUCCESS) { // Otherwise, get the CPU error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_CPU, 1, &device, NULL); } if(error != CL_SUCCESS) { perror("Can't locate any OpenCL compliant device"); exit(1); } /* Create a context */ context = clCreateContext(NULL, 1, &device, NULL, NULL, &error); if(error != CL_SUCCESS) { perror("Can't create a valid OpenCL context"); exit(1); } /* Load the two source files into temporary datastores */ const char *file_names[] = {"simple.cl", "simple_2.cl"}; const int NUMBER_OF_FILES = 2; char* buffer[NUMBER_OF_FILES]; size_t sizes[NUMBER_OF_FILES]; loadProgramSource(file_names, NUMBER_OF_FILES, buffer, sizes); /* Create the OpenCL program object */ program = clCreateProgramWithSource(context, NUMBER_OF_FILES, (const char**)buffer, sizes, &error); if(error != CL_SUCCESS) { perror("Can't create the OpenCL program object"); exit(1); } /* Build OpenCL program object and dump the error message, if any */ char *program_log; const char options[] = "-cl-finite-math-only -cl-no-signed-zeros"; size_t log_size; //error = clBuildProgram(program, 1, &device, argv[1], NULL, NULL); // Uncomment the line below, comment the line above; re-build the program to use build options statically error = clBuildProgram(program, 1, &device, options, NULL, NULL); if(error != CL_SUCCESS) { // If there's an error whilst building the program, dump the log clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); program_log = (char*) malloc(log_size+1); program_log[log_size] = '\0'; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL); printf("\n=== ERROR ===\n\n%s\n=============\n", program_log); free(program_log); exit(1); } /* Query the program as to how many kernels were detected */ cl_uint numOfKernels; error = clCreateKernelsInProgram(program, 0, NULL, &numOfKernels); if (error != CL_SUCCESS) { perror("Unable to retrieve kernel count from program"); exit(1); } cl_kernel* kernels = (cl_kernel*) alloca(sizeof(cl_kernel) * numOfKernels); error = clCreateKernelsInProgram(program, numOfKernels, kernels, NULL); for(cl_uint i = 0; i < numOfKernels; i++) { char kernelName[32]; cl_uint argCnt; clGetKernelInfo(kernels[i], CL_KERNEL_FUNCTION_NAME, sizeof(kernelName), kernelName, NULL); clGetKernelInfo(kernels[i], CL_KERNEL_NUM_ARGS, sizeof(argCnt), &argCnt, NULL); printf("Kernel name: %s with arity: %d\n", kernelName, argCnt); } /* Clean up */ for(cl_uint i = 0; i < numOfKernels; i++) { clReleaseKernel(kernels[i]); } for(i=0; i< NUMBER_OF_FILES; i++) { free(buffer[i]); } clReleaseProgram(program); clReleaseContext(context); } }
int main(int argc, char** argv) { /* OpenCL 1.1 data structures */ cl_platform_id* platforms; cl_program program; cl_device_id device; cl_context context; cl_command_queue queue; cl_uint numOfPlatforms; cl_int error; cl_mem matrixAMemObj; // input matrix A mem buffer cl_mem matrixBMemObj; // input matrix B mem buffer cl_mem matrixCMemObj; // input matrix C mem buffer cl_int* matrixA; // input matrix A cl_int* matrixB; // input matrix B cl_int* matrixC; // input matrix C cl_uint widthA = WIDTH_G; cl_uint heightA = HEIGHT_G; cl_uint widthB = WIDTH_G; cl_uint heightB = HEIGHT_G; { // allocate memory for input and output matrices // based on whatever matrix theory i know. matrixA = (cl_int*)malloc(widthA * heightA * sizeof(cl_int)); matrixB = (cl_int*)malloc(widthB * heightB * sizeof(cl_int)); matrixC = (cl_int*)malloc(widthB * heightA * sizeof(cl_int)); memset(matrixA, 0, widthA * heightA * sizeof(cl_int)); memset(matrixB, 0, widthB * heightB * sizeof(cl_int)); memset(matrixC, 0, widthB * heightA * sizeof(cl_int)); fillRandom(matrixA, widthA, heightA, 643); fillRandom(matrixB, widthB, heightB, 991); } /* Get the number of platforms Remember that for each vendor's SDK installed on the computer, the number of available platform also increased. */ error = clGetPlatformIDs(0, NULL, &numOfPlatforms); if(error != CL_SUCCESS) { perror("Unable to find any OpenCL platforms"); exit(1); } platforms = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numOfPlatforms); printf("Number of OpenCL platforms found: %d\n", numOfPlatforms); error = clGetPlatformIDs(numOfPlatforms, platforms, NULL); if(error != CL_SUCCESS) { perror("Unable to find any OpenCL platforms"); exit(1); } // Search for a GPU device through the installed platforms // Build a OpenCL program and do not run it. for(cl_int i = 0; i < numOfPlatforms; i++ ) { // Get the GPU device error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 1, &device, NULL); if(error != CL_SUCCESS) { perror("Can't locate a OpenCL compliant device i.e. GPU"); exit(1); } /* Create a context */ context = clCreateContext(NULL, 1, &device, NULL, NULL, &error); if(error != CL_SUCCESS) { perror("Can't create a valid OpenCL context"); exit(1); } /* Load the two source files into temporary datastores */ const char *file_names[] = {"simple_mm_mult.cl"}; const int NUMBER_OF_FILES = 1; char* buffer[NUMBER_OF_FILES]; size_t sizes[NUMBER_OF_FILES]; loadProgramSource(file_names, NUMBER_OF_FILES, buffer, sizes); /* Create the OpenCL program object */ program = clCreateProgramWithSource(context, NUMBER_OF_FILES, (const char**)buffer, sizes, &error); if(error != CL_SUCCESS) { perror("Can't create the OpenCL program object"); exit(1); } /* Build OpenCL program object and dump the error message, if any */ char *program_log; const char options[] = ""; size_t log_size; error = clBuildProgram(program, 1, &device, options, NULL, NULL); if(error != CL_SUCCESS) { // If there's an error whilst building the program, dump the log clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); program_log = (char*) malloc(log_size+1); program_log[log_size] = '\0'; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL); printf("\n=== ERROR ===\n\n%s\n=============\n", program_log); free(program_log); exit(1); } // Queue is created with profiling enabled cl_command_queue_properties props; props |= CL_QUEUE_PROFILING_ENABLE; queue = clCreateCommandQueue(context, device, props, &error); cl_kernel kernel = clCreateKernel(program, "mmmult", &error); matrixAMemObj = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, widthA * heightA * sizeof(cl_int), matrixA, &error); matrixBMemObj = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, widthB * heightB * sizeof(cl_int), matrixB, &error); matrixCMemObj = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR, widthB * heightA * sizeof(cl_int), 0, &error); clSetKernelArg(kernel, 0, sizeof(cl_int),(void*)&widthB); clSetKernelArg(kernel, 1, sizeof(cl_int),(void*)&heightA); clSetKernelArg(kernel, 2, sizeof(cl_mem),(void*)&matrixAMemObj); clSetKernelArg(kernel, 3, sizeof(cl_mem),(void*)&matrixBMemObj); clSetKernelArg(kernel, 4, sizeof(cl_mem),(void*)&matrixCMemObj); size_t globalThreads[] = {widthB, heightA}; cl_event exeEvt; cl_ulong executionStart, executionEnd; error = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalThreads, NULL, 0, NULL, &exeEvt); clWaitForEvents(1, &exeEvt); if(error != CL_SUCCESS) { printf("Kernel execution failure!\n"); exit(-22); } // let's understand how long it took? clGetEventProfilingInfo(exeEvt, CL_PROFILING_COMMAND_START, sizeof(executionStart), &executionStart, NULL); clGetEventProfilingInfo(exeEvt, CL_PROFILING_COMMAND_END, sizeof(executionEnd), &executionEnd, NULL); clReleaseEvent(exeEvt); // printf("Execution the matrix-matrix multiplication took %lu.%lu s\n", (executionEnd - executionStart)/1000000000, (executionEnd - executionStart)%1000000000); printf("Execution the matrix-matrix multiplication took %lu s\n", (executionEnd - executionStart)); clEnqueueReadBuffer(queue, matrixCMemObj, CL_TRUE, 0, widthB * heightA * sizeof(cl_int), matrixC, 0, NULL, NULL); if (compare(matrixC, matrixA, matrixB, heightA, widthA, widthB)) printf("Passed!\n"); else printf("Failed!\n"); /* Clean up */ for(i=0; i< NUMBER_OF_FILES; i++) { free(buffer[i]); } clReleaseProgram(program); clReleaseContext(context); clReleaseMemObject(matrixAMemObj); clReleaseMemObject(matrixBMemObj); clReleaseMemObject(matrixCMemObj); } free(matrixA); free(matrixB); free(matrixC); }
int main(int argc, char** argv) { /* OpenCL 1.1 data structures */ cl_platform_id* platforms; cl_program program; cl_device_id device; cl_context context; /* OpenCL 1.1 scalar data types */ cl_uint numOfPlatforms; cl_int error; /* Prepare an array of UserData via dynamic memory allocation */ UserData* ud_in = (UserData*) malloc( sizeof(UserData) * DATA_SIZE); // input to device UserData* ud_out = (UserData*) malloc( sizeof(UserData) * DATA_SIZE); // output from device for( int i = 0; i < DATA_SIZE; ++i) { (ud_in + i)->x = i; (ud_in + i)->y = i; (ud_in + i)->z = i; (ud_in + i)->w = 3 * i; } /* Get the number of platforms Remember that for each vendor's SDK installed on the computer, the number of available platform also increased. */ error = clGetPlatformIDs(0, NULL, &numOfPlatforms); if(error != CL_SUCCESS ) { perror("Unable to find any OpenCL platforms"); exit(1); } platforms = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numOfPlatforms); printf("Number of OpenCL platforms found: %d\n", numOfPlatforms); error = clGetPlatformIDs(numOfPlatforms, platforms, NULL); if(error != CL_SUCCESS ) { perror("Unable to find any OpenCL platforms"); exit(1); } // Search for a CPU/GPU device through the installed platforms // Build a OpenCL program and do not run it. for(cl_uint i = 0; i < numOfPlatforms; i++ ) { // Get the GPU device error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 1, &device, NULL); if(error != CL_SUCCESS) { // Otherwise, get the CPU error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_CPU, 1, &device, NULL); } if(error != CL_SUCCESS) { perror("Can't locate any OpenCL compliant device"); exit(1); } /* Create a context */ context = clCreateContext(NULL, 1, &device, NULL, NULL, &error); if(error != CL_SUCCESS) { perror("Can't create a valid OpenCL context"); exit(1); } /* Load the two source files into temporary datastores */ const char *file_names[] = {"user_test.cl"}; const int NUMBER_OF_FILES = 1; char* buffer[NUMBER_OF_FILES]; size_t sizes[NUMBER_OF_FILES]; loadProgramSource(file_names, NUMBER_OF_FILES, buffer, sizes); /* Create the OpenCL program object */ program = clCreateProgramWithSource(context, NUMBER_OF_FILES, (const char**)buffer, sizes, &error); if(error != CL_SUCCESS) { perror("Can't create the OpenCL program object"); exit(1); } /* Build OpenCL program object and dump the error message, if any */ char *program_log; size_t log_size; error = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if(error != CL_SUCCESS) { // If there's an error whilst building the program, dump the log clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); program_log = (char*) malloc(log_size+1); program_log[log_size] = '\0'; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL); printf("\n=== ERROR ===\n\n%s\n=============\n", program_log); free(program_log); exit(1); } /* Query the program as to how many kernels were detected */ cl_uint numOfKernels; error = clCreateKernelsInProgram(program, 0, NULL, &numOfKernels); if (error != CL_SUCCESS) { perror("Unable to retrieve kernel count from program"); exit(1); } cl_kernel* kernels = (cl_kernel*) alloca(sizeof(cl_kernel) * numOfKernels); error = clCreateKernelsInProgram(program, numOfKernels, kernels, NULL); for(cl_uint i = 0; i < numOfKernels; i++) { char kernelName[32]; cl_uint argCnt; clGetKernelInfo(kernels[i], CL_KERNEL_FUNCTION_NAME, sizeof(kernelName), kernelName, NULL); clGetKernelInfo(kernels[i], CL_KERNEL_NUM_ARGS, sizeof(argCnt), &argCnt, NULL); printf("Kernel name: %s with arity: %d\n", kernelName, argCnt); printf("About to create command queue and enqueue this kernel...\n"); /* Create a command queue */ cl_command_queue cQ = clCreateCommandQueue(context, device, 0, &error); if (error != CL_SUCCESS) { perror("Unable to create command-queue"); exit(1); } /* Create a OpenCL buffer object */ cl_mem UDObj = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(UserData) * DATA_SIZE, ud_in, &error); if (error != CL_SUCCESS) { perror("Unable to create buffer object"); exit(1); } /* Let OpenCL know that the kernel is suppose to receive an argument */ error = clSetKernelArg(kernels[i], 0, sizeof(cl_mem), &UDObj); if (error != CL_SUCCESS) { perror("Unable to set buffer object as kernel argument"); exit(1); } /* Enqueue the kernel to the command queue */ error = clEnqueueTask(cQ, kernels[i], 0, NULL, NULL); if (error != CL_SUCCESS) { perror("Unable to enqueue task to command-queue"); exit(1); } printf("Task has been enqueued successfully!\n"); /* Enqueue the read-back from device to host */ error = clEnqueueReadBuffer(cQ, UDObj, CL_TRUE, // blocking read 0, // write from the start sizeof(UserData) * DATA_SIZE, // how much to copy ud_out, 0, NULL, NULL); if ( valuesOK(ud_in, ud_out) ) { printf("Check passed!\n"); } else printf("Check failed!\n"); /* Release the command queue */ clReleaseCommandQueue(cQ); clReleaseMemObject(UDObj); } /* Clean up */ for(cl_uint i = 0; i < numOfKernels; i++) { clReleaseKernel(kernels[i]); } for(i=0; i< NUMBER_OF_FILES; i++) { free(buffer[i]); } clReleaseProgram(program); clReleaseContext(context); } free(ud_in); free(ud_out); }
int main(int argc, char** argv) { /* OpenCL 1.1 data structures */ cl_platform_id* platforms; cl_program program; cl_context context; /* OpenCL 1.1 scalar data types */ cl_uint numOfPlatforms; cl_int error; cl_float16* ud_in = (cl_float16*) malloc( sizeof(cl_float16) * DATA_SIZE); // input to device cl_float16* ud_out = (cl_float16*) malloc( sizeof(cl_float16) * DATA_SIZE); // output from device for( int i = 0; i < DATA_SIZE; ++i) { ud_in[i] = (cl_float16){ (float)i,(float)i,(float)i,(float)i, (float)i,(float)i,(float)i,(float)i, (float)i,(float)i,(float)i,(float)i, (float)i,(float)i,(float)i,(float)i }; } /* Get the number of platforms Remember that for each vendor's SDK installed on the computer, the number of available platform also increased. */ error = clGetPlatformIDs(0, NULL, &numOfPlatforms); if(error != CL_SUCCESS ) { perror("Unable to find any OpenCL platforms"); exit(1); } platforms = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numOfPlatforms); printf("Number of OpenCL platforms found: %d\n", numOfPlatforms); error = clGetPlatformIDs(numOfPlatforms, platforms, NULL); if(error != CL_SUCCESS ) { perror("Unable to find any OpenCL platforms"); exit(1); } // Search for a CPU/GPU device through the installed platforms // Build a OpenCL program and do not run it. for(cl_uint i = 0; i < numOfPlatforms; i++ ) { cl_uint numOfDevices = 0; /* Determine how many devices are connected to your platform */ error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &numOfDevices); if (error != CL_SUCCESS ) { perror("Unable to obtain any OpenCL compliant device info"); exit(1); } cl_device_id* devices = (cl_device_id*) alloca(sizeof(cl_device_id) * numOfDevices); /* Load the information about your devices into the variable 'devices' */ error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numOfDevices, devices, NULL); if (error != CL_SUCCESS ) { perror("Unable to obtain any OpenCL compliant device info"); exit(1); } printf("Number of detected OpenCL devices: %d\n", numOfDevices); /* Create a context */ cl_context_properties ctx[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i], 0 }; context = clCreateContext(ctx, numOfDevices, devices, NULL, NULL, &error); if(error != CL_SUCCESS) { perror("Can't create a valid OpenCL context"); exit(1); } /* For each device, create a buffer and partition that data among the devices for compute! */ cl_mem inobj = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float16) * DATA_SIZE, ud_in, &error); if(error != CL_SUCCESS) { perror("Can't create a buffer"); exit(1); } int offset = 0; for(int i = 0; i < numOfDevices; ++i, ++offset ) { /* Load the two source files into temporary datastores */ const char *file_names[] = {"vector_load.cl"}; const int NUMBER_OF_FILES = 1; char* buffer[NUMBER_OF_FILES]; size_t sizes[NUMBER_OF_FILES]; loadProgramSource(file_names, NUMBER_OF_FILES, buffer, sizes); /* Create the OpenCL program object */ program = clCreateProgramWithSource(context, NUMBER_OF_FILES, (const char**)buffer, sizes, &error); if(error != CL_SUCCESS) { perror("Can't create the OpenCL program object"); exit(1); } /* Build OpenCL program object and dump the error message, if any */ char *program_log; size_t log_size; error = clBuildProgram(program, 1, &devices[i], NULL, NULL, NULL); if(error != CL_SUCCESS) { // If there's an error whilst building the program, dump the log clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); program_log = (char*) malloc(log_size+1); program_log[log_size] = '\0'; clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL); printf("\n=== ERROR ===\n\n%s\n=============\n", program_log); free(program_log); exit(1); } /* Query the program as to how many kernels were detected */ cl_uint numOfKernels; error = clCreateKernelsInProgram(program, 0, NULL, &numOfKernels); if (error != CL_SUCCESS) { perror("Unable to retrieve kernel count from program"); exit(1); } cl_kernel* kernels = (cl_kernel*) alloca(sizeof(cl_kernel) * numOfKernels); error = clCreateKernelsInProgram(program, numOfKernels, kernels, NULL); /* Loop thru each kernel and execute on device */ for(cl_uint j = 0; j < numOfKernels; j++) { char kernelName[32]; cl_uint argCnt; clGetKernelInfo(kernels[j], CL_KERNEL_FUNCTION_NAME, sizeof(kernelName), kernelName, NULL); clGetKernelInfo(kernels[j], CL_KERNEL_NUM_ARGS, sizeof(argCnt), &argCnt, NULL); printf("Kernel name: %s with arity: %d\n", kernelName, argCnt); printf("About to create command queue and enqueue this kernel...\n"); /* Create a command queue */ cl_command_queue cQ = clCreateCommandQueue(context, devices[i], 0, &error); if (error != CL_SUCCESS) { perror("Unable to create command-queue"); exit(1); } /* Create a buffer and copy the data from the main buffer */ cl_mem outobj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float16) * DATA_SIZE, 0, &error); if (error != CL_SUCCESS) { perror("Unable to create sub-buffer object"); exit(1); } /* Let OpenCL know that the kernel is suppose to receive an argument */ error = clSetKernelArg(kernels[j], 0, sizeof(cl_mem), &inobj); error = clSetKernelArg(kernels[j], 1, sizeof(cl_mem), &outobj); if (error != CL_SUCCESS) { perror("Unable to set buffer object in kernel"); exit(1); } /* Enqueue the kernel to the command queue */ size_t threadsPerGroup[] = {4}; size_t numOfGroups[] = { DATA_SIZE / threadsPerGroup[0] }; error = clEnqueueNDRangeKernel(cQ, kernels[j], 1, 0, numOfGroups, threadsPerGroup,0, NULL, NULL); if (error != CL_SUCCESS) { perror("Unable to enqueue task to command-queue"); exit(1); } printf("Task has been enqueued successfully!\n"); /* Enqueue the read-back from device to host */ error = clEnqueueReadBuffer(cQ, outobj, CL_TRUE, // blocking read 0, // read from the start sizeof(cl_float16)*DATA_SIZE, // how much to copy ud_out, 0, NULL, NULL); /* Check the returned data */ if ( valuesOK(ud_in, ud_out, DATA_SIZE) ) { printf("Check passed!\n"); } else printf("Check failed!\n"); /* Release the command queue */ clReleaseCommandQueue(cQ); clReleaseMemObject(outobj); } /* Clean up */ for(cl_uint i = 0; i < numOfKernels; i++) { clReleaseKernel(kernels[i]); } for(int i=0; i< NUMBER_OF_FILES; i++) { free(buffer[i]); } clReleaseProgram(program); }// end of device loop and execution clReleaseMemObject(inobj); clReleaseContext(context); }// end of platform loop free(ud_in); free(ud_out); }