int main(int argc, char* argv[]) { struct pb_Parameters *parameters; parameters = pb_ReadParameters(&argc, argv); if (!parameters) return -1; if(!parameters->inpFiles[0]){ fputs("Input file expected\n", stderr); return -1; } struct pb_TimerSet timers; char oclOverhead[] = "OCL Overhead"; char intermediates[] = "IntermediatesKernel"; char finals[] = "FinalKernel"; pb_InitializeTimerSet(&timers); pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); pb_AddSubTimer(&timers, intermediates, pb_TimerID_KERNEL); pb_AddSubTimer(&timers, finals, pb_TimerID_KERNEL); pb_SwitchToTimer(&timers, pb_TimerID_IO); int numIterations; if (argc >= 2){ numIterations = atoi(argv[1]); } else { fputs("Expected at least one command line argument\n", stderr); return -1; } unsigned int img_width, img_height; unsigned int histo_width, histo_height; FILE* f = fopen(parameters->inpFiles[0],"rb"); int result = 0; result += fread(&img_width, sizeof(unsigned int), 1, f); result += fread(&img_height, sizeof(unsigned int), 1, f); result += fread(&histo_width, sizeof(unsigned int), 1, f); result += fread(&histo_height, sizeof(unsigned int), 1, f); if (result != 4){ fputs("Error reading input and output dimensions from file\n", stderr); return -1; } unsigned int* img = (unsigned int*) malloc (img_width*img_height*sizeof(unsigned int)); unsigned char* histo = (unsigned char*) calloc (histo_width*histo_height, sizeof(unsigned char)); result = fread(img, sizeof(unsigned int), img_width*img_height, f); fclose(f); if (result != img_width*img_height){ fputs("Error reading input array from file\n", stderr); return -1; } cl_int ciErrNum; pb_Context* pb_context; pb_context = pb_InitOpenCLContext(parameters); if (pb_context == NULL) { fprintf (stderr, "Error: No OpenCL platform/device can be found."); return -1; } cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId; cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId; cl_context clContext = (cl_context) pb_context->clContext; cl_command_queue clCommandQueue; cl_program clProgram[2]; cl_kernel histo_intermediates_kernel; cl_kernel histo_final_kernel; cl_mem input; cl_mem ranges; cl_mem sm_mappings; cl_mem global_subhisto; cl_mem global_overflow; cl_mem final_histo; clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); pb_SetOpenCL(&clContext, &clCommandQueue); pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); cl_uint workItemDimensions; OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &workItemDimensions, NULL) ); size_t workItemSizes[workItemDimensions]; OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, workItemDimensions*sizeof(size_t), workItemSizes, NULL) ); size_t program_length[2]; const char *source_path[2] = { "src/opencl_mxpa/histo_intermediates.cl", "src/opencl_mxpa/histo_final.cl"}; char *source[4]; for (int i = 0; i < 2; ++i) { // Dynamically allocate buffer for source source[i] = oclLoadProgSource(source_path[i], "", &program_length[i]); if(!source[i]) { fprintf(stderr, "Could not load program source\n"); exit(1); } clProgram[i] = clCreateProgramWithSource(clContext, 1, (const char **)&source[i], &program_length[i], &ciErrNum); OCL_ERRCK_VAR(ciErrNum); free(source[i]); } for (int i = 0; i < 2; ++i) { //fprintf(stderr, "Building Program #%d...\n", i); OCL_ERRCK_RETVAL ( clBuildProgram(clProgram[i], 1, &clDevice, NULL, NULL, NULL) ); /* char *build_log; size_t ret_val_size; ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); OCL_ERRCK_VAR(ciErrNum); build_log = (char *)malloc(ret_val_size+1); ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); OCL_ERRCK_VAR(ciErrNum); // to be carefully, terminate with \0 // there's no information in the reference whether the string is 0 terminated or not build_log[ret_val_size] = '\0'; fprintf(stderr, "%s\n", build_log ); */ } histo_intermediates_kernel = clCreateKernel(clProgram[0], "histo_intermediates_kernel", &ciErrNum); OCL_ERRCK_VAR(ciErrNum); histo_final_kernel = clCreateKernel(clProgram[1], "histo_final_kernel", &ciErrNum); OCL_ERRCK_VAR(ciErrNum); pb_SwitchToTimer(&timers, pb_TimerID_COPY); input = clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*img_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); ranges = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 2*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); sm_mappings = clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*img_height*4*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); global_subhisto = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); global_overflow = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); final_histo = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); // Must dynamically allocate. Too large for stack unsigned int *zeroData; zeroData = (unsigned int *) calloc(img_width*histo_height, sizeof(unsigned int)); if (zeroData == NULL) { fprintf(stderr, "Failed to allocate %ld bytes of memory on host!\n", sizeof(unsigned int) * img_width * histo_height); exit(1); } for (int y=0; y < img_height; y++){ OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, input, CL_TRUE, y*img_width*sizeof(unsigned int), // Offset in bytes img_width*sizeof(unsigned int), // Size of data to write &img[y*img_width], // Host Source 0, NULL, NULL) ); } pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); unsigned int img_dim = img_height*img_width; OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 0, sizeof(cl_mem), (void *)&input) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 1, sizeof(unsigned int), &img_width) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 2, sizeof(cl_mem), (void *)&global_subhisto) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 0, sizeof(unsigned int), &histo_height) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 1, sizeof(unsigned int), &histo_width) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 2, sizeof(cl_mem), (void *)&global_subhisto) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 3, sizeof(cl_mem), (void *)&final_histo) ); size_t inter_localWS[1] = { workItemSizes[0] }; size_t inter_globalWS[1] = { img_height * inter_localWS[0] }; size_t final_localWS[1] = { workItemSizes[0] }; size_t final_globalWS[1] = {(((int)(histo_height*histo_width+(final_localWS[0]-1))) / (int)final_localWS[0])*(int)final_localWS[0] }; pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); for (int iter = 0; iter < numIterations; iter++) { unsigned int ranges_h[2] = {UINT32_MAX, 0}; // how about something like // __global__ unsigned int ranges[2]; // ...kernel // __shared__ unsigned int s_ranges[2]; // if (threadIdx.x == 0) {s_ranges[0] = ranges[0]; s_ranges[1] = ranges[1];} // __syncthreads(); // Although then removing the blocking cudaMemcpy's might cause something about // concurrent kernel execution. // If kernel launches are synchronous, then how can 2 kernels run concurrently? different host threads? OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, ranges, CL_TRUE, 0, // Offset in bytes 2*sizeof(unsigned int), // Size of data to write ranges_h, // Host Source 0, NULL, NULL) ); OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, global_subhisto, CL_TRUE, 0, // Offset in bytes histo_width*histo_height*sizeof(unsigned int), // Size of data to write zeroData, // Host Source 0, NULL, NULL) ); pb_SwitchToSubTimer(&timers, intermediates, pb_TimerID_KERNEL); OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_intermediates_kernel /*histo_intermediates_kernel*/, 1, 0, inter_globalWS, inter_localWS, 0, 0, 0) ); pb_SwitchToSubTimer(&timers, finals, pb_TimerID_KERNEL); OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_final_kernel, 1, 0, final_globalWS, final_localWS, 0, 0, 0) ); } pb_SwitchToTimer(&timers, pb_TimerID_IO); OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, final_histo, CL_TRUE, 0, // Offset in bytes histo_height*histo_width*sizeof(unsigned char), // Size of data to read histo, // Host Source 0, NULL, NULL) ); OCL_ERRCK_RETVAL ( clReleaseKernel(histo_intermediates_kernel) ); OCL_ERRCK_RETVAL ( clReleaseKernel(histo_final_kernel) ); OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[0]) ); OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[1]) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(input) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(ranges) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(sm_mappings) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(global_subhisto) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(global_overflow) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(final_histo) ); if (parameters->outFile) { dump_histo_img(histo, histo_height, histo_width, parameters->outFile); } pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); free(zeroData); free(img); free(histo); pb_SwitchToTimer(&timers, pb_TimerID_NONE); printf("\n"); pb_PrintTimerSet(&timers); pb_FreeParameters(parameters); pb_DestroyTimerSet(&timers); OCL_ERRCK_RETVAL ( clReleaseCommandQueue(clCommandQueue) ); OCL_ERRCK_RETVAL ( clReleaseContext(clContext) ); return 0; }
static int init_cladsyn(CSOUND *csound, CLADSYN *p){ int asize, ipsize, fpsize, err; cl_device_id device_ids[32], device_id; cl_context context; cl_command_queue commands; cl_program program; cl_kernel kernel1, kernel2; cl_uint num = 0, nump = 0; cl_platform_id platforms[16]; uint i; if(p->fsig->overlap > 1024) return csound->InitError(csound, "overlap is too large\n"); err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 32, device_ids, &num); if (err != CL_SUCCESS){ clGetPlatformIDs(16, platforms, &nump); int devs = 0; for(i=0; i < nump && devs < 32; i++){ char name[128]; clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 128, name, NULL); csound->Message(csound, "available platform[%d] %s\n",i, name); err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 32-devs, &device_ids[devs], &num); if (err != CL_SUCCESS) csound->InitError(csound, "failed to find an OpenCL device! %s \n", cl_error_string(err)); } devs += num; } for(i=0; i < num; i++){ char name[128]; cl_device_type type; clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, 128, name, NULL); clGetDeviceInfo(device_ids[i], CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL); if(type & CL_DEVICE_TYPE_CPU) csound->Message(csound, "available CPU[device %d] %s\n",i, name); else if(type & CL_DEVICE_TYPE_GPU) csound->Message(csound, "available GPU[device %d] %s\n",i, name); else if(type & CL_DEVICE_TYPE_ACCELERATOR) csound->Message(csound, "available ACCELLERATOR[device %d] %s\n",i, name); else csound->Message(csound, "available generic [device %d] %s\n",i, name);; } // SELECT THE GPU HERE if(*p->idev < num) device_id = device_ids[(int)*p->idev]; else device_id = device_ids[num-1]; context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) return csound->InitError(csound, "Failed to create a compute context! %s\n", cl_error_string(err)); // Create a command commands // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) return csound->InitError(csound, "Failed to create a command commands! %s\n", cl_error_string(err)); // Create the compute program from the source buffer // program = clCreateProgramWithSource(context, 1, (const char **) &code, NULL, &err); if (!program) return csound->InitError(csound, "Failed to create compute program! %s\n", cl_error_string(err)); err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; csound->Message(csound, "Failed to build program executable! %s\n", cl_error_string(err)); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); return csound->InitError(csound, "%s\n", buffer); } kernel1 = clCreateKernel(program, "sample", &err); if (!kernel1 || err != CL_SUCCESS) return csound->InitError(csound, "Failed to create sample compute kernel! %s\n", cl_error_string(err)); kernel2 = clCreateKernel(program, "update", &err); if (!kernel2 || err != CL_SUCCESS) return csound->InitError(csound,"Failed to create update compute kernel! %s\n", cl_error_string(err)); char name[128]; clGetDeviceInfo(device_id, CL_DEVICE_NAME, 128, name, NULL); csound->Message(csound, "using device: %s\n",name); p->bins = (p->fsig->N)/2; if(*p->inum > 0 && *p->inum < p->bins) p->bins = *p->inum; p->vsamps = p->fsig->overlap; p->threads = p->bins*p->vsamps; p->mthreads = (p->bins > p->vsamps ? p->bins : p->vsamps); asize = p->vsamps*sizeof(cl_float); ipsize = (p->bins > p->vsamps ? p->bins : p->vsamps)*sizeof(cl_long); fpsize = p->fsig->N*sizeof(cl_float); p->out = clCreateBuffer(context,0, asize, NULL, NULL); p->frame = clCreateBuffer(context, CL_MEM_READ_ONLY, fpsize, NULL, NULL); p->ph = clCreateBuffer(context,0, ipsize, NULL, NULL); p->amps = clCreateBuffer(context,0,(p->bins > p->vsamps ? p->bins : p->vsamps)*sizeof(cl_float), NULL, NULL); // memset needed? asize = p->vsamps*sizeof(float); if(p->out_.auxp == NULL || p->out_.size < (unsigned long) asize) csound->AuxAlloc(csound, asize , &p->out_); csound->RegisterDeinitCallback(csound, p, destroy_cladsyn); p->count = 0; p->context = context; p->program = program; p->commands = commands; p->kernel1 = kernel1; p->kernel2 = kernel2; clGetKernelWorkGroupInfo(p->kernel1, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(p->wgs1), &p->wgs1, NULL); clGetKernelWorkGroupInfo(p->kernel2, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(p->wgs1), &p->wgs2, NULL); p->sr = csound->GetSr(csound); clSetKernelArg(p->kernel1, 0, sizeof(cl_mem), &p->out); clSetKernelArg(p->kernel1, 1, sizeof(cl_mem), &p->frame); clSetKernelArg(p->kernel1, 2, sizeof(cl_mem), &p->ph); clSetKernelArg(p->kernel1, 3, sizeof(cl_mem), &p->amps); clSetKernelArg(p->kernel1, 5, sizeof(cl_int), &p->bins); clSetKernelArg(p->kernel1, 6, sizeof(cl_int), &p->vsamps); clSetKernelArg(p->kernel1, 7, sizeof(cl_float), &p->sr); clSetKernelArg(p->kernel2, 0, sizeof(cl_mem), &p->out); clSetKernelArg(p->kernel2, 1, sizeof(cl_mem), &p->frame); clSetKernelArg(p->kernel2, 2, sizeof(cl_mem), &p->ph); clSetKernelArg(p->kernel2, 3, sizeof(cl_mem), &p->amps); clSetKernelArg(p->kernel2, 5, sizeof(cl_int), &p->bins); clSetKernelArg(p->kernel2, 6, sizeof(cl_int), &p->vsamps); clSetKernelArg(p->kernel2, 7, sizeof(cl_float), &p->sr); return OK; }
int main(int argc, char const *argv[]) { /* Get platform */ cl_platform_id platform; cl_uint num_platforms; cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformIDs' failed\n"); exit(1); } printf("Number of platforms: %d\n", num_platforms); printf("platform=%p\n", platform); /* Get platform name */ char platform_name[100]; ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformInfo' failed\n"); exit(1); } printf("platform.name='%s'\n\n", platform_name); /* Get device */ cl_device_id device; cl_uint num_devices; ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceIDs' failed\n"); exit(1); } printf("Number of devices: %d\n", num_devices); printf("device=%p\n", device); /* Get device name */ char device_name[100]; ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceInfo' failed\n"); exit(1); } printf("device.name='%s'\n", device_name); printf("\n"); /* Create a Context Object */ cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateContext' failed\n"); exit(1); } printf("context=%p\n", context); /* Create a Command Queue Object*/ cl_command_queue command_queue; command_queue = clCreateCommandQueue(context, device, 0, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateCommandQueue' failed\n"); exit(1); } printf("command_queue=%p\n", command_queue); printf("\n"); /* Program source */ unsigned char *source_code; size_t source_length; /* Read program from 'max_int16int16.cl' */ source_code = read_buffer("max_int16int16.cl", &source_length); /* Create a program */ cl_program program; program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_length, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateProgramWithSource' failed\n"); exit(1); } printf("program=%p\n", program); /* Build program */ ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (ret != CL_SUCCESS ) { size_t size; char *log; /* Get log size */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size); /* Allocate log and print */ log = malloc(size); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL); printf("error: call to 'clBuildProgram' failed:\n%s\n", log); /* Free log and exit */ free(log); exit(1); } printf("program built\n"); printf("\n"); /* Create a Kernel Object */ cl_kernel kernel; kernel = clCreateKernel(program, "max_int16int16", &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateKernel' failed\n"); exit(1); } /* Create and allocate host buffers */ size_t num_elem = 10; /* Create and init host side src buffer 0 */ cl_int16 *src_0_host_buffer; src_0_host_buffer = malloc(num_elem * sizeof(cl_int16)); for (int i = 0; i < num_elem; i++) src_0_host_buffer[i] = (cl_int16){{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}}; /* Create and init device side src buffer 0 */ cl_mem src_0_device_buffer; src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_int16), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_int16), src_0_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create and init host side src buffer 1 */ cl_int16 *src_1_host_buffer; src_1_host_buffer = malloc(num_elem * sizeof(cl_int16)); for (int i = 0; i < num_elem; i++) src_1_host_buffer[i] = (cl_int16){{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}}; /* Create and init device side src buffer 1 */ cl_mem src_1_device_buffer; src_1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_int16), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_1_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_int16), src_1_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create host dst buffer */ cl_int16 *dst_host_buffer; dst_host_buffer = malloc(num_elem * sizeof(cl_int16)); memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_int16)); /* Create device dst buffer */ cl_mem dst_device_buffer; dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_int16), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create dst buffer\n"); exit(1); } /* Set kernel arguments */ ret = CL_SUCCESS; ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src_1_device_buffer); ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clSetKernelArg' failed\n"); exit(1); } /* Launch the kernel */ size_t global_work_size = num_elem; size_t local_work_size = num_elem; ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueNDRangeKernel' failed\n"); exit(1); } /* Wait for it to finish */ clFinish(command_queue); /* Read results from GPU */ ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_int16), dst_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueReadBuffer' failed\n"); exit(1); } /* Dump dst buffer to file */ char dump_file[100]; sprintf((char *)&dump_file, "%s.result", argv[0]); write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_int16)); printf("Result dumped to %s\n", dump_file); /* Free host dst buffer */ free(dst_host_buffer); /* Free device dst buffer */ ret = clReleaseMemObject(dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 0 */ free(src_0_host_buffer); /* Free device side src buffer 0 */ ret = clReleaseMemObject(src_0_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 1 */ free(src_1_host_buffer); /* Free device side src buffer 1 */ ret = clReleaseMemObject(src_1_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Release kernel */ ret = clReleaseKernel(kernel); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseKernel' failed\n"); exit(1); } /* Release program */ ret = clReleaseProgram(program); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseProgram' failed\n"); exit(1); } /* Release command queue */ ret = clReleaseCommandQueue(command_queue); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseCommandQueue' failed\n"); exit(1); } /* Release context */ ret = clReleaseContext(context); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseContext' failed\n"); exit(1); } return 0; }
int main( void ) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufB, bufC; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs( 1, &platform, NULL ); err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL ); props[1] = (cl_context_properties)platform; ctx = clCreateContext( props, 1, &device, NULL, NULL, &err ); queue = clCreateCommandQueue( ctx, device, 0, &err ); /* Setup clBLAS */ err = clblasSetup( ); /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer( ctx, CL_MEM_READ_ONLY, M * K * sizeof(*A), NULL, &err ); bufB = clCreateBuffer( ctx, CL_MEM_READ_ONLY, K * N * sizeof(*B), NULL, &err ); bufC = clCreateBuffer( ctx, CL_MEM_READ_WRITE, M * N * sizeof(*C), NULL, &err ); err = clEnqueueWriteBuffer( queue, bufA, CL_TRUE, 0, M * K * sizeof( *A ), A, 0, NULL, NULL ); err = clEnqueueWriteBuffer( queue, bufB, CL_TRUE, 0, K * N * sizeof( *B ), B, 0, NULL, NULL ); err = clEnqueueWriteBuffer( queue, bufC, CL_TRUE, 0, M * N * sizeof( *C ), C, 0, NULL, NULL ); /* Call clBLAS extended function. Perform gemm for the lower right sub-matrices */ err = clblasSgemm( clblasRowMajor, clblasNoTrans, clblasNoTrans, M, N, K, alpha, bufA, 0, lda, bufB, 0, ldb, beta, bufC, 0, ldc, 1, &queue, 0, NULL, &event ); /* Wait for calculations to be finished. */ err = clWaitForEvents( 1, &event ); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer( queue, bufC, CL_TRUE, 0, M * N * sizeof(*result), result, 0, NULL, NULL ); /* Release OpenCL memory objects. */ clReleaseMemObject( bufC ); clReleaseMemObject( bufB ); clReleaseMemObject( bufA ); /* Finalize work with clBLAS */ clblasTeardown( ); /* Release OpenCL working objects. */ clReleaseCommandQueue( queue ); clReleaseContext( ctx ); return ret; }
GPUBase::GPUBase(char* source, char* KernelName) { kernelFuncName = KernelName; size_t szKernelLength; size_t szKernelLengthFilter; size_t szKernelLengthSum; char* SourceOpenCLShared; char* SourceOpenCL; iBlockDimX = 16; iBlockDimY = 16; GPUError = oclGetPlatformID(&cpPlatform); CheckError(GPUError); cl_uint uiNumAllDevs = 0; // Get the number of GPU devices available to the platform GPUError = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumAllDevs); CheckError(GPUError); uiDevCount = uiNumAllDevs; // Create the device list cdDevices = new cl_device_id [uiDevCount]; GPUError = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiDevCount, cdDevices, NULL); CheckError(GPUError); // Create the OpenCL context on a GPU device GPUContext = clCreateContext(0, uiNumAllDevs, cdDevices, NULL, NULL, &GPUError); CheckError(GPUError); //The command-queue can be used to queue a set of operations (referred to as commands) in order. GPUCommandQueue = clCreateCommandQueue(GPUContext, cdDevices[0], 0, &GPUError); CheckError(GPUError); oclPrintDevName(LOGBOTH, cdDevices[0]); // Load OpenCL kernel SourceOpenCLShared = oclLoadProgSource("C:\\Dropbox\\MGR\\GPUFeatureExtraction\\GPU\\OpenCL\\GPUCode.cl", "// My comment\n", &szKernelLength); SourceOpenCL = oclLoadProgSource(source, "// My comment\n", &szKernelLengthFilter); szKernelLengthSum = szKernelLength + szKernelLengthFilter; char* sourceCL = new char[szKernelLengthSum]; strcpy(sourceCL,SourceOpenCLShared); strcat (sourceCL, SourceOpenCL); GPUProgram = clCreateProgramWithSource( GPUContext , 1, (const char **)&sourceCL, &szKernelLengthSum, &GPUError); CheckError(GPUError); // Build the program with 'mad' Optimization option char *flags = "-cl-unsafe-math-optimizations -cl-fast-relaxed-math -cl-mad-enable"; GPUError = clBuildProgram(GPUProgram, 0, NULL, flags, NULL, NULL); //error checking code if(!GPUError) { //print kernel compilation error char programLog[1024]; clGetProgramBuildInfo(GPUProgram, cdDevices[0], CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); cout<<programLog<<endl; } cout << kernelFuncName << endl; GPUKernel = clCreateKernel(GPUProgram, kernelFuncName, &GPUError); CheckError(GPUError); }
static void test_opencl_opengl_interop() { cl_int status; cl_device_id renderer; #ifndef __EMSCRIPTEN__ CGLContextObj gl_context = CGLGetCurrentContext(); // const char * err = CGLErrorString(kCGLContext); CGLShareGroupObj kCGLShareGroup = CGLGetShareGroup(gl_context); cl_context_properties properties[] = { CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE, (cl_context_properties)kCGLShareGroup, 0 }; clData.ctx = clCreateContext(properties, 0, 0, 0, 0, &status); CHECK_CL_ERROR(status, "clCreateContext"); // And now we can ask OpenCL which particular device is being used by // OpenGL to do the rendering, currently: clGetGLContextInfoAPPLE(clData.ctx, gl_context, CL_CGL_DEVICE_FOR_CURRENT_VIRTUAL_SCREEN_APPLE, sizeof(renderer), &renderer, NULL); #else cl_context_properties cps[] = { CL_GL_CONTEXT_KHR, (cl_context_properties) 0, CL_WGL_HDC_KHR, (cl_context_properties) 0, 0}; //Probably won't work because &dev should correspond to glContext clData.ctx = clCreateContext(cps, 1, &renderer, NULL, NULL, &status); CHECK_CL_ERROR(status, "clCreateContext"); #endif cl_uint id_in_use; clGetDeviceInfo(renderer, CL_DEVICE_VENDOR_ID, sizeof(cl_uint), &id_in_use, NULL); clData.device = renderer; cl_command_queue_properties qprops = 0; clData.queue = clCreateCommandQueue(clData.ctx, clData.device, qprops, &status); CHECK_CL_ERROR(status, "clCreateCommandQueue"); int extensionExists = 0; size_t extensionSize; int ciErrNum = clGetDeviceInfo( clData.device, CL_DEVICE_EXTENSIONS, 0, NULL, &extensionSize ); char* extensions = (char*) malloc( extensionSize); ciErrNum = clGetDeviceInfo( clData.device, CL_DEVICE_EXTENSIONS, extensionSize, extensions, &extensionSize); char * pch; //printf ("Splitting extensions string \"%s\" into tokens:\n",extensions); pch = strtok (extensions," "); while (pch != NULL) { printf ("%s\n",pch); if(strcmp(pch, GL_SHARING_EXTENSION) == 0) { printf("Device supports gl sharing\n"); extensionExists = 1; break; } pch = strtok (NULL, " "); } }
//! This function is documented in the header file void nbnxn_gpu_init(gmx_nbnxn_ocl_t **p_nb, const gmx_device_info_t *deviceInfo, const interaction_const_t *ic, const NbnxnListParameters *listParams, const nbnxn_atomdata_t *nbat, int rank, gmx_bool bLocalAndNonlocal) { gmx_nbnxn_ocl_t *nb; cl_int cl_error; cl_command_queue_properties queue_properties; assert(ic); if (p_nb == nullptr) { return; } snew(nb, 1); snew(nb->atdat, 1); snew(nb->nbparam, 1); snew(nb->plist[eintLocal], 1); if (bLocalAndNonlocal) { snew(nb->plist[eintNonlocal], 1); } nb->bUseTwoStreams = static_cast<cl_bool>(bLocalAndNonlocal); nb->timers = new cl_timers_t(); snew(nb->timings, 1); /* set device info, just point it to the right GPU among the detected ones */ nb->dev_info = deviceInfo; snew(nb->dev_rundata, 1); /* init nbst */ pmalloc(reinterpret_cast<void**>(&nb->nbst.e_lj), sizeof(*nb->nbst.e_lj)); pmalloc(reinterpret_cast<void**>(&nb->nbst.e_el), sizeof(*nb->nbst.e_el)); pmalloc(reinterpret_cast<void**>(&nb->nbst.fshift), SHIFTS * sizeof(*nb->nbst.fshift)); init_plist(nb->plist[eintLocal]); /* OpenCL timing disabled if GMX_DISABLE_GPU_TIMING is defined. */ nb->bDoTime = static_cast<cl_bool>(getenv("GMX_DISABLE_GPU_TIMING") == nullptr); /* Create queues only after bDoTime has been initialized */ if (nb->bDoTime) { queue_properties = CL_QUEUE_PROFILING_ENABLE; } else { queue_properties = 0; } nbnxn_gpu_create_context(nb->dev_rundata, nb->dev_info, rank); /* local/non-local GPU streams */ nb->stream[eintLocal] = clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error); if (CL_SUCCESS != cl_error) { gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d", rank, nb->dev_info->device_name, cl_error); } if (nb->bUseTwoStreams) { init_plist(nb->plist[eintNonlocal]); nb->stream[eintNonlocal] = clCreateCommandQueue(nb->dev_rundata->context, nb->dev_info->ocl_gpu_id.ocl_device_id, queue_properties, &cl_error); if (CL_SUCCESS != cl_error) { gmx_fatal(FARGS, "On rank %d failed to create context for GPU #%s: OpenCL error %d", rank, nb->dev_info->device_name, cl_error); } } if (nb->bDoTime) { init_timers(nb->timers, nb->bUseTwoStreams == CL_TRUE); init_timings(nb->timings); } nbnxn_ocl_init_const(nb, ic, listParams, nbat); /* Enable LJ param manual prefetch for AMD or Intel or if we request through env. var. * TODO: decide about NVIDIA */ nb->bPrefetchLjParam = (getenv("GMX_OCL_DISABLE_I_PREFETCH") == nullptr) && ((nb->dev_info->vendor_e == OCL_VENDOR_AMD) || (nb->dev_info->vendor_e == OCL_VENDOR_INTEL) || (getenv("GMX_OCL_ENABLE_I_PREFETCH") != nullptr)); /* NOTE: in CUDA we pick L1 cache configuration for the nbnxn kernels here, * but sadly this is not supported in OpenCL (yet?). Consider adding it if * it becomes supported. */ nbnxn_gpu_compile_kernels(nb); nbnxn_gpu_init_kernels(nb); /* clear energy and shift force outputs */ nbnxn_ocl_clear_e_fshift(nb); *p_nb = nb; if (debug) { fprintf(debug, "Initialized OpenCL data structures.\n"); } }
/*--------------------------------------------------------- tmr_ocl_create_command_queues - for a selected platform and device ---------------------------------------------------------*/ int tmr_ocl_create_command_queues( FILE *Interactive_output, /* file or stdout to write messages */ int Platform_index, int Device_type, int Monitor ) { // in a loop over all platforms int platform_index; for(platform_index=0; platform_index<tmv_ocl_struct.number_of_platforms; platform_index++){ // shortctm for global platform structure tmt_ocl_platform_struct platform_struct = tmv_ocl_struct.list_of_platforms[platform_index]; // if creating contexts for all platforms or just this one if(Platform_index == TMC_OCL_ALL_PLATFORMS || Platform_index == platform_index){ // in a loop over all devices int idev; for(idev=0; idev<platform_struct.number_of_devices; idev++){ // variable for storing device_id cl_device_id device = 0; // select context for the device (CPU context for CPU device, etc.) // (contexts are already created!, // icon is just the index in the platform structure) int icon; // check whether this is a CPU device - then context is no 0 if(platform_struct.list_of_devices[idev].type == CL_DEVICE_TYPE_CPU){ if(Device_type == TMC_OCL_ALL_DEVICES || Device_type == TMC_OCL_DEVICE_CPU){ device = platform_struct.list_of_devices[idev].id; platform_struct.list_of_devices[idev].tmc_type = TMC_OCL_DEVICE_CPU; icon = 0; } else{ device = NULL; } } // check whether this is a GPU device - then context is no 1 else if(platform_struct.list_of_devices[idev].type == CL_DEVICE_TYPE_GPU){ if(Device_type == TMC_OCL_ALL_DEVICES || Device_type == TMC_OCL_DEVICE_GPU){ device = platform_struct.list_of_devices[idev].id; platform_struct.list_of_devices[idev].tmc_type = TMC_OCL_DEVICE_GPU; icon = 1; } else{ device = NULL; } } // check whether this is an ACCELERATOR device - then context is no 2 else if(platform_struct.list_of_devices[idev].type == CL_DEVICE_TYPE_ACCELERATOR){ if(Device_type == TMC_OCL_ALL_DEVICES || Device_type == TMC_OCL_DEVICE_ACCELERATOR){ device = platform_struct.list_of_devices[idev].id; platform_struct.list_of_devices[idev].tmc_type = TMC_OCL_DEVICE_ACCELERATOR; icon = 2; } else{ device = NULL; } } if(device != NULL){ // choose OpenCL context selected for a device cl_context context = platform_struct.list_of_contexts[icon]; platform_struct.list_of_devices[idev].context_index = icon; // if context exist if(context != NULL){ if(Monitor>TMC_PRINT_INFO){ if(platform_struct.list_of_devices[idev].tmc_type == TMC_OCL_DEVICE_CPU){ fprintf(Interactive_output,"\nCreating command queue for CPU context %d, device index %d, platform %d\n", icon, idev, platform_index); } if(platform_struct.list_of_devices[idev].tmc_type == TMC_OCL_DEVICE_GPU){ fprintf(Interactive_output,"\nCreating command queue for GPU context %d, device index %d, platform %d\n", icon, idev, platform_index); } if(platform_struct.list_of_devices[idev].tmc_type == TMC_OCL_DEVICE_ACCELERATOR){ fprintf(Interactive_output,"\nCreating command queue for ACCELERATOR context %d, device index %d, platform %d\n", icon, idev, platform_index); } } // Create a command-queue on the device for the context cl_command_queue_properties prop = 0; prop |= CL_QUEUE_PROFILING_ENABLE; platform_struct.list_of_devices[idev].command_queue = clCreateCommandQueue(context, device, prop, NULL); if (platform_struct.list_of_devices[idev].command_queue == NULL) { fprintf(Interactive_output,"Failed to create command queue for context %d, device %d, platform %d\n", icon, idev, platform_index); exit(-1); } } // end if context exist for a given device } // end if device is of specified type } // end loop over devices } // end if platform is of specified type } // end loop over platforms return(1); }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufX; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * lda * sizeof(cl_float), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(cl_float), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, N * lda * sizeof(cl_float), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float), X, 0, NULL, NULL); /* Call clblas function. */ err = clblasStbsv(order, uplo, trans, diag, N, K, bufA, 0, lda, bufX, 0, incx, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasStbsv() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float), X, 0, NULL, NULL); /* At this point you will get the result of STBSV placed in X array. */ printResult(); } /* Release OpenCL memory objects. */ clReleaseMemObject(bufX); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
int main(int argc, char **argv) { cl_int ret; /* * Command line */ char *binary_path; if (argc != 2) { printf("syntax: %s <binary>\n", argv[0]); exit(1); } binary_path = argv[1]; /* * Platform */ /* Get platform */ cl_platform_id platform; cl_uint num_platforms; ret = clGetPlatformIDs(1, &platform, &num_platforms); if (ret != CL_SUCCESS) { printf("error: second call to 'clGetPlatformIDs' failed\n"); exit(1); } printf("Number of platforms: %d\n", num_platforms); /* Get platform name */ char platform_name[100]; ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformInfo' failed\n"); exit(1); } printf("platform.name='%s'\n", platform_name); printf("\n"); /* * Device */ /* Get device */ cl_device_id device; cl_uint num_devices; ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceIDs' failed\n"); exit(1); } printf("Number of devices: %d\n", num_devices); /* Get device name */ char device_name[100]; ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceInfo' failed\n"); exit(1); } printf("device.name='%s'\n", device_name); printf("\n"); /* * Context */ /* Create context */ cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateContext' failed\n"); exit(1); } /* * Command Queue */ /* Create command queue */ cl_command_queue command_queue; command_queue = clCreateCommandQueue(context, device, 0, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateCommandQueue' failed\n"); exit(1); } printf("\n"); /* * Program */ /* Program binary */ const unsigned char *binary; size_t binary_length; /* Read binary */ binary = read_buffer(binary_path, &binary_length); if (!binary) { printf("error: %s: cannot open binary\n", binary_path); exit(1); } /* Create a program */ cl_program program; program = clCreateProgramWithBinary(context, 1, &device, &binary_length, &binary, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateProgramWithSource' failed\n"); exit(1); } /* Build program */ ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (ret != CL_SUCCESS ) { size_t size; char *log; /* Get log size */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &size); /* Allocate log and print */ log = malloc(size); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, size, log, NULL); printf("error: call to 'clBuildProgram' failed:\n%s\n", log); /* Free log and exit */ free(log); exit(1); } printf("program built\n"); printf("\n"); /* * Kernel */ /* Create kernel */ cl_kernel kernel; kernel = clCreateKernel(program, "vector_add", &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateKernel' failed\n"); exit(1); } printf("\n"); /* * Buffers */ /* Create and allocate host buffers */ size_t num_elem = 10; cl_int *src1_host_buffer; cl_int *src2_host_buffer; cl_int *dst_host_buffer; src1_host_buffer = malloc(num_elem * sizeof(cl_int)); src2_host_buffer = malloc(num_elem * sizeof(cl_int)); dst_host_buffer = malloc(num_elem * sizeof(cl_int)); /* Initialize host source buffer */ int i; for (i = 0; i < num_elem; i++) { src1_host_buffer[i] = i; src2_host_buffer[i] = 100; } /* Create device source buffers */ cl_mem src1_device_buffer; cl_mem src2_device_buffer; src1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_int), NULL, NULL); src2_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_int), NULL, NULL); if (!src1_device_buffer || !src2_device_buffer) { printf("error: could not create destination buffer\n"); exit(1); } /* Create device destination buffer */ cl_mem dst_device_buffer; dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem * sizeof(cl_int), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create destination buffer\n"); exit(1); } /* Copy buffer */ ret = clEnqueueWriteBuffer(command_queue, src1_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_int), src1_host_buffer, 0, NULL, NULL); ret |= clEnqueueWriteBuffer(command_queue, src2_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_int), src2_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* * Kernel arguments */ ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), &src1_device_buffer); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src2_device_buffer); ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clSetKernelArg' failed\n"); exit(1); } /* * Launch Kernel */ size_t global_work_size = num_elem; size_t local_work_size = num_elem; /* Launch the kernel */ ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueNDRangeKernel' failed\n"); exit(1); } /* Wait for it to finish */ clFinish(command_queue); /* * Result */ /* Receive buffer */ ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_int), dst_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueReadBuffer' failed\n"); exit(1); } /* Print result */ for (i = 0; i < num_elem; i++) printf("dst_host_buffer[%d] = %d\n", i, dst_host_buffer[i]); printf("\n"); return 0; }
int FastWalshTransform::setupCL(void) { cl_int status = 0; cl_device_type dType; if(sampleArgs->deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //sampleArgs->deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(sampleArgs->isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_platform_id platform = NULL; int retValue = getPlatform(platform, sampleArgs->platformId, sampleArgs->isPlatformEnabled()); CHECK_ERROR(retValue, SDK_SUCCESS, "getPlatform() failed"); // Display available devices. retValue = displayDevices(platform, dType); CHECK_ERROR(retValue, SDK_SUCCESS, "displayDevices() failed"); /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType( cps, dType, NULL, NULL, &status); CHECK_OPENCL_ERROR( status, "clCreateContextFromType failed."); // getting device on which to run the sample status = getDevices(context, &devices, sampleArgs->deviceId, sampleArgs->isDeviceIdEnabled()); CHECK_ERROR(status, SDK_SUCCESS, "getDevices() failed"); { // The block is to move the declaration of prop closer to its use cl_command_queue_properties prop = 0; commandQueue = clCreateCommandQueue( context, devices[sampleArgs->deviceId], prop, &status); CHECK_OPENCL_ERROR( status, "clCreateCommandQueue failed."); } //Set device info of given cl_device_id retValue = deviceInfo.setDeviceInfo(devices[sampleArgs->deviceId]); CHECK_ERROR(retValue, SDK_SUCCESS, "SDKDeviceInfo::setDeviceInfo() failed"); inputBuffer = clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof(cl_float) * length, 0, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (inputBuffer)"); // create a CL program using the kernel source buildProgramData buildData; buildData.kernelName = std::string("FastWalshTransform_Kernels.cl"); buildData.devices = devices; buildData.deviceId = sampleArgs->deviceId; buildData.flagsStr = std::string(""); if(sampleArgs->isLoadBinaryEnabled()) { buildData.binaryName = std::string(sampleArgs->loadBinary.c_str()); } if(sampleArgs->isComplierFlagsSpecified()) { buildData.flagsFileName = std::string(sampleArgs->flags.c_str()); } retValue = buildOpenCLProgram(program, context, buildData); CHECK_ERROR(retValue, SDK_SUCCESS, "buildOpenCLProgram() failed"); // get a kernel object handle for a kernel with the given name kernel = clCreateKernel(program, "fastWalshTransform", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed."); return SDK_SUCCESS; }
int initGPU(int n) { #pragma mark Device Information // Find the CPU CL device, as a fallback err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &cpu, NULL); assert(err == CL_SUCCESS); // Find the GPU CL device, this is what we really want // If there is no GPU device is CL capable, fall back to CPU err |= clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) device = cpu; assert(device); // Get some information about the returned device cl_char vendor_name[1024] = {0}; cl_char device_name[1024] = {0}; err |= clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, &returned_size); err |= clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, &returned_size); assert(err == CL_SUCCESS); printf("Connecting to %s %s...", vendor_name, device_name); #pragma mark Context and Command Queue // Now create a context to perform our calculation with the // specified device context = clCreateContext(0, 1, &device, NULL, NULL, &err); assert(err == CL_SUCCESS); // And also a command queue for the context cmd_queue = clCreateCommandQueue(context, device, 0, NULL); #pragma mark Program and Kernel Creation // Load the program source from disk // The kernel/program is the project directory and in Xcode the executable // is set to launch from that directory hence we use a relative path const char * filename = "kernel.cl"; char *program_source = load_program_source(filename); program[0] = clCreateProgramWithSource(context, 1, (const char**)&program_source, NULL, &err); assert(err == CL_SUCCESS); err |= clBuildProgram(program[0], 0, NULL, NULL, NULL, NULL); assert(err == CL_SUCCESS); // Now create the kernel "objects" that we want to use in the example file kernel[0] = clCreateKernel(program[0], "add", &err); assert(err == CL_SUCCESS); #pragma mark Memory Allocation // Allocate memory on the device to hold our data and store the results into buffer_size = sizeof(int) * n; mem_c_position = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err); mem_c_velocity = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err); mem_p_angle = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err); mem_p_velocity = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err); assert(err == CL_SUCCESS); mem_fitness = clCreateBuffer(context, CL_MEM_WRITE_ONLY, buffer_size, NULL, &err); assert(err == CL_SUCCESS); // Get all of the stuff written and allocated clFinish(cmd_queue); printf(" done\n"); return err; // CL_SUCCESS }
int BinomialOption::setupCL() { cl_int status = CL_SUCCESS; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; if(isThereGPU() == false) { std::cout << "GPU not found. Falling back to CPU device" << std::endl; dType = CL_DEVICE_TYPE_CPU; } } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_platform_id platform = NULL; int retValue = sampleCommon->getPlatform(platform, platformId, isPlatformEnabled()); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::getPlatform() failed"); // Display available devices. retValue = sampleCommon->displayDevices(platform, dType); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::displayDevices() failed"); /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType(cps, dType, NULL, NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateContextFromType failed."); // getting device on which to run the sample status = sampleCommon->getDevices(context, &devices, deviceId, isDeviceIdEnabled()); CHECK_ERROR(status, SDK_SUCCESS, "sampleCommon::getDevices() failed"); status = deviceInfo.setDeviceInfo(devices[deviceId]); CHECK_OPENCL_ERROR(status, "deviceInfo.setDeviceInfo failed"); { // The block is to move the declaration of prop closer to its use cl_command_queue_properties prop = 0; commandQueue = clCreateCommandQueue(context, devices[deviceId], prop, &status); CHECK_OPENCL_ERROR(status, "clCreateCommandQueue failed."); } // Create and initialize memory objects // Set Presistent memory only for AMD platform cl_mem_flags inMemFlags = CL_MEM_READ_ONLY; // if(isAmdPlatform()) // inMemFlags |= CL_MEM_USE_PERSISTENT_MEM_AMD; // Create memory object for stock price randBuffer = clCreateBuffer(context, inMemFlags, numSamples * sizeof(cl_float4), NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (randBuffer)"); // Create memory object for output array outBuffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, numSamples * sizeof(cl_float4), NULL, &status); CHECK_OPENCL_ERROR(status, "clCreateBuffer failed. (outBuffer)"); // create a CL program using the kernel source streamsdk::buildProgramData buildData; buildData.kernelName = std::string("BinomialOption_Kernels.cl"); buildData.devices = devices; buildData.deviceId = deviceId; buildData.flagsStr = std::string(""); if(isLoadBinaryEnabled()) buildData.binaryName = std::string(loadBinary.c_str()); if(isComplierFlagsSpecified()) buildData.flagsFileName = std::string(flags.c_str()); retValue = sampleCommon->buildOpenCLProgram(program, context, buildData); CHECK_ERROR(retValue, SDK_SUCCESS, "sampleCommon::buildOpenCLProgram() failed"); // get a kernel object handle for a kernel with the given name kernel = clCreateKernel(program, "binomial_options", &status); CHECK_OPENCL_ERROR(status, "clCreateKernel failed."); status = kernelInfo.setKernelWorkGroupInfo(kernel, devices[deviceId]); CHECK_OPENCL_ERROR(status, "kernelInfo.setKernelWorkGroupInfo failed"); // If group-size is gerater than maximum supported on kernel if((size_t)(numSteps + 1) > kernelInfo.kernelWorkGroupSize) { if(!quiet) { std::cout << "Out of Resources!" << std::endl; std::cout << "Group Size specified : " << (numSteps + 1) << std::endl; std::cout << "Max Group Size supported on the kernel : " << kernelInfo.kernelWorkGroupSize << std::endl; std::cout << "Using appropiate group-size." << std::endl; std::cout << "-------------------------------------------" << std::endl; } numSteps = (cl_int)kernelInfo.kernelWorkGroupSize - 2; } return SDK_SUCCESS; }
// SETUP int CLContext::setupCL() { cl_int status = CL_SUCCESS; cl_device_type dType; int gpu = 1; if(gpu == 0) dType = CL_DEVICE_TYPE_CPU; else //deviceType = "gpu" dType = CL_DEVICE_TYPE_GPU; /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. <----- LOL check out the amd propaganda */ cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(!checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) return CL_FAILURE; if (0 < numPlatforms) { cl_platform_id* platforms = new cl_platform_id[numPlatforms]; status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(!checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) return CL_FAILURE; for (unsigned i = 0; i < numPlatforms; ++i) { char pbuf[100]; status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if(!checkVal(status, CL_SUCCESS, "clGetPlatformInfo failed.")) return CL_FAILURE; platform = platforms[i]; if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) break; } delete[] platforms; } /* * If we could find our platform, use it. Otherwise pass a NULL and get whatever the * implementation thinks we should be using. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; /* Use NULL for backward compatibility */ cl_context_properties* cprops = (NULL == platform) ? NULL : cps; context = clCreateContextFromType( cprops, dType, NULL, NULL, &status); if(!checkVal( status, CL_SUCCESS, "clCreateContextFromType failed.")) return CL_FAILURE; size_t deviceListSize; /* First, get the size of device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); if(!checkVal( status, CL_SUCCESS, "clGetContextInfo failed.")) return CL_FAILURE; /* Now allocate memory for device list based on the size we got earlier */ devices = (cl_device_id*)malloc(deviceListSize); if(devices==NULL) { cout << "Failed to allocate memory (devices)." << endl; return CL_FAILURE; } /* Now, get the device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL); if(!checkVal( status, CL_SUCCESS, "clGetContextInfo failed.")) return CL_FAILURE; /* Create command queue */ commandQueue = clCreateCommandQueue( context, devices[0], 0, &status); if(!checkVal( status, CL_SUCCESS, "clCreateCommandQueue failed.")) return CL_FAILURE; /* Get Device specific Information */ status = clGetDeviceInfo( devices[0], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void*)&maxWorkGroupSize, NULL); if(!checkVal( status, CL_SUCCESS, "clGetDeviceInfo CL_DEVICE_MAX_WORK_GROUP_SIZE failed.")) return CL_FAILURE; status = clGetDeviceInfo( devices[0], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void*)&maxDimensions, NULL); if(!checkVal( status, CL_SUCCESS, "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS failed.")) return CL_FAILURE; maxWorkItemSizes = (size_t *)malloc(maxDimensions * sizeof(unsigned int)); status = clGetDeviceInfo( devices[0], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * maxDimensions, (void*)maxWorkItemSizes, NULL); if(!checkVal( status, CL_SUCCESS, "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_SIZES failed.")) return CL_FAILURE; status = clGetDeviceInfo( devices[0], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), (void *)&totalLocalMemory, NULL); if(!checkVal( status, CL_SUCCESS, "clGetDeviceInfo CL_DEVICE_LOCAL_MEM_SIZE failed.")) return CL_FAILURE; /* * Create and initialize memory objects */ /* create a CL program using the kernel source */ string content; fileH.open( "critterding.cl", content ); const char * source = content.c_str(); size_t sourceSize[] = { strlen(source) }; program = clCreateProgramWithSource( context, 1, &source, sourceSize, &status); if(!checkVal( status, CL_SUCCESS, "clCreateProgramWithSource failed.")) return CL_FAILURE; /* create a cl program executable for all the devices specified */ status = clBuildProgram( program, 1, &devices[0], NULL, NULL, NULL); if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) { cl_int logStatus; char * buildLog = NULL; size_t buildLogSize = 0; logStatus = clGetProgramBuildInfo (program, devices[0], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &buildLogSize); if(!checkVal( logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) return CL_FAILURE; buildLog = (char*)malloc(buildLogSize); if(buildLog == NULL) { cout << "Failed to allocate host memory. (buildLog)" << endl; return CL_FAILURE; } memset(buildLog, 0, buildLogSize); logStatus = clGetProgramBuildInfo (program, devices[0], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); if(!checkVal( logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { free(buildLog); return CL_FAILURE; } std::cout << " \n\t\t\tBUILD LOG\n"; std::cout << " ************************************************\n"; std::cout << buildLog << std::endl; std::cout << " ************************************************\n"; free(buildLog); } if(!checkVal( status, CL_SUCCESS, "clBuildProgram failed.")) return CL_FAILURE; } return CL_SUCCESS; }
bool Reduction::initContextResources() { //error code cl_int clError; //get platform ID V_RETURN_FALSE_CL(clGetPlatformIDs(1, &clPlatform, NULL), "Failed to get CL platform ID"); cl_uint numberDevices = 0; //get a reference to the first available GPU device V_RETURN_FALSE_CL(clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, 0, 0, &numberDevices), "No GPU device found."); cout << "Found " << numberDevices << " devices" << endl; std::vector<cl_device_id> devicesIds(numberDevices); V_RETURN_FALSE_CL(clGetDeviceIDs(clPlatform, CL_DEVICE_TYPE_GPU, numberDevices, devicesIds.data(), NULL), "No GPU device found."); //Additional attributes to OpenCL context creation //which associate an OpenGL context with the OpenCL context cl_context_properties props[] = { //OpenCL platform CL_CONTEXT_PLATFORM, (cl_context_properties) clPlatform, //OpenGL context CL_GL_CONTEXT_KHR, (cl_context_properties) glXGetCurrentContext(), CL_GLX_DISPLAY_KHR , (cl_context_properties) glXGetCurrentDisplay() , 0 }; for(auto dev : devicesIds) { cl_device_id deviceToTry = dev; cl_context contextToTry = 0; contextToTry = clCreateContext( props, 1, &deviceToTry, 0, 0, &clError); if(clError == CL_SUCCESS) { clDevice = deviceToTry; clContext = contextToTry; break; } } char deviceName[1024]; V_RETURN_FALSE_CL(clGetDeviceInfo(clDevice, CL_DEVICE_NAME, 256, &deviceName, NULL), "Unable to query device name."); cout << "Device: " << deviceName << endl; //Finally, create the command queue. All the asynchronous commands to the device will be issued //from the CPU into this queue. This way the host program can continue the execution until some results //from that device are needed. clCommandQueue = clCreateCommandQueue(clContext, clDevice, 0, &clError); V_RETURN_FALSE_CL(clError, "Failed to create the command queue in the context"); //Now create and compile the programs size_t programSize = 0; QFile f(":/shaders/Reduce.cl"); if(!f.open(QIODevice::ReadOnly | QIODevice::Text)) return false; std::string programCodeStr = std::string(f.readAll().data()); const char *programCode = programCodeStr.c_str(); programSize = f.size(); clProgram = clCreateProgramWithSource(clContext, 1, (const char**) &programCode, &programSize, &clError); V_RETURN_FALSE_CL(clError, "Failed to create program file"); clError = clBuildProgram(clProgram, 1, &clDevice, NULL, NULL, NULL); if(clError != CL_SUCCESS) { PrintBuildLog(clProgram, clDevice); return false; } reduceHorizontalTransposeKernel = clCreateKernel(clProgram, "ReduceHorizontal", &clError); V_RETURN_FALSE_CL(clError, "Failed to compile kernel: ReduceHorizontal"); reduceVerticalKernel = clCreateKernel(clProgram, "ReduceVertical", &clError); V_RETURN_FALSE_CL(clError, "Failed to compile kernel: ReduceVertical"); return true; }
bool CLContextWrapper::createContext(DeviceType deviceType) { // Check if has already created a context if(_hasCreatedContext) { return false; } cl_int err = CL_SUCCESS; // Get Platform cl_uint numPlatforms = 0; err = clGetPlatformIDs(0, nullptr, &numPlatforms); if(numPlatforms == 0) { return false; } // Get the first found platform cl_platform_id platform; err = clGetPlatformIDs(1, &platform, nullptr); if(err) { logError("Error: Failed get platorm id" , getError(err)); return false; } // Get Device Info cl_device_type computeDeviceType = deviceType == DeviceType::GPU_DEVICE ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU; // for now, only these two types cl_device_id computeDeviceId; err = clGetDeviceIDs(platform, computeDeviceType, 1, &computeDeviceId, NULL); if (err != CL_SUCCESS) { logError("Error: Failed to locate a compute device!" , getError(err)); return false; } size_t returnedSize = 0; size_t maxWorkGroupSize = 0; err = clGetDeviceInfo(computeDeviceId, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &maxWorkGroupSize, &returnedSize); if (err != CL_SUCCESS) { logError("Error: Failed to retrieve device info!", getError(err)); return false; } cl_char vendorName[1024] = {0}; cl_char deviceName[1024] = {0}; err = clGetDeviceInfo(computeDeviceId, CL_DEVICE_VENDOR, sizeof(vendorName), vendorName, &returnedSize); err|= clGetDeviceInfo(computeDeviceId, CL_DEVICE_NAME, sizeof(deviceName), deviceName, &returnedSize); if (err != CL_SUCCESS) { logError("Error: Failed to retrieve device info!", getError(err)); return false; } std::cout << "Connecting to " << vendorName << " - " << deviceName << "..." << std::endl; // Create Context cl_context context = clCreateContext(0, 1, &computeDeviceId, NULL, NULL, &err); if (!_this->context || err) { logError("Error: Failed to create a compute ComputeContext!", getError(err)); return false; } // Create Command Queue cl_command_queue commandQueue = clCreateCommandQueue(context, computeDeviceId, 0, &err); if (!_this->commandQueue) { logError("Error: Failed to create a command ComputeCommands!", getError(err)); return false; } _this->deviceId = computeDeviceId; _this->context = context; _this->commandQueue = commandQueue; _this->maxWorkGroupSize = maxWorkGroupSize; std::cout << "Successfully created OpenCL context " << std::endl; _hasCreatedContext = true; _deviceType = deviceType; return true; }
int main(void) { //############################################### // // Declare variables for OpenCL // //############################################### int err; // error code returned from OpenCL calls size_t global; // global domain size cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel ko_calculate_imagerowdots_iterations; // compute kernel cl_kernel ko_calculate_colorrow; // compute kernel cl_mem d_a; // device memory used for the input a vector cl_mem d_b; // device memory int i; //############################################### // // Set values for mandelbrot // //############################################### //plane section values float x_ebene_min = -1; float y_ebene_min = -1; float x_ebene_max = 2; float y_ebene_max = 1; //monitor resolution values const long x_mon = 640; const long y_mon = 480; //Iterations long itr = 100; //abort condition float abort_value = 2; //Number of images per second long fps = 24; //video duration in seconds long video_duration = 3; //zoom speed in percentage float reduction = 5; //zoom dot my_complex_t zoom_dot; //############################################### // // Set up platform and GPU device // //############################################### cl_uint numPlatforms; // Find number of platforms err = clGetPlatformIDs(0, NULL, &numPlatforms); checkError(err, "Finding platforms"); if (numPlatforms == 0) { printf("Found 0 platforms!\n"); return EXIT_FAILURE; } // Get all platforms cl_platform_id Platform[numPlatforms]; err = clGetPlatformIDs(numPlatforms, Platform, NULL); checkError(err, "Getting platforms"); // Secure a GPU for (i = 0; i < numPlatforms; i++) { err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL); if (err == CL_SUCCESS) { break; } } if (device_id == NULL) checkError(err, "Finding a device"); err = output_device_info(device_id); checkError(err, "Printing device output"); //############################################### // // Create context, command queue and kernel // //############################################### // Create a compute context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); checkError(err, "Creating context"); // Create a command queue commands = clCreateCommandQueue(context, device_id, 0, &err); checkError(err, "Creating command queue"); //Read Kernel source FILE *fp; char *source_str; size_t source_size, program_size; fp = fopen("./kernel/calculate_iterations.cl", "r"); if (!fp) { printf("Failed to load kernel\n"); return 1; } fseek(fp, 0, SEEK_END); program_size = ftell(fp); rewind(fp); source_str = (char*) malloc(program_size + 1); source_str[program_size] = '\0'; fread(source_str, sizeof(char), program_size, fp); fclose(fp); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) &source_str, NULL, &err); checkError(err, "Creating program"); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); // Determine the size of the log size_t log_size; clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); // Allocate memory for the log char *log = (char *) malloc(log_size); // Get the log clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); // Print the log printf("%s\n", log); return EXIT_FAILURE; } // Create the compute kernel from the program ko_calculate_imagerowdots_iterations = clCreateKernel(program, "calculate_imagerowdots_iterations", &err); checkError(err, "Creating kernel"); // Create the compute kernel from the program ko_calculate_colorrow = clCreateKernel(program, "calculate_colorrow", &err); checkError(err, "Creating kernel"); int number_images = 0; do { //Get memory for image long* h_image = (long*) calloc(x_mon * y_mon, sizeof(long)); unsigned char* h_image_pixel = (unsigned char*) calloc( x_mon * y_mon * 3, sizeof(unsigned char)); //############################################### //############################################### // // Loop to calculate image dot iterations // //############################################### //############################################### float y_value = y_ebene_max; float delta_y = delta(y_ebene_min, y_ebene_max, y_mon); for (int row = 0; row < y_mon; ++row) { //############################################### // // Create and write buffer // //############################################### //Get memory for row long* h_image_row = (long*) calloc(x_mon, sizeof(long)); // a vector d_a = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * x_mon, NULL, &err); checkError(err, "Creating buffer d_a"); // Write a vector into compute device memory err = clEnqueueWriteBuffer(commands, d_a, CL_TRUE, 0, sizeof(long) * x_mon, h_image_row, 0, NULL, NULL); checkError(err, "Copying h_a to device at d_a"); //############################################### // // Set the arguments to our compute kernel // //############################################### err = clSetKernelArg(ko_calculate_imagerowdots_iterations, 0, sizeof(float), &x_ebene_min); err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 1, sizeof(float), &x_ebene_max); err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 2, sizeof(float), &y_value); err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 3, sizeof(long), &x_mon); err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 4, sizeof(float), &abort_value); err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 5, sizeof(long), &itr); err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 6, sizeof(cl_mem), &d_a); checkError(err, "Setting kernel arguments"); /*__kernel void calculate_imagerowdots_iterations(const float x_min, const float x_max, const float y_value, const long x_mon, const float abort_value, const long itr, __global long * imagerow)*/ // Execute the kernel over the entire range of our 1d input data set // letting the OpenCL runtime choose the work-group size global = x_mon; err = clEnqueueNDRangeKernel(commands, ko_calculate_imagerowdots_iterations, 1, NULL, &global, NULL, 0, NULL, NULL); checkError(err, "Enqueueing kernel"); // Wait for the commands to complete err = clFinish(commands); checkError(err, "Waiting for kernel to finish"); // Read back the results from the compute device err = clEnqueueReadBuffer(commands, d_a, CL_TRUE, 0, sizeof(long) * x_mon, h_image_row, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to read output array!\n%s\n", err_code(err)); exit(1); } //reduce y y_value -= delta_y; //cope row to image memcpy(h_image + row * x_mon, h_image_row, sizeof(long) * x_mon); free(h_image_row); } // for (i = 0; i < x_mon * y_mon; ++i) { // printf("%ld ", h_image[i]); // } // fflush(stdout); //############################################### //############################################### // // End of loop to calculate image dot iterations // //############################################### //############################################### //############################################### //############################################### // // Beginn color calculation // //############################################### //############################################### for (int row = 0; row < y_mon; ++row) { //Get memory for row long* h_image_row = (long*) calloc(x_mon, sizeof(long)); // a vector memcpy(h_image_row, h_image + row * x_mon, sizeof(long) * x_mon); d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(long) * x_mon, NULL, &err); checkError(err, "Creating buffer d_a"); // Write a vector into compute device memory err = clEnqueueWriteBuffer(commands, d_a, CL_TRUE, 0, sizeof(long) * x_mon, h_image_row, 0, NULL, NULL); checkError(err, "Copying h_image_row to device at d_a"); unsigned char* h_imagepixel_row = (unsigned char*) calloc(x_mon * 3, sizeof(unsigned char)); // a vector d_b = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(unsigned char) * x_mon * 3, NULL, &err); checkError(err, "Creating buffer d_b"); // Write a vector into compute device memory err = clEnqueueWriteBuffer(commands, d_b, CL_TRUE, 0, sizeof(unsigned char) * x_mon * 3, h_imagepixel_row, 0, NULL, NULL); checkError(err, "Copying h_imagepixel_row to device at d_b"); //############################################### // // Set the arguments to our compute kernel // //############################################### err = clSetKernelArg(ko_calculate_colorrow, 0, sizeof(long), &x_mon); err |= clSetKernelArg(ko_calculate_colorrow, 1, sizeof(long), &itr); err |= clSetKernelArg(ko_calculate_colorrow, 2, sizeof(cl_mem), &d_a); err |= clSetKernelArg(ko_calculate_colorrow, 3, sizeof(cl_mem), &d_b); checkError(err, "Setting kernel arguments"); /*__kernel void calculate_colorrow(const long width, long itr, long * imagerowvalues, unsigned char * imagerow)*/ // Execute the kernel over the entire range of our 1d input data set // letting the OpenCL runtime choose the work-group size global = x_mon; err = clEnqueueNDRangeKernel(commands, ko_calculate_colorrow, 1, NULL, &global, NULL, 0, NULL, NULL); checkError(err, "Enqueueing kernel"); // Wait for the commands to complete err = clFinish(commands); checkError(err, "Waiting for kernel to finish"); // Read back the results from the compute device err = clEnqueueReadBuffer(commands, d_b, CL_TRUE, 0, sizeof(unsigned char) * x_mon * 3, h_imagepixel_row, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to read output array!\n%s\n", err_code(err)); exit(1); } memcpy(h_image_pixel + row * x_mon * 3, h_imagepixel_row, sizeof(unsigned char) * x_mon * 3); free(h_image_row); free(h_imagepixel_row); } if (number_images == 0) { zoom_dot = find_dot_to_zoom(x_ebene_min, x_ebene_max, y_ebene_min, y_ebene_max, h_image, y_mon, x_mon, itr); } reduce_plane_section_focus_dot(&x_ebene_min, &x_ebene_max, &y_ebene_min, &y_ebene_max, reduction, zoom_dot); // save the image char filename[50]; sprintf(filename, "img-%d.bmp", number_images); safe_image_to_bmp(x_mon, y_mon, h_image_pixel, filename); free(h_image); free(h_image_pixel); number_images++; itr = (long) (itr + itr * reduction / 100); printf("%d\n", number_images); fflush(stdout); } while (number_images < (fps * video_duration)); //############################################### // // cleanup then shutdown // //############################################### clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseProgram(program); clReleaseKernel(ko_calculate_imagerowdots_iterations); clReleaseCommandQueue(commands); clReleaseContext(context); return 0; }
bool CLContextWrapper::createContextWithOpengl() { cl_platform_id platform; cl_int err = clGetPlatformIDs(1, &platform, nullptr); if(err) { logError("Error: Failed get platorm id" , getError(err)); return false; } // Get Device Info cl_device_id computeDeviceId; err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &computeDeviceId, NULL); if (err != CL_SUCCESS) { logError("Error: Failed to locate a compute device!" , getError(err)); return false; } CGLContextObj kCGLContext = CGLGetCurrentContext(); CGLShareGroupObj kCGLShareGroup = CGLGetShareGroup(kCGLContext); // Create CL context properties, add handle & share-group enum cl_context_properties properties[] = { CL_CONTEXT_PROPERTY_USE_CGL_SHAREGROUP_APPLE, (cl_context_properties)kCGLShareGroup, 0 }; // Create a context with device in the CGL share group cl_context context = clCreateContext(properties, 0, &computeDeviceId, nullptr, 0, &err); size_t returnedSize = 0; size_t maxWorkGroupSize = 0; err = clGetDeviceInfo(computeDeviceId, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &maxWorkGroupSize, &returnedSize); if (err != CL_SUCCESS) { logError("Error: Failed to retrieve device info!", getError(err)); return false; } if(err) { logError("Error creating OpenCL shared with with shared Opengl", getError(err)); return false; } // Create Command Queue auto commandQueue = clCreateCommandQueue(context, computeDeviceId, 0, &err); if (!commandQueue) { logError("Error: Failed to create a command ComputeCommands with shared Opengl!", getError(err)); return false; } _this->context = context; _this->deviceId = computeDeviceId; _this->commandQueue = commandQueue; _this->maxWorkGroupSize = maxWorkGroupSize; _hasCreatedContext = true; _deviceType = DeviceType::GPU_DEVICE; return true; }
int main( int argc, char* argv[] ) { // Length of vectors unsigned int n = 100000; // Host input vectors double *h_a; double *h_b; // Host output vector double *h_c; // Device input buffers cl_mem d_a; cl_mem d_b; // Device output buffer cl_mem d_c; cl_platform_id cpPlatform; // OpenCL platform cl_device_id device_id; // device ID cl_context context; // context cl_command_queue queue; // command queue cl_program program; // program cl_kernel kernel; // kernel // Size, in bytes, of each vector size_t bytes = n*sizeof(double); // Allocate memory for each vector on host h_a = (double*)malloc(bytes); h_b = (double*)malloc(bytes); h_c = (double*)malloc(bytes); // Initialize vectors on host int i; for( i = 0; i < n; i++ ) { h_a[i] = sinf(i)*sinf(i); h_b[i] = cosf(i)*cosf(i); } size_t globalSize, localSize; cl_int err; // Number of work items in each local work group localSize = 64; // Number of total work items - localSize must be devisor globalSize = ceil(n/(float)localSize)*localSize; // Bind to platform err = clGetPlatformIDs(1, &cpPlatform, NULL); // Get ID for the device err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); // Create a context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); // Create a command queue queue = clCreateCommandQueue(context, device_id, 0, &err); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelSource, NULL, &err); // Build the program executable clBuildProgram(program, 0, NULL, NULL, NULL, NULL); // Create the compute kernel in the program we wish to run kernel = clCreateKernel(program, "vecAdd", &err); // Create the input and output arrays in device memory for our calculation d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL); // Write our data set into the input array in device memory err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0, bytes, h_a, 0, NULL, NULL); err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0, bytes, h_b, 0, NULL, NULL); // Set the arguments to our compute kernel err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c); err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n); // Execute the kernel over the entire range of the data set err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL); // Wait for the command queue to get serviced before reading back results clFinish(queue); // Read the results from the device clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0, bytes, h_c, 0, NULL, NULL ); //Sum up vector c and print result divided by n, this should equal 1 within error double sum = 0; for(i=0; i<n; i++) sum += h_c[i]; printf("final result: %f\n", sum/(double)n); // release OpenCL resources clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseContext(context); //release host memory free(h_a); free(h_b); free(h_c); return 0; }
bool CLContextWrapper::createContextWithOpengl() { cl_platform_id platform; cl_int err = clGetPlatformIDs(1, &platform, nullptr); if(err) { logError("Error: Failed get platorm id" , getError(err)); return false; } // Create CL context properties, add handle & share-group enum auto glContext = wglGetCurrentContext(); auto glDc = wglGetCurrentDC(); cl_context_properties properties[] = { CL_GL_CONTEXT_KHR, (cl_context_properties) glContext, CL_WGL_HDC_KHR, (cl_context_properties) glDc, CL_CONTEXT_PLATFORM,(cl_context_properties) platform, 0 }; // Get Device Info // The easy way cl_device_id computeDeviceId; err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &computeDeviceId, NULL); if (err) { // The "hard" way if (!clGetGLContextInfoKHR) { clGetGLContextInfoKHR = (clGetGLContextInfoKHR_fn) clGetExtensionFunctionAddressForPlatform(platform, "clGetGLContextInfoKHR"); if (!clGetGLContextInfoKHR) { logError("Error: Failed to locate a compute device!" , "Failed to query proc address for clGetGLContextInfoKHR"); return false; } // Get the first err = clGetGLContextInfoKHR(properties, CL_DEVICES_FOR_GL_CONTEXT_KHR, sizeof(cl_device_id), &computeDeviceId, nullptr); if(!computeDeviceId) { logError("Error: Failed to locate a compute device!" , getError(err)); } } } // Create a context with device in the CGL share group cl_context context = clCreateContext(properties, 1, &computeDeviceId, nullptr, 0, &err); size_t returnedSize = 0; size_t maxWorkGroupSize = 0; err = clGetDeviceInfo(computeDeviceId, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &maxWorkGroupSize, &returnedSize); if (err != CL_SUCCESS) { logError("Error: Failed to retrieve device info!", getError(err)); return false; } if(err) { logError("Error creating OpenCL shared with with shared Opengl", getError(err)); return false; } // Create Command Queue auto commandQueue = clCreateCommandQueue(context, computeDeviceId, 0, &err); if (!commandQueue) { logError("Error: Failed to create a command ComputeCommands with shared Opengl!", getError(err)); return false; } _this->deviceId = computeDeviceId; _this->context = context; _this->commandQueue = commandQueue; _this->maxWorkGroupSize = maxWorkGroupSize; _hasCreatedContext = true; _deviceType = DeviceType::GPU_DEVICE; return true; return false; }
// Main program //***************************************************************************** int main(int argc, char** argv) { // Locals used with command line args int p = 256; // workgroup X dimension int q = 1; // workgroup Y dimension nEdges = computeNumEdges(numBodies); pArgc = &argc; pArgv = argv; shrQAStart(argc, argv); // latch the executable path for other funcs to use cExecutablePath = argv[0]; // start logs and show command line help shrSetLogFileName ("oclNbody.txt"); shrLog("%s Starting...\n\n", cExecutablePath); shrLog("Command line switches:\n"); shrLog(" --qatest\t\tCheck correctness of GPU execution and measure performance)\n"); shrLog(" --noprompt\t\tQuit simulation automatically after a brief period\n"); shrLog(" --n=<numbodies>\tSpecify # of bodies to simulate (default = %d)\n", numBodies); shrLog(" --double\t\tUse double precision floating point values for simulation\n"); shrLog(" --p=<workgroup X dim>\tSpecify X dimension of workgroup (default = %d)\n", p); shrLog(" --q=<workgroup Y dim>\tSpecify Y dimension of workgroup (default = %d)\n\n", q); // Get command line arguments if there are any and set vars accordingly if (argc > 0) { shrGetCmdLineArgumenti(argc, (const char**)argv, "p", &p); shrGetCmdLineArgumenti(argc, (const char**)argv, "q", &q); shrGetCmdLineArgumenti(argc, (const char**)argv, "n", &numBodies); bDouble = (shrTRUE == shrCheckCmdLineFlag(argc, (const char**)argv, "double")); bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); bQATest = shrCheckCmdLineFlag(argc, (const char**)argv, "qatest"); } //Get the NVIDIA platform cl_int ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); shrLog("clGetPlatformID...\n\n"); if (bDouble) { shrLog("Double precision execution...\n\n"); } else { shrLog("Single precision execution...\n\n"); } flopsPerInteraction = bDouble ? 30 : 20; //Get all the devices shrLog("Get the Device info and select Device...\n"); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumDevices); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); cdDevices = (cl_device_id *)malloc(uiNumDevices * sizeof(cl_device_id) ); ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiNumDevices, cdDevices, NULL); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Set target device and Query number of compute units on uiTargetDevice shrLog(" # of Devices Available = %u\n", uiNumDevices); if(shrGetCmdLineArgumentu(argc, (const char**)argv, "device", &uiTargetDevice)== shrTRUE) { uiTargetDevice = CLAMP(uiTargetDevice, 0, (uiNumDevices - 1)); } shrLog(" Using Device %u, ", uiTargetDevice); oclPrintDevName(LOGBOTH, cdDevices[uiTargetDevice]); cl_uint uiNumComputeUnits; clGetDeviceInfo(cdDevices[uiTargetDevice], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(uiNumComputeUnits), &uiNumComputeUnits, NULL); shrLog(" # of Compute Units = %u\n", uiNumComputeUnits); //Create the context shrLog("\n\n\nMikisko sa hlasi do sluzby...\n\n\n"); shrLog("clCreateContext...\n"); cxContext = clCreateContext(0, uiNumDevsUsed, &cdDevices[uiTargetDevice], NULL, NULL, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Create a command-queue shrLog("clCreateCommandQueue...\n\n"); cqCommandQueue = clCreateCommandQueue(cxContext, cdDevices[uiTargetDevice], CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); // Log and config for number of bodies shrLog("Number of Bodies = %d\n", numBodies); switch (numBodies) { case 1024: activeParams.m_clusterScale = 1.52f; activeParams.m_velocityScale = 2.f; break; case 2048: activeParams.m_clusterScale = 1.56f; activeParams.m_velocityScale = 2.64f; break; case 4096: activeParams.m_clusterScale = 1.68f; activeParams.m_velocityScale = 2.98f; break; case 7680: case 8000: case 8192: activeParams.m_clusterScale = 1.98f; activeParams.m_velocityScale = 2.9f; break; default: case 15360: case 16384: activeParams.m_clusterScale = 1.54f; activeParams.m_velocityScale = 8.f; break; case 30720: case 32768: activeParams.m_clusterScale = 1.44f; activeParams.m_velocityScale = 11.f; break; } if ((q * p) > 256) { p = 256 / q; shrLog("Setting p=%d to maintain %d threads per block\n", p, 256); } if ((q == 1) && (numBodies < p)) { p = numBodies; shrLog("Setting p=%d because # of bodies < p\n", p); } shrLog("Workgroup Dims = (%d x %d)\n\n", p, q); // Initialize OpenGL items if using GL if (bQATest == shrFALSE) { shrLog("Calling InitGL...\n"); InitGL(&argc, argv); } else { shrLog("Skipping InitGL...\n"); } shrLog("Calling InitGL...\n"); // CL/GL interop disabled bUsePBO = (false && (bQATest == shrFALSE)); InitNbody(cdDevices[uiTargetDevice], cxContext, cqCommandQueue, numBodies, p, q, bUsePBO, bDouble, NBODY_CONFIG_SHELL); ResetSim(nbody, numBodies, NBODY_CONFIG_SHELL, bUsePBO); shrLog("Calling InitGL...\n"); // init timers shrDeltaT(DEMOTIME); // timer 0 is for timing demo periods shrDeltaT(FUNCTIME); // timer 1 is for logging function delta t's shrDeltaT(FPSTIME); // timer 2 is for fps measurement // Standard simulation if (bQATest == shrFALSE) { shrLog("Running standard oclNbody simulation...\n\n"); glutDisplayFunc(DisplayGL); glutReshapeFunc(ReshapeGL); glutMouseFunc(MouseGL); glutMotionFunc(MotionGL); glutKeyboardFunc(KeyboardGL); glutSpecialFunc(SpecialGL); glutIdleFunc(IdleGL); glutMainLoop(); } // Compare to host, profile and write out file for regression analysis if (bQATest == shrTRUE) { bool bTestResults = false; shrLog("Running oclNbody Results Comparison...\n\n"); bTestResults = CompareResults(numBodies); shrLog("Profiling oclNbody...\n\n"); RunProfiling(100, (unsigned int)(p * q)); // 100 iterations shrQAFinish(argc, (const char **)argv, bTestResults ? QA_PASSED : QA_FAILED); } else { // Cleanup/exit bNoPrompt = shrTRUE; shrQAFinish2(false, *pArgc, (const char **)pArgv, QA_PASSED); } Cleanup(EXIT_SUCCESS); }
void execute(float *grid, size_t gridSize, unsigned int width, unsigned int workGroupSize, unsigned int iterations, bool printResult) { cl_context context; cl_command_queue commandQueue; cl_program program; cl_kernel kernel; size_t dataBytes, kernelLength; cl_int errorCode; cl_mem gridBuffer; cl_device_id* devices; cl_device_id gpu; cl_uint numPlatforms; errorCode = clGetPlatformIDs(0, NULL, &numPlatforms); cl_platform_id platforms[numPlatforms]; errorCode = clGetPlatformIDs(numPlatforms, platforms, NULL); checkError(errorCode); cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (int) platforms[0], 0}; context = clCreateContextFromType(properties, CL_DEVICE_TYPE_ALL, 0, NULL, &errorCode); checkError(errorCode); errorCode = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &dataBytes); devices = malloc(dataBytes); errorCode |= clGetContextInfo(context, CL_CONTEXT_DEVICES, dataBytes, devices, NULL); gpu = devices[0]; commandQueue = clCreateCommandQueue(context, gpu, 0, &errorCode); checkError(errorCode); gridBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, gridSize, grid, &errorCode); checkError(errorCode); const char* programBuffer = readFile("kernel.cl"); kernelLength = strlen(programBuffer); program = clCreateProgramWithSource(context, 1, (const char **)&programBuffer, &kernelLength, &errorCode); checkError(errorCode); errorCode = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (errorCode == CL_BUILD_PROGRAM_FAILURE) { // Determine the size of the log size_t log_size; clGetProgramBuildInfo(program, gpu, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); // Allocate memory for the log char *log = (char *) malloc(log_size); // Get the log clGetProgramBuildInfo(program, gpu, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); // Print the log free(log); printf("%s\n", log); } checkError(errorCode); kernel = clCreateKernel(program, "diffuse", &errorCode); checkError(errorCode); size_t localWorkSize[2] = {workGroupSize, workGroupSize}, globalWorkSize[2] = {width, width}; errorCode |= clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&gridBuffer); errorCode |= clSetKernelArg(kernel, 1, sizeof(float) * workGroupSize * workGroupSize, NULL); errorCode |= clSetKernelArg(kernel, 2, sizeof(int), (void *)&width); errorCode |= clSetKernelArg(kernel, 3, sizeof(int), (void *)&workGroupSize); errorCode |= clSetKernelArg(kernel, 4, sizeof(int), (void *)&iterations); checkError(errorCode); errorCode = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); checkError(errorCode); errorCode = clEnqueueReadBuffer(commandQueue, gridBuffer, CL_TRUE, 0, gridSize, grid, 0, NULL, NULL); checkError(errorCode); free(devices); free((void *)programBuffer); clReleaseContext(context); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(commandQueue); }
int main(int argc, char** argv) { int err; // error code returned from api calls float data[DATA_SIZE]; // original data set given to device float results[DATA_SIZE]; // results returned from device unsigned int correct; // number of correct results returned size_t global; // global domain size for our calculation size_t local; // local domain size for our calculation cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel cl_mem input; // device memory used for the input array cl_mem output; // device memory used for the output array // Fill our data set with random float values // int i = 0; unsigned int count = DATA_SIZE; for(i = 0; i < count; i++) data[i] = rand() / (float)RAND_MAX; // Determine the platform ID: NULL platform IDs lead to // "platform specific" behavior! cl_platform_id platforms[8]; uint32_t num_platforms; err = clGetPlatformIDs(8, platforms, &num_platforms); if(err != CL_SUCCESS) { printf("Error: failed to get platform ids!\n"); return EXIT_FAILURE; } printf("%u platform ids found\n", num_platforms); // Connect to a compute device // int gpu = 1; err = clGetDeviceIDs(platforms[0], gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command commands // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source buffer // program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "square", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } // Create the input and output arrays in device memory for our calculation // input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL); output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL); if (!input || !output) { printf("Error: Failed to allocate device memory!\n"); exit(1); } // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array!\n"); exit(1); } // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } // Get the maximum work group size for executing the kernel on the device // err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // global = count; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } // Wait for the command commands to get serviced before reading back results // clFinish(commands); // Read back the results from the device to verify the output // err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } // Validate our results // correct = 0; for(i = 0; i < count; i++) { if(results[i] == data[i] * data[i]) correct++; } // Print a brief summary detailing the results // printf("Computed '%d/%d' correct values!\n", correct, count); // Shutdown and cleanup // clReleaseMemObject(input); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return 0; }
void opencl_init(void) { // get the platform cl_uint num_platforms; clError = clGetPlatformIDs(0, NULL, &num_platforms); checkErr(clError, "clGetPlatformIDs( 0, NULL, &num_platforms );"); if (num_platforms <= 0) { std::cout << "No platform..." << std::endl; exit(1); } cl_platform_id* platforms = new cl_platform_id[num_platforms]; clError = clGetPlatformIDs(num_platforms, platforms, NULL); checkErr(clError, "clGetPlatformIDs( num_platforms, &platforms, NULL );"); if (num_platforms > 1) { char platformName[256]; clError = clGetPlatformInfo(platforms[0], CL_PLATFORM_VENDOR, sizeof(platformName), platformName, NULL); std::cerr << "Multiple platforms found defaulting to: " << platformName << std::endl; } platform_id = platforms[0]; if (getenv("OPENCL_PLATEFORM")) platform_id = platforms[1]; delete platforms; // Connect to a compute device // cl_uint device_count = 0; clError = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, 0, NULL, &device_count); checkErr(clError, "Failed to create a device group"); cl_device_id* deviceIds = (cl_device_id*) malloc( sizeof(cl_device_id) * device_count); clError = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, device_count, deviceIds, NULL); if (device_count > 1) { char device_name[256]; int compute_units; clError = clGetDeviceInfo(deviceIds[0], CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); checkErr(clError, "clGetDeviceInfo failed"); clError = clGetDeviceInfo(deviceIds[0], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &compute_units, NULL); checkErr(clError, "clGetDeviceInfo failed"); std::cerr << "Multiple devices found defaulting to: " << device_name; std::cerr << " with " << compute_units << " compute units" << std::endl; } device_id = deviceIds[0]; delete deviceIds; // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &clError); checkErr(clError, "Failed to create a compute context!"); // Create a command commands // commandQueue = clCreateCommandQueue(context, device_id, 0, &clError); checkErr(clError, "Failed to create a command commands!"); // READ KERNEL FILENAME std::string filename = "NOTDEFINED.cl"; char const* tmp_name = getenv("OPENCL_KERNEL"); if (tmp_name) { filename = std::string(tmp_name); } else { filename = std::string(__FILE__); filename = filename.substr(0, filename.length() - 17); filename += "/kernels.cl"; } // READ OPENCL_PARAMETERS std::string compile_parameters = ""; char const* tmp_params = getenv("OPENCL_PARAMETERS"); if (tmp_params) { compile_parameters = std::string(tmp_params); } std::ifstream kernelFile(filename.c_str(), std::ios::in); if (!kernelFile.is_open()) { std::cout << "Unable to open " << filename << ". " << __FILE__ << ":" << __LINE__ << "Please set OPENCL_KERNEL" << std::endl; exit(1); } /* * Read the kernel file into an output stream. * Convert this into a char array for passing to OpenCL. */ std::ostringstream outputStringStream; outputStringStream << kernelFile.rdbuf(); std::string srcStdStr = outputStringStream.str(); const char* charSource = srcStdStr.c_str(); kernelFile.close(); // Create the compute program from the source buffer // program = clCreateProgramWithSource(context, 1, (const char **) &charSource, NULL, &clError); if (!program) { printf("Error: Failed to create compute program!\n"); exit(1); } // Build the program executable // clError = clBuildProgram(program, 0, NULL, compile_parameters.c_str(), NULL, NULL); /* Get the size of the build log. */ size_t logSize = 0; clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); if (clError != CL_SUCCESS) { if (logSize > 1) { char* log = new char[logSize]; clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, logSize, log, NULL); std::string stringChars(log, logSize); std::cerr << "Build log:\n " << stringChars << std::endl; delete[] log; } printf("Error: Failed to build program executable!\n"); exit(1); } return; }
int main(void) { cl_context context = 0; cl_command_queue command_waiting_line = 0; cl_program program = 0; cl_device_id device_id = 0; cl_kernel kernel = 0; // int numberOfMemoryObjects = 3; cl_mem memoryObjects[3] = {0, 0, 0}; cl_platform_id platform_id = NULL; cl_uint ret_num_devices; cl_int errorNumber; cl_int ret; /* Load the source code containing the kernel*/ char fileName[] = "source/parallel/composition_population.cl"; FILE *fp; char *source_str; size_t source_size; fp = fopen(fileName, "r"); cl_uint ret_num_platforms; if (!fp) { fprintf(stderr, "Failed to load kernel %s:%d.\n", __FILE__, __LINE__); exit(1); } source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); // printf("file: %s :file", source_str); getInfo(); ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); if (!success_verification(ret)) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to get platform id's. %s:%d\n", __FILE__, __LINE__); return 1; } ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); if (!success_verification(ret)) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to get OpenCL devices. %s:%d\n", __FILE__, __LINE__); return 1; } context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); if (!success_verification(ret)) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to create an OpenCL context. %s:%d\n", __FILE__, __LINE__); return 1; } #ifdef CL_VERSION_2_0 command_waiting_line = clCreateCommandQueueWithProperties(context, device_id, 0, &ret); #else command_waiting_line = clCreateCommandQueue(context, device_id, 0, &ret); #endif if (!success_verification(ret)) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to create the OpenCL command queue. %s:%d\n", __FILE__, __LINE__); return 1; } /* create program */ program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); if (!success_verification(ret)) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to create OpenCL program. %s:%d\n", __FILE__, __LINE__); return 1; } /* Build Kernel Program */ ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); if (!success_verification(ret)) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to build OpenCL program. %s:%d\n", __FILE__, __LINE__); return 1; } kernel = clCreateKernel(program, "composition_population", &errorNumber); if (!success_verification(errorNumber)) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to create OpenCL kernel. %s:%d\n", __FILE__, __LINE__); return 1; } /* [Setup memory] */ /* Number of elements in the arrays of input and output data. */ /* The buffers are the size of the arrays. */ uint16_t activity_atom_size = MAX_INDEPENDENTCLAUSE_TABLET * 1; uint8_t program_size = 1; uint8_t population_size = 4; size_t activity_atom_byte_size = activity_atom_size * sizeof(v16us); uint16_t population_byte_size = (uint16_t)(program_size * (uint16_t)(population_size * sizeof(v16us))); /* * Ask the OpenCL implementation to allocate buffers for the data. * We ask the OpenCL implemenation to allocate memory rather than allocating * it on the CPU to avoid having to copy the data later. * The read/write flags relate to accesses to the memory from within the * kernel. */ int createMemoryObjectsSuccess = TRUE; memoryObjects[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, activity_atom_byte_size, NULL, &errorNumber); createMemoryObjectsSuccess &= success_verification(errorNumber); memoryObjects[1] = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, population_byte_size, NULL, &errorNumber); createMemoryObjectsSuccess &= success_verification(errorNumber); memoryObjects[2] = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, newspaper_byte_size, NULL, &errorNumber); createMemoryObjectsSuccess &= success_verification(errorNumber); if (!createMemoryObjectsSuccess) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to create OpenCL buffer. %s:%d\n", __FILE__, __LINE__); return 1; } /* [Setup memory] */ /* [Map the buffers to pointers] */ /* Map the memory buffers created by the OpenCL implementation to pointers so * we can access them on the CPU. */ int mapMemoryObjectsSuccess = TRUE; v16us *activity_atom = (v16us *)clEnqueueMapBuffer( command_waiting_line, memoryObjects[0], CL_TRUE, CL_MAP_WRITE, 0, activity_atom_byte_size, 0, NULL, NULL, &errorNumber); mapMemoryObjectsSuccess &= success_verification(errorNumber); // cl_int *inputB = (cl_int *)clEnqueueMapBuffer( // command_waiting_line, memoryObjects[1], CL_TRUE, CL_MAP_WRITE, 0, // bufferSize, 0, // NULL, NULL, &errorNumber); // mapMemoryObjectsSuccess &= success_verification(errorNumber); if (!mapMemoryObjectsSuccess) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to map buffer. %s:%d\n", __FILE__, __LINE__); return 1; } /* [Map the buffers to pointers] */ /* [Initialize the input data] */ const char *activity_atom_text = "nyistu htoftu hnattu hnamtu"; const uint16_t activity_atom_text_size = (uint16_t)(strlen(activity_atom_text)); const char *quiz_independentClause_list_text = "zrundoka hwindocayu hwindokali" "hwindoka tyutdocayu tyindokali" "tyutdoka tyutdocayu hfutdokali" "tyindoka fwandocayu nyatdokali"; //"bu.hnac.2.hnac.buka bu.hnac.2.hnac.buca yu " //"bu.hnac.4.hnac.bukali"; const uint16_t quiz_independentClause_list_text_size = (uint16_t)strlen(quiz_independentClause_list_text); uint16_t quiz_independentClause_list_size = 4; v16us quiz_independentClause_list[8]; uint16_t text_remainder = 0; // uint16_t program_worth = 0; uint64_t random_seed = 0x0123456789ABCDEF; uint16_t tablet_indexFinger = 0; // uint8_t champion = 0; // uint16_t champion_worth = 0; // v16us program_; // v16us population[4]; memset(quiz_independentClause_list, 0, (size_t)(quiz_independentClause_list_size * TABLET_LONG * WORD_THICK)); text_code(activity_atom_text_size, activity_atom_text, &activity_atom_size, activity_atom, &text_remainder); assert(text_remainder == 0); text_code(quiz_independentClause_list_text_size, quiz_independentClause_list_text, &quiz_independentClause_list_size, quiz_independentClause_list, &text_remainder); /* [Initialize the input data] */ /* [Un-map the buffers] */ /* * Unmap the memory objects as we have finished using them from the CPU side. * We unmap the memory because otherwise: * - reads and writes to that memory from inside a kernel on the OpenCL side * are undefined. * - the OpenCL implementation cannot free the memory when it is finished. */ if (!success_verification( clEnqueueUnmapMemObject(command_waiting_line, memoryObjects[0], activity_atom, 0, NULL, NULL))) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Unmapping memory objects failed %s:%d\n", __FILE__, __LINE__); return 1; } // if (!success_verification(clEnqueueUnmapMemObject(command_waiting_line, // memoryObjects[1], // inputB, 0, NULL, NULL))) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); // cerr << "Unmapping memory objects failed " << __FILE__ << ":" << __LINE__ // << endl; // return 1; //} /* [Un-map the buffers] */ /* [Set the kernel arguments] */ int setKernelArgumentsSuccess = TRUE; printf("arg0\n"); setKernelArgumentsSuccess &= success_verification(clSetKernelArg( kernel, 0, sizeof(uint8_t), (uint8_t *)&activity_atom_size)); printf("arg1\n"); setKernelArgumentsSuccess &= success_verification( clSetKernelArg(kernel, 1, sizeof(cl_mem), &memoryObjects[0])); printf("arg2\n"); setKernelArgumentsSuccess &= success_verification( clSetKernelArg(kernel, 2, sizeof(uint16_t), (uint16_t *)&program_size)); printf("arg3\n"); setKernelArgumentsSuccess &= success_verification( clSetKernelArg(kernel, 3, sizeof(uint8_t), (uint8_t *)&population_size)); printf("arg4\n"); setKernelArgumentsSuccess &= success_verification( clSetKernelArg(kernel, 4, sizeof(uint64_t), (uint64_t *)&random_seed)); printf("arg5\n"); setKernelArgumentsSuccess &= success_verification(clSetKernelArg(kernel, 5, sizeof(uint64_t *), NULL)); printf("arg6\n"); setKernelArgumentsSuccess &= success_verification( clSetKernelArg(kernel, 6, sizeof(cl_mem), &memoryObjects[1])); printf("arg7\n"); setKernelArgumentsSuccess &= success_verification(clSetKernelArg(kernel, 7, sizeof(uint8_t *), NULL)); printf("arg8\n"); setKernelArgumentsSuccess &= success_verification( clSetKernelArg(kernel, 8, sizeof(cl_mem), &memoryObjects[2])); if (!setKernelArgumentsSuccess) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed setting OpenCL kernel arguments. %s:%d\n", __FILE__, __LINE__); return 1; } /* [Set the kernel arguments] */ /* An event to associate with the Kernel. Allows us to retrieve profiling * information later. */ cl_event event = 0; /* [Global work size] */ /* * Each instance of our OpenCL kernel operates on a single element of each * array so the number of * instances needed is the number of elements in the array. */ size_t globalWorksize[1] = {population_size}; size_t localWorksize[1] = {2}; /* Enqueue the kernel */ if (!success_verification(clEnqueueNDRangeKernel( command_waiting_line, kernel, 1, NULL, globalWorksize, localWorksize, 0, NULL, &event))) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed enqueuing the kernel. %s:%d\n", __FILE__, __LINE__); return 1; } /* [Global work size] */ /* Wait for kernel execution completion. */ if (!success_verification(clFinish(command_waiting_line))) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed waiting for kernel execution to finish. %s:%d\n", __FILE__, __LINE__); return 1; } /* Print the profiling information for the event. */ // printProfilingInfo(event); /* Release the event object. */ if (!success_verification(clReleaseEvent(event))) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed releasing the event object. %s:%d\n", __FILE__, __LINE__); return 1; } /* Get a pointer to the output data. */ printf("clOut\n"); v16us *output = (v16us *)clEnqueueMapBuffer( command_waiting_line, memoryObjects[1], CL_TRUE, CL_MAP_READ, 0, population_byte_size, 0, NULL, NULL, &errorNumber); v16us *newspaper = (v16us *)clEnqueueMapBuffer( command_waiting_line, memoryObjects[2], CL_TRUE, CL_MAP_READ, 0, newspaper_byte_size, 0, NULL, NULL, &errorNumber); if (!success_verification(errorNumber)) { // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Failed to map buffer. %s:%d\n", __FILE__, __LINE__); return 1; } /* [Output the results] */ /* Uncomment the following block to print results. */ for (tablet_indexFinger = 0; tablet_indexFinger < (population_size * TABLET_LONG); ++tablet_indexFinger) { if (tablet_indexFinger % 0x10 == 0) printf("\n"); printf("%04X ", (uint)((uint16_t *)output)[tablet_indexFinger]); } printf("\n"); // printf("program %04X \n", (uint)*((uint16_t *)&(output[1]))); printf("newspaper \n"); for (tablet_indexFinger = 0; tablet_indexFinger < (NEWSPAPER_LONG * TABLET_LONG); ++tablet_indexFinger) { if (tablet_indexFinger % 0x10 == 0) printf("\n"); printf("%04X ", (uint)((uint16_t *)newspaper)[tablet_indexFinger]); } printf("\n"); /* [Output the results] */ /* Unmap the memory object as we are finished using them from the CPU side. */ if (!success_verification(clEnqueueUnmapMemObject( command_waiting_line, memoryObjects[1], output, 0, NULL, NULL))) { printf("unmapping\n"); // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Unmapping memory objects failed %s:%d\n", __FILE__, __LINE__); return 1; } if (!success_verification(clEnqueueUnmapMemObject( command_waiting_line, memoryObjects[2], newspaper, 0, NULL, NULL))) { printf("unmapping\n"); // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); fprintf(stderr, "Unmapping memory objects failed %s:%d\n", __FILE__, __LINE__); return 1; } printf("releasing\n"); /* Release OpenCL objects. */ // cleanUpOpenCL(context, command_waiting_line, program, kernel, // memoryObjects, // numberOfMemoryObjects); }
void StartQueue(CL* cl){ cl_int err; cl->queue = clCreateCommandQueue(cl->context, cl->device, 0, &err); printf("QUEUE STATUS\t"); cl_error(err); }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufAP, bufX, scratchBuff; cl_event event = NULL; int ret = 0, numElementsAP; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } numElementsAP = (N * (N+1)) / 2; // To get number of elements in a packed matrix /* Prepare OpenCL memory objects and place matrices inside them. */ bufAP = clCreateBuffer(ctx, CL_MEM_READ_ONLY, numElementsAP * sizeof(cl_float), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, N * sizeof(cl_float), NULL, &err); scratchBuff = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float), NULL, &err); err = clEnqueueWriteBuffer(queue, bufAP, CL_TRUE, 0, numElementsAP * sizeof(cl_float), AP, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float), X, 0, NULL, NULL); err = clblasStpmv(order, uplo, clblasTrans, clblasUnit, N, bufAP, 0 /*offA */, bufX, 0 /*offX */, incx, scratchBuff, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasStpmv() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float), X, 0, NULL, NULL); /* At this point you will get the result of STRMV placed in Y array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(scratchBuff); clReleaseMemObject(bufX); clReleaseMemObject(bufAP); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
static void find_best_workgroup(int jtrUniqDevNo, unsigned int gpu_perf) { size_t _lws=0; cl_device_type dTyp; cl_command_queue cmdq; cl_int err; unsigned int max_kpc = get_max_mem_alloc_size(jtrUniqDevNo) / sizeof(temp_buf) < MAX_KEYS_PER_CRYPT ? ((get_max_mem_alloc_size(jtrUniqDevNo) / sizeof(temp_buf)) / 8192 - 1) * 8192 : MAX_KEYS_PER_CRYPT; cl_uint *dcc_hash_host = (cl_uint*)mem_alloc(4 * sizeof(cl_uint) * ((max_kpc < 65536) ? max_kpc : 65536)); cl_uint *dcc2_hash_host = (cl_uint*)mem_alloc(4 * sizeof(cl_uint) * ((max_kpc < 65536) ? max_kpc : 65536)); cl_uint *hmac_sha1_out = (cl_uint*)mem_alloc(5 * sizeof(cl_uint) * ((max_kpc < 65536) ? max_kpc : 65536)); cl_uint salt_api[9], length = 10; event_ctr = 0; //HANDLE_CLERROR(clGetDeviceInfo(devices[jtrUniqDevNo], CL_DEVICE_TYPE, sizeof(cl_device_type), &dTyp, NULL), "Failed Device Info"); dTyp = get_device_type(jtrUniqDevNo); if (dTyp == CL_DEVICE_TYPE_CPU) globalObj[jtrUniqDevNo].lws = 1; else globalObj[jtrUniqDevNo].lws = 16; ///Set Dummy DCC hash , unicode salt and ascii salt(username) length memset(dcc_hash_host, 0xb5, 4 * sizeof(cl_uint) * ((max_kpc < 65536) ? max_kpc : 65536)); memset(salt_api, 0xfe, 9 * sizeof(cl_uint)); cmdq = clCreateCommandQueue(context[jtrUniqDevNo], devices[jtrUniqDevNo], CL_QUEUE_PROFILING_ENABLE, &err); HANDLE_CLERROR(err, "Error creating command queue"); PROFILE = 1; kernelExecTimeNs = CL_ULONG_MAX; ///Find best local work size while (1) { _lws = globalObj[jtrUniqDevNo].lws; if (dTyp == CL_DEVICE_TYPE_CPU) exec_pbkdf2(dcc_hash_host, salt_api, length, 10240, dcc2_hash_host, 4096, jtrUniqDevNo, cmdq, hmac_sha1_out); else exec_pbkdf2(dcc_hash_host, salt_api, length, 10240, dcc2_hash_host, (((max_kpc < 65536) ? max_kpc : 65536) / gpu_perf), jtrUniqDevNo, cmdq, hmac_sha1_out); if (globalObj[jtrUniqDevNo].lws <= _lws) break; } if (dTyp == CL_DEVICE_TYPE_CPU) globalObj[jtrUniqDevNo].exec_time_inv = globalObj[jtrUniqDevNo].exec_time_inv / 16; else globalObj[jtrUniqDevNo].exec_time_inv *= (((max_kpc < 65536) ? max_kpc : 65536) / (long double) gpu_perf) / 65536; PROFILE = 0; if (options.verbosity > 2) { fprintf(stderr, "Optimal Work Group Size:%d\n", (int)globalObj[jtrUniqDevNo].lws); fprintf(stderr, "Kernel Execution Speed (Higher is better):%Lf\n", globalObj[jtrUniqDevNo].exec_time_inv); } MEM_FREE(dcc_hash_host); MEM_FREE(dcc2_hash_host); MEM_FREE(hmac_sha1_out); HANDLE_CLERROR(clReleaseCommandQueue(cmdq), "Release Command Queue:Failed"); }
/* * This function picks/creates necessary OpenCL objects which are needed. * The objects are: * OpenCL platform, device, context, and command queue. * * All these steps are needed to be performed once in a regular OpenCL application. * This happens before actual compute kernels calls are performed. * * For convenience, in this application you store all those basic OpenCL objects in structure ocl_args_d_t, * so this function populates fields of this structure, which is passed as parameter ocl. * Please, consider reviewing the fields before going further. * The structure definition is right in the beginning of this file. */ int SetupOpenCL(ocl_args_d_t *ocl, cl_device_type deviceType) { // The following variable stores return codes for all OpenCL calls. cl_int err = CL_SUCCESS; // Query for all available OpenCL platforms on the system // Here you enumerate all platforms and pick one which name has preferredPlatform as a sub-string cl_platform_id platformId = FindOpenCLPlatform("Intel", deviceType); if (NULL == platformId) { LogError("Error: Failed to find OpenCL platform.\n"); return CL_INVALID_VALUE; } // Create context with device of specified type. // Required device type is passed as function argument deviceType. // So you may use this function to create context for any CPU or GPU OpenCL device. // The creation is synchronized (pfn_notify is NULL) and NULL user_data cl_context_properties contextProperties[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)platformId, 0}; ocl->context = clCreateContextFromType(contextProperties, deviceType, NULL, NULL, &err); if ((CL_SUCCESS != err) || (NULL == ocl->context)) { LogError("Couldn't create a context, clCreateContextFromType() returned '%s'.\n", TranslateOpenCLError(err)); return err; } // Query for OpenCL device which was used for context creation err = clGetContextInfo(ocl->context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &ocl->device, NULL); if (CL_SUCCESS != err) { LogError("Error: clGetContextInfo() to get list of devices returned %s.\n", TranslateOpenCLError(err)); return err; } // Read the OpenCL platform's version and the device OpenCL and OpenCL C versions GetPlatformAndDeviceVersion(platformId, ocl); // Create command queue. // OpenCL kernels are enqueued for execution to a particular device through special objects called command queues. // Command queue guarantees some ordering between calls and other OpenCL commands. // Here you create a simple in-order OpenCL command queue that doesn't allow execution of two kernels in parallel on a target device. #ifdef CL_VERSION_2_0 if (OPENCL_VERSION_2_0 == ocl->deviceVersion) { const cl_command_queue_properties properties[] = {CL_QUEUE_PROPERTIES, 0, 0}; ocl->commandQueue = clCreateCommandQueueWithProperties(ocl->context, ocl->device, properties, &err); } else { // default behavior: OpenCL 1.2 cl_command_queue_properties properties = 0; ocl->commandQueue = clCreateCommandQueue(ocl->context, ocl->device, properties, &err); } #else // default behavior: OpenCL 1.2 cl_command_queue_properties properties = 0; ocl->commandQueue = clCreateCommandQueue(ocl->context, ocl->device, properties, &err); #endif if (CL_SUCCESS != err) { LogError("Error: clCreateCommandQueue() returned %s.\n", TranslateOpenCLError(err)); return err; } return CL_SUCCESS; }
std::shared_ptr<XdevLComputeDeviceQueue> XdevLComputeDeviceContextCL::createCommandQueue() { cl_int ret; cl_command_queue commandQueue = clCreateCommandQueue(m_context, m_deviceID, 0, &ret); auto tmp = std::make_shared<XdevLComputeDeviceQueueCL>(commandQueue); return tmp; }