cl_kernel get_kernel(char *kernel_name, cl_context *context, cl_device_id *device) { cl_int status = CL_SUCCESS; const char* program_source = load_program_source(PROGRAM_SRC); if(program_source == NULL) { fprintf(stderr, "Programm can not be created. File was not found."); abort(); } cl_program program = clCreateProgramWithSource(*context, 1, &program_source, NULL, &status); CL_CHECK_ERROR(status); status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); /* Print build log */ char buf[0x10000]; clGetProgramBuildInfo(program, *device, CL_PROGRAM_BUILD_LOG, 0x10000, buf, NULL); if(status != CL_SUCCESS) { printf("\n-------BUILD LOG:\n %s \n-------\n", buf); fprintf(stderr, "Programm can not be build. (%s)", opencl_map_error(status)); abort(); } return clCreateKernel(program, kernel_name, &status); }
void createKernel(const char* kernel, const char* path, int indice) { // TheContext* tc = new TheContext(); // cl_context GPUContext_K = tc->getMyContext()->getContextCL(); // cl_device_id cdDevice_K = tc->getMyContext()->getDeviceCL(); // Creates the program // Uses NVIDIA helper functions to get the code string and it's size (in bytes) //size_t src_size = 0; char full_path[256]; #ifdef _VIVID_STATIC_LIB sprintf(full_path, "%s", path); #else sprintf(full_path, "%s", path); #endif char *program_source = load_program_source(full_path); if (program_source == NULL) { printf("Error: Failed to read the OpenCL kernel: %s\n",path); exit(-1); } cl_int err; program_list[indice] = clCreateProgramWithSource(GPUContext_K, 1, (const char **) &program_source, NULL, &err); if (!program_list[indice]) { printf("Error: Failed to create compute program for device %d Kernel: (%s)!\n", indice,kernel); printf("************\n%s\n************\n", program_source); } // Build the program executable const char * options = "-cl-fast-relaxed-math"; err = clBuildProgram(program_list[indice], 0, NULL, options, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[10000]; printf("Error: Failed to build program executable for device %d kernel: (%s)!\n",err,kernel); cl_int get_err=clGetProgramBuildInfo(program_list[indice], cdDevice_K, CL_PROGRAM_BUILD_LOG, sizeof (buffer), buffer, &len); printf("%d %s\n", get_err, buffer); } kernel_list[indice] = clCreateKernel(program_list[indice], kernel, &err); if (!kernel_list[indice] || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel for device %d Kernel: (%s)!\n", indice, full_path); exit(1); } }
void init_cl(int ker_id, char *kernel_path, t_cl *cl) { cl_device_id device; char *source; cl_program program; clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (!cl->context) { cl->context = clCreateContext(0, 1, &device, NULL, NULL, NULL); cl->cmd_queue = clCreateCommandQueue(cl->context, device, 0, NULL); } source = load_program_source(kernel_path); program = clCreateProgramWithSource(cl->context, 1, (const char **)&source, NULL, NULL); free(source); clBuildProgram(program, 0, NULL, NULL, NULL, NULL); cl->kernel[ker_id] = clCreateKernel(program, "thread", NULL); cl->input_cl_mem[ker_id] = clCreateBuffer(cl->context, CL_MEM_READ_ONLY, cl->gws[ker_id] * sizeof(float), NULL, NULL); cl->output_cl_mem[ker_id] = clCreateBuffer(cl->context, CL_MEM_READ_WRITE, cl->gws[ker_id] * sizeof(float), NULL, NULL); }
void opencl_setup(CLEnv& env) { /*****************************************/ /* Initialize OpenCL */ /*****************************************/ clGetPlatformIDs(1, &env.cpPlatform, NULL); clGetDeviceIDs(env.cpPlatform, CL_DEVICE_TYPE_GPU, 1, &env.cdDevice, NULL); env.context = clCreateContext(0, 1, &env.cdDevice, NULL, NULL, &env.errcode); //env.context = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU,NULL, NULL, &env.errcode); opencl_check_error(env.errcode, CL_SUCCESS, __FILE__ , __LINE__ ); // get the list of GPU devices associated with context env.errcode = clGetContextInfo(env.context, CL_CONTEXT_DEVICES, 0, NULL,&env.device_size); env.devices = (cl_device_id *) malloc(env.device_size); env.errcode |= clGetContextInfo(env.context, CL_CONTEXT_DEVICES, env.device_size, env.devices, NULL); opencl_check_error(env.errcode, CL_SUCCESS, __FILE__ , __LINE__ ); //Create a command-queue env.command_queue = clCreateCommandQueue(env.context, env.cdDevice, 0, &env.errcode); opencl_check_error(env.errcode, CL_SUCCESS, __FILE__ , __LINE__ ); // Load and build OpenCL kernel const char * filename = "kernel.cl"; char* kernel_source = load_program_source(filename); env.program = clCreateProgramWithSource(env.context, 1, (const char**)&kernel_source, NULL, &env.errcode); opencl_check_error(env.errcode, CL_SUCCESS, __FILE__ , __LINE__ ); env.errcode = clBuildProgram(env.program, 0, NULL, NULL, NULL, NULL); opencl_check_error(env.errcode, CL_SUCCESS, __FILE__ , __LINE__ ); env.kernel = clCreateKernel(env.program, "matrix_mul", &env.errcode); opencl_check_error(env.errcode, CL_SUCCESS, __FILE__ , __LINE__ ); free(kernel_source); }
cl_int load_kernel(cl_context context, cl_device_id *devices, unsigned int devc, cl_program *prog, cl_kernel *kern) { cl_int err; char* source = load_program_source("ConwayKernel.cl"); *prog = clCreateProgramWithSource(context, 1, (const char**)&source, NULL, &err); if (*prog == NULL) { printf("clCreateProgramWithSource failed! (Error: %d)\n", err); return err; } err = clBuildProgram(*prog, devc, devices, NULL, NULL, NULL); if (err) { printf("clBuildProgram failed! (Error: %d)\n", err); size_t length; char buffer[2048]; clGetProgramBuildInfo(*prog, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &length); printf("Build log: %s\n", buffer); return err; } *kern = clCreateKernel(*prog, "evaluate_bit", &err); if (*kern == NULL) { printf("clCreateKernel failed! (Error: %d)\n", err); return err; } printf("Kernel build completed successfully\n"); return 0; }
int main(int argc, char *argv[]) { cl_platform_id platform; cl_device_id device; cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_mem buff_A, buff_B, buff_C; int mult = 1; uint32_t uiWA, uiHA, uiWB, uiHB, uiWC, uiHC; uiWA = WA * mult; uiHA = HA * mult; uiWB = WB * mult; uiHB = HB * mult; uiWC = WC * mult; uiHC = HC * mult; printf("sizes WA %u HA %u WB %u HB %u WC %u HC %u\n", uiWA, uiHA, uiWB, uiHB, uiWC, uiHC); uint32_t size_A = uiWA * uiHA; uint32_t size_B = uiWB * uiHB; uint32_t size_C = uiWC * uiHC; size_t mem_size_A = size_A * sizeof(float); size_t mem_size_B = size_B * sizeof(float); size_t mem_size_C = size_C * sizeof(float); float *data_A = (float *)malloc(mem_size_A); float *data_B = (float *)malloc(mem_size_B); float *data_C = (float *)malloc(mem_size_C); srand(2012); shrFillArray(data_A, size_A); shrFillArray(data_B, size_B); size_t global_work_size[2]; size_t local_work_size[] = { BLOCK_SIZE, BLOCK_SIZE }; global_work_size[0] = shrRoundUp(BLOCK_SIZE, uiWC); global_work_size[1] = shrRoundUp(BLOCK_SIZE, uiHA); const char *source = load_program_source("MatrixMul.cl"); size_t source_len = strlen(source);; cl_uint err = 0; char *flags = "-cl-fast-relaxed-math"; clGetPlatformIDs(1, &platform, NULL); printf("platform %p err %d\n", platform, err); clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, &err); printf("device %p err %d\n", device, err); context = clCreateContext(0, 1, &device, NULL, NULL, &err); printf("context %p err %d\n", context, err); queue = clCreateCommandQueue(context, device, 0, &err); printf("queue %p err %d\n", queue, err); program = clCreateProgramWithSource(context, 1, &source, &source_len, &err); printf("program %p err %d\n", program, err); err = clBuildProgram(program, 0, NULL, flags, NULL, NULL); printf("err %d\n", err); kernel = clCreateKernel(program, "matrixMul", &err); printf("kernel %p err %d\n", kernel, err); buff_A = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_A, data_A, NULL); printf("buff_A %p\n", buff_A); buff_B = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, mem_size_B, data_B, NULL); printf("buff_B %p\n", buff_B); buff_C = clCreateBuffer(context, CL_MEM_WRITE_ONLY, mem_size_C, NULL, NULL); printf("buff_C %p\n", buff_C); err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&buff_C); printf("err %d\n", err); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&buff_A); printf("err %d\n", err); err = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&buff_B); printf("err %d\n", err); err = clSetKernelArg(kernel, 3, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, NULL); printf("err %d\n", err); err = clSetKernelArg(kernel, 4, sizeof(float) * BLOCK_SIZE * BLOCK_SIZE, NULL); printf("err %d\n", err); err = clSetKernelArg(kernel, 5, sizeof(cl_int), (void*)&uiWA); printf("err %d\n", err); err = clSetKernelArg(kernel, 6, sizeof(cl_int), (void*)&uiWB); printf("err %d\n", err); err = clSetKernelArg(kernel, 7, sizeof(cl_int), (void*)&uiHA); printf("err %d\n", err); err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL); printf("err %d\n", err); err = clFlush(queue); printf("err %d\n", err); err = clFinish(queue); printf("err %d\n", err); err = clEnqueueReadBuffer(queue, buff_C, CL_TRUE, 0, mem_size_C, data_C, 0, NULL, NULL); printf("err %d\n", err); int i; for (i = 0; i < size_C; i++) { printf("%d %f\n", i, data_C[i]); } clReleaseMemObject(buff_A); clReleaseMemObject(buff_B); clReleaseMemObject(buff_C); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseProgram(program); }
CLKernel::CLKernel(cl_context context, cl_command_queue commands, cl_device_id device, const char * filename, const char * name, const char * options) { this->device = device; this->commands = commands; //this->profiling = true; //Profiling doesn't work on neuro this->profiling = false; this->elapsed = 0; #ifdef PV_USE_OPENCL int status = CL_SUCCESS; // Create the compute program from the source buffer // char * source = load_program_source(filename); program = clCreateProgramWithSource(context, 1, (const char **) &source, NULL, &status); if (!program || status != CL_SUCCESS) { printf("Error: Failed to create compute program!\n"); CLDevice::print_error_code(status); exit(status); } // Build the program executable // // TODO - fix include path status = clBuildProgram(program, 0, NULL, options, NULL, NULL); if (status != CL_SUCCESS) { size_t len; char buffer[150641]; //[12050]; //[8192]; printf("Error: Failed to build program executable!\n"); CLDevice::print_error_code(status); status = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); if (status != CL_SUCCESS) { printf("CLKernel: error buffer length may be too small, is %ld, should be %ld\n", sizeof(buffer), len); CLDevice::print_error_code(status); } printf("%s\n", buffer); exit(status); } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, name, &status); if (!kernel || status != CL_SUCCESS) { fprintf(stderr, "Error: Failed to create compute kernel!\n"); CLDevice::print_error_code(status); exit(status); } #endif // PV_USE_OPENCL }
int main( int argc, char* argv[] ) { // Length of vectors unsigned int n = 10; struct timespec start, finish; // Host input vectors int *h_a; int *h_b; // Host output vector int *h_c; double elapsed; // Device input buffers cl_mem d_a; cl_mem d_b; // Device output buffer cl_mem d_c; cl_platform_id cpPlatform; // OpenCL platform cl_device_id device_id; // device ID cl_context context; // context cl_command_queue queue; // command queue cl_program program; // program cl_kernel kernel; // kernel // Size, in bytes, of each vector size_t bytes = n*sizeof(int); // Allocate memory for each vector on host h_a = (int*)malloc(bytes); h_b = (int*)malloc(bytes); h_c = (int*)malloc(bytes); // Initialize vectors on host int i; for( i = 0; i < n; i++ ) { h_a[i] = i; h_b[i] = i; } size_t globalSize, localSize; cl_int err; int workgrp; int wrkitm; //wrkitm=atoi(argv[1]); // Number of work items in each local work group // localSize = wrkitm ; //workgrp=atoi(argv[2]); // Number of total work items - localSize must be devisor globalSize = n;//ceil(n/(float)localSize)*localSize; //cl_uint platformCount; //cl_platform_id* platforms; //clGetPlatformIDs(0, NULL, &platformCount); // platforms = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount); // clGetPlatformIDs(platformCount, platforms, NULL); //printf("%d",platformCount); // Bind to platform err = clGetPlatformIDs(1, &cpPlatform, NULL); // Get ID for the device err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); } // Create a context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) printf("Error: Failed to create a compute context!\n"); // Create a command queue queue = clCreateCommandQueue(context, device_id, 0, &err); //loading external cl file const char *file="vectadd.cl"; const char *kernelSource = load_program_source(file); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelSource, NULL, &err); // Build the program executable clBuildProgram(program, 0, NULL, NULL, NULL, NULL); // Create the compute kernel in the program we wish to run kernel = clCreateKernel(program, "vecAdd", &err); // Create the input and output arrays in device memory for our calculation d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL); // Write our data set into the input array in device memory err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0,bytes, h_a, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0,bytes, h_b, 0, NULL, NULL); clFinish(queue); // Set the arguments to our compute kernel err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b); err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c); err = clSetKernelArg(kernel, 3, sizeof(unsigned int), &n); clock_gettime(CLOCK_MONOTONIC, &start); // Execute the kernel over the entire range of the data set err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL); clock_gettime(CLOCK_MONOTONIC, &finish); elapsed = (finish.tv_sec - start.tv_sec); elapsed += (finish.tv_nsec - start.tv_nsec)/ 1000000000.0; // Wait for the command queue to get serviced before reading back results clFinish(queue); // Read the results from the device clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0, bytes, h_c, 0, NULL, NULL ); clFinish(queue); //Sum up vector c and print result divided by n, this should equal 1 within error double sum = 0; for(i=0; i<n; i++) sum += h_c[i]; printf("Work Item/threads = %d \n",wrkitm); printf("time taken by GPU = %le\n ",elapsed); // release OpenCL resources clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseContext(context); //release host memory free(h_a); free(h_b); free(h_c); return 0; }
int main(int argc, char **argv) { int err; cl_device_id device_id; cl_command_queue commands; cl_context context; cl_mem output_buffer; cl_mem input_buffer; cl_mem partials_buffer; size_t typesize; int pass_count = 0; size_t* group_counts = 0; size_t* work_item_counts = 0; int* operation_counts = 0; int* entry_counts = 0; int use_gpu = 1; int i; int c; // Parse command line options // for( i = 0; i < argc && argv; i++) { if(!argv[i]) continue; if(strstr(argv[i], "cpu")) { use_gpu = 0; } else if(strstr(argv[i], "gpu")) { use_gpu = 1; } } channels=1; // Create some random input data on the host // time_t tstart=0; (void) time(&tstart); srand48((long) tstart); float *float_data = (float*)malloc(count * channels * sizeof(float)); for (i = 0; i < count * channels; i++) { float_data[i] = drand48(); } //SETUP PLATFORM cl_uint numPlatforms; err = clGetPlatformIDs(0, NULL, &numPlatforms); if (err != CL_SUCCESS) { fprintf(stderr,"could not get platform\n"); exit(EXIT_FAILURE); } cl_platform_id platform_id; if(numPlatforms > 0) { //we have at least one //cl_platform_id* platforms = new cl_platform_id[numPlatforms]; cl_platform_id* platforms = calloc(numPlatforms, sizeof(cl_platform_id)); err = clGetPlatformIDs(numPlatforms, platforms, NULL); if (err != CL_SUCCESS) { fprintf(stderr,"could not get platform id\n"); exit(EXIT_FAILURE); } fprintf(stderr,"Found %d platforms\n", numPlatforms); platform_id = platforms[0]; //delete[] platforms; free(platforms); } else exit(0); // Connect to a compute device // err = clGetDeviceIDs(platform_id, use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to locate a compute device!\n"); return EXIT_FAILURE; } size_t returned_size = 0; size_t max_workgroup_size = 0; err = clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_workgroup_size, &returned_size); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve device info!\n"); return EXIT_FAILURE; } cl_char vendor_name[1024] = {0}; cl_char device_name[1024] = {0}; err = clGetDeviceInfo(device_id, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, &returned_size); err|= clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_name), device_name, &returned_size); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve device info!\n"); return EXIT_FAILURE; } printf(SEPARATOR); printf("Connecting to %s %s...\n", vendor_name, device_name); // Load the compute program from disk into a cstring buffer // typesize = (sizeof(float)); const char* filename = 0; filename = "apple-reduce-kernel-float.cl"; printf(SEPARATOR); printf("Loading program '%s'...\n", filename); printf(SEPARATOR); char *source = load_program_source(filename); if(!source) { printf("Error: Failed to load compute program from file!\n"); return EXIT_FAILURE; } // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command queue // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the input buffer on the device // size_t buffer_size = typesize * count * channels; input_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL); if (!input_buffer) { printf("Error: Failed to allocate input buffer on device!\n"); return EXIT_FAILURE; } // Fill the input buffer with the host allocated random data // void *input_data = (void*)float_data; err = clEnqueueWriteBuffer(commands, input_buffer, CL_TRUE, 0, buffer_size, input_data, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array!\n"); return EXIT_FAILURE; } // Create an intermediate data buffer for intra-level results // partials_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL); if (!partials_buffer) { printf("Error: Failed to allocate partial sum buffer on device!\n"); return EXIT_FAILURE; } // Create the output buffer on the device // output_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL); if (!output_buffer) { printf("Error: Failed to allocate result buffer on device!\n"); return EXIT_FAILURE; } // Determine the reduction pass configuration for each level in the pyramid // create_reduction_pass_counts( count, max_workgroup_size, MAX_GROUPS, MAX_WORK_ITEMS, &pass_count, &group_counts, &work_item_counts, &operation_counts, &entry_counts); // Create specialized programs and kernels for each level of the reduction // cl_program *programs = (cl_program*)malloc(pass_count * sizeof(cl_program)); memset(programs, 0, pass_count * sizeof(cl_program)); cl_kernel *kernels = (cl_kernel*)malloc(pass_count * sizeof(cl_kernel)); memset(kernels, 0, pass_count * sizeof(cl_kernel)); for(i = 0; i < pass_count; i++) { char *block_source = malloc(strlen(source) + 1024); size_t source_length = strlen(source) + 1024; memset(block_source, 0, source_length); // Insert macro definitions to specialize the kernel to a particular group size // const char group_size_macro[] = "#define GROUP_SIZE"; const char operations_macro[] = "#define OPERATIONS"; sprintf(block_source, "%s (%d) \n%s (%d)\n\n%s\n", group_size_macro, (int)group_counts[i], operations_macro, (int)operation_counts[i], source); // Create the compute program from the source buffer // programs[i] = clCreateProgramWithSource(context, 1, (const char **) & block_source, NULL, &err); if (!programs[i] || err != CL_SUCCESS) { printf("%s\n", block_source); printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program executable // err = clBuildProgram(programs[i], 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t length; char build_log[2048]; printf("%s\n", block_source); printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(programs[i], device_id, CL_PROGRAM_BUILD_LOG, sizeof(build_log), build_log, &length); printf("%s\n", build_log); return EXIT_FAILURE; } // Create the compute kernel from within the program // kernels[i] = clCreateKernel(programs[i], "reduce", &err); if (!kernels[i] || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); return EXIT_FAILURE; } free(block_source); } // Do the reduction for each level // this is one pass over it to establish the kernel args and such, so // it is negligible time // cl_mem pass_swap; cl_mem pass_input = output_buffer; cl_mem pass_output = input_buffer; for(i = 0; i < pass_count; i++) { size_t global = group_counts[i] * work_item_counts[i]; size_t local = work_item_counts[i]; unsigned int operations = operation_counts[i]; unsigned int entries = entry_counts[i]; size_t shared_size = typesize * channels * local * operations; printf("Pass[%4d] Global[%4d] Local[%4d] Groups[%4d] WorkItems[%4d] Operations[%d] Entries[%d]\n", i, (int)global, (int)local, (int)group_counts[i], (int)work_item_counts[i], operations, entries); // Swap the inputs and outputs for each pass // pass_swap = pass_input; pass_input = pass_output; pass_output = pass_swap; err = CL_SUCCESS; err |= clSetKernelArg(kernels[i], 0, sizeof(cl_mem), &pass_output); err |= clSetKernelArg(kernels[i], 1, sizeof(cl_mem), &pass_input); err |= clSetKernelArg(kernels[i], 2, shared_size, NULL); err |= clSetKernelArg(kernels[i], 3, sizeof(int), &entries); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments!\n"); return EXIT_FAILURE; } // After the first pass, use the partial sums for the next input values // if(pass_input == input_buffer) pass_input = partials_buffer; err = CL_SUCCESS; err |= clEnqueueNDRangeKernel(commands, kernels[i], 1, NULL, &global, &local, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } } err = clFinish(commands); if (err != CL_SUCCESS) { printf("Error: Failed to wait for command queue to finish! %d\n", err); return EXIT_FAILURE; } // Start the timing loop and execute the kernel over several iterations // printf(SEPARATOR); printf("Timing %d iterations of reduction with %d elements of type %s%s...\n", iterations, count, "float", (channels <= 1) ? (" ") : (channels == 2) ? "2" : "4"); printf(SEPARATOR); int k; err = CL_SUCCESS; time_t t1 = clock(); for (k = 0 ; k < iterations; k++) { for(i = 0; i < pass_count; i++) { size_t global = group_counts[i] * work_item_counts[i]; size_t local = work_item_counts[i]; err = clEnqueueNDRangeKernel(commands, kernels[i], 1, NULL, &global, &local, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } } } err = clFinish(commands); if (err != CL_SUCCESS) { printf("Error: Failed to wait for command queue to finish! %d\n", err); return EXIT_FAILURE; } time_t t2 = clock(); // Calculate the statistics for execution time and throughput // double t = (t2-t1)/( (double)CLOCKS_PER_SEC ); printf("Exec Time: %.2f ms\n", t); printf("Throughput: %.2f GB/sec\n", 1e-9 * buffer_size * iterations / t); printf(SEPARATOR); // Read back the results that were computed on the device // void *computed_result = malloc(typesize * channels); memset(computed_result, 0, typesize * channels); err = clEnqueueReadBuffer(commands, pass_output, CL_TRUE, 0, typesize * channels, computed_result, 0, NULL, NULL); if (err) { printf("Error: Failed to read back results from the device!\n"); return EXIT_FAILURE; } // now do the speed test on standard float reference=0; t1 = clock(); for (k=0; k<iterations; k++) { reference = reduce_validate_float(float_data, count); } t2 = clock(); double tcpu = (t2-t1)/( (double)CLOCKS_PER_SEC ); printf("CPU Exec Time: %.2f ms\n", tcpu); printf("CPU Throughput: %.2f GB/sec\n", 1e-9 * buffer_size * iterations / tcpu); printf("GPU is faster by %.16g\n", tcpu/t); printf(SEPARATOR); float result= ( (float *)computed_result )[0]; float ferror = fabs(reference - result)/reference; if (ferror > MIN_ERROR) { printf("Result %.16g != %.16g\n", reference, result); printf("Error: Incorrect results obtained! Rel error %.16g > Max allowed = %.16g\n", ferror, MIN_ERROR); return EXIT_FAILURE; } else { printf("Results Validated!\n"); printf(SEPARATOR); } // Shutdown and cleanup // for(i = 0; i < pass_count; i++) { clReleaseKernel(kernels[i]); clReleaseProgram(programs[i]); } clReleaseMemObject(input_buffer); clReleaseMemObject(output_buffer); clReleaseMemObject(partials_buffer); clReleaseCommandQueue(commands); clReleaseContext(context); free(group_counts); free(work_item_counts); free(operation_counts); free(entry_counts); free(computed_result); free(kernels); free(float_data); return 0; }
int main(int argc, char **argv){ printf("Check OpenCL environtment\n"); cl_platform_id platid; cl_device_id devid; cl_int res; size_t param; /* Query OpenCL, get some information about the returned device */ clGetPlatformIDs(1u, &platid, NULL); clGetDeviceIDs(platid, CL_DEVICE_TYPE_ALL, 1, &devid, NULL); cl_char vendor_name[1024] = {0}; cl_char device_name[1024] = {0}; clGetDeviceInfo(devid, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, NULL); clGetDeviceInfo(devid, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); printf("Connecting to OpenCL device:\t%s %s\n", vendor_name, device_name); clGetDeviceInfo(devid, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), ¶m, NULL); printf("CL_DEVICE_MAX_COMPUTE_UNITS\t%d\n", param); clGetDeviceInfo(devid, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), ¶m, NULL); printf("CL_DEVICE_MAX_WORK_GROUP_SIZE\t%u\n", param); clGetDeviceInfo(devid, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), ¶m, NULL); printf("CL_DEVICE_LOCAL_MEM_SIZE\t%ub\n", param); /* Check if kernel source exists, we compile argv[1] passed kernel */ if(argv[1] == NULL) { printf("\nUsage: %s kernel_source.cl kernel_function\n", argv[0]); exit(1); } char *kernel_source; if(load_program_source(argv[1], &kernel_source)) return 1; printf("Building from OpenCL source: \t%s\n", argv[1]); printf("Compile/query OpenCL_program:\t%s\n", argv[2]); /* Create context and kernel program */ cl_context context = clCreateContext(0, 1, &devid, NULL, NULL, NULL); cl_program pro = clCreateProgramWithSource(context, 1, (const char **)&kernel_source, NULL, NULL); res = clBuildProgram(pro, 1, &devid, "-cl-fast-relaxed-math", NULL, NULL); if(res != CL_SUCCESS){ printf("clBuildProgram failed: %d\n", res); char buf[0x10000]; clGetProgramBuildInfo(pro, devid, CL_PROGRAM_BUILD_LOG, 0x10000, buf, NULL); printf("\n%s\n", buf); return(-1); } cl_kernel kernelobj = clCreateKernel(pro, argv[2], &res); check_return(res); /* Get the maximum work-group size for executing the kernel on the device */ size_t global, local; res = clGetKernelWorkGroupInfo(kernelobj, devid, CL_KERNEL_WORK_GROUP_SIZE, sizeof(int), &local, NULL); check_return(res); printf("CL_KERNEL_WORK_GROUP_SIZE\t%u\n", local); res = clGetKernelWorkGroupInfo(kernelobj, devid, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(cl_ulong), ¶m, NULL); check_return(res); printf("CL_KERNEL_LOCAL_MEM_SIZE\t%ub\n", param); cl_command_queue cmd_queue = clCreateCommandQueue(context, devid, CL_QUEUE_PROFILING_ENABLE, NULL); if(cmd_queue == NULL) { printf("Compute device setup failed\n"); return(-1); } local = 4; int n = 2 * local; //num_group * local workgroup size global = n; int num_groups= global / local, allocated_local= sizeof(data) * local + sizeof(debug) * local; data *DP __attribute__ ((aligned(16))); DP = calloc(n, sizeof(data) *1); debug *dbg __attribute__ ((aligned(16))); dbg = calloc(n, sizeof(debug)); printf("global:%d, local:%d, (should be):%d groups\n", global, local, num_groups); printf("structs size: %db, %db, %db\n", sizeof(data), sizeof(Elliptic_Curve), sizeof(inv256)); printf("sets:%d, total of %db needed, allocated _local: %db\n", n, n * sizeof(cl_uint4) *5 *4, allocated_local); cl_mem cl_DP, cl_EC, cl_INV, DEBUG; cl_DP = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, n * sizeof(data), NULL, &res); check_return(res); cl_EC = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY, 1 * sizeof(Elliptic_Curve), NULL, &res); check_return(res); //_constant address space cl_INV= clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY, 1 * sizeof(u8) * 0x80, NULL, &res); check_return(res); DEBUG = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_WRITE_ONLY, n * sizeof(debug), NULL, &res); check_return(res); Elliptic_Curve EC; /* Curve domain parameters, (test vectors) ------------------------------------------------------------------------------------- p: c1c627e1638fdc8e24299bb041e4e23af4bb5427 is prime a: c1c627e1638fdc8e24299bb041e4e23af4bb5424 divisor g = 62980 b: 877a6d84155a1de374b72d9f9d93b36bb563b2ab divisor g = 227169643 Gx: 010aff82b3ac72569ae645af3b527be133442131 divisor g = 32209245 Gy: 46b8ec1e6d71e5ecb549614887d57a287df573cc divisor g = 972 precomputed_per_curve_constants: U: c1c627e1638fdc8e24299bb041e4e23af4bb5425 V: 3e39d81e9c702371dbd6644fbe1b1dc50b44abd9 already prepared mod p to test: a: 07189f858e3f723890a66ec1079388ebd2ed509c b: 6043379beb0dade6eed1e9d6de64f4a0c50639d4 gx: 5ef84aacf4f0ea6752f572d0741f40049f354dca gy: 418c695435af6b3d4d7cbb72967395016ef67239 resulting point: P.x: 01718f862ebe9423bd661a65355aa1c86ba330f8 program MUST got this point !! P.y: 557e8ed53ffbfe2c990a121967b340f62e0e4fe2 taken mod p: P.x: 41da1a8f74ff8d3f1ce20ef3e9d8865c96014fe3 P.y: 73ca143c9badedf2d9d3c7573307115ccfe04f13 */ u8 *t; t = _x_to_u8_buffer("c1c627e1638fdc8e24299bb041e4e23af4bb5427"); memcpy(EC.p, t, 20); t = _x_to_u8_buffer("07189f858e3f723890a66ec1079388ebd2ed509c"); memcpy(EC.a, t, 20); t = _x_to_u8_buffer("6043379beb0dade6eed1e9d6de64f4a0c50639d4"); memcpy(EC.b, t, 20); t = _x_to_u8_buffer("5ef84aacf4f0ea6752f572d0741f40049f354dca"); memcpy(EC.Gx, t, 20); t = _x_to_u8_buffer("418c695435af6b3d4d7cbb72967395016ef67239"); memcpy(EC.Gy, t, 20); t = _x_to_u8_buffer("c1c627e1638fdc8e24299bb041e4e23af4bb5425"); memcpy(EC.U, t, 20); t = _x_to_u8_buffer("3e39d81e9c702371dbd6644fbe1b1dc50b44abd9"); memcpy(EC.V, t, 20); /* we need to map buffer now to load some k into data */ DP = clEnqueueMapBuffer(cmd_queue, cl_DP, CL_TRUE, CL_MAP_WRITE, 0, n * sizeof(data), 0, NULL, NULL, &res); check_return(res); t = _x_to_u8_buffer("00542d46e7b3daac8aeb81e533873aabd6d74bb710"); for(u8 i = 0; i < n; i++) memcpy(DP[i].k, t, 21); free(t); //d for(u8 i = 0; i < n; i++) bn_print("", DP[i].k, 21, 1); /* we can alter just a byte into a chosen k to verify that we'll get a different point! */ //DP[2].k[2] = 0x09; //no res = clEnqueueWriteBuffer(cmd_queue, cl_DP, CL_TRUE, 0, n * sizeof(data), &DP, 0, NULL, NULL); check_return(res); res = clEnqueueWriteBuffer(cmd_queue, cl_EC, CL_TRUE, 0, 1 * sizeof(Elliptic_Curve), &EC, 0, NULL, NULL); check_return(res); res = clEnqueueWriteBuffer(cmd_queue, cl_INV, CL_TRUE, 0, 1 * sizeof(u8) * 0x80, &inv256, 0, NULL, NULL); check_return(res); res = clSetKernelArg(kernelobj, 0, sizeof(cl_mem), &cl_DP); /* i/o buffer */ res|= clSetKernelArg(kernelobj, 1, sizeof(data) * local *1, NULL); //allocate space for __local in kernel (just this!) one * localsize res|= clSetKernelArg(kernelobj, 2, sizeof(cl_mem), &cl_EC); res|= clSetKernelArg(kernelobj, 3, sizeof(cl_mem), &cl_INV); res|= clSetKernelArg(kernelobj, 4, sizeof(debug) * local *1, NULL); //allocate space for __local in kernel (just this!) one * localsize res|= clSetKernelArg(kernelobj, 5, sizeof(cl_mem), &DEBUG); //this used to debug kernel output check_return(res); // printf("n:%d, total of %db needed, allocated _local: %db\n", n, n * sizeof(debug), allocated_local); cl_event NDRangeEvent; cl_ulong start, end; /* Execute NDrange */ res = clEnqueueNDRangeKernel(cmd_queue, kernelobj, 1, NULL, &global, &local, 0, NULL, &NDRangeEvent); check_return(res); // res = clEnqueueNDRangeKernel(cmd_queue, kernelobj, 1, NULL, &global, NULL, 0, NULL, &NDRangeEvent); check_return(res); printf("Read back, Mapping buffer:\t%db\n", n * sizeof(data)); DP = clEnqueueMapBuffer(cmd_queue, cl_DP, CL_TRUE, CL_MAP_READ, 0, n * sizeof(data), 0, NULL, NULL, &res); check_return(res); dbg =clEnqueueMapBuffer(cmd_queue, DEBUG, CL_TRUE, CL_MAP_READ, 0, n * sizeof(debug), 0, NULL, NULL, &res); check_return(res); /* using clEnqueueReadBuffer template */ // res = clEnqueueReadBuffer(cmd_queue, ST, CL_TRUE, 0, sets * sizeof(cl_uint8), dbg, 0, NULL, NULL); check_return(res); clFlush(cmd_queue); clFinish(cmd_queue); /* get NDRange execution time with internal ocl profiler */ res = clGetEventProfilingInfo(NDRangeEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); res|= clGetEventProfilingInfo(NDRangeEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); check_return(res); printf("kernel execution time:\t\t%.2f ms\n", (float) ((end - start) /1000000)); //relative to NDRange call printf("number of computes/sec:\t%.2f\n", (float) global *1000000 /((end - start))); printf("i,\tgid\tlid0\tlsize0\tgid0/lsz0,\tgsz0,\tn_gr0,\tlid5,\toffset\n"); for(int i = 0; i < n; i++) { // if(i %local == 0) { printf("%d \t", i); //printf("%u\t%u\t%u\t%u\t| %2u, %2u, %2u, %u\n", *p, *(p +1), *(p +2), *(p +3), *(p +4), *(p +5), *(p +6), *(p +7)); /* silence this doubled debug info printf("%u\t%u\t%u\t%u\t| %2u, %2u, %2u, %u\n", dbg[i].data[0], dbg[i].data[1], dbg[i].data[2], dbg[i].data[3], dbg[i].data[4], dbg[i].data[5], dbg[i].data[6], dbg[i].data[7]); */ //printf("%d %d\n", P[i].dig, P[i].c); bn_print("", DP[i].k, 21, 1); bn_print("", DP[i].rx, 20, 0); bn_print(" ", DP[i].ry, 20, 1); printf("%u(/%u) = %u*%u(/%u) +%u, offset:%u, stride:%u\n", DP[i].pad[0], DP[i].pad[1], DP[i].pad[2], DP[i].pad[3], DP[i].pad[4], DP[i].pad[5], DP[i].pad[6], DP[i].pad[7]); // } } /* Release OpenCL stuff, free the rest */ clReleaseMemObject(cl_DP); clReleaseMemObject(cl_EC); clReleaseMemObject(cl_INV); clReleaseMemObject(DEBUG); clReleaseKernel(kernelobj); clReleaseProgram(pro); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); free(kernel_source); puts("Done!"); return 0; }
int main(int argc, char *argv[]) { /* Variables used to manage the OpenCL environment. */ cl_int rc; size_t return_size[1]; unsigned int column_span = 0; static cl_device_type device_type = CL_DEVICE_TYPE_DEFAULT; static cl_uint kernel_type = KERNEL_DEFAULT; static int gpu_wgsz = MAX_WGSZ; /* The external file containing the matrix data in Matrix Market format */ static char *file_name; /* These variables deal with the source file for the kernel, and the names of the kernels contained therein. */ char kernel_source_file[8] = "spmv.cl"; char kernel_name_LS[21] = "tiled_spmv_kernel_LS"; char kernel_name_AWGC[23] = "tiled_spmv_kernel_AWGC"; char kernel_name[32]; /* Basic "size of problem" variables. */ unsigned int nx; /* Number of elements in the X direction (length of the "input" vector. */ unsigned int ny; /* Number of elements in the Y direction (length of the "answer" vector. */ unsigned int non_zero; /* Number of non_zero elements in the matrix. */ unsigned int nx_pad, nyround; /* Rounded versions of nx and ny. */ /* Variables used to hold user-specified overrides and intermediate control values derived from them. */ unsigned int *slab_startrow = NULL; unsigned int segcachesize; unsigned int max_slabheight; /* Maximum matrix chunksize. */ unsigned int i, j, pdex = 0, ddex = 0; size_t param_value_size_ret; /* ================================================================================== */ /* Read in command line arguments. */ /* ================================================================================== */ int opt; int option_index; struct option long_options[] = { {"help", no_argument, NULL, 'h'}, {"accel", no_argument, NULL, 'a'}, {"cpu", no_argument, NULL, 'c'}, {"gpu", no_argument, NULL, 'g'}, {"ls", no_argument, NULL, 'L'}, {"awgc", no_argument, NULL, 'A'}, {"verify", no_argument, NULL, 'v'}, {"lwgsize", required_argument, NULL, 'l'}, {"filename", required_argument, NULL, 'f'}, {NULL, 0, NULL, 0} }; char *name; /* ================================================================================== */ /* Change current working directory to that of the invocation path so that spmv can */ /* be run from any current working directory. */ /* ================================================================================== */ name = basename(argv[0]); (void)chdir(dirname(argv[0])); while (1) { opt = getopt_long(argc, argv, "hacgLAl:f:", long_options, &option_index); if (opt == -1) break; switch (opt) { /* -h, --help */ case 'h': usage(); exit(EXIT_SUCCESS); /* -a, --accel */ case 'a': device_type = CL_DEVICE_TYPE_ACCELERATOR; break; /* -c, --cpu */ case 'c': device_type = CL_DEVICE_TYPE_CPU; break; /* -g, --gpu */ case 'g': device_type = CL_DEVICE_TYPE_GPU; break; /* -L, --ls */ case 'L': kernel_type = KERNEL_LS; break; /* -A, --awgc */ case 'A': kernel_type = KERNEL_AWGC; break; /* -l, --lwgsize */ case 'l': gpu_wgsz = atoi(optarg); break; /* -f, --filename */ case 'f': posix_memalign((void **) &file_name, 128, 1+strlen(optarg)); strcpy(file_name, optarg); break; case '?': printf("Try '%s --help' for more information.\n", name); exit(EXIT_FAILURE); } } if (optind != argc) { printf("%s: unrecognized option '%s'.\n", name, argv[optind]); printf("Try '%s --help' for more information.\n", name); exit(EXIT_FAILURE); } /* ================================================================================== */ /* Start up OpenCL. */ /* ================================================================================== */ cl_uint preferred_alignment = 16; // used by "MEMORY_ALLOC_CHECK" macro cl_uint num_platforms; rc = clGetPlatformIDs(0, (cl_platform_id *) NULL, &num_platforms); CHECK_RESULT("clGetPlatformIDs(num_platforms)") platform_struct *platform; MEMORY_ALLOC_CHECK(platform, num_platforms * sizeof(platform_struct), "platform"); cl_mem *buffer; MEMORY_ALLOC_CHECK(buffer, num_platforms * sizeof(cl_mem), "buffer"); cl_platform_id *temp_platform_id_array; MEMORY_ALLOC_CHECK(temp_platform_id_array, num_platforms * sizeof(cl_platform_id), "temp_platform_id_array"); rc = clGetPlatformIDs(num_platforms, temp_platform_id_array, (cl_uint *) NULL); CHECK_RESULT("clGetPlatform IDs(Platform IDs)") for (i=0; i<num_platforms; ++i) { platform[i].id = temp_platform_id_array[i]; } free(temp_platform_id_array); printf("[START RUN]\n"); printf("command line: "); for (i=0; i<(unsigned int) argc; ++i) { printf("%s ", argv[i]); } printf("\n"); //printf("num_platforms = %d\n\n", num_platforms); for (i=0; i<num_platforms; ++i) { rc = clGetPlatformInfo(platform[i].id, CL_PLATFORM_NAME, (size_t) 0, NULL, (size_t *) ¶m_value_size_ret); CHECK_RESULT("clGetPlatformInfo(size of platform name)") MEMORY_ALLOC_CHECK(platform[i].name, param_value_size_ret, "platform name"); rc = clGetPlatformInfo(platform[i].id, CL_PLATFORM_NAME, param_value_size_ret, platform[i].name, (size_t *) NULL); CHECK_RESULT("clGetPlatformInfo(platform name)") rc = clGetDeviceIDs(platform[i].id, CL_DEVICE_TYPE_ALL, 0, NULL, (cl_uint *) &(platform[i].num_devices)); CHECK_RESULT("clGetDeviceIDs(number of devices)") MEMORY_ALLOC_CHECK(platform[i].device, platform[i].num_devices * sizeof(device_struct), "device structure"); cl_device_id *tmpdevices; MEMORY_ALLOC_CHECK(tmpdevices, platform[i].num_devices * sizeof(cl_device_id), "tmpdevices"); rc = clGetDeviceIDs(platform[i].id, CL_DEVICE_TYPE_ALL, platform[i].num_devices, tmpdevices, NULL); CHECK_RESULT("clGetDeviceIDs(list of device IDs)") for (j=0; j<platform[i].num_devices; ++j) { platform[i].device[j].id = tmpdevices[j]; rc = clGetDeviceInfo(platform[i].device[j].id, CL_DEVICE_TYPE, sizeof(cl_device_type), &platform[i].device[j].type, NULL); CHECK_RESULT("clGetDeviceInfo(device type)") } free(tmpdevices); } /* ================================================================================== */ /* Choose the best device to use, if one is not explicitly called for. */ /* If a device is specified, ensure that device is present on this hardware. */ /* ================================================================================== */ if (device_type == CL_DEVICE_TYPE_DEFAULT) { int accel_found = 0; for (i=0; i<num_platforms; ++i) { for (j=0; j<platform[i].num_devices; ++j) { if (platform[i].device[j].type == CL_DEVICE_TYPE_ACCELERATOR) { accel_found = 1; pdex = i; ddex = j; } } } if (!accel_found) { int gpu_found = 0; for (i=0; i<num_platforms; ++i) { for (j=0; j<platform[i].num_devices; ++j) { if ((gpu_found == 0) && (platform[i].device[j].type == CL_DEVICE_TYPE_GPU)) { gpu_found = 1; pdex = i; ddex = j; } } } if (!gpu_found) { int cpu_found = 0; for (i=0; i<num_platforms; ++i) { for (j=0; j<platform[i].num_devices; ++j) { if (platform[i].device[j].type == CL_DEVICE_TYPE_CPU) { cpu_found = 1; pdex = i; ddex = j; } } } if (!cpu_found) { fprintf(stderr, "no devices of any kind were found on this system. Leaving...\n"); fflush(stderr); exit(EXIT_FAILURE); } } } } else { int device_found = 0; for (i=0; i<num_platforms; ++i) for (j=0; j<platform[i].num_devices; ++j) { if (platform[i].device[j].type == device_type) { device_found = 1; pdex = i; ddex = j; } } if (device_found == 0) { fprintf(stderr, "no devices of the requested type were found on this system. Leaving...\n"); fflush(stderr); exit(EXIT_FAILURE); } } /* ================================================================================== */ /* Choose the best kernel to use, if one is not explicitly called for. */ /* ================================================================================== */ if (kernel_type == KERNEL_DEFAULT) { kernel_type = (platform[pdex].device[ddex].type == CL_DEVICE_TYPE_ACCELERATOR) ? KERNEL_AWGC : KERNEL_LS; } /* ================================================================================== */ /* Create a context. */ /* ================================================================================== */ cl_context_properties properties[3]; properties[0] = CL_CONTEXT_PLATFORM; properties[1] = (const cl_context_properties) platform[pdex].id; properties[2] = 0; platform[pdex].context = clCreateContext((const cl_context_properties *) properties, 1, &(platform[pdex].device[ddex].id), NULL, NULL, &rc); CHECK_RESULT("clCreateContext") /* ================================================================================== */ /* Build the kernel, create the Command Queue, and print kernel/device info. */ /* ================================================================================== */ switch (kernel_type) { case KERNEL_LS: strcpy(kernel_name, kernel_name_LS); break; case KERNEL_AWGC: strcpy(kernel_name, kernel_name_AWGC); break; } char *kernel_source; kernel_source = load_program_source(kernel_source_file); if (kernel_source == NULL) { fprintf(stderr, "Error: Failed to load compute program from file!\n"); exit(EXIT_FAILURE); } platform[pdex].program = clCreateProgramWithSource(platform[pdex].context, 1, (const char **) &kernel_source, NULL, &rc); CHECK_RESULT("clCreateProgramWithSource") free(kernel_source); rc = clBuildProgram(platform[pdex].program, 1, &(platform[pdex].device[ddex].id), "", NULL, NULL); CHECK_RESULT("clBuildProgram") platform[pdex].kernel = clCreateKernel(platform[pdex].program, kernel_name, &rc); CHECK_RESULT("clCreateKernel") platform[pdex].device[ddex].ComQ = clCreateCommandQueue(platform[pdex].context, platform[pdex].device[ddex].id, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &rc); CHECK_RESULT("clCreateCommandQueue") rc = clGetDeviceInfo(platform[pdex].device[ddex].id, CL_DEVICE_NAME, (size_t) 0, NULL, (size_t *) ¶m_value_size_ret); CHECK_RESULT("clGetDeviceInfo(size of CL_DEVICE_NAME)") MEMORY_ALLOC_CHECK(platform[pdex].device[ddex].name, param_value_size_ret, "device name"); rc = clGetDeviceInfo(platform[pdex].device[ddex].id, CL_DEVICE_NAME, (size_t) param_value_size_ret, platform[pdex].device[ddex].name, (size_t *) NULL); CHECK_RESULT("clGetDeviceInfo(CL_DEVICE_NAME)") printf("We'll run kernel %s on device %s\n", ((kernel_type == KERNEL_LS) ? "kernel_ls" : "kernel_awgc"), platform[pdex].device[ddex].name); /* ================================================================================== */ /* Determine device alignment, and whether "out-of-order" processing is supported. */ /* ================================================================================== */ rc = clGetDeviceInfo(platform[pdex].device[ddex].id, CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(cl_uint), &preferred_alignment, NULL); CHECK_RESULT("clGetDeviceInfo(CL_DEVICE_MEM_BASE_ADDR_ALIGN)") if (preferred_alignment > 1024) preferred_alignment = 1024; preferred_alignment /= 8; /* Convert from units of bits to units of bytes. */ cl_command_queue_properties command_queue_properties; clGetDeviceInfo (platform[pdex].device[ddex].id, CL_DEVICE_QUEUE_PROPERTIES, sizeof(cl_command_queue_properties), &command_queue_properties, NULL); command_queue_properties &= CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE; /* ================================================================================== */ /* Determine local memory size and maximum compute units. */ /* ================================================================================== */ size_t kernel_wg_size; rc = clGetKernelWorkGroupInfo (platform[pdex].kernel, platform[pdex].device[ddex].id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &kernel_wg_size, return_size); CHECK_RESULT("clGetKernelWorkGroupInfo(CL_KERNEL_WORK_GROUP_SIZE)") cl_ulong total_local_mem; rc = clGetDeviceInfo (platform[pdex].device[ddex].id, CL_DEVICE_LOCAL_MEM_SIZE, sizeof (cl_ulong), (void *) &total_local_mem, NULL); CHECK_RESULT("clGetDeviceInfo(CL_DEVICE_LOCAL_MEM_SIZE)") cl_ulong used_local_mem; rc = clGetKernelWorkGroupInfo (platform[pdex].kernel, platform[pdex].device[ddex].id, CL_KERNEL_LOCAL_MEM_SIZE, sizeof (cl_ulong), &used_local_mem, NULL); CHECK_RESULT("clGetKernelWorkGroupInfo(CL_KERNEL_LOCAL_MEM_SIZE)") cl_ulong local_mem_size; local_mem_size = total_local_mem - used_local_mem; cl_uint max_compute_units; clGetDeviceInfo (platform[pdex].device[ddex].id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL); /* ================================================================================== */ /* Set up parameter structure and call the function that builds the tiled matrix. */ /* ================================================================================== */ matrix_gen_struct mgs; unsigned int nslabs_round, memsize; packet *seg_workspace; slab_header *matrix_header; unsigned int num_header_packets; unsigned int *row_index_array = NULL; unsigned int *x_index_array = NULL; float *data_array = NULL; mgs.matrix_header = &matrix_header; mgs.seg_workspace = &seg_workspace; mgs.num_header_packets = &num_header_packets; mgs.row_index_array = &row_index_array; mgs.x_index_array = &x_index_array; mgs.data_array = &data_array; mgs.nx_pad = &nx_pad; mgs.nyround = &nyround; mgs.slab_startrow = &slab_startrow; mgs.nx = &nx; mgs.ny = &ny; mgs.non_zero = &non_zero; mgs.file_name = (char *) file_name; mgs.preferred_alignment = preferred_alignment; mgs.max_compute_units = &max_compute_units; mgs.kernel_type = kernel_type; mgs.column_span = &column_span; mgs.local_mem_size = (unsigned int) local_mem_size; mgs.segcachesize = &segcachesize; mgs.max_slabheight = &max_slabheight; mgs.device_type = platform[pdex].device[ddex].type, mgs.gpu_wgsz = &gpu_wgsz, mgs.kernel_wg_size = kernel_wg_size; mgs.nslabs_round = &nslabs_round; mgs.memsize = &memsize; rc = matrix_gen(&mgs); /* =============================================================================================== */ /* Compute the local and global work group sizes. */ /* =============================================================================================== */ unsigned int ndims; unsigned int team_size; size_t global_work_size[3]; size_t local_work_size[3]; if (kernel_type == KERNEL_AWGC) { ndims = 1; global_work_size[0] = nslabs_round; local_work_size[0] = 1; } else { ndims = 2; team_size = (platform[pdex].device[ddex].type == CL_DEVICE_TYPE_GPU) ? 16 : 1; global_work_size[1] = nslabs_round; local_work_size[1] = 1; global_work_size[0] = local_work_size[0] = (platform[pdex].device[ddex].type == CL_DEVICE_TYPE_GPU) ? gpu_wgsz : CPU_WGSZ; int max_aggregate_local_work_group_size = 0; int aggregate_local_work_group_size = 1; for (i=0; i<ndims; ++i) { aggregate_local_work_group_size *= local_work_size[i]; } max_aggregate_local_work_group_size = aggregate_local_work_group_size; if (max_aggregate_local_work_group_size > (int) kernel_wg_size) { while (max_aggregate_local_work_group_size > (int) kernel_wg_size) { local_work_size[0] /= 2; gpu_wgsz /= 2; max_aggregate_local_work_group_size /= 2; } printf("coercing work group size to fit within hardware limits. New size is %d\n", gpu_wgsz); } } /* =============================================================================================== */ /* Our Tiled format is now complete, but still in "working storage". We cannot allocate its */ /* buffer in OpenCL until we know how big it is, and now, we finally know how big it is. So, we */ /* create the Input and Output arrays, and the final array to hold the Tiled Format of the Matrix. */ /* =============================================================================================== */ /* Arrays to hold input and output data, and the finished tiled matrix data. */ float *input_array, *output_array, *output_array_verify; unsigned int *tilebuffer; MEMORY_ALLOC_CHECK(output_array_verify, (nyround * sizeof(float)), "output_array_verify") if (output_array_verify == NULL) { fprintf(stderr, "insufficient memory to perform this workload.\n"); fflush(stderr); exit(EXIT_FAILURE); } cl_mem input_buffer; cl_mem matrix_buffer; cl_mem output_buffer; unsigned int input_buffer_size; unsigned int matrix_buffer_size; /* Create the input and matrix buffer memory objects. */ input_buffer_size = (nx_pad * sizeof(float)); input_buffer = clCreateBuffer(platform[pdex].context, CL_MEM_ALLOC_HOST_PTR, input_buffer_size, NULL, &rc); CHECK_RESULT("clCreateBuffer(input_buffer)") matrix_buffer_size = memsize; matrix_buffer = clCreateBuffer(platform[pdex].context, CL_MEM_ALLOC_HOST_PTR, matrix_buffer_size, NULL, &rc); CHECK_RESULT("clCreateBuffer(matrix_buffer)") cl_event events[2]; unsigned int output_buffer_size; output_buffer_size = (slab_startrow[nslabs_round] - slab_startrow[0]) * sizeof(float); output_buffer = clCreateBuffer(platform[pdex].context, CL_MEM_ALLOC_HOST_PTR, output_buffer_size, NULL, &rc); CHECK_RESULT("clCreateBuffer(output_buffer)") /* =============================================================================================== */ /* Map these buffers to allocate pointers into these buffers that we can use to load them. */ /* =============================================================================================== */ input_array = (float *) clEnqueueMapBuffer(platform[pdex].device[ddex].ComQ, input_buffer, CL_TRUE, CL_MAP_WRITE, 0, (size_t) input_buffer_size, 0, NULL, NULL, &rc); CHECK_RESULT("clEnqueueMapBuffer(input_array)") tilebuffer = (unsigned int *) clEnqueueMapBuffer(platform[pdex].device[ddex].ComQ, matrix_buffer, CL_TRUE, CL_MAP_WRITE, 0, (size_t) matrix_buffer_size, 0, NULL, NULL, &rc); CHECK_RESULT("clEnqueueMapBuffer(tilebuffer)") output_array = (float *) clEnqueueMapBuffer(platform[pdex].device[ddex].ComQ, output_buffer, CL_TRUE, CL_MAP_WRITE, 0, (size_t) output_buffer_size, 0, NULL, NULL, &rc); CHECK_RESULT("clEnqueueMapBuffer(output_array)") /* =============================================================================================== */ /* Copy the tiled matrix into the memory buffer, and then unmap it. */ /* =============================================================================================== */ memcpy(tilebuffer, seg_workspace, sizeof(packet) * (matrix_header[nslabs_round].offset)); rc = clEnqueueUnmapMemObject(platform[pdex].device[ddex].ComQ, matrix_buffer, tilebuffer, 0, NULL, &events[0]); CHECK_RESULT("clEnqueueUnmapMemObject(tilebuffer)") clWaitForEvents(1, events); /* Load random data into the input array. */ /* The user can substitute initialization of real data at this point in the code. */ for (i=0; i<nx; ++i) { float rval; rval = ((float) (rand() & 0x7fff)) * 0.001f - 15.0f; input_array[i] = rval; } /* Zero out the output array. */ /* Note that this is only needed because some matrices are singular and have whole rows */ /* that are all zero, which is detected, and no work is done on those rows, so that they */ /* will never get written by the kernel, so to be safe, we zero it all out here, as well. */ memset((void *) output_array, 0, output_buffer_size); /* =============================================================================================== */ /* Unmap the input and output memory buffers, to prepare for kernel execution. */ /* =============================================================================================== */ rc = clEnqueueUnmapMemObject(platform[pdex].device[ddex].ComQ, input_buffer, input_array, 0, NULL, &events[0]); CHECK_RESULT("clEnqueueUnmapMemObject(input_array)") rc = clEnqueueUnmapMemObject(platform[pdex].device[ddex].ComQ, output_buffer, output_array, 0, NULL, &events[1]); CHECK_RESULT("clEnqueueUnmapMemObject(output_array)") clWaitForEvents(2, events); /* =============================================================================================== */ /* Execution: Multiplication of the input array times the Tiled Format of the Matrix. */ /* =============================================================================================== */ /* Run once to verifying correct answer, and computing a baseline number of repetitions for later performance measurements. */ rc = clSetKernelArg(platform[pdex].kernel, 0, sizeof(cl_mem), (const void *) &input_buffer); CHECK_RESULT("clSetKernelArg(0)") rc = clSetKernelArg(platform[pdex].kernel, 1, sizeof(cl_mem), (const void *) &output_buffer); CHECK_RESULT("clSetKernelArg(1)") rc = clSetKernelArg(platform[pdex].kernel, 2, sizeof(cl_mem), (const void *) &matrix_buffer); CHECK_RESULT("clSetKernelArg(2)") rc = clSetKernelArg(platform[pdex].kernel, 3, sizeof(cl_uint), &column_span); CHECK_RESULT("clSetKernelArg(3)") rc = clSetKernelArg(platform[pdex].kernel, 4, sizeof(cl_uint), &max_slabheight); CHECK_RESULT("clSetKernelArg(4)") if (kernel_type == KERNEL_LS) { rc = clSetKernelArg(platform[pdex].kernel, 5, sizeof(cl_uint), &team_size); CHECK_RESULT("clSetKernelArg(5)") rc = clSetKernelArg(platform[pdex].kernel, 6, sizeof(cl_uint), &num_header_packets); CHECK_RESULT("clSetKernelArg(6)") rc = clSetKernelArg(platform[pdex].kernel, 7, (size_t) (max_slabheight * sizeof(float)), (void *) NULL); CHECK_RESULT("clSetKernelArg(7)") }
int main(int argc, char * argv[]) { init_rpc(argv[1]); cl_platform_id platform; cl_device_id device; cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_mem d_in_pos, d_in_vel, d_out_pos, d_out_vel; int iterations = 100; int num_bodies = 1024; float espSqr = 500.0f; float delT = 0.005f; int exchange = 1; size_t buf_size = 4 * num_bodies * sizeof(float); float *ref_pos = (float *)malloc(buf_size); float *ref_vel = (float *)malloc(buf_size); int i, j; for (i = 0; i < num_bodies; i++) { int index = 4 * i; for (j = 0; j < 3; ++j) { ref_pos[index + j] = frandom(3, 50); } ref_pos[index + 3] = frandom(1, 1000); for (j = 0; j < 3; ++j) { ref_vel[index + j] = 0.0f; } ref_vel[3] = 0.0f; } size_t local_work_size[1]; size_t global_work_size[1]; local_work_size[0] = 256; global_work_size[0] = num_bodies; const char *source = load_program_source("NBody.cl"); size_t source_len = strlen(source);; cl_uint err = 0; char *flags = ""; clGetPlatformIDs(1, &platform, NULL); printf("platform %p err %d\n", platform, err); clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, &err); printf("device %p err %d\n", device, err); context = clCreateContext(0, 1, &device, NULL, NULL, &err); printf("context %p err %d\n", context, err); queue = clCreateCommandQueue(context, device, 0, &err); printf("queue %p err %d\n", queue, err); program = clCreateProgramWithSource(context, 1, &source, &source_len, &err); printf("program %p err %d\n", program, err); err = clBuildProgram(program, 0, NULL, flags, NULL, NULL); printf("err %d\n", err); kernel = clCreateKernel(program, "nbody_sim", NULL); printf("kernel %p\n", kernel); d_in_pos = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, buf_size, ref_pos, &err); printf("d_in_pos %p err %d\n", d_in_pos, err); d_in_vel = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, buf_size, ref_vel, &err); printf("d_in_vel %p err %d\n", d_in_vel, err); d_out_pos = clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err); printf("d_out_pos %p err %d\n", d_out_pos, err); d_out_vel = clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &err); printf("d_out_vel %p err %d\n", d_out_vel, err); err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&d_in_pos); printf("err %d\n", err); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&d_in_vel); printf("err %d\n", err); err = clSetKernelArg(kernel, 2, sizeof(int), (void*)&num_bodies); printf("err %d\n", err); err = clSetKernelArg(kernel, 3, sizeof(float), (void*)&delT); printf("err %d\n", err); err = clSetKernelArg(kernel, 4, sizeof(float), (void*)&espSqr); printf("err %d\n", err); err = clSetKernelArg(kernel, 5, 256 * 4 * sizeof(float), NULL); printf("err %d\n", err); err = clSetKernelArg(kernel, 6, sizeof(cl_mem), (void*)&d_out_pos); printf("err %d\n", err); err = clSetKernelArg(kernel, 7, sizeof(cl_mem), (void*)&d_out_vel); printf("err %d\n", err); for (i = 0; i < iterations; i++) { err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); printf("err %d\n", err); clFinish(queue); err = clSetKernelArg(kernel, exchange ? 6 : 0, sizeof(cl_mem), (void*)&d_in_pos); printf("err %d\n", err); err = clSetKernelArg(kernel, exchange ? 7 : 1, sizeof(cl_mem), (void*)&d_in_vel); printf("err %d\n", err); err = clSetKernelArg(kernel, exchange ? 0 : 6, sizeof(cl_mem), (void*)&d_out_pos); printf("err %d\n", err); err = clSetKernelArg(kernel, exchange ? 1 : 7, sizeof(cl_mem), (void*)&d_out_vel); printf("err %d\n", err); exchange = exchange ? 0 : 1; } err = clEnqueueReadBuffer(queue, d_out_pos, CL_TRUE, 0, buf_size, ref_pos, 0, NULL, NULL); printf("err %d\n", err); err = clEnqueueReadBuffer(queue, d_out_vel, CL_TRUE, 0, buf_size, ref_vel, 0, NULL, NULL); printf("err %d\n", err); for (i = 0; i < num_bodies ; i++) { printf("%i %f %f\n", i, ref_pos[i], ref_vel[i]); } clReleaseMemObject(d_in_pos); clReleaseMemObject(d_in_vel); clReleaseMemObject(d_out_pos); clReleaseMemObject(d_out_vel); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(queue); }
int main(int argc, char *argv[]) { cl_platform_id platform; cl_device_id device; cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel_one, kernel_path; cl_mem d_mt_state, d_mt_emit, d_max_prob_old; cl_mem d_max_prob_new, d_path, v_prob, v_path; int wg_size = 256; int n_state = 256*16; int n_emit = 128; int n_obs = 100; size_t init_prob_size = sizeof(float) * n_state; size_t mt_state_size = sizeof(float) * n_state * n_state; size_t mt_emit_size = sizeof(float) * n_emit * n_state; float *init_prob = (float *) malloc(init_prob_size); float *mt_state = (float *) malloc(mt_state_size); float *mt_emit = (float *) malloc(mt_emit_size); int *obs = (int *) malloc(sizeof(int) * n_obs); int *viterbi_gpu = (int *) malloc(sizeof(int) * n_obs); srand(2012); initHMM(init_prob, mt_state, mt_emit, n_state, n_emit); int i; for (i = 0; i < n_obs; i++) { obs[i] = i % 15; } const char *source = load_program_source("Viterbi.cl"); size_t source_len = strlen(source);; cl_uint err = 0; char *flags = "-cl-fast-relaxed-math"; clGetPlatformIDs(1, &platform, NULL); printf("platform %p err %d\n", platform, err); clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, &err); printf("device %p err %d\n", device, err); context = clCreateContext(0, 1, &device, NULL, NULL, &err); printf("context %p err %d\n", context, err); queue = clCreateCommandQueue(context, device, 0, &err); printf("queue %p err %d\n", queue, err); program = clCreateProgramWithSource(context, 1, &source, &source_len, &err); printf("program %p err %d\n", program, err); err = clBuildProgram(program, 0, NULL, flags, NULL, NULL); printf("err %d\n", err); /* char tmp[102400]; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(tmp), tmp, NULL); printf("error %s\n", tmp); */ kernel_one = clCreateKernel(program, "ViterbiOneStep", &err); printf("kernel %p err %d\n", kernel_one, err); kernel_path = clCreateKernel(program, "ViterbiPath", &err); printf("kernel %p err %d\n", kernel_path, err); d_mt_state = clCreateBuffer(context, CL_MEM_READ_ONLY, mt_state_size, NULL, &err); printf("buffer %p\n", d_mt_state); d_mt_emit = clCreateBuffer(context, CL_MEM_READ_ONLY, mt_emit_size, NULL, &err); printf("buffer %p\n", d_mt_emit); d_max_prob_new = clCreateBuffer(context, CL_MEM_READ_WRITE, init_prob_size, NULL, &err); printf("buffer %p\n", d_max_prob_new); d_max_prob_old = clCreateBuffer(context, CL_MEM_READ_WRITE, init_prob_size, NULL, &err); printf("buffer %p\n", d_max_prob_old); d_path = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int)*(n_obs-1)*n_state, NULL, &err); printf("buffer %p\n", d_path); v_prob = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float), NULL, &err); printf("buffer %p\n", v_prob); v_path = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int)*n_obs, NULL, &err); printf("buffer %p\n", v_prob); err = clEnqueueWriteBuffer(queue, d_mt_state, CL_TRUE, 0, mt_state_size, mt_state, 0, NULL, NULL); printf("err %d\n", err); err = clEnqueueWriteBuffer(queue, d_mt_emit, CL_TRUE, 0, mt_emit_size, mt_emit, 0, NULL, NULL); printf("err %d\n", err); err = clEnqueueWriteBuffer(queue, d_max_prob_old, CL_TRUE, 0, init_prob_size, init_prob, 0, NULL, NULL); printf("err %d\n", err); // max_wg_size is 1024 for Intel Core 2 CPU size_t max_wg_size; err = clGetKernelWorkGroupInfo(kernel_one, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &max_wg_size, NULL); printf("max_wg_size %d\n", max_wg_size); size_t local_work_size[2], global_work_size[2]; local_work_size[0] = wg_size; local_work_size[1] = 1; global_work_size[0] = local_work_size[0] * 256; global_work_size[1] = n_state/256; for (i = 1; i < n_obs; i++) { err = clSetKernelArg(kernel_one, 0, sizeof(cl_mem), (void*)&d_max_prob_new); printf("err %d\n", err); err = clSetKernelArg(kernel_one, 1, sizeof(cl_mem), (void*)&d_path); printf("err %d\n", err); err = clSetKernelArg(kernel_one, 2, sizeof(cl_mem), (void*)&d_max_prob_old); printf("err %d\n", err); err = clSetKernelArg(kernel_one, 3, sizeof(cl_mem), (void*)&d_mt_state); printf("err %d\n", err); err = clSetKernelArg(kernel_one, 4, sizeof(cl_mem), (void*)&d_mt_emit); printf("err %d\n", err); err = clSetKernelArg(kernel_one, 5, sizeof(float)*local_work_size[0], NULL); printf("err %d\n", err); err = clSetKernelArg(kernel_one, 6, sizeof(int)*local_work_size[0], NULL); printf("err %d\n", err); err = clSetKernelArg(kernel_one, 7, sizeof(int), (void*)&n_state); printf("err %d\n", err); err = clSetKernelArg(kernel_one, 8, sizeof(int), (void*)&(obs[i])); printf("err %d\n", err); err = clSetKernelArg(kernel_one, 9, sizeof(int), (void*)&i); printf("err %d\n", err); err = clEnqueueNDRangeKernel(queue, kernel_one, 2, NULL, global_work_size, local_work_size, 0, NULL, NULL); printf("err %d\n", err); err = clEnqueueCopyBuffer(queue, d_max_prob_new, d_max_prob_old, 0, 0, sizeof(float)*n_state, 0, NULL, NULL); printf("err %d\n", err); } local_work_size[0] = 1; global_work_size[0] = 1; err = clSetKernelArg(kernel_path, 0, sizeof(cl_mem), (void*)&v_prob); printf("err %d\n", err); err = clSetKernelArg(kernel_path, 1, sizeof(cl_mem), (void*)&v_path); printf("err %d\n", err); err = clSetKernelArg(kernel_path, 2, sizeof(cl_mem), (void*)&d_max_prob_new); printf("err %d\n", err); err = clSetKernelArg(kernel_path, 3, sizeof(cl_mem), (void*)&d_path); printf("err %d\n", err); err = clSetKernelArg(kernel_path, 4, sizeof(int), (void*)&n_state); printf("err %d\n", err); err = clSetKernelArg(kernel_path, 5, sizeof(int), (void*)&n_obs); printf("err %d\n", err); err = clEnqueueNDRangeKernel(queue, kernel_path, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); printf("err %d\n", err); clFinish(queue); printf("finish done\n"); err = clEnqueueReadBuffer(queue, v_path, CL_TRUE, 0, sizeof(int)*n_obs, viterbi_gpu, 0, NULL, NULL); printf("err %d\n", err); for (i = 0; i < n_obs; i++) { printf("%d %d\n", i, viterbi_gpu[i]); } clReleaseMemObject(d_mt_state); clReleaseMemObject(d_mt_emit); clReleaseMemObject(d_max_prob_old); clReleaseMemObject(d_max_prob_new); clReleaseMemObject(d_path); clReleaseMemObject(v_prob); clReleaseMemObject(v_path); clReleaseProgram(program); clReleaseKernel(kernel_one); clReleaseKernel(kernel_path); clReleaseCommandQueue(queue); }
int main(int argc, char *argv[]) { init_rpc(argv[1]); cl_platform_id platform; cl_device_id device; cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_mem buffer; size_t i; int scale = 8; // scale should be higher than 8 size_t array_size = powl(2, scale) * 4; cl_int *input = (cl_int *) malloc(sizeof(cl_int) * array_size); cl_int *output = (cl_int *) malloc(sizeof(cl_int) * array_size); cl_int dir = 1; cl_int no_stages = 0; cl_int temp; generateInput(input, array_size); //ExecuteSortReference(input, array_size, dir); for (temp = array_size; temp > 2; temp >>= 1) { no_stages++; } size_t local_work_size[1]; size_t global_work_size[1]; const char *source = load_program_source("BitonicSort.cl"); size_t source_len = strlen(source);; cl_uint err = 0; char *flags = "-cl-fast-relaxed-math"; clGetPlatformIDs(1, &platform, NULL); printf("platform %p err %d\n", platform, err); clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, &err); printf("device %p err %d\n", device, err); context = clCreateContext(0, 1, &device, NULL, NULL, &err); printf("context %p err %d\n", context, err); queue = clCreateCommandQueue(context, device, 0, &err); printf("queue %p err %d\n", queue, err); program = clCreateProgramWithSource(context, 1, &source, &source_len, &err); printf("program %p err %d\n", program, err); err = clBuildProgram(program, 0, NULL, flags, NULL, NULL); printf("err %d\n", err); kernel = clCreateKernel(program, "BitonicSort", NULL); printf("kernel %p\n", kernel); buffer = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(cl_int) * array_size, input, &err); printf("buffer %p err %d\n", buffer, err); err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&buffer); printf("err %d\n", err); err = clSetKernelArg(kernel, 3, sizeof(cl_int), (void*)&dir); printf("err %d\n", err); cl_int stage, pass_stage; for (stage = 0; stage < no_stages; stage++) { err = clSetKernelArg(kernel, 1, sizeof(cl_int), (void*)&stage); printf("err %d\n", err); for (pass_stage = stage; pass_stage >= 0; pass_stage--) { err = clSetKernelArg(kernel, 2, sizeof(cl_int), (void*)&pass_stage); printf("err %d\n", err); size_t gsz = array_size/(2*4); global_work_size[0] = pass_stage ? gsz : gsz << 1; local_work_size[0] = 128; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); printf("err %d\n", err); } } clFinish(queue); err = clEnqueueReadBuffer(queue, buffer, CL_TRUE, 0, sizeof(cl_int) * array_size, output, 0, NULL, NULL); printf("err %d\n", err); for (i = 0; i < array_size; i++) { printf("%i %i\n", i, output[i]); } clReleaseMemObject(buffer); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(queue); }
int main( int argc, char* argv[] ) { // Length of vectors int m = atoi(argv[4]); unsigned int n=(256*m); //matrix variable // OpenCL device memory for matrices cl_mem d_A; cl_mem d_B; cl_mem d_C; //########################Vector Add Variables // Host input vectors int *h_a; int *h_b; // Host output vector int *h_c; // Device input buffers cl_mem d_a; cl_mem d_b; // Device output buffer cl_mem d_c; // cl_kernel *kernel; cl_platform_id* cpPlatform; // OpenCL platform cl_device_id device_id; // device ID cl_context context; // context //cl_command_queue* queue; // command queue //cl_command_queue queue; // command queue // cl_program *program; // program cl_platform_id* platforms; // platform id, // differnt for all the device we have in the system cl_uint platformCount; //keeps the divice count // Size, in bytes, of each vector size_t bytes = n*sizeof(int); // Allocate memory for each vector on host h_a = (int*)malloc(bytes); h_b = (int*)malloc(bytes); h_c = (int*)malloc(bytes); // Initialize vectors on host int i; for( i = 0; i < n; i++ ) { h_a[i] = i; h_b[i] = i; // printf("%d ",h_a[i]); } size_t globalSize, localSize; //similar to cuda cl_int err;//for errors int workgrp; int wrkitm; int num_ker; num_ker=atoi(argv[2]); wrkitm=atoi(argv[3]);// i have tried automating lots of data, // Number of work items in each local work group localSize = wrkitm ; // Number of total work items - localSize must be devisor globalSize = n; //################################# Done vector ################### //#############Matrix Multiplication Variables ############### int WA,HA,WB,HB,WC,HC; WA = n; HA = WA; WB = WA; HB = WB; WC = WA; HC = WA; // set seed for rand() srand(2006); // 1. allocate host memory for matrices A and B //automate the size of the matrix unsigned int size_A = WA * HA; unsigned int mem_size_A = sizeof(float) * size_A; float* h_A = (float*) malloc(mem_size_A); unsigned int size_B = WB * HB; unsigned int mem_size_B = sizeof(float) * size_B; float* h_B = (float*) malloc(mem_size_B); // 4. allocate host memory for the result C unsigned int size_C = WC * HC; unsigned int mem_size_C = sizeof(float) * size_C; float* h_C = (float*) malloc(mem_size_C); // 2. initialize host memory randomInit(h_A, size_A); randomInit(h_B, size_B); //######################## matrix done ####################### //mallocing for array of queues (break through) cl_command_queue * queue = (cl_command_queue *)malloc(num_ker * sizeof(cl_command_queue)); cl_kernel *kernel=(cl_kernel *)malloc(num_ker * sizeof(cl_kernel)); cl_program *program=(cl_program *)malloc(num_ker * sizeof(cl_kernel)); //defining platform clGetPlatformIDs(0, NULL, &platformCount); cpPlatform = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount); clGetPlatformIDs(platformCount, cpPlatform, NULL);//what ever is returned from last step will be used here int choice = atoi(argv[1]); if(choice ==1) { // we can have CL_DEVICE_GPU or ACCELERATOR or ALL as an option here // we can these multiple times depending on requirements err = clGetDeviceIDs(cpPlatform[0],CL_DEVICE_TYPE_CPU , 1, &device_id, NULL); if (err != CL_SUCCESS) printf("Error: Failed to create a device group!\n"); } else { // Get ID for the device err = clGetDeviceIDs(cpPlatform[1], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); } } context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); //malloc file and kernel variable char **file=(char **)malloc(num_ker * sizeof(char *)); char **KernelSource=(char **)malloc(num_ker * sizeof(char *)); for(i=0;i<num_ker;i++) { queue[i] = clCreateCommandQueue(context, device_id, 0, &err); } file[0]="vectadd.cl"; KernelSource[0] = load_program_source(file[0]); file[1]="matxm.cl"; KernelSource[1] = load_program_source(file[1]); for(i=0;i<num_ker;i++) { // Create the compute program from the source buffer program[i] = clCreateProgramWithSource(context, 1, (const char **) & KernelSource[i], NULL, &err); // Build the program executable clBuildProgram(program[i], 0, NULL, NULL, NULL, NULL); // Create the compute kernel in the program we wish to run kernel[i] = clCreateKernel(program[i], file[i], &err); } //Vector Start // Create the input and output arrays in device memory for our calculation d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL); //vector finsih //matrix start d_C = clCreateBuffer(context, CL_MEM_READ_WRITE, mem_size_A, NULL, &err); d_A = clCreateBuffer(context, CL_MEM_READ_WRITE, mem_size_A, h_A, &err); d_B = clCreateBuffer(context, CL_MEM_READ_WRITE, mem_size_B, h_B, &err); //matrix finish // Write our data set into the input array in device memory for(i=0;i<num_ker;i++) { if(i=0)//for vectorADD { err = clEnqueueWriteBuffer(queue[i], d_a, CL_TRUE, 0,bytes, h_a, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue[i], d_b, CL_TRUE, 0,bytes, h_b, 0, NULL, NULL); // Set the arguments to our compute kernel err = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), &d_a); err = clSetKernelArg(kernel[i], 1, sizeof(cl_mem), &d_b); err = clSetKernelArg(kernel[i], 2, sizeof(cl_mem), &d_c); err = clSetKernelArg(kernel[i], 3, sizeof(unsigned int), &n); // Get the maximum work group size for executing the kernel on the device if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } } else if(i=1) { err = clEnqueueWriteBuffer(queue[i], d_A, CL_TRUE, 0,mem_size_A, h_A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue[i], d_B, CL_TRUE, 0,mem_size_B, h_B, 0, NULL, NULL); //size_t localWorkSize[2], globalWorkSize[2]; int wA = WA; int wC = WC; err = clSetKernelArg(kernel[i], 0, sizeof(cl_mem), (void *)&d_C); err = clSetKernelArg(kernel[i], 1, sizeof(cl_mem), (void *)&d_A); err = clSetKernelArg(kernel[i], 2, sizeof(cl_mem), (void *)&d_B); err = clSetKernelArg(kernel[i], 3, sizeof(int), (void *)&wA); err = clSetKernelArg(kernel[i], 4, sizeof(int), (void *)&wC); } } /* // Set the arguments to our compute kernel err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b); err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c); err = clSetKernelArg(kernel, 3, sizeof(unsigned int), &n); // Get the maximum work group size for executing the kernel on the device if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } */ //struct timeval tim; //double t1,t2; //gettimeofday(&tim, NULL); //t1=tim.tv_sec+(tim.tv_usec/1000000.0); //need to work on work size############################# for(i=0;i<num_ker;i++) { err = clEnqueueNDRangeKernel(queue[i], kernel[i], 1, NULL, &globalSize, &localSize, 0, NULL, NULL); } //for(i=0;i<num_ker;i++) //clFinish(queue[i]); //gettimeofday(&tim, NULL); // t2=tim.tv_sec+(tim.tv_usec/1000000.0); //printf("GPU time %.4lf\t",(t2-t1)); for(i=0;i<num_ker;++i) { if(i=0) { clEnqueueReadBuffer(queue[i], d_c, CL_TRUE, 0, bytes, h_c, 0, NULL, NULL ); } else if(i=1) { err = clEnqueueReadBuffer(queue[i], d_C, CL_TRUE, 0, mem_size_C, h_C, 0, NULL, NULL); } } for(i=0;i<num_ker;++i) { clFinish(queue[i]); } // release OpenCL resources free(h_A); free(h_B); free(h_C); clReleaseMemObject(d_A); clReleaseMemObject(d_C); clReleaseMemObject(d_B); clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); // clReleaseProgram(program); // clReleaseKernel(kernel); for(i=0;i<num_ker;++i) { clReleaseCommandQueue(queue[i]); clReleaseKernel(kernel[i]); clReleaseProgram(program[i]); } clReleaseContext(context); //release host memory free(h_a); free(h_b); free(h_c); return 0; }
int main(int argc, char *argv[]) { init_rpc(argv[1]); cl_platform_id platform; cl_device_id device; cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_mem d_input_r; cl_mem d_input_i; int length = 1024; size_t buf_size = length * sizeof(float); float *input_r, *input_i, *output_r, *output_i; posix_memalign((void **)&input_r, 16, buf_size); posix_memalign((void **)&input_i, 16, buf_size); posix_memalign((void **)&output_r, 16, buf_size); posix_memalign((void **)&output_i, 16, buf_size); fill_rand(input_r, length, 0, 255); fill_rand(input_i, length, 0, 0); memcpy(output_r, input_r, buf_size); memcpy(output_i, input_i, buf_size); size_t local_work_size[1]; size_t global_work_size[1]; local_work_size[0] = 64; global_work_size[0] = 64; const char *source = load_program_source("FFT.cl"); size_t source_len = strlen(source);; cl_uint err = 0; char *flags = "-x clc++"; clGetPlatformIDs(1, &platform, NULL); printf("platform %p err %d\n", platform, err); clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, &err); printf("device %p err %d\n", device, err); context = clCreateContext(0, 1, &device, NULL, NULL, &err); printf("context %p err %d\n", context, err); queue = clCreateCommandQueue(context, device, 0, &err); printf("queue %p err %d\n", queue, err); program = clCreateProgramWithSource(context, 1, &source, &source_len, &err); printf("program %p err %d\n", program, err); err = clBuildProgram(program, 0, NULL, flags, NULL, NULL); printf("err %d\n", err); kernel = clCreateKernel(program, "kfft", NULL); printf("kernel %p\n", kernel); d_input_r = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, buf_size, input_r, &err); printf("d_input_r %p err %d\n", d_input_r, err); d_input_i = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, buf_size, input_i, &err); printf("d_input_i %p err %d\n", d_input_i, err); err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void*)&d_input_r); printf("err %d\n", err); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&d_input_i); printf("err %d\n", err); err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); printf("err %d\n", err); clFinish(queue); err = clEnqueueReadBuffer(queue, d_input_r, CL_TRUE, 0, buf_size, output_r, 0, NULL, NULL); printf("err %d\n", err); err = clEnqueueReadBuffer(queue, d_input_i, CL_TRUE, 0, buf_size, output_i, 0, NULL, NULL); printf("err %d\n", err); int i; for (i = 0; i < length; i++) { printf("%i %f %f\n", i, output_r[i], output_i[i]); } clReleaseMemObject(d_input_r); clReleaseMemObject(d_input_i); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(queue); }
int main(int argc, char** argv) { int err; // error code returned from api calls float data[DATA_SIZE]; // original data set given to device float results[DATA_SIZE]; // results returned from device unsigned int correct; // number of correct results returned size_t global; // global domain size for our calculation size_t local; // local domain size for our calculation cl_platform_id platform_id = NULL; // compute device platform id cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel cl_mem input; // device memory used for the input array cl_mem output; // device memory used for the output array cl_event event; // Fill our data set with random float values // int i = 0; unsigned int count = DATA_SIZE; for(i = 0; i < count; i++) data[i] = rand() / (float)RAND_MAX; //Connect to a platform on device err = clGetPlatformIDs(1, &platform_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to locate opencl platform!\n"); return EXIT_FAILURE; } // Connect to a compute device // int gpu = 0; err = clGetDeviceIDs(platform_id, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command commands // commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } //Use function and load the kernel source from .cl files in the same folder // char *KernelSource = load_program_source("hello.cl"); // Create the compute program from the source buffer // program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "square", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel! - %d\n",err); exit(1); } // Create the input and output arrays in device memory for our calculation // input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL); output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL); if (!input || !output) { printf("Error: Failed to allocate device memory!\n"); exit(1); } // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array!\n"); exit(1); } // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } // Get the maximum work group size for executing the kernel on the device // err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // global = count; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, NULL, 0, NULL, &event); if (err) { printf("Error: Failed to execute kernel!-%d\n",err); return EXIT_FAILURE; } // Wait for the command commands to get serviced before reading back results // clWaitForEvents(1, &event); clFinish(commands); cl_ulong time_start, time_end; double total_time; clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL); total_time = time_end - time_start; printf("cl:main timing:opencl clEnqueueNDRangeKernel %0.3f us\n", total_time / 1000.0); // Read back the results from the device to verify the output // err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } // Validate our results // correct = 0; for(i = 0; i < count; i++) { if(results[i] == data[i] * data[i]) correct++; } // Print a brief summary detailing the results // printf("Computed '%d/%d' correct values!\n", correct, count); // Shutdown and cleanup // clReleaseMemObject(input); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return 0; }
int main(int argc, char* argv[]) { int num_ker=0,num_queue; num_ker=atoi(argv[2]); num_queue=atoi(argv[3]); //variables /*#define WA 1024 #define HA 1024 #define WB 1024 #define HB WA #define WC WB #define HC HA */ struct timeval tim,ftim; double t1,t2,tim1,tim2; // gettimeofday(&tim, NULL); // t1=tim.tv_sec+(tim.tv_usec/1000000.0); gettimeofday(&ftim, NULL); tim1=ftim.tv_sec+(ftim.tv_usec/1000000.0); int m,WA,HA,WB,HB,WC,HC; m = atoi(argv[5]); WA=(256*m); HA = WA; WB = WA; HB = WB; WC = WA; HC = WA; // set seed for rand() srand(2006); // 1. allocate host memory for matrices A and B //automate the size of the matrix unsigned int size_A = WA * HA; unsigned int mem_size_A = sizeof(int) * size_A; int* h_A = (int*) malloc(mem_size_A); unsigned int size_B = WB * HB; unsigned int mem_size_B = sizeof(int) * size_B; int* h_B = (int*) malloc(mem_size_B); // 2. initialize host memory randomInit(h_A, size_A); randomInit(h_B, size_B); /* // 3. print out A and B printf("\n\nMatrix A\n"); for(i = 0; i < size_A; i++) { printf("%f ", h_A[i]); if(((i + 1) % WA) == 0) printf("\n"); } printf("\n\nMatrix B\n"); for(i = 0; i < size_B; i++) { printf("%f ", h_B[i]); if(((i + 1) % WB) == 0) printf("\n"); } */ // 4. allocate host memory for the result C unsigned int size_C = WC * HC; unsigned int mem_size_C = sizeof(int) * size_C; int* h_C = (int*) malloc(mem_size_C); // 5. Initialize OpenCL // OpenCL specific variables cl_context clGPUContext; // cl_command_queue* clCommandQue; //cl_program clProgram; //cl_kernel clKernel; cl_platform_id* cpPlatform; // OpenCL platform cl_uint platformCount; //keeps the divice count size_t dataBytes; size_t kernelLength; cl_int errcode; // OpenCL device memory for matrices cl_mem d_A; cl_mem d_B; cl_mem d_C; /*****************************************/ /* Initialize OpenCL */ /*****************************************/ //cl_platform_id* cpPlatform; // OpenCL platform //cl_device_id device_id;// = (cl_device_id)malloc(sizeof(cl_device_id)); // Bind to platform // errcode = clGetPlatformIDs(1, &cpPlatform, NULL); clGetPlatformIDs(0, NULL, &platformCount); cpPlatform = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount); clGetPlatformIDs(platformCount, cpPlatform, NULL);//what ever is returned from last step will be used here cl_device_id device_id; int choice =atoi(argv[1]); if(choice ==1) { // Length of vectors // n = 64; // Connect to a compute device // we can have CL_DEVICE_GPU or ACCELERATOR or ALL as an option here //depending what device are we working on // we can these multiple times depending on requirements errcode = clGetDeviceIDs(cpPlatform[0],CL_DEVICE_TYPE_CPU , 1, &device_id, NULL); if (errcode != CL_SUCCESS) printf("Error: Failed to create a device group!\n"); } else { // errcode = clGetPlatformIDs(1, &cpPlatform, NULL); // Get ID for the device errcode = clGetDeviceIDs(cpPlatform[1], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); if (errcode != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); } } //printf("here"); // Create a context clGPUContext = clCreateContext(0, 1, &device_id, NULL, NULL, &errcode); // Create a command queue //printf("here"); /*clGPUContext = clCreateContextFromType(NULL, CL_DEVICE_TYPE_GPU, NULL, NULL, &errcode); //shrCheckError(errcode, CL_SUCCESS); // get the list of GPU devices associated // with context errcode = clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &dataBytes); cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes); errcode = clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes, clDevices, NULL); //shrCheckError(errcode, CL_SUCCESS); */ //malloc for command queue, kernel and program cl_kernel *clKernel=(cl_kernel *)malloc(num_ker * sizeof(cl_kernel)); cl_program *clProgram=(cl_program *)malloc(num_ker * sizeof(cl_kernel)); cl_command_queue * clCommandQue = (cl_command_queue *)malloc(num_ker * sizeof(cl_command_queue)); //Create a command-queue for(i=0;i<num_queue;i++) { clCommandQue[i] = clCreateCommandQueue(clGPUContext, device_id, 0, &errcode); } //shrCheckError(errcode, CL_SUCCESS); /* // Setup device memory d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, mem_size_A, NULL, &errcode); d_A = clCreateBuffer(clGPUContext, printf("\nhere"); CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_A, h_A, &errcode); d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_B, h_B, &errcode); */ char *file="matxm.cl"; char *KernelSource = load_program_source(file); for(i=0;i<num_ker;i++) { clProgram[i] = clCreateProgramWithSource(clGPUContext, 1, (const char **) & KernelSource, NULL, &errcode); //shrCheckError(errcode, CL_SUCCESS); errcode = clBuildProgram(clProgram[i], 0, NULL, NULL, NULL, NULL); //shrCheckError(errcode, CL_SUCCESS); clKernel[i] = clCreateKernel(clProgram[i], "matrixMul", &errcode); } //shrCheckError(errcode, CL_SUCCESS); // Setup device memory d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, mem_size_A, NULL, &errcode); d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, mem_size_A, h_A, &errcode); d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, mem_size_B, h_B, &errcode); // Write our data set into the input array in device memory for(i=0;i<num_queue;i++){ errcode = clEnqueueWriteBuffer(clCommandQue[i], d_A, CL_TRUE, 0,mem_size_A, h_A, 0, NULL, NULL); errcode = clEnqueueWriteBuffer(clCommandQue[i], d_B, CL_TRUE, 0,mem_size_B, h_B, 0, NULL, NULL); } // 7. Launch OpenCL kernel size_t localWorkSize[2], globalWorkSize[2]; int wA = WA; int wC = WC; for(i=0;i<num_ker;i++) { errcode = clSetKernelArg(clKernel[i], 0, sizeof(cl_mem), (void *)&d_C); errcode = clSetKernelArg(clKernel[i], 1, sizeof(cl_mem), (void *)&d_A); errcode = clSetKernelArg(clKernel[i], 2, sizeof(cl_mem), (void *)&d_B); errcode = clSetKernelArg(clKernel[i], 3, sizeof(int), (void *)&wA); errcode = clSetKernelArg(clKernel[i], 4, sizeof(int), (void *)&wC); } // shrCheckError(errcode, CL_SUCCESS); //struct timespec start, finish; int value; value =atoi(argv[4]); localWorkSize[0] = value ; localWorkSize[1] = value ; globalWorkSize[0] = HA; globalWorkSize[1] = HA; //clFinish(clCommandQue); //timer starting // clock_gettime(CLOCK_MONOTONIC, &start); //struct timeval tim; //double t1,t2; // gettimeofday(&tim, NULL); // t1=tim.tv_sec+(tim.tv_usec/1000000.0); gettimeofday(&tim, NULL); t1=tim.tv_sec+(tim.tv_usec/1000000.0); //multikernels inside queues int j=0; for(j=0;j<num_queue;j++) { for(i=0;i<num_ker;i++){ errcode = clEnqueueNDRangeKernel(clCommandQue[j], clKernel[i], 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); } } for(i=0;i<num_queue;i++) { clFinish(clCommandQue[i]); } gettimeofday(&tim, NULL); t2=tim.tv_sec+(tim.tv_usec/1000000.0); printf("%.6lf\t",(t2-t1)); // shrCheckError(errcode, CL_SUCCESS); /* clock_gettime(CLOCK_MONOTONIC, &finish); elapsed = (finish.tv_sec - start.tv_sec); elapsed += (finish.tv_nsec - start.tv_nsec)/ 1000000000.0; printf("Work Item/threads = %d \n",value); printf("time taken by GPU = %le\n ",elapsed); */ // 8. Retrieve result from device for(i=0;i<num_queue;i++) { errcode = clEnqueueReadBuffer(clCommandQue[i], d_C, CL_TRUE, 0, mem_size_C, h_C, 0, NULL, NULL); //shrCheckError(errcode, CL_SUCCESS); } for(i=0;i<num_queue;i++) { clFinish(clCommandQue[i]); } // shrCheckError(errcode, CL_SUCCESS); //clock_gettime(CLOCK_MONOTONIC, &finish); // elapsed = (finish.tv_sec - start.tv_sec); // elapsed += (finish.tv_nsec - start.tv_nsec)/ 1000000000.0; //printf("Work Item/threads = %d \n",value); //printf("time taken by GPU = %le\n ",elapsed); // 9. print out the results /*printf("\n\nMatrix C (Results)\n"); for(i = 0; i < size_C; i++) { printf("%f ", h_C[i]); if(((i + 1) % WC) == 0) printf("\n"); } printf("\n");*/ // 10. clean up memory free(h_A); free(h_B); free(h_C); clReleaseMemObject(d_A); clReleaseMemObject(d_C); clReleaseMemObject(d_B); // free(device_id); free(KernelSource); clReleaseContext(clGPUContext); for(i=0;i<num_ker;i++) { clReleaseKernel(clKernel[i]); clReleaseProgram(clProgram[i]); } for(i=0;i<num_queue;i++){ clReleaseCommandQueue(clCommandQue[i]); } gettimeofday(&ftim, NULL); tim2=ftim.tv_sec+(ftim.tv_usec/1000000.0); printf("%.6lf\t",(tim2-tim1)); printf("\n"); exit(0); }
int main(int argc, char* argv[]) { int device_gpu = 1; const char *source_files[1] = { "mtgp32-opencl.cl"}; const char *buildOptions="-I. -Werror"; const char *program_source[1]; cl_int clerr; cl_platform_id platform_ids[32]; unsigned int num_platforms; clerr = clGetPlatformIDs(32, platform_ids, &num_platforms); CLERR; for (unsigned int i=0; i < num_platforms; ++i) { clerr = clGetDeviceIDs (platform_ids[i], device_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (CL_SUCCESS == clerr) { platform_id = platform_ids[i]; break; } else if (CL_DEVICE_NOT_FOUND == clerr) continue; CLERR; } { char platform_name[1024]; char platform_vendor[1024]; char device_name[1024]; clerr = clGetPlatformInfo(platform_id, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); CLERR; clerr = clGetPlatformInfo(platform_id, CL_PLATFORM_VENDOR, sizeof(platform_vendor), platform_vendor, NULL); CLERR; clerr = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); CLERR; printf("Platform name: %s\nPlatform vendor: %s\nDevice name: %s\n", platform_name, platform_vendor, device_name); } context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &clerr); CLERR; commands = clCreateCommandQueue(context, device_id, 0, &clerr); CLERR; program_source[0] = load_program_source(source_files[0]); program = clCreateProgramWithSource(context, 1, program_source, NULL, &clerr); CLERR; clerr = clBuildProgram(program, 0, NULL, buildOptions, NULL, NULL); //CLERR; size_t log_size; clerr = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); CLERR; char* build_log = (char*) malloc(log_size); clerr = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL); CLERR; build_log[log_size-1] = '\0'; printf("BUILD LOG %s\n", build_log); free(build_log); free((void*)program_source[0]); mtgp32_uint32_kernel = clCreateKernel(program, "mtgp32_uint32_kernel", &clerr); CLERR; mtgp32_single_kernel = clCreateKernel(program, "mtgp32_single_kernel", &clerr); CLERR; // LARGE_SIZE is a multiple of 16 int num_data = 10000000; int block_num; int num_unit; int r; cl_mem d_status; cl_mem d_params; int mb, mp; block_num = 96; /* if (argc >= 2) { errno = 0; block_num = strtol(argv[1], NULL, 10); if (errno) { printf("%s number_of_block number_of_output\n", argv[0]); return 1; } if (block_num < 1 || block_num > BLOCK_NUM_MAX) { printf("%s block_num should be between 1 and %d\n", argv[0], BLOCK_NUM_MAX); return 1; } errno = 0; num_data = strtol(argv[2], NULL, 10); if (errno) { printf("%s number_of_block number_of_output\n", argv[0]); return 1; } argc -= 2; argv += 2; } else { printf("%s number_of_block number_of_output\n", argv[0]); block_num = get_suitable_block_num(device, &mb, &mp, sizeof(uint32_t), THREAD_NUM, LARGE_SIZE); if (block_num <= 0) { printf("can't calculate sutable number of blocks.\n"); return 1; } printf("the suitable number of blocks for device 0 " "will be multiple of %d, or multiple of %d\n", block_num, (mb - 1) * mp); return 1; } */ num_unit = LARGE_SIZE * block_num; d_status = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(struct mtgp32_kernel_status_t) * block_num, NULL, &clerr); CLERR; d_params = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(struct mtgp32_param_t), NULL, &clerr); CLERR; // ccudaMalloc((void**)&d_status, sizeof(mtgp32_kernel_status_t) * block_num); r = num_data % num_unit; if (r != 0) { num_data = num_data + num_unit - r; } make_constant(commands, d_params, MTGPDC_PARAM_TABLE, block_num); make_kernel_data32(commands, d_status, MTGPDC_PARAM_TABLE, block_num); make_uint32_random(d_status, d_params, num_data, block_num); make_single_random(d_status, d_params, num_data, block_num); clReleaseMemObject(d_status); clReleaseMemObject(d_params); /*Close connection with devices*/ clReleaseKernel(mtgp32_uint32_kernel); clReleaseKernel(mtgp32_single_kernel); clReleaseProgram(program); clReleaseCommandQueue(commands); clReleaseContext(context); }
int main(int argc, char **argv) { uint64_t t1 = 0; uint64_t t2 = 0; int err; cl_device_id device_id; cl_command_queue commands; cl_context context; cl_mem output_buffer; cl_mem input_buffer; cl_mem partials_buffer; size_t typesize; int pass_count = 0; size_t* group_counts = 0; size_t* work_item_counts = 0; int* operation_counts = 0; int* entry_counts = 0; int use_gpu = 1; int i; int c; // Parse command line options // for( i = 0; i < argc && argv; i++) { if(!argv[i]) continue; if(strstr(argv[i], "cpu")) { use_gpu = 0; } else if(strstr(argv[i], "gpu")) { use_gpu = 1; } else if(strstr(argv[i], "float2")) { integer = false; channels = 2; } else if(strstr(argv[i], "float4")) { integer = false; channels = 4; } else if(strstr(argv[i], "float")) { integer = false; channels = 1; } else if(strstr(argv[i], "int2")) { integer = true; channels = 2; } else if(strstr(argv[i], "int4")) { integer = true; channels = 4; } else if(strstr(argv[i], "int")) { integer = true; channels = 1; } } // Create some random input data on the host // float *float_data = (float*)malloc(count * channels * sizeof(float)); int *integer_data = (int*)malloc(count * channels * sizeof(int)); for (i = 0; i < count * channels; i++) { float_data[i] = ((float) rand() / (float) RAND_MAX); integer_data[i] = (int) (255.0f * float_data[i]); } // Connect to a compute device // err = clGetDeviceIDs(NULL, use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to locate a compute device!\n"); return EXIT_FAILURE; } size_t returned_size = 0; size_t max_workgroup_size = 0; err = clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_workgroup_size, &returned_size); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve device info!\n"); return EXIT_FAILURE; } cl_char vendor_name[1024] = {0}; cl_char device_name[1024] = {0}; err = clGetDeviceInfo(device_id, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, &returned_size); err|= clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(device_name), device_name, &returned_size); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve device info!\n"); return EXIT_FAILURE; } printf(SEPARATOR); printf("Connecting to %s %s...\n", vendor_name, device_name); // Load the compute program from disk into a cstring buffer // typesize = integer ? (sizeof(int)) : (sizeof(float)); const char* filename = 0; switch(channels) { case 4: filename = integer ? "reduce_int4_kernel.cl" : "reduce_float4_kernel.cl"; break; case 2: filename = integer ? "reduce_int2_kernel.cl" : "reduce_float2_kernel.cl"; break; case 1: filename = integer ? "reduce_int_kernel.cl" : "reduce_float_kernel.cl"; break; default: printf("Invalid channel count specified!\n"); return EXIT_FAILURE; }; printf(SEPARATOR); printf("Loading program '%s'...\n", filename); printf(SEPARATOR); char *source = load_program_source(filename); if(!source) { printf("Error: Failed to load compute program from file!\n"); return EXIT_FAILURE; } // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command queue // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the input buffer on the device // size_t buffer_size = typesize * count * channels; input_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL); if (!input_buffer) { printf("Error: Failed to allocate input buffer on device!\n"); return EXIT_FAILURE; } // Fill the input buffer with the host allocated random data // void *input_data = (integer) ? (void*)integer_data : (void*)float_data; err = clEnqueueWriteBuffer(commands, input_buffer, CL_TRUE, 0, buffer_size, input_data, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array!\n"); return EXIT_FAILURE; } // Create an intermediate data buffer for intra-level results // partials_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL); if (!partials_buffer) { printf("Error: Failed to allocate partial sum buffer on device!\n"); return EXIT_FAILURE; } // Create the output buffer on the device // output_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, buffer_size, NULL, NULL); if (!output_buffer) { printf("Error: Failed to allocate result buffer on device!\n"); return EXIT_FAILURE; } // Determine the reduction pass configuration for each level in the pyramid // create_reduction_pass_counts( count, max_workgroup_size, MAX_GROUPS, MAX_WORK_ITEMS, &pass_count, &group_counts, &work_item_counts, &operation_counts, &entry_counts); // Create specialized programs and kernels for each level of the reduction // cl_program *programs = (cl_program*)malloc(pass_count * sizeof(cl_program)); memset(programs, 0, pass_count * sizeof(cl_program)); cl_kernel *kernels = (cl_kernel*)malloc(pass_count * sizeof(cl_kernel)); memset(kernels, 0, pass_count * sizeof(cl_kernel)); for(i = 0; i < pass_count; i++) { char *block_source = malloc(strlen(source) + 1024); size_t source_length = strlen(source) + 1024; memset(block_source, 0, source_length); // Insert macro definitions to specialize the kernel to a particular group size // const char group_size_macro[] = "#define GROUP_SIZE"; const char operations_macro[] = "#define OPERATIONS"; sprintf(block_source, "%s (%d) \n%s (%d)\n\n%s\n", group_size_macro, (int)group_counts[i], operations_macro, (int)operation_counts[i], source); // Create the compute program from the source buffer // programs[i] = clCreateProgramWithSource(context, 1, (const char **) & block_source, NULL, &err); if (!programs[i] || err != CL_SUCCESS) { printf("%s\n", block_source); printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program executable // err = clBuildProgram(programs[i], 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t length; char build_log[2048]; printf("%s\n", block_source); printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(programs[i], device_id, CL_PROGRAM_BUILD_LOG, sizeof(build_log), build_log, &length); printf("%s\n", build_log); return EXIT_FAILURE; } // Create the compute kernel from within the program // kernels[i] = clCreateKernel(programs[i], "reduce", &err); if (!kernels[i] || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); return EXIT_FAILURE; } free(block_source); } // Do the reduction for each level // cl_mem pass_swap; cl_mem pass_input = output_buffer; cl_mem pass_output = input_buffer; for(i = 0; i < pass_count; i++) { size_t global = group_counts[i] * work_item_counts[i]; size_t local = work_item_counts[i]; unsigned int operations = operation_counts[i]; unsigned int entries = entry_counts[i]; size_t shared_size = typesize * channels * local * operations; printf("Pass[%4d] Global[%4d] Local[%4d] Groups[%4d] WorkItems[%4d] Operations[%d] Entries[%d]\n", i, (int)global, (int)local, (int)group_counts[i], (int)work_item_counts[i], operations, entries); // Swap the inputs and outputs for each pass // pass_swap = pass_input; pass_input = pass_output; pass_output = pass_swap; err = CL_SUCCESS; err |= clSetKernelArg(kernels[i], 0, sizeof(cl_mem), &pass_output); err |= clSetKernelArg(kernels[i], 1, sizeof(cl_mem), &pass_input); err |= clSetKernelArg(kernels[i], 2, shared_size, NULL); err |= clSetKernelArg(kernels[i], 3, sizeof(int), &entries); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments!\n"); return EXIT_FAILURE; } // After the first pass, use the partial sums for the next input values // if(pass_input == input_buffer) pass_input = partials_buffer; err = CL_SUCCESS; err |= clEnqueueNDRangeKernel(commands, kernels[i], 1, NULL, &global, &local, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } } err = clFinish(commands); if (err != CL_SUCCESS) { printf("Error: Failed to wait for command queue to finish! %d\n", err); return EXIT_FAILURE; } // Start the timing loop and execute the kernel over several iterations // printf(SEPARATOR); printf("Timing %d iterations of reduction with %d elements of type %s%s...\n", iterations, count, integer ? "int" : "float", (channels <= 1) ? (" ") : (channels == 2) ? "2" : "4"); printf(SEPARATOR); int k; err = CL_SUCCESS; t1 = current_time(); for (k = 0 ; k < iterations; k++) { for(i = 0; i < pass_count; i++) { size_t global = group_counts[i] * work_item_counts[i]; size_t local = work_item_counts[i]; err = clEnqueueNDRangeKernel(commands, kernels[i], 1, NULL, &global, &local, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } } } err = clFinish(commands); if (err != CL_SUCCESS) { printf("Error: Failed to wait for command queue to finish! %d\n", err); return EXIT_FAILURE; } t2 = current_time(); // Calculate the statistics for execution time and throughput // double t = subtract_time_in_seconds(t2, t1); printf("Exec Time: %.2f ms\n", 1000.0 * t / (double)(iterations)); printf("Throughput: %.2f GB/sec\n", 1e-9 * buffer_size * iterations / t); printf(SEPARATOR); // Read back the results that were computed on the device // void *computed_result = malloc(typesize * channels); memset(computed_result, 0, typesize * channels); err = clEnqueueReadBuffer(commands, pass_output, CL_TRUE, 0, typesize * channels, computed_result, 0, NULL, NULL); if (err) { printf("Error: Failed to read back results from the device!\n"); return EXIT_FAILURE; } // Verify the results are correct // if(integer) { int reference[4] = { 0, 0, 0, 0}; switch(channels) { case 4: reduce_validate_int4(integer_data, count, reference); break; case 2: reduce_validate_int2(integer_data, count, reference); break; case 1: reduce_validate_int(integer_data, count, reference); break; default: printf("Invalid channel count specified!\n"); return EXIT_FAILURE; } int result[4] = { 0.0f, 0.0f, 0.0f, 0.0f}; for(c = 0; c < channels; c++) { int v = ((int*) computed_result)[c]; result[c] += v; } float error = 0.0f; float diff = 0.0f; for(c = 0; c < channels; c++) { diff = fabs(reference[c] - result[c]); error = diff > error ? diff : error; } if (error > MIN_ERROR) { for(c = 0; c < channels; c++) printf("Result[%d] %d != %d\n", c, reference[c], result[c]); printf("Error: Incorrect results obtained! Max error = %f\n", error); return EXIT_FAILURE; } else { printf("Results Validated!\n"); printf(SEPARATOR); } } else { float reference[4] = { 0.0f, 0.0f, 0.0f, 0.0f}; switch(channels) { case 4: reduce_validate_float4(float_data, count, reference); break; case 2: reduce_validate_float2(float_data, count, reference); break; case 1: reduce_validate_float(float_data, count, reference); break; default: printf("Invalid channel count specified!\n"); return EXIT_FAILURE; } float result[4] = { 0.0f, 0.0f, 0.0f, 0.0f}; for(c = 0; c < channels; c++) { float v = ((float*) computed_result)[c]; result[c] += v; } float error = 0.0f; float diff = 0.0f; for(c = 0; c < channels; c++) { diff = fabs(reference[c] - result[c]); error = diff > error ? diff : error; } if (error > MIN_ERROR) { for(c = 0; c < channels; c++) printf("Result[%d] %f != %f\n", c, reference[c], result[c]); printf("Error: Incorrect results obtained! Max error = %f\n", error); return EXIT_FAILURE; } else { printf("Results Validated!\n"); printf(SEPARATOR); } } // Shutdown and cleanup // for(i = 0; i < pass_count; i++) { clReleaseKernel(kernels[i]); clReleaseProgram(programs[i]); } clReleaseMemObject(input_buffer); clReleaseMemObject(output_buffer); clReleaseMemObject(partials_buffer); clReleaseCommandQueue(commands); clReleaseContext(context); free(group_counts); free(work_item_counts); free(operation_counts); free(entry_counts); free(computed_result); free(kernels); free(float_data); free(integer_data); return 0; }
int initGPU(int n) { #pragma mark Device Information // Find the CPU CL device, as a fallback err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_CPU, 1, &cpu, NULL); assert(err == CL_SUCCESS); // Find the GPU CL device, this is what we really want // If there is no GPU device is CL capable, fall back to CPU err |= clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) device = cpu; assert(device); // Get some information about the returned device cl_char vendor_name[1024] = {0}; cl_char device_name[1024] = {0}; err |= clGetDeviceInfo(device, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, &returned_size); err |= clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, &returned_size); assert(err == CL_SUCCESS); printf("Connecting to %s %s...", vendor_name, device_name); #pragma mark Context and Command Queue // Now create a context to perform our calculation with the // specified device context = clCreateContext(0, 1, &device, NULL, NULL, &err); assert(err == CL_SUCCESS); // And also a command queue for the context cmd_queue = clCreateCommandQueue(context, device, 0, NULL); #pragma mark Program and Kernel Creation // Load the program source from disk // The kernel/program is the project directory and in Xcode the executable // is set to launch from that directory hence we use a relative path const char * filename = "kernel.cl"; char *program_source = load_program_source(filename); program[0] = clCreateProgramWithSource(context, 1, (const char**)&program_source, NULL, &err); assert(err == CL_SUCCESS); err |= clBuildProgram(program[0], 0, NULL, NULL, NULL, NULL); assert(err == CL_SUCCESS); // Now create the kernel "objects" that we want to use in the example file kernel[0] = clCreateKernel(program[0], "add", &err); assert(err == CL_SUCCESS); #pragma mark Memory Allocation // Allocate memory on the device to hold our data and store the results into buffer_size = sizeof(int) * n; mem_c_position = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err); mem_c_velocity = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err); mem_p_angle = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err); mem_p_velocity = clCreateBuffer(context, CL_MEM_READ_ONLY, buffer_size, NULL, &err); assert(err == CL_SUCCESS); mem_fitness = clCreateBuffer(context, CL_MEM_WRITE_ONLY, buffer_size, NULL, &err); assert(err == CL_SUCCESS); // Get all of the stuff written and allocated clFinish(cmd_queue); printf(" done\n"); return err; // CL_SUCCESS }