int main(int argc, char const *argv[]) { /* Get platform */ cl_platform_id platform; cl_uint num_platforms; cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformIDs' failed\n"); exit(1); } printf("Number of platforms: %d\n", num_platforms); printf("platform=%p\n", platform); /* Get platform name */ char platform_name[100]; ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformInfo' failed\n"); exit(1); } printf("platform.name='%s'\n\n", platform_name); /* Get device */ cl_device_id device; cl_uint num_devices; ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceIDs' failed\n"); exit(1); } printf("Number of devices: %d\n", num_devices); printf("device=%p\n", device); /* Get device name */ char device_name[100]; ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceInfo' failed\n"); exit(1); } printf("device.name='%s'\n", device_name); printf("\n"); /* Create a Context Object */ cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateContext' failed\n"); exit(1); } printf("context=%p\n", context); /* Create a Command Queue Object*/ cl_command_queue command_queue; command_queue = clCreateCommandQueue(context, device, 0, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateCommandQueue' failed\n"); exit(1); } printf("command_queue=%p\n", command_queue); printf("\n"); /* Program source */ unsigned char *source_code; size_t source_length; /* Read program from 'mad_hi_uint8uint8uint8.cl' */ source_code = read_buffer("mad_hi_uint8uint8uint8.cl", &source_length); /* Create a program */ cl_program program; program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_length, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateProgramWithSource' failed\n"); exit(1); } printf("program=%p\n", program); /* Build program */ ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (ret != CL_SUCCESS ) { size_t size; char *log; /* Get log size */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size); /* Allocate log and print */ log = malloc(size); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL); printf("error: call to 'clBuildProgram' failed:\n%s\n", log); /* Free log and exit */ free(log); exit(1); } printf("program built\n"); printf("\n"); /* Create a Kernel Object */ cl_kernel kernel; kernel = clCreateKernel(program, "mad_hi_uint8uint8uint8", &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateKernel' failed\n"); exit(1); } /* Create and allocate host buffers */ size_t num_elem = 10; /* Create and init host side src buffer 0 */ cl_uint8 *src_0_host_buffer; src_0_host_buffer = malloc(num_elem * sizeof(cl_uint8)); for (int i = 0; i < num_elem; i++) src_0_host_buffer[i] = (cl_uint8){{2, 2, 2, 2, 2, 2, 2, 2}}; /* Create and init device side src buffer 0 */ cl_mem src_0_device_buffer; src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_uint8), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_uint8), src_0_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create and init host side src buffer 1 */ cl_uint8 *src_1_host_buffer; src_1_host_buffer = malloc(num_elem * sizeof(cl_uint8)); for (int i = 0; i < num_elem; i++) src_1_host_buffer[i] = (cl_uint8){{2, 2, 2, 2, 2, 2, 2, 2}}; /* Create and init device side src buffer 1 */ cl_mem src_1_device_buffer; src_1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_uint8), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_1_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_uint8), src_1_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create and init host side src buffer 2 */ cl_uint8 *src_2_host_buffer; src_2_host_buffer = malloc(num_elem * sizeof(cl_uint8)); for (int i = 0; i < num_elem; i++) src_2_host_buffer[i] = (cl_uint8){{2, 2, 2, 2, 2, 2, 2, 2}}; /* Create and init device side src buffer 2 */ cl_mem src_2_device_buffer; src_2_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_uint8), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_2_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_uint8), src_2_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create host dst buffer */ cl_uint8 *dst_host_buffer; dst_host_buffer = malloc(num_elem * sizeof(cl_uint8)); memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_uint8)); /* Create device dst buffer */ cl_mem dst_device_buffer; dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_uint8), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create dst buffer\n"); exit(1); } /* Set kernel arguments */ ret = CL_SUCCESS; ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src_1_device_buffer); ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &src_2_device_buffer); ret |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clSetKernelArg' failed\n"); exit(1); } /* Launch the kernel */ size_t global_work_size = num_elem; size_t local_work_size = num_elem; ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueNDRangeKernel' failed\n"); exit(1); } /* Wait for it to finish */ clFinish(command_queue); /* Read results from GPU */ ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_uint8), dst_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueReadBuffer' failed\n"); exit(1); } /* Dump dst buffer to file */ char dump_file[100]; sprintf((char *)&dump_file, "%s.result", argv[0]); write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_uint8)); printf("Result dumped to %s\n", dump_file); /* Free host dst buffer */ free(dst_host_buffer); /* Free device dst buffer */ ret = clReleaseMemObject(dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 0 */ free(src_0_host_buffer); /* Free device side src buffer 0 */ ret = clReleaseMemObject(src_0_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 1 */ free(src_1_host_buffer); /* Free device side src buffer 1 */ ret = clReleaseMemObject(src_1_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 2 */ free(src_2_host_buffer); /* Free device side src buffer 2 */ ret = clReleaseMemObject(src_2_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Release kernel */ ret = clReleaseKernel(kernel); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseKernel' failed\n"); exit(1); } /* Release program */ ret = clReleaseProgram(program); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseProgram' failed\n"); exit(1); } /* Release command queue */ ret = clReleaseCommandQueue(command_queue); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseCommandQueue' failed\n"); exit(1); } /* Release context */ ret = clReleaseContext(context); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseContext' failed\n"); exit(1); } return 0; }
static int build_program_from_file(const char *filename, const char *options, cl_context context, cl_device_id device, cl_program *program_out, cl_int *err) { cl_int _err; FILE *file; char *program_source = NULL; size_t program_source_size; cl_program program = NULL; char *build_log = NULL; assert(filename != NULL); assert(program_out != NULL); if (!err) err = &_err; file = fopen(filename, "r"); if (!file) { ERROR("Couldn't open file \"%s\"", filename); goto error; } if (fseek(file, 0L, SEEK_END)) { ERROR("Cannot determine file size of \"%s\"", filename); goto error; } program_source_size = ftell(file); if (fseek(file, 0L, SEEK_SET)) { ERROR("Cannot determine file size of \"%s\"", filename); goto error; } program_source = malloc(sizeof(*program_source) * (program_source_size + 1)); CHECK_ALLOCATION(program_source); if (fread(program_source, 1, program_source_size, file) != program_source_size) { ERROR("Failed to read file \"%s\"", filename); goto error; } program_source[program_source_size] = '\0'; fclose(file); program = clCreateProgramWithSource(context, 1, (const char **)&program_source, NULL, err); CHECK_CL_ERROR(*err); *err = clBuildProgram(program, 0, NULL, options, NULL, NULL); if (*err == CL_BUILD_PROGRAM_FAILURE) { size_t build_log_size; *err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &build_log_size); CHECK_CL_ERROR(*err); build_log = malloc(sizeof(*build_log) * build_log_size); CHECK_ALLOCATION(build_log); *err = clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, build_log_size, build_log, NULL); CHECK_CL_ERROR(*err); if (options) ERROR("Failed to build program in file \"%s\" with options \"%s\"", filename, options); else ERROR("Failed to build program in file \"%s\"", filename); debug_printf("================================== BUILD LOG ===================================\n" "%s", NULL, 0, LOGGING_MSG_ERROR, build_log); goto error; } CHECK_CL_ERROR(*err); *program_out = program; return 0; error: free(build_log); if (program) clReleaseProgram(program); *program_out = NULL; free(program_source); return -1; }
// Helper function to create and build program and kernel // ********************************************************************* cl_kernel getReductionKernel(ReduceType datatype, int whichKernel, int blockSize, int isPowOf2) { // compile cl program size_t program_length; char *source; std::ostringstream preamble; // create the program // with type specification depending on datatype argument switch (datatype) { default: case REDUCE_INT: preamble << "#define T int" << std::endl; break; case REDUCE_FLOAT: preamble << "#define T float" << std::endl; break; } // set blockSize at compile time preamble << "#define blockSize " << blockSize << std::endl; // set isPow2 at compile time preamble << "#define nIsPow2 " << isPowOf2 << std::endl; // Load the source code and prepend the preamble source = oclLoadProgSource(source_path, preamble.str().c_str(), &program_length); oclCheckError(source != NULL, shrTRUE); cl_program cpProgram = clCreateProgramWithSource(cxGPUContext, 1,(const char **) &source, &program_length, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); free(source); // build the program ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-fast-relaxed-math", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLogEx(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclReduction.ptx"); oclCheckError(ciErrNum, CL_SUCCESS); } // create Kernel std::ostringstream kernelName; kernelName << "reduce" << whichKernel; cl_kernel ckKernel = clCreateKernel(cpProgram, kernelName.str().c_str(), &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); size_t wgSize; ciErrNum = clGetKernelWorkGroupInfo(ckKernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &wgSize, NULL); if (wgSize == 64) smallBlock = true; else smallBlock = false; // NOTE: the program will get deleted when the kernel is also released clReleaseProgram(cpProgram); return ckKernel; }
int main(int argc, char **argv) { /* test name */ char name[] = "test_sampler_address_clamp"; size_t global_work_size[1] = { 1 }, local_work_size[1]= { 1 }; size_t srcdir_length, name_length, filename_size; char *filename = NULL; char *source = NULL; cl_device_id devices[1]; cl_context context = NULL; cl_command_queue queue = NULL; cl_program program = NULL; cl_kernel kernel = NULL; cl_int result; int retval = -1; /* image parameters */ cl_uchar4 *imageData; cl_image_format image_format; cl_image_desc image_desc; printf("Running test %s...\n", name); memset(&image_desc, 0, sizeof(cl_image_desc)); image_desc.image_type = CL_MEM_OBJECT_IMAGE2D; image_desc.image_width = 4; image_desc.image_height = 4; image_format.image_channel_order = CL_RGBA; image_format.image_channel_data_type = CL_UNSIGNED_INT8; imageData = (cl_uchar4*)malloc (4 * 4 * sizeof(cl_uchar4)); if (imageData == NULL) { puts("out of host memory\n"); goto error; } memset (imageData, 1, 4*4*sizeof(cl_uchar4)); /* determine file name of kernel source to load */ srcdir_length = strlen(SRCDIR); name_length = strlen(name); filename_size = srcdir_length + name_length + 16; filename = (char *)malloc(filename_size + 1); if (!filename) { puts("out of memory"); goto error; } snprintf(filename, filename_size, "%s/%s.cl", SRCDIR, name); /* read source code */ source = poclu_read_file (filename); TEST_ASSERT (source != NULL && "Kernel .cl not found."); /* setup an OpenCL context and command queue using default device */ context = poclu_create_any_context(); if (!context) { puts("clCreateContextFromType call failed\n"); goto error; } result = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), devices, NULL); if (result != CL_SUCCESS) { puts("clGetContextInfo call failed\n"); goto error; } queue = clCreateCommandQueue(context, devices[0], 0, NULL); if (!queue) { puts("clCreateCommandQueue call failed\n"); goto error; } /* Create image */ cl_mem image = clCreateImage (context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &image_format, &image_desc, imageData, &result); if (result != CL_SUCCESS) { puts("image creation failed\n"); goto error; } /* create and build program */ program = clCreateProgramWithSource (context, 1, (const char **)&source, NULL, NULL); if (!program) { puts("clCreateProgramWithSource call failed\n"); goto error; } result = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (result != CL_SUCCESS) { puts("clBuildProgram call failed\n"); goto error; } /* execute the kernel with give name */ kernel = clCreateKernel(program, name, NULL); if (!kernel) { puts("clCreateKernel call failed\n"); goto error; } result = clSetKernelArg( kernel, 0, sizeof(cl_mem), &image); if (result) { puts("clSetKernelArg failed\n"); goto error; } result = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (result != CL_SUCCESS) { puts("clEnqueueNDRangeKernel call failed\n"); goto error; } result = clFinish(queue); if (result == CL_SUCCESS) retval = 0; error: if (image) { clReleaseMemObject (image); } if (kernel) { clReleaseKernel(kernel); } if (program) { clReleaseProgram(program); } if (queue) { clReleaseCommandQueue(queue); } if (context) { clUnloadCompiler (); clReleaseContext (context); } if (source) { free(source); } if (filename) { free(filename); } if (imageData) { free(imageData); } if (retval) { printf("FAIL\n"); return 1; } printf("OK\n"); return 0; }
int main() { // START:context cl_platform_id platform; clGetPlatformIDs(1, &platform, NULL); cl_device_id device; clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); // END:context // START:queue cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL); // END:queue // START:kernel char* source = read_source("multiply_arrays.cl"); cl_program program = clCreateProgramWithSource(context, 1, (const char**)&source, NULL, NULL); free(source); clBuildProgram(program, 0, NULL, NULL, NULL, NULL); cl_kernel kernel = clCreateKernel(program, "multiply_arrays", NULL); // END:kernel // START:buffers cl_float a[NUM_ELEMENTS], b[NUM_ELEMENTS]; random_fill(a, NUM_ELEMENTS); random_fill(b, NUM_ELEMENTS); cl_mem inputA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * NUM_ELEMENTS, a, NULL); cl_mem inputB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * NUM_ELEMENTS, b, NULL); cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * NUM_ELEMENTS, NULL, NULL); // END:buffers // START:execute clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA); clSetKernelArg(kernel, 1, sizeof(cl_mem), &inputB); clSetKernelArg(kernel, 2, sizeof(cl_mem), &output); size_t work_units = NUM_ELEMENTS; clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_units, NULL, 0, NULL, NULL); // END:execute // START:results cl_float results[NUM_ELEMENTS]; clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(cl_float) * NUM_ELEMENTS, results, 0, NULL, NULL); // END:results // START:cleanup clReleaseMemObject(inputA); clReleaseMemObject(inputB); clReleaseMemObject(output); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(queue); clReleaseContext(context); // END:cleanup for (int i = 0; i < NUM_ELEMENTS; ++i) { printf("%f * %f = %f\n", a[i], b[i], results[i]); } return 0; }
int CommandGenerate::execute(const std::vector<std::string>& p_args) { if(p_args.size() < 10) { help(); return -1; } unsigned int platformId = atol(p_args[1].c_str()); unsigned int deviceId = atol(p_args[2].c_str()); unsigned int staggerSize = atol(p_args[3].c_str()); unsigned int threadsNumber = atol(p_args[4].c_str()); unsigned int hashesNumber = atol(p_args[5].c_str()); unsigned int nonceSize = PLOT_SIZE * staggerSize; std::cerr << "Threads number: " << threadsNumber << std::endl; std::cerr << "Hashes number: " << hashesNumber << std::endl; unsigned int numjobs = (p_args.size() - 5)/4; std::cerr << numjobs << " plot(s) to do." << std::endl; unsigned int staggerMbSize = staggerSize / 4; std::cerr << "Non-GPU memory usage: " << staggerMbSize*numjobs << "MB" << std::endl; std::vector<std::string> paths(numjobs); std::vector<std::ofstream *> out_files(numjobs); std::vector<unsigned long long> addresses(numjobs); std::vector<unsigned long long> startNonces(numjobs); std::vector<unsigned long long> endNonces(numjobs); std::vector<unsigned int> noncesNumbers(numjobs); std::vector<unsigned char*> buffersCpu(numjobs); std::vector<bool> saving_thread_flags(numjobs); std::vector<std::future<void>> save_threads(numjobs); unsigned long long maxNonceNumber = 0; unsigned long long totalNonces = 0; int returnCode = 0; try { for (unsigned int i = 0; i < numjobs; i++) { std::cerr << "----" << std::endl; std::cerr << "Job number " << i << std::endl; unsigned int argstart = 6 + i*4; paths[i] = std::string(p_args[argstart]); addresses[i] = strtoull(p_args[argstart+1].c_str(), NULL, 10); startNonces[i] = strtoull(p_args[argstart+2].c_str(), NULL, 10); noncesNumbers[i] = atol(p_args[argstart+3].c_str()); maxNonceNumber = std::max(maxNonceNumber, (long long unsigned int)noncesNumbers[i]); totalNonces += noncesNumbers[i]; std::ostringstream outFile; outFile << paths[i] << "/" << addresses[i] << "_" << startNonces[i] << "_" << \ noncesNumbers[i] << "_" << staggerSize; std::ios_base::openmode file_mode = std::ios::out | std::ios::binary | std::ios::trunc; out_files[i] = new std::ofstream(outFile.str(), file_mode); assert(out_files[i]); if(noncesNumbers[i] % staggerSize != 0) { noncesNumbers[i] -= noncesNumbers[i] % staggerSize; noncesNumbers[i] += staggerSize; } endNonces[i] = startNonces[i] + noncesNumbers[i]; unsigned int noncesGbSize = noncesNumbers[i] / 4 / 1024; std::cerr << "Path: " << outFile.str() << std::endl; std::cerr << "Nonces: " << startNonces[i] << " to " << endNonces[i] << " (" << noncesGbSize << " GB)" << std::endl; std::cerr << "Creating CPU buffer" << std::endl; buffersCpu[i] = new unsigned char[nonceSize]; if(!buffersCpu[i]) { throw std::runtime_error("Unable to create the CPU buffer (probably out of host memory.)"); } saving_thread_flags[i] = false; std::cerr << "----" << std::endl; } cl_platform_id platforms[4]; cl_uint platformsNumber; cl_device_id devices[32]; cl_uint devicesNumber; cl_context context = 0; cl_command_queue commandQueue = 0; cl_mem bufferGpuGen = 0; cl_mem bufferGpuScoops = 0; cl_program program = 0; cl_kernel kernelStep1 = 0; cl_kernel kernelStep2 = 0; cl_kernel kernelStep3 = 0; int error; std::cerr << "Retrieving OpenCL platforms" << std::endl; error = clGetPlatformIDs(4, platforms, &platformsNumber); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to retrieve the OpenCL platforms"); } if(platformId >= platformsNumber) { throw std::runtime_error("No platform found with the provided id"); } std::cerr << "Retrieving OpenCL GPU devices" << std::endl; error = clGetDeviceIDs(platforms[platformId], CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU, 32, devices, &devicesNumber); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to retrieve the OpenCL devices"); } if(deviceId >= devicesNumber) { throw std::runtime_error("No device found with the provided id"); } std::cerr << "Creating OpenCL context" << std::endl; context = clCreateContext(0, 1, &devices[deviceId], NULL, NULL, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL context"); } std::cerr << "Creating OpenCL command queue" << std::endl; commandQueue = clCreateCommandQueue(context, devices[deviceId], 0, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL command queue"); } std::cerr << "Creating OpenCL GPU generation buffer" << std::endl; bufferGpuGen = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uchar) * GEN_SIZE * staggerSize, 0, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL GPU generation buffer"); } std::cerr << "Creating OpenCL GPU scoops buffer" << std::endl; bufferGpuScoops = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uchar) * nonceSize, 0, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL GPU scoops buffer"); } std::cerr << "Creating OpenCL program" << std::endl; std::string source = loadSource("kernel/nonce.cl"); const char* sources[] = {source.c_str()}; size_t sourcesLength[] = {source.length()}; program = clCreateProgramWithSource(context, 1, sources, sourcesLength, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL program"); } std::cerr << "Building OpenCL program" << std::endl; error = clBuildProgram(program, 1, &devices[deviceId], "-I kernel", 0, 0); if(error != CL_SUCCESS) { size_t logSize; clGetProgramBuildInfo(program, devices[deviceId], CL_PROGRAM_BUILD_LOG, 0, 0, &logSize); char* log = new char[logSize]; clGetProgramBuildInfo(program, devices[deviceId], CL_PROGRAM_BUILD_LOG, logSize, (void*)log, 0); std::cerr << log << std::endl; delete[] log; throw OpenclError(error, "Unable to build the OpenCL program"); } std::cerr << "Creating OpenCL step1 kernel" << std::endl; kernelStep1 = clCreateKernel(program, "nonce_step1", &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL kernel"); } std::cerr << "Setting OpenCL step1 kernel static arguments" << std::endl; error = clSetKernelArg(kernelStep1, 2, sizeof(cl_mem), (void*)&bufferGpuGen); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL kernel arguments"); } std::cerr << "Creating OpenCL step2 kernel" << std::endl; kernelStep2 = clCreateKernel(program, "nonce_step2", &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL kernel"); } std::cerr << "Setting OpenCL step2 kernel static arguments" << std::endl; error = clSetKernelArg(kernelStep2, 1, sizeof(cl_mem), (void*)&bufferGpuGen); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL kernel arguments"); } std::cerr << "Creating OpenCL step3 kernel" << std::endl; kernelStep3 = clCreateKernel(program, "nonce_step3", &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL kernel"); } std::cerr << "Setting OpenCL step3 kernel static arguments" << std::endl; error = clSetKernelArg(kernelStep3, 0, sizeof(cl_uint), (void*)&staggerSize); error = clSetKernelArg(kernelStep3, 1, sizeof(cl_mem), (void*)&bufferGpuGen); error = clSetKernelArg(kernelStep3, 2, sizeof(cl_mem), (void*)&bufferGpuScoops); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL kernel arguments"); } size_t globalWorkSize = staggerSize; size_t localWorkSize = (staggerSize < threadsNumber) ? staggerSize : threadsNumber; time_t startTime = time(0); unsigned int totalNoncesCompleted = 0; for (unsigned long long nonce_ordinal = 0; nonce_ordinal < maxNonceNumber; nonce_ordinal += staggerSize) { for (unsigned int jobnum = 0; jobnum < paths.size(); jobnum += 1) { unsigned long long nonce = startNonces[jobnum] + nonce_ordinal; if (nonce > endNonces[jobnum]) { break; } std::cout << "Running with start nonce " << nonce << std::endl; // Is a cl_ulong always an unsigned long long? unsigned int error = 0; error = clSetKernelArg(kernelStep1, 0, sizeof(cl_ulong), (void*)&addresses[jobnum]); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL step1 kernel arguments"); } error = clSetKernelArg(kernelStep1, 1, sizeof(cl_ulong), (void*)&nonce); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL step1 kernel arguments"); } error = clEnqueueNDRangeKernel(commandQueue, kernelStep1, 1, 0, &globalWorkSize, &localWorkSize, 0, 0, 0); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in step1 kernel launch"); } unsigned int hashesSize = hashesNumber * HASH_SIZE; for(int hashesOffset = PLOT_SIZE ; hashesOffset > 0 ; hashesOffset -= hashesSize) { error = clSetKernelArg(kernelStep2, 0, sizeof(cl_ulong), (void*)&nonce); error = clSetKernelArg(kernelStep2, 2, sizeof(cl_uint), (void*)&hashesOffset); error = clSetKernelArg(kernelStep2, 3, sizeof(cl_uint), (void*)&hashesNumber); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL step2 kernel arguments"); } error = clEnqueueNDRangeKernel(commandQueue, kernelStep2, 1, 0, &globalWorkSize, &localWorkSize, 0, 0, 0); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in step2 kernel launch"); } error = clFinish(commandQueue); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in step2 kernel finish"); } } totalNoncesCompleted += staggerSize; double percent = 100.0 * (double)totalNoncesCompleted / totalNonces; time_t currentTime = time(0); double speed = (double)totalNoncesCompleted / difftime(currentTime, startTime) * 60.0; double estimatedTime = (double)(totalNonces - totalNoncesCompleted) / speed; std::cerr << "\r" << percent << "% (" << totalNoncesCompleted << "/" << totalNonces << " nonces)"; std::cerr << ", " << speed << " nonces/minutes"; std::cerr << ", ETA: " << ((int)estimatedTime / 60) << "h" << ((int)estimatedTime % 60) << "m" << ((int)(estimatedTime * 60.0) % 60) << "s"; std::cerr << "... "; error = clEnqueueNDRangeKernel(commandQueue, kernelStep3, 1, 0, &globalWorkSize, &localWorkSize, 0, 0, 0); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in step3 kernel launch"); } if (saving_thread_flags[jobnum]) { save_threads[jobnum].wait(); // Wait for last job to finish saving_thread_flags[jobnum] = false; } error = clEnqueueReadBuffer(commandQueue, bufferGpuScoops, CL_TRUE, 0, sizeof(cl_uchar) * nonceSize, buffersCpu[jobnum], 0, 0, 0); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in synchronous read"); } saving_thread_flags[jobnum] = true; save_threads[jobnum] = std::async(std::launch::async, save_nonces, nonceSize, out_files[jobnum], buffersCpu[jobnum]); } } //Clean up for (unsigned int i = 0; i < paths.size(); i += 1) { if (saving_thread_flags[i]) { std::cerr << "waiting for final save to " << paths[i] << " to finish" << std::endl; save_threads[i].wait(); saving_thread_flags[i] = false; std::cerr << "done waiting for final save" << std::endl; if (buffersCpu[i]) { delete[] buffersCpu[i]; } } } if(kernelStep3) { clReleaseKernel(kernelStep3); } if(kernelStep2) { clReleaseKernel(kernelStep2); } if(kernelStep1) { clReleaseKernel(kernelStep1); } if(program) { clReleaseProgram(program); } if(bufferGpuGen) { clReleaseMemObject(bufferGpuGen); } if(bufferGpuScoops) { clReleaseMemObject(bufferGpuScoops); } if(commandQueue) { clReleaseCommandQueue(commandQueue); } if(context) { clReleaseContext(context); } time_t currentTime = time(0); double elapsedTime = difftime(currentTime, startTime) / 60.0; double speed = (double)totalNonces / elapsedTime; std::cerr << "\r100% (" << totalNonces << "/" << totalNonces << " nonces)"; std::cerr << ", " << speed << " nonces/minutes"; std::cerr << ", " << ((int)elapsedTime / 60) << "h" << ((int)elapsedTime % 60) << "m" << ((int)(elapsedTime * 60.0) % 60) << "s"; std::cerr << " " << std::endl; } catch(const OpenclError& ex) { std::cerr << "[ERROR] [" << ex.getCode() << "] " << ex.what() << std::endl; returnCode = -1; } catch(const std::exception& ex) { std::cerr << "[ERROR] " << ex.what() << std::endl; returnCode = -1; } return returnCode; }
int main(int argc, char **argv) { printf("enter demo main\n"); fflush(stdout); putenv("POCL_VERBOSE=1"); putenv("POCL_DEVICES=basic"); putenv("POCL_LEAVE_TEMP_DIRS=1"); putenv("POCL_LEAVE_KERNEL_COMPILER_TEMP_FILES=1"); putenv("POCL_TEMP_DIR=pocl"); putenv("POCL_CACHE_DIR=pocl"); putenv("POCL_WORK_GROUP_METHOD=spmd"); if(argc >= 2){ printf("argv[1]:%s:\n",argv[1]); if(!strcmp(argv[1], "h")) putenv("POCL_WORK_GROUP_METHOD=spmd"); if(!strcmp(argv[1], "c")) putenv("POCL_CROSS_COMPILE=1"); } if(argc >= 3){ printf("argv[2]:%s:\n",argv[2]); if(!strcmp(argv[2], "h")) putenv("POCL_WORK_GROUP_METHOD=spmd"); if(!strcmp(argv[2], "c")) putenv("POCL_CROSS_COMPILE=1"); } //putenv("LD_LIBRARY_PATH=/scratch/colins/build/linux/fs/lib"); //putenv("LTDL_LIBRARY_PATH=/scratch/colins/build/linux/fs/lib"); //lt_dlsetsearchpath("/scratch/colins/build/linux/fs/lib"); //printf("SEARCH_PATH:%s\n",lt_dlgetsearchpath()); cl_platform_id platforms[100]; cl_uint platforms_n = 0; CL_CHECK(clGetPlatformIDs(100, platforms, &platforms_n)); printf("=== %d OpenCL platform(s) found: ===\n", platforms_n); for (int i=0; i<platforms_n; i++) { char buffer[10240]; printf(" -- %d --\n", i); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL)); printf(" PROFILE = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL)); printf(" VERSION = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL)); printf(" NAME = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL)); printf(" VENDOR = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL)); printf(" EXTENSIONS = %s\n", buffer); } if (platforms_n == 0) return 1; cl_device_id devices[100]; cl_uint devices_n = 0; // CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 100, devices, &devices_n)); CL_CHECK(clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 100, devices, &devices_n)); printf("=== %d OpenCL device(s) found on platform:\n", devices_n); for (int i=0; i<devices_n; i++) { char buffer[10240]; cl_uint buf_uint; cl_ulong buf_ulong; printf(" -- %d --\n", i); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL)); printf(" DEVICE_NAME = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VENDOR = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL)); printf(" DRIVER_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL)); printf(" DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong); } if (devices_n == 0) return 1; cl_context context; context = CL_CHECK_ERR(clCreateContext(NULL, 1, devices+1, &pfn_notify, NULL, &_err)); cl_command_queue queue; queue = CL_CHECK_ERR(clCreateCommandQueue(context, devices[1], CL_QUEUE_PROFILING_ENABLE, &_err)); cl_kernel kernel = 0; cl_mem memObjects[2] = {0,0}; // Create OpenCL program - first attempt to load cached binary. // If that is not available, then create the program from source // and store the binary for future use. std::cout << "Attempting to create program from binary..." << std::endl; cl_program program = CreateProgramFromBinary(context, devices[1], "kernel.cl.bin"); if (program == NULL) { std::cout << "Binary not loaded, create from source..." << std::endl; program = CreateProgram(context, devices[1], "kernel.cl"); if (program == NULL) { Cleanup(context, queue, program, kernel, memObjects); return 1; } std::cout << "Save program binary for future run..." << std::endl; if (SaveProgramBinary(program, devices[1], "kernel.cl.bin") == false) { std::cerr << "Failed to write program binary" << std::endl; Cleanup(context, queue, program, kernel, memObjects); return 1; } } else { std::cout << "Read program from binary." << std::endl; } printf("attempting to create input buffer\n"); fflush(stdout); cl_mem input_buffer; input_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(double)*NUM_DATA, NULL, &_err)); printf("attempting to create output buffer\n"); fflush(stdout); cl_mem output_buffer; output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(double)*NUM_DATA, NULL, &_err)); memObjects[0] = input_buffer; memObjects[1] = output_buffer; double factor = ((double)rand()/(double)(RAND_MAX)) * 100.0;; printf("attempting to create kernel\n"); fflush(stdout); kernel = CL_CHECK_ERR(clCreateKernel(program, "daxpy", &_err)); printf("setting up kernel args cl_mem:%lx \n",input_buffer); fflush(stdout); CL_CHECK(clSetKernelArg(kernel, 0, sizeof(input_buffer), &input_buffer)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(output_buffer), &output_buffer)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(factor), &factor)); printf("attempting to enqueue write buffer\n"); fflush(stdout); for (int i=0; i<NUM_DATA; i++) { double in = ((double)rand()/(double)(RAND_MAX)) * 100.0;; CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, i*sizeof(double), 8, &in, 0, NULL, NULL)); } cl_event kernel_completion; size_t global_work_size[1] = { NUM_DATA }; printf("attempting to enqueue kernel\n"); fflush(stdout); CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, &kernel_completion)); printf("Enqueue'd kerenel\n"); fflush(stdout); cl_ulong time_start, time_end; CL_CHECK(clWaitForEvents(1, &kernel_completion)); CL_CHECK(clGetEventProfilingInfo(kernel_completion, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL)); CL_CHECK(clGetEventProfilingInfo(kernel_completion, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL)); double elapsed = time_end - time_start; printf("time(ns):%lg\n",elapsed); CL_CHECK(clReleaseEvent(kernel_completion)); printf("Result:"); for (int i=0; i<NUM_DATA; i++) { double data; CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, i*sizeof(double), 8, &data, 0, NULL, NULL)); //printf(" %lg", data); } printf("\n"); CL_CHECK(clReleaseMemObject(memObjects[0])); CL_CHECK(clReleaseMemObject(memObjects[1])); CL_CHECK(clReleaseKernel(kernel)); CL_CHECK(clReleaseProgram(program)); CL_CHECK(clReleaseContext(context)); return 0; }
int main(int argc, char *argv[]) { //FILE *fp; cl_platform_id platform_id[2]; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret_code; cl_mem image_in_mem = NULL; cl_mem image_out_mem = NULL; cl_mem twiddle_factors_mem = NULL; cl_float2 *image_in_host; cl_float2 *twiddle_factors_host; cl_kernel kernel_twiddle_factors; cl_kernel kernel_matriz_transpose; cl_kernel kernel_lowpass_filter; pgm_t ipgm; pgm_t opgm; image_file_t *image_filename; char *output_filename; FILE *fp; const char *kernel_filename = C_NOME_ARQ_KERNEL; size_t source_size; char *source_str; cl_int i, j,n ,m; cl_int raio = 0; size_t global_wg[2]; size_t local_wg[2]; float *image_amplitudes; size_t log_size; char *log_file; cl_event kernels_events_out_fft[4]; cl_ulong kernel_runtime = (cl_ulong) 0; cl_ulong kernel_start_time = (cl_ulong) 0; cl_ulong kernel_end_time = (cl_ulong) 0; cl_event write_host_dev_event; cl_ulong write_host_dev_start_time = (cl_ulong) 0; cl_ulong write_host_dev_end_time = (cl_ulong) 0; cl_ulong write_host_dev_run_time = (cl_ulong) 0; cl_event read_dev_host_event; cl_ulong read_dev_host_start_time = (cl_ulong) 0; cl_ulong read_dev_host_end_time = (cl_ulong) 0; cl_ulong read_dev_host_run_time = (cl_ulong) 0; unsigned __int64 image_tam; unsigned __int64 MEGA_BYTES = 1048576; // 1024*1024 double image_tam_MB; double tempo_total; struct event_in_fft_t *fft_events; //=== Timer count start ============================================================================== timer_reset(); timer_start(); //=================================================================================================== if (argc < 2) { printf("**Erro: O arquivo de entrada eh necessario.\n"); exit(EXIT_FAILURE); } image_filename = (image_file_t *) malloc(sizeof(image_file_t)); split_image_filename(image_filename, argv[1]); output_filename = (char *) malloc(40*sizeof(char)); sprintf(output_filename, "%d.%d.%s.%s.%s", image_filename->res, image_filename->num, ENV_TYPE, APP_TYPE, EXTENSAO); fp = fopen(kernel_filename, "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(EXIT_FAILURE); } source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose( fp ); //=================================================================================================== /* Abrindo imagem do arquivo para objeto de memoria local*/ if( ler_pgm(&ipgm, argv[1]) == -1) exit(EXIT_FAILURE); n = ipgm.width; raio = n/8; m = (cl_int)(log((double)n)/log(2.0)); image_in_host = (cl_float2 *)malloc((n*n)*sizeof(cl_float2)); twiddle_factors_host = (cl_float2 *)malloc(n / 2 * sizeof(cl_float2)); for (i = 0; i < n; i++) { for (j = 0; j < n; j++) { image_in_host[n*i + j].s[0] = (float)ipgm.buf[n*i + j]; image_in_host[n*i + j].s[1] = (float)0; } } fft_events = (struct event_in_fft_t *)malloc(MAX_CALL_FFT*sizeof(struct event_in_fft_t)); kernel_butter_events = (cl_event *)malloc(MAX_CALL_FFT*m*sizeof(cl_event)); //=================================================================================================== CL_CHECK(clGetPlatformIDs(MAX_PLATFORM_ID, platform_id, &ret_num_platforms)); if (ret_num_platforms == 0 ) { fprintf(stderr,"[Erro] Não existem plataformas OpenCL\n"); exit(2); } //=================================================================================================== CL_CHECK(clGetDeviceIDs( platform_id[0], CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices)); //print_platform_info(&platform_id[1]); //=================================================================================================== context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret_code); //=================================================================================================== cmd_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret_code); //=================================================================================================== image_in_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret_code); image_out_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret_code); twiddle_factors_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, (n/2)*sizeof(cl_float2), NULL, &ret_code); //=================================================================================================== /* Transfer data to memory buffer */ CL_CHECK(clEnqueueWriteBuffer(cmd_queue, image_in_mem, CL_TRUE, 0, n*n*sizeof(cl_float2), image_in_host, 0, NULL, &write_host_dev_event)); image_tam = n*n*sizeof(cl_float2); //=================================================================================================== program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret_code); //=================================================================================================== ret_code = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); //=================================================================================================== if (ret_code != CL_SUCCESS) { // Determine the size of the log clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); //=================================================================================================== // Allocate memory for the log log_file = (char *) malloc(log_size); // Get the log clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log_file, NULL); printf("%s\n", log_file); system("pause"); exit(0); } kernel_twiddle_factors = clCreateKernel(program, "twiddle_factors", &ret_code); kernel_matriz_transpose = clCreateKernel(program, "matrix_trasponse", &ret_code); kernel_lowpass_filter = clCreateKernel(program, "lowpass_filter", &ret_code); /* Processa os fatores Wn*/ //=================================================================================================== CL_CHECK(clSetKernelArg(kernel_twiddle_factors, 0, sizeof(cl_mem), (void *)&twiddle_factors_mem)); CL_CHECK(clSetKernelArg(kernel_twiddle_factors, 1, sizeof(cl_int), (void *)&n)); config_workgroup_size(global_wg, local_wg, n/2, 1); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_twiddle_factors, 1, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[0])); //=================================================================================================== /* Executa a FFT em N/2 */ fft_main(image_out_mem, image_in_mem, twiddle_factors_mem, m, direta, &fft_events[0]); //=================================================================================================== /* Realiza a transposta da Matriz (imagem) */ CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 0, sizeof(cl_mem), (void *)&image_in_mem)); CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 1, sizeof(cl_mem), (void *)&image_out_mem)); CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 2, sizeof(cl_int), (void *)&n)); config_workgroup_size(global_wg, local_wg, n, n); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_matriz_transpose, 2, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[1])); //=================================================================================================== /* Executa a FFT N/2 */ fft_main(image_out_mem, image_in_mem, twiddle_factors_mem, m, direta, &fft_events[1]); //=================================================================================================== /* Processa o filtro passa baixa */ CL_CHECK(clSetKernelArg(kernel_lowpass_filter, 0, sizeof(cl_mem), (void *)&image_out_mem)); CL_CHECK(clSetKernelArg(kernel_lowpass_filter, 1, sizeof(cl_int), (void *)&n)); CL_CHECK(clSetKernelArg(kernel_lowpass_filter, 2, sizeof(cl_int), (void *)&raio)); config_workgroup_size(global_wg, local_wg, n, n); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_lowpass_filter, 2, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[2])); //=================================================================================================== /* Obtem a FFT inversa*/ fft_main(image_in_mem, image_out_mem, twiddle_factors_mem, m, inversa, &fft_events[2]); //=================================================================================================== /* Realiza a transposta da Matriz (imagem) */ CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 0, sizeof(cl_mem), (void *)&image_out_mem)); CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 1, sizeof(cl_mem), (void *)&image_in_mem)); CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 2, sizeof(cl_int), (void *)&n)); config_workgroup_size(global_wg, local_wg, n, n); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_matriz_transpose, 2, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[3])); //=================================================================================================== fft_main(image_in_mem, image_out_mem, twiddle_factors_mem, m, inversa, &fft_events[3]); //=================================================================================================== CL_CHECK(clEnqueueReadBuffer(cmd_queue, image_in_mem, CL_TRUE, 0, n*n*sizeof(cl_float2), image_in_host, 0, NULL, &read_dev_host_event)); //=================================================================================================== //== Total time elapsed ============================================================================ timer_stop(); tempo_total = get_elapsed_time(); //================================================================================================== //====== Get time of Profile Info ================================================================== // Write data time CL_CHECK(clGetEventProfilingInfo(write_host_dev_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &write_host_dev_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(write_host_dev_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &write_host_dev_end_time, NULL)); // Read data time CL_CHECK(clGetEventProfilingInfo(read_dev_host_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &read_dev_host_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(read_dev_host_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &read_dev_host_end_time, NULL)); for (i = 0; i < MAX_CALL_FFT; i++) { kernel_start_time = (cl_long) 0; kernel_end_time = (cl_long) 0; CL_CHECK(clGetEventProfilingInfo(kernels_events_out_fft[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(kernels_events_out_fft[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL)); kernel_runtime += (kernel_end_time - kernel_start_time); kernel_start_time = (cl_long) 0; kernel_end_time = (cl_long) 0; CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_bitsrev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_bitsrev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL)); kernel_runtime += (kernel_end_time - kernel_start_time); kernel_start_time = (cl_long) 0; kernel_end_time = (cl_long) 0; if (fft_events[i].kernel_normalize != NULL) { CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_normalize, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_normalize, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL)); kernel_runtime += (kernel_end_time - kernel_start_time); } } for (j=0; j < MAX_CALL_FFT*m; j++){ kernel_start_time = (cl_long) 0; kernel_end_time = (cl_long) 0; CL_CHECK(clGetEventProfilingInfo(kernel_butter_events[j], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(kernel_butter_events[j], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL)); kernel_runtime += (kernel_end_time - kernel_start_time); } write_host_dev_run_time = write_host_dev_end_time - write_host_dev_start_time; read_dev_host_run_time = read_dev_host_end_time - read_dev_host_start_time; /* save_log_debug(write_host_dev_run_time,fp); save_log_debug(read_dev_host_run_time,fp); close_log_debug(fp); */ image_tam_MB = (double) (((double) image_tam)/(double) MEGA_BYTES); //================================================================================================== save_log_gpu(image_filename, kernel_runtime, (double) (image_tam_MB/( (double) read_dev_host_run_time/(double) NANOSECONDS)), (double) (image_tam_MB/ ((double) write_host_dev_run_time/ (double) NANOSECONDS)), tempo_total, LOG_NAME); //=================================================================================================== image_amplitudes = (float*)malloc(n*n*sizeof(float)); for (i=0; i < n; i++) { for (j=0; j < n; j++) { image_amplitudes[n*j + i] = (float) (AMP(((float*)image_in_host)[(2*n*j)+2*i], ((float*)image_in_host)[(2*n*j)+2*i+1])); } } //clFlush(cmd_queue); //clFinish(cmd_queue); opgm.width = n; opgm.height = n; normalizar_pgm(&opgm, image_amplitudes); escrever_pgm(&opgm, output_filename); //=================================================================================================== clFinish(cmd_queue); clReleaseKernel(kernel_twiddle_factors); clReleaseKernel(kernel_matriz_transpose); clReleaseKernel(kernel_lowpass_filter); clReleaseProgram(program); clReleaseMemObject(image_in_mem); clReleaseMemObject(image_out_mem); clReleaseMemObject(twiddle_factors_mem); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); clReleaseEvent(read_dev_host_event); clReleaseEvent(write_host_dev_event); clReleaseEvent(kernels_events_out_fft[0]); clReleaseEvent(kernels_events_out_fft[1]); clReleaseEvent(kernels_events_out_fft[2]); clReleaseEvent(kernels_events_out_fft[3]); destruir_pgm(&ipgm); destruir_pgm(&opgm); free(image_amplitudes); free(source_str); free(image_in_host); free(image_filename); free(twiddle_factors_host); free(output_filename); free(fft_events); free(kernel_butter_events); //_CrtDumpMemoryLeaks(); return 0; }
/** * @brief Main principal * @param argc El número de argumentos del programa * @param argv Cadenas de argumentos del programa * @return Nada si es correcto o algún número negativo si es incorrecto */ int main( int argc, char** argv ) { if(argc != 2) return -1; // Medimos tiempo para el programa const double start_time = getCurrentTimestamp(); FILE *kernels; char *source_str; size_t source_size, work_items; // OpenCL runtime configuration unsigned num_devices; cl_platform_id platform_ids[3]; cl_uint ret_num_platforms; cl_device_id device_id; cl_context context = NULL; cl_command_queue command_queue; cl_program program = NULL; cl_int ret; cl_kernel kernelINIT; cl_event kernel_event, finish_event; cl_mem objPARTICULAS; // Abrimos el fichero que contiene el kernel fopen_s(&kernels, "initparticulasCPU.cl", "r"); if (!kernels) { fprintf(stderr, "Fallo al cargar el kernel\n"); exit(-1); } source_str = (char *) malloc(0x100000); source_size = fread(source_str, 1, 0x100000, kernels); fclose(kernels); // Obtenemos los IDs de las plataformas disponibles if( clGetPlatformIDs(3, platform_ids, &ret_num_platforms) != CL_SUCCESS) { printf("No se puede obtener id de la plataforma"); return -1; } // Intentamos obtener un dispositivo CPU soportado if( clGetDeviceIDs(platform_ids[1], CL_DEVICE_TYPE_CPU, 1, &device_id, &num_devices) != CL_SUCCESS) { printf("No se puede obtener id del dispositivo"); return -1; } clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &work_items, NULL); // Creación de un contexto OpenCL context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); // Creación de una cola de comandos command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret); // Creación de un programa kernel desde un fichero de código program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); if (ret != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: ¡Fallo al construir el programa ejecutable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s", buffer); exit(-1); } // Creación del kernel OpenCL kernelINIT = clCreateKernel(program, "calc_particles_init", &ret); // Creamos el buffer para las partÃculas y reservamos espacio ALINEADO para los datos size_t N = atoi(argv[1]); particle *particulas = (particle*) _aligned_malloc(N * sizeof(particle), 64); objPARTICULAS = clCreateBuffer(context, CL_MEM_WRITE_ONLY, N * sizeof(particle), NULL, &ret); const size_t global = 4; const size_t local_work_size = 1; // Transferimos el frame al dispositivo cl_event write_event; ret = clEnqueueWriteBuffer(command_queue, objPARTICULAS, CL_FALSE, 0, N * sizeof(particle), particulas, 0, NULL, &write_event); // Establecemos los argumentos del kernel ret = clSetKernelArg(kernelINIT, 0, sizeof(cl_mem), &objPARTICULAS); ret = clSetKernelArg(kernelINIT, 1, sizeof(int), &N); // Ejecutamos el kernel. Un work-item por cada work-group o unidad de cómputo ret = clEnqueueNDRangeKernel(command_queue, kernelINIT, 1, NULL, &global, &local_work_size, 1, &write_event, &kernel_event); // Leemos los resultados ret = clEnqueueReadBuffer(command_queue, objPARTICULAS, CL_FALSE, 0, N * sizeof(particle), particulas, 1, &kernel_event, &finish_event); // Esperamos a que termine de leer los resultados clWaitForEvents(1, &finish_event); // Obtenemos el tiempo del kernel y de las transferencias CPU-RAM cl_ulong totalKernel = getStartEndTime(kernel_event); cl_ulong totalRam = getStartEndTime(write_event) + getStartEndTime(finish_event); const double end_time = getCurrentTimestamp(); // Obtenemos el tiempo consumido por el programa, el kernel y las transferencias de memoria printf("\nTiempo total del programa: %0.3f ms\n", (end_time - start_time) * 1e3); printf("Tiempo total consumido por el kernel: %0.3f ms\n", double(totalKernel) * 1e-6); printf("Tiempo total consumido en transferencias CPU-RAM: %0.3f ms\n", double(totalRam) * 1e-6); // Liberamos todos los recursos usados (kernels y objetos OpenCL) clReleaseEvent(kernel_event); clReleaseEvent(finish_event); clReleaseEvent(write_event); clReleaseMemObject(objPARTICULAS); clReleaseKernel(kernelINIT); clReleaseCommandQueue(command_queue); clReleaseProgram(program); clReleaseContext(context); }
double gpu_cgm_image(uint32_t* aList, uint32_t* bList, int aLength, int bLength, int keyLength, uint32_t** matches, char* clFile, int x, int y) { int gap = 0, myoffset = 0; cl_platform_id *platforms; cl_uint num_platforms = 0; cl_device_id *devices; cl_uint num_devices = 0; cl_context context; cl_command_queue command_queue; cl_image_format imgFormat; cl_mem aImg; cl_mem bImg; cl_mem res_buf; cl_program program; cl_kernel kernel; cl_uint *results; FILE *prgm_fptr; struct stat prgm_sbuf; char *prgm_data; size_t prgm_size; size_t offset; size_t count; const size_t global_work_size[] = { x, y }; const size_t origin[] = { 0, 0, 0 }; const size_t region[] = { aLength, 1, 1 }; cl_int ret; cl_uint i; cl_bool imageSupport; struct timeval t1, t2; double elapsedTime; results = malloc(sizeof(cl_uint) * aLength); imgFormat.image_channel_order = CL_RGBA; imgFormat.image_channel_data_type = CL_UNSIGNED_INT32; /* figure out how many CL platforms are available */ ret = clGetPlatformIDs(0, NULL, &num_platforms); if (CL_SUCCESS != ret) { print_error ("Error getting the number of platform IDs: %d", ret); exit(EXIT_FAILURE); } if (0 == num_platforms) { print_error ("No CL platforms were found."); exit(EXIT_FAILURE); } /* allocate space for each available platform ID */ if (NULL == (platforms = malloc((sizeof *platforms) * num_platforms))) { print_error ("Out of memory"); exit(EXIT_FAILURE); } /* get all of the platform IDs */ ret = clGetPlatformIDs(num_platforms, platforms, NULL); if (CL_SUCCESS != ret) { print_error ("Error getting platform IDs: %d", ret); exit(EXIT_FAILURE); } /* find a platform that supports given device type */ // print_error ("Number of platforms found: %d", num_platforms); for (i = 0; i < num_platforms; i++) { ret = clGetDeviceIDs(platforms[i], getDeviceType(), 0, NULL, &num_devices); if (CL_SUCCESS != ret) continue; if (0 < num_devices) break; } /* make sure at least one device was found */ if (num_devices == 0) { print_error ("No CL device found that supports device type: %s.", ((getDeviceType() == CL_DEVICE_TYPE_CPU) ? "CPU" : "GPU")); exit(EXIT_FAILURE); } /* only one device is necessary... */ num_devices = 1; if (NULL == (devices = malloc((sizeof *devices) * num_devices))) { print_error ("Out of memory"); exit(EXIT_FAILURE); } /* get one device id */ ret = clGetDeviceIDs(platforms[i], getDeviceType(), num_devices, devices, NULL); if (CL_SUCCESS != ret) { print_error ("Error getting device IDs: %d", ret); exit(EXIT_FAILURE); } ret = clGetDeviceInfo(*devices, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &imageSupport, NULL); if (CL_SUCCESS != ret) { print_error ("Failed to get Device Info: %d", ret); exit(EXIT_FAILURE); } if(imageSupport == CL_FALSE) { print_error ("Failure: Images are not supported!"); exit(EXIT_FAILURE); } /* create a context for the CPU device that was found earlier */ context = clCreateContext(NULL, num_devices, devices, NULL, NULL, &ret); if (NULL == context || CL_SUCCESS != ret) { print_error ("Failed to create context: %d", ret); exit(EXIT_FAILURE); } /* create a command queue for the CPU device */ command_queue = clCreateCommandQueue(context, devices[0], 0, &ret); if (NULL == command_queue || CL_SUCCESS != ret) { print_error ("Failed to create a command queue: %d", ret); exit(EXIT_FAILURE); } /* create buffers on the CL device */ aImg = clCreateImage2D(context, CL_MEM_READ_ONLY, &imgFormat, aLength, 1, 0, NULL, &ret); if (NULL == aImg || CL_SUCCESS != ret) { print_error ("Failed to create a image: %d", ret); exit(EXIT_FAILURE); } bImg = clCreateImage2D(context, CL_MEM_READ_ONLY, &imgFormat, aLength, 1, 0, NULL, &ret); if (NULL == bImg || CL_SUCCESS != ret) { print_error ("Failed to create b image: %d", ret); exit(EXIT_FAILURE); } int res_bufSize = aLength; res_buf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * res_bufSize, NULL, &ret); if (NULL == res_buf || CL_SUCCESS != ret) { print_error ("Failed to create b buffer: %d", ret); exit(EXIT_FAILURE); } /* read the opencl program code into a string */ prgm_fptr = fopen(clFile, "r"); if (NULL == prgm_fptr) { print_error ("%s", strerror (errno)); exit(EXIT_FAILURE); } if (0 != stat(clFile, &prgm_sbuf)) { print_error ("%s", strerror (errno)); exit(EXIT_FAILURE); } prgm_size = prgm_sbuf.st_size; prgm_data = malloc(prgm_size); if (NULL == prgm_data) { print_error ("Out of memory"); exit(EXIT_FAILURE); } /* make sure all data is read from the file (just in case fread returns * short) */ offset = 0; while (prgm_size - offset != (count = fread(prgm_data + offset, 1, prgm_size - offset, prgm_fptr))) offset += count; if (0 != fclose(prgm_fptr)) { print_error ("%s", strerror (errno)); exit(EXIT_FAILURE); } /* create a 'program' from the source */ program = clCreateProgramWithSource(context, 1, (const char **) &prgm_data, &prgm_size, &ret); if (NULL == program || CL_SUCCESS != ret) { print_error ("Failed to create program with source: %d", ret); exit(EXIT_FAILURE); } /* compile the program.. (it uses llvm or something) */ ret = clBuildProgram(program, num_devices, devices, NULL, NULL, NULL); if (CL_SUCCESS != ret) { size_t size; char *log = calloc(1, 4000); if (NULL == log) { print_error ("Out of memory"); exit(EXIT_FAILURE); } print_error ("Failed to build program: %d", ret); ret = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 4096, log, &size); if (CL_SUCCESS != ret) { print_error ("Failed to get program build info: %d", ret); exit(EXIT_FAILURE); } fprintf(stderr, "Begin log:\n%s\nEnd log.\n", log); exit(EXIT_FAILURE); } /* pull out a reference to your kernel */ kernel = clCreateKernel(program, "cgm_kernel", &ret); if (NULL == kernel || CL_SUCCESS != ret) { print_error ("Failed to create kernel: %d", ret); exit(EXIT_FAILURE); } gettimeofday(&t1, NULL); /* write data to these buffers */ clEnqueueWriteImage(command_queue, aImg, CL_FALSE, origin, region, 0, 0, (void*) aImg, 0, NULL, NULL); clEnqueueWriteImage(command_queue, bImg, CL_FALSE, origin, region, 0, 0, (void*) bImg, 0, NULL, NULL); /* set your kernel's arguments */ ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), &aImg); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bImg); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } ret = clSetKernelArg(kernel, 4, sizeof(int), &gap); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } ret = clSetKernelArg(kernel, 5, sizeof(int), &myoffset); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } ret = clSetKernelArg(kernel, 6, sizeof(int), &keyLength); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } ret = clSetKernelArg(kernel, 7, sizeof(cl_mem), &res_buf); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } /* make sure buffers have been written before executing */ ret = clEnqueueBarrier(command_queue); if (CL_SUCCESS != ret) { print_error ("Failed to enqueue barrier: %d", ret); exit(EXIT_FAILURE); } /* enque this kernel for execution... */ ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL); if (CL_SUCCESS != ret) { print_error ("Failed to enqueue kernel: %d", ret); exit(EXIT_FAILURE); } /* wait for the kernel to finish executing */ ret = clEnqueueBarrier(command_queue); if (CL_SUCCESS != ret) { print_error ("Failed to enqueue barrier: %d", ret); exit(EXIT_FAILURE); } /* copy the contents of dev_buf from the CL device to the host (CPU) */ ret = clEnqueueReadBuffer(command_queue, res_buf, true, 0, sizeof(cl_uint) * aLength, results, 0, NULL, NULL); gettimeofday(&t2, NULL); elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000.0; // sec to ms elapsedTime += (t2.tv_usec - t1.tv_usec) / 1000.0; // us to ms if (CL_SUCCESS != ret) { print_error ("Failed to copy data from device to host: %d", ret); exit(EXIT_FAILURE); } ret = clEnqueueBarrier(command_queue); if (CL_SUCCESS != ret) { print_error ("Failed to enqueue barrier: %d", ret); exit(EXIT_FAILURE); } /* make sure the content of the buffer are what we expect */ //for (i = 0; i < aLength; i++) // printf("%d\n", results[i]); /* free up resources */ ret = clReleaseKernel(kernel); if (CL_SUCCESS != ret) { print_error ("Failed to release kernel: %d", ret); exit(EXIT_FAILURE); } ret = clReleaseProgram(program); if (CL_SUCCESS != ret) { print_error ("Failed to release program: %d", ret); exit(EXIT_FAILURE); } ret = clReleaseMemObject(aImg); if (CL_SUCCESS != ret) { print_error ("Failed to release memory object: %d", ret); exit(EXIT_FAILURE); } ret = clReleaseMemObject(bImg); if (CL_SUCCESS != ret) { print_error ("Failed to release memory object: %d", ret); exit(EXIT_FAILURE); } ret = clReleaseMemObject(res_buf); if (CL_SUCCESS != ret) { print_error ("Failed to release memory object: %d", ret); exit(EXIT_FAILURE); } if (CL_SUCCESS != (ret = clReleaseCommandQueue(command_queue))) { print_error ("Failed to release command queue: %d", ret); exit(EXIT_FAILURE); } if (CL_SUCCESS != (ret = clReleaseContext(context))) { print_error ("Failed to release context: %d", ret); exit(EXIT_FAILURE); } matches = &results; return elapsedTime; }
void JNIContext::dispose(JNIEnv *jenv, Config* config) { //fprintf(stdout, "dispose()\n"); cl_int status = CL_SUCCESS; jenv->DeleteGlobalRef(kernelObject); jenv->DeleteGlobalRef(kernelClass); if (context != 0){ status = clReleaseContext(context); //fprintf(stdout, "dispose context %0lx\n", context); CLException::checkCLError(status, "clReleaseContext()"); context = (cl_context)0; } if (commandQueue != 0){ if (config->isTrackingOpenCLResources()){ commandQueueList.remove((cl_command_queue)commandQueue, __LINE__, __FILE__); } status = clReleaseCommandQueue((cl_command_queue)commandQueue); //fprintf(stdout, "dispose commandQueue %0lx\n", commandQueue); CLException::checkCLError(status, "clReleaseCommandQueue()"); commandQueue = (cl_command_queue)0; } if (program != 0){ status = clReleaseProgram((cl_program)program); //fprintf(stdout, "dispose program %0lx\n", program); CLException::checkCLError(status, "clReleaseProgram()"); program = (cl_program)0; } if (kernel != 0){ status = clReleaseKernel((cl_kernel)kernel); //fprintf(stdout, "dispose kernel %0lx\n", kernel); CLException::checkCLError(status, "clReleaseKernel()"); kernel = (cl_kernel)0; } if (argc > 0){ for (int i=0; i< argc; i++){ KernelArg *arg = args[i]; if (!arg->isPrimitive()){ if (arg->arrayBuffer != NULL){ if (arg->arrayBuffer->mem != 0){ if (config->isTrackingOpenCLResources()){ memList.remove((cl_mem)arg->arrayBuffer->mem, __LINE__, __FILE__); } status = clReleaseMemObject((cl_mem)arg->arrayBuffer->mem); //fprintf(stdout, "dispose arg %d %0lx\n", i, arg->arrayBuffer->mem); CLException::checkCLError(status, "clReleaseMemObject()"); arg->arrayBuffer->mem = (cl_mem)0; } if (arg->arrayBuffer->javaArray != NULL) { jenv->DeleteWeakGlobalRef((jweak) arg->arrayBuffer->javaArray); } delete arg->arrayBuffer; arg->arrayBuffer = NULL; } } if (arg->name != NULL){ free(arg->name); arg->name = NULL; } if (arg->javaArg != NULL ) { jenv->DeleteGlobalRef((jobject) arg->javaArg); } delete arg; arg=args[i]=NULL; } delete[] args; args=NULL; // do we need to call clReleaseEvent on any of these that are still retained.... delete[] readEvents; readEvents = NULL; delete[] writeEvents; writeEvents = NULL; delete[] executeEvents; executeEvents = NULL; if (config->isProfilingEnabled()) { if (config->isProfilingCSVEnabled()) { if (profileFile != NULL && profileFile != stderr) { fclose(profileFile); } } delete[] readEventArgs; readEventArgs=0; delete[] writeEventArgs; writeEventArgs=0; } } if (config->isTrackingOpenCLResources()){ fprintf(stderr, "after dispose{ \n"); commandQueueList.report(stderr); memList.report(stderr); readEventList.report(stderr); executeEventList.report(stderr); writeEventList.report(stderr); fprintf(stderr, "}\n"); } }
static void clrpc_client_test2(void) { int err; int size = 1024; cl_uint nplatforms = 0; cl_platform_id* platforms = 0; cl_uint nplatforms_ret; clGetPlatformIDs(nplatforms,platforms,&nplatforms_ret); printf( "after call one i get nplatforms_ret = %d", nplatforms_ret); if (nplatforms_ret == 0) exit(1); nplatforms = nplatforms_ret; platforms = (cl_platform_id*)calloc(nplatforms,sizeof(cl_platform_id)); clGetPlatformIDs(nplatforms,platforms,&nplatforms_ret); int i; for(i=0;i<nplatforms;i++) { clrpc_dptr* tmp = ((_xobj_t*)platforms[i])->obj; int is_rpc; if ( clGetPlatformInfo(platforms[i],999,sizeof(cl_int),&is_rpc,0)==CL_SUCCESS) { printf( "platforms[%d] local=%p remote=%p\n", i,(void*)tmp->local, (void*)tmp->remote); } else { printf( "platforms[%d] not RPC\n",i); } } char buffer[1024]; size_t sz; cl_platform_id rpc_platform = 0; for(i=0;i<nplatforms;i++) { clGetPlatformInfo(platforms[i],CL_PLATFORM_NAME,1023,buffer,&sz); printf( "\n [%d] CL_PLATFORM_NAME|%ld:%s|\n",i,sz,buffer); } int iplat; for(iplat=0;iplat<nplatforms;iplat++) { printf("\n******************\nTEST PLATFORM %d\n*************\n\n",iplat); cl_uint ndevices = 0; cl_device_id* devices = 0; cl_uint ndevices_ret; clGetDeviceIDs(platforms[iplat],CL_DEVICE_TYPE_ALL, ndevices,devices,&ndevices_ret); printf( "after call one i get ndevices_ret = %d\n", ndevices_ret); if (ndevices_ret > 10) exit(-1); ndevices = ndevices_ret; devices = (cl_device_id*)calloc(ndevices,sizeof(cl_device_id)); clGetDeviceIDs(platforms[iplat],CL_DEVICE_TYPE_ALL, ndevices,devices,&ndevices_ret); if (!ndevices_ret) { //printf("no devices, stopping.\n"); //exit(1); printf("no devices, skipping.\n"); continue; } for(i=0;i<ndevices;i++) { clrpc_dptr* tmp = ((_xobj_t*)devices[i])->obj; clGetDeviceInfo(devices[i],CL_DEVICE_NAME,1023,buffer,&sz); printf( "CL_DEVICE_NAME |%s|\n",buffer); cl_platform_id tmpid; clGetDeviceInfo(devices[i],CL_DEVICE_PLATFORM,sizeof(tmpid),&tmpid,&sz); printf("%p\n",platforms[iplat]); fflush(stdout); printf("%p\n",tmpid); fflush(stdout); clGetPlatformInfo(tmpid,CL_PLATFORM_NAME,1023,buffer,&sz); printf( "\n [%d] CL_PLATFORM_NAME|%ld:%s|\n",i,sz,buffer); } cl_context_properties ctxprop[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[iplat], 0 }; printf("i am setting this: prop[%d] %p\n",iplat,platforms[iplat]); cl_context ctx = clCreateContext(ctxprop,ndevices,devices, 0,0,&err); cl_command_queue* cmdq = (cl_command_queue*) calloc(ndevices,sizeof(cl_command_queue)); for(i=0;i<ndevices;i++) { cmdq[i] = clCreateCommandQueue(ctx,devices[i],0,&err); printf( "cmdq %d %p",i,cmdq[i]); } cl_mem a_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int), 0,&err); cl_mem b_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int), 0,&err); cl_mem c_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int), 0,&err); cl_mem d_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int), 0,&err); int* a = (int*)malloc(1024*sizeof(int)); int* b = (int*)malloc(1024*sizeof(int)); int* c = (int*)malloc(1024*sizeof(int)); int* d = (int*)malloc(1024*sizeof(int)); char* prgsrc[] = { "__kernel void my_kern( int n, __global int* a, __global int* b )\n" " { int i = get_global_id(0); int tmp = 0; int j; for(j=0;j<n;j++) tmp += a[i] * a[j]; b[i] = tmp; }\n" }; size_t prgsrc_sz = strlen(prgsrc[0]) + 1; cl_program prg = clCreateProgramWithSource(ctx,1, (const char**)prgsrc,&prgsrc_sz,&err); clBuildProgram(prg,ndevices,devices,0,0,0); cl_kernel krn = clCreateKernel(prg,"my_kern",&err); int idev; for(idev=0;idev<ndevices;idev++) { printf("\n******************\nTEST DEVICE %d(%d)\n*************\n\n",idev,iplat); for(i=0;i<size;i++) a[i] = i*10; for(i=0;i<size;i++) b[i] = i*10+1; for(i=0;i<size;i++) c[i] = 0; for(i=0;i<size;i++) d[i] = 0; cl_event ev[8]; for(i=0;i<32;i++) printf("%d/",a[i]); printf("\n"); for(i=0;i<32;i++) printf("%d/",b[i]); printf("\n"); clEnqueueWriteBuffer(cmdq[idev],a_buf,CL_FALSE,0,size*sizeof(int),a, 0,0,&ev[0]); clEnqueueWriteBuffer(cmdq[idev],b_buf,CL_FALSE,0,size*sizeof(int),b, 1,ev,&ev[1]); clEnqueueWriteBuffer(cmdq[idev],c_buf,CL_FALSE,0,size*sizeof(int),c, 2,ev,&ev[2]); clEnqueueWriteBuffer(cmdq[idev],d_buf,CL_FALSE,0,size*sizeof(int),d, 3,ev,&ev[3]); size_t offset = 0; size_t gwsz = 128; size_t lwsz = 16; clSetKernelArg(krn,0,sizeof(int),&size); clSetKernelArg(krn,1,sizeof(cl_mem),&a_buf); clSetKernelArg(krn,2,sizeof(cl_mem),&c_buf); clEnqueueNDRangeKernel(cmdq[idev],krn,1,&offset,&gwsz,&lwsz,4,ev,&ev[4]); clSetKernelArg(krn,1,sizeof(cl_mem),&b_buf); clSetKernelArg(krn,2,sizeof(cl_mem),&d_buf); clEnqueueNDRangeKernel(cmdq[idev],krn,1,&offset,&gwsz,&lwsz,5,ev,&ev[5]); clEnqueueReadBuffer(cmdq[idev],c_buf,CL_FALSE,0,size*sizeof(int),c, 6,ev,&ev[6]); clEnqueueReadBuffer(cmdq[idev],d_buf,CL_FALSE,0,size*sizeof(int),d, 7,ev,&ev[7]); clFlush(cmdq[idev]); clWaitForEvents(8,ev); for(i=0;i<32;i++) printf("%d/",c[i]); printf("\n"); for(i=0;i<32;i++) printf("%d/",d[i]); printf("\n"); for(i=0;i<8;i++) clReleaseEvent(ev[i]); } clReleaseKernel(krn); clReleaseProgram(prg); clReleaseMemObject(a_buf); clReleaseMemObject(b_buf); clReleaseMemObject(c_buf); clReleaseMemObject(d_buf); clReleaseCommandQueue(cmdq[0]); clReleaseContext(ctx); // printf("sleeping ...\n"); // sleep(1); } // clrpc_final(); }
//////////////////////////////////////////////////////////////////////////////////// // Measure the local memoy to local memoy bandwidth. //////////////////////////////////////////////////////////////////////////////////// int measureLocalMemory(cl_device_id device_id, cl_context context, cl_command_queue commands, unsigned int type, int f4, unsigned int elements, unsigned int iterations, bool larg, double time_taken[2]) { cl_int err = CL_SUCCESS; const char* source_path = "mem_streaming.cl"; char buf[512]; int elementsToAlloc = elements; size_t local, global; for(size_t ws = 0; ws <= 1; ++ws) { if(ws == 0) { // Execute the kernel using just one single workitem local = 1; global = 1; } else { // Execute the kernel using the max number of threads on each processor _DEVICE_INFO* info = get_device_info(device_id); size_t* tmp = info->max_work_item_sizes; local = tmp[0]; free(tmp); global = info->max_compute_units; while(local > elements) local /= 2; global *= local; } if(type == 1) elementsToAlloc = (elements + local-1)/local; if(f4 == 0) sprintf(buf, "#define dtype float\n"); else sprintf(buf, "#define dtype float%d\n", (int)pow(2.0, f4)); sprintf(buf+strlen(buf), "#define VEC %d\n#define ELEMENTS %d\n#define localRange %lu\n", f4, elementsToAlloc, local); if(larg) sprintf(buf+strlen(buf), "#define LARG\n"); cl_program program = load_kernel(source_path, context, buf); if(!program) { fprintf(stderr, "Error: Failed to create compute program!\n"); return 1; } // Build the program executable err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if(err != CL_SUCCESS) { size_t len; char buffer[8096]; fprintf(stderr, "Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); fprintf(stderr, "%s\n", buffer); return 1; } // Create the compute kernel cl_kernel kernel; switch(type) { case 1: kernel = clCreateKernel(program, "private_mem", &err); break; case 2: kernel = clCreateKernel(program, "global_mem", &err); break; default: kernel = clCreateKernel(program, "local_mem", &err); } if (!kernel || err != CL_SUCCESS) { fprintf(stderr, "Error: Failed to create compute kernel!\n"); return 1; } float* hOutput = (float*)malloc(global * sizeof(float)); memset(hOutput, 0, global * sizeof(float)); cl_mem output = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * global, hOutput, NULL); if (!output || err != CL_SUCCESS) { fprintf(stderr, "Error: Failed to allocate device memory!\n"); return 1; } // Set the arguments to our compute kernel err = CL_SUCCESS; err |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &output); cl_mem g1, g2; switch(type) { case 1: break; case 2: switch(f4) { case(1): g1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float2) * elements, NULL, NULL); g2 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float2) * elements*2, NULL, NULL); break; case(2): g1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float4) * elements, NULL, NULL); g2 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float4) * elements*2, NULL, NULL); break; case(3): g1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float8) * elements, NULL, NULL); g2 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float8) * elements*2, NULL, NULL); break; case(4): g1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float16) * elements, NULL, NULL); g2 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float16) * elements*2, NULL, NULL); break; default: g1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * elements, NULL, NULL); g2 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * elements*2, NULL, NULL); break; break; } err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &g1); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &g2); break; default: if(larg) switch(f4) { case(1): err |= clSetKernelArg(kernel, 1, sizeof(cl_float2)*elements, NULL); err |= clSetKernelArg(kernel, 2, sizeof(cl_float2)*elements*2, NULL); break; case(2): err |= clSetKernelArg(kernel, 1, sizeof(cl_float4)*elements, NULL); err |= clSetKernelArg(kernel, 2, sizeof(cl_float4)*elements*2, NULL); break; case(3): err |= clSetKernelArg(kernel, 1, sizeof(cl_float8)*elements, NULL); err |= clSetKernelArg(kernel, 2, sizeof(cl_float8)*elements*2, NULL); break; case(4): err |= clSetKernelArg(kernel, 1, sizeof(cl_float8)*elements, NULL); err |= clSetKernelArg(kernel, 2, sizeof(cl_float8)*elements*2, NULL); break; default: err |= clSetKernelArg(kernel, 1, sizeof(cl_float)*elements, NULL); err |= clSetKernelArg(kernel, 2, sizeof(cl_float)*elements*2, NULL); break; break; } } if (err != CL_SUCCESS) { fprintf(stderr, "Error: Failed to set kernel arguments! %d\n", err); return 1; } // warmup for(unsigned i = 0; i < WARMUP_CYCLES; ++i) { err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); clFinish(commands); } // start actual measurement unsigned long start_time = current_msecs(); for(unsigned i = 0; i < iterations; ++i) { err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err) { fprintf(stderr, "Error %i: Failed to execute kernel!\n%s\n", err, oclErrorString(err)); return 1; } clFlush(commands); } clFinish(commands); time_taken[ws] = elapsed_msecs(start_time) / 1000.0; /* cl_event read; err = clEnqueueReadBuffer(commands, output, CL_FALSE, 0, global*sizeof(float), hOutput, 0, NULL, &read); if (err) { fprintf(stderr, "Error %i: Failed read buffer!\n%s\n", err, oclErrorString(err)); return 1; } clWaitForEvents(1, &read); for(size_t i = 0; i < global; ++i) printf(", %d %f ", i, hOutput[i]); printf("\n\n"); */ free(hOutput); clReleaseMemObject(output); if(type == 2) { clReleaseMemObject(g1); clReleaseMemObject(g2); } clReleaseProgram(program); clReleaseKernel(kernel); } return err; }
int main(int argc, char** argv) { /* OpenCL 1.1 data structures */ cl_platform_id* platforms; cl_program program; cl_device_id device; cl_context context; cl_command_queue queue; cl_uint numOfPlatforms; cl_int error; cl_mem matrixAMemObj; // input matrix A mem buffer cl_mem matrixBMemObj; // input matrix B mem buffer cl_mem matrixCMemObj; // input matrix C mem buffer cl_int* matrixA; // input matrix A cl_int* matrixB; // input matrix B cl_int* matrixC; // input matrix C cl_uint widthA = WIDTH_G; cl_uint heightA = HEIGHT_G; cl_uint widthB = WIDTH_G; cl_uint heightB = HEIGHT_G; { // allocate memory for input and output matrices // based on whatever matrix theory i know. matrixA = (cl_int*)malloc(widthA * heightA * sizeof(cl_int)); matrixB = (cl_int*)malloc(widthB * heightB * sizeof(cl_int)); matrixC = (cl_int*)malloc(widthB * heightA * sizeof(cl_int)); memset(matrixA, 0, widthA * heightA * sizeof(cl_int)); memset(matrixB, 0, widthB * heightB * sizeof(cl_int)); memset(matrixC, 0, widthB * heightA * sizeof(cl_int)); fillRandom(matrixA, widthA, heightA, 643); fillRandom(matrixB, widthB, heightB, 991); } /* Get the number of platforms Remember that for each vendor's SDK installed on the computer, the number of available platform also increased. */ error = clGetPlatformIDs(0, NULL, &numOfPlatforms); if(error != CL_SUCCESS) { perror("Unable to find any OpenCL platforms"); exit(1); } platforms = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numOfPlatforms); printf("Number of OpenCL platforms found: %d\n", numOfPlatforms); error = clGetPlatformIDs(numOfPlatforms, platforms, NULL); if(error != CL_SUCCESS) { perror("Unable to find any OpenCL platforms"); exit(1); } // Search for a GPU device through the installed platforms // Build a OpenCL program and do not run it. for(cl_int i = 0; i < numOfPlatforms; i++ ) { // Get the GPU device error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 1, &device, NULL); if(error != CL_SUCCESS) { perror("Can't locate a OpenCL compliant device i.e. GPU"); exit(1); } /* Create a context */ context = clCreateContext(NULL, 1, &device, NULL, NULL, &error); if(error != CL_SUCCESS) { perror("Can't create a valid OpenCL context"); exit(1); } /* Load the two source files into temporary datastores */ const char *file_names[] = {"mmult.cl"}; const int NUMBER_OF_FILES = 1; char* buffer[NUMBER_OF_FILES]; size_t sizes[NUMBER_OF_FILES]; loadProgramSource(file_names, NUMBER_OF_FILES, buffer, sizes); /* Create the OpenCL program object */ program = clCreateProgramWithSource(context, NUMBER_OF_FILES, (const char**)buffer, sizes, &error); if(error != CL_SUCCESS) { perror("Can't create the OpenCL program object"); exit(1); } /* Build OpenCL program object and dump the error message, if any */ char *program_log; const char options[] = ""; size_t log_size; error = clBuildProgram(program, 1, &device, options, NULL, NULL); if(error != CL_SUCCESS) { // If there's an error whilst building the program, dump the log clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); program_log = (char*) malloc(log_size+1); program_log[log_size] = '\0'; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL); printf("\n=== ERROR ===\n\n%s\n=============\n", program_log); free(program_log); exit(1); } // Queue is created with profiling enabled cl_command_queue_properties props; props |= CL_QUEUE_PROFILING_ENABLE; queue = clCreateCommandQueue(context, device, props, &error); cl_kernel kernel = clCreateKernel(program, "mmmult", &error); matrixAMemObj = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, widthA * heightA * sizeof(cl_int), matrixA, &error); matrixBMemObj = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, widthB * heightB * sizeof(cl_int), matrixB, &error); matrixCMemObj = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR, widthB * heightA * sizeof(cl_int), 0, &error); clSetKernelArg(kernel, 0, sizeof(cl_int),(void*)&widthB); clSetKernelArg(kernel, 1, sizeof(cl_int),(void*)&heightA); clSetKernelArg(kernel, 2, sizeof(cl_mem),(void*)&matrixAMemObj); clSetKernelArg(kernel, 3, sizeof(cl_mem),(void*)&matrixBMemObj); clSetKernelArg(kernel, 4, sizeof(cl_mem),(void*)&matrixCMemObj); size_t globalThreads[] = {heightA}; size_t localThreads[] = {256}; cl_event exeEvt; cl_ulong executionStart, executionEnd; error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalThreads, localThreads, 0, NULL, &exeEvt); clWaitForEvents(1, &exeEvt); if(error != CL_SUCCESS) { printf("Kernel execution failure!\n"); exit(-22); } // let's understand how long it took? clGetEventProfilingInfo(exeEvt, CL_PROFILING_COMMAND_START, sizeof(executionStart), &executionStart, NULL); clGetEventProfilingInfo(exeEvt, CL_PROFILING_COMMAND_END, sizeof(executionEnd), &executionEnd, NULL); clReleaseEvent(exeEvt); printf("Execution the matrix-matrix multiplication took %lu.%lu s\n", (executionEnd - executionStart)/1000000000, (executionEnd - executionStart)%1000000000); printf("Execution the matrix-matrix multiplication took %lu s\n", (executionEnd - executionStart)); clEnqueueReadBuffer(queue, matrixCMemObj, CL_TRUE, 0, widthB * heightA * sizeof(cl_int), matrixC, 0, NULL, NULL); if (compare(matrixC, matrixA, matrixB, heightA, widthA, widthB)) printf("Passed!\n"); else printf("Failed!\n"); /* Clean up */ for(i=0; i< NUMBER_OF_FILES; i++) { free(buffer[i]); } clReleaseProgram(program); clReleaseContext(context); clReleaseMemObject(matrixAMemObj); clReleaseMemObject(matrixBMemObj); clReleaseMemObject(matrixCMemObj); } free(matrixA); free(matrixB); free(matrixC); }
int exec_trig_kernel(const char *program_source, int n, void *srcA, void *dst) { cl_context context; cl_command_queue cmd_queue; cl_device_id *devices; cl_program program; cl_kernel kernel; cl_mem memobjs[2]; size_t global_work_size[1]; size_t local_work_size[1]; size_t cb; cl_int err; float c = 7.3f; // a scalar number to test non-pointer args // create the OpenCL context on a GPU device context = poclu_create_any_context(); if (context == (cl_context)0) return -1; // get the list of GPU devices associated with context clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb); devices = (cl_device_id *) malloc(cb); clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, devices, NULL); // create a command-queue cmd_queue = clCreateCommandQueue(context, devices[0], 0, NULL); if (cmd_queue == (cl_command_queue)0) { clReleaseContext(context); free(devices); return -1; } free(devices); // allocate the buffer memory objects memobjs[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float4) * n, srcA, NULL); if (memobjs[0] == (cl_mem)0) { clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } memobjs[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float4) * n, NULL, NULL); if (memobjs[1] == (cl_mem)0) { delete_memobjs(memobjs, 1); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // create the program program = clCreateProgramWithSource(context, 1, (const char**)&program_source, NULL, NULL); if (program == (cl_program)0) { delete_memobjs(memobjs, 2); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // create the kernel kernel = clCreateKernel(program, "trig", NULL); if (kernel == (cl_kernel)0) { delete_memobjs(memobjs, 2); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // set the args values err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &memobjs[0]); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &memobjs[1]); err |= clSetKernelArg(kernel, 2, sizeof(float), (void *) &c); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // set work-item dimensions global_work_size[0] = n; local_work_size[0]= 2; // execute kernel err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // read output image err = clEnqueueReadBuffer(cmd_queue, memobjs[1], CL_TRUE, 0, n * sizeof(cl_float4), dst, 0, NULL, NULL); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // release kernel, program, and memory objects delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return 0; // success... }
double runCode(double input,double input2){ /* OpenCL structures */ cl_device_id device; cl_context context; cl_program program; cl_kernel kernel; cl_command_queue queue; cl_int err; size_t global_size; double output; cl_mem output_buffer; cl_mem input_buffer; /* Create device and context */ device = create_device(); context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if(err < 0) { perror("Couldn't create a context"); exit(1); } /* Build program */ program = build_program(context, device, PROGRAM_FILE); /* Create data buffer */ //This effectively means having only a single work-item, which means no //paraellizm. That's okay, this is only a test. global_size = 1; input_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(double), &input, &err); if(err < 0) { fprintf(stderr,"Couldn't create input Buffer: %d\n",err); exit(1); }; output_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double), NULL, &err); if(err < 0) { fprintf(stderr,"Couldn't create output Buffer: %d\n",err); exit(1); }; /* Create a command queue */ queue = clCreateCommandQueue(context, device, 0, &err); if(err < 0) { perror("Couldn't create a command queue"); exit(1); }; /* Create a kernel */ //kernel = clCreateKernel(program, KERNEL_FUNC, &err); kernel = clCreateKernel(program, "test", &err); if(err < 0) { perror("Couldn't create a kernel"); exit(1); }; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_buffer); if(err < 0) { fprintf(stderr,"Error setting kernel arguments, code: %d \n",err); } err = clSetKernelArg(kernel, 1, sizeof(cl_double), (void*)&input2); if(err < 0) { fprintf(stderr,"Error setting kernel arguments, code: %d \n",err); } err = clSetKernelArg(kernel, 2, sizeof(cl_double), (void*)&input2); if(err < 0) { fprintf(stderr,"Error setting kernel arguments, code: %d \n",err); } err = clSetKernelArg(kernel, 3, sizeof(cl_double), (void*)&input2); if(err < 0) { fprintf(stderr,"Error setting kernel arguments, code: %d \n",err); } err = clSetKernelArg(kernel, 4, sizeof(cl_double), (void*)&input2); if(err < 0) { fprintf(stderr,"Error setting kernel arguments, code: %d \n",err); } err = clSetKernelArg(kernel, 5, sizeof(cl_double), (void*)&input2); if(err < 0) { fprintf(stderr,"Error setting kernel arguments, code: %d \n",err); } err = clSetKernelArg(kernel, 6, sizeof(cl_double), (void*)&input2); if(err < 0) { fprintf(stderr,"Error setting kernel arguments, code: %d \n",err); } err = clSetKernelArg(kernel, 7, sizeof(cl_mem), &output_buffer); if(err < 0) { fprintf(stderr,"Error setting kernel arguments, code: %d \n",err); } /* Enqueue kernel */ err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, NULL, 0, NULL, NULL); if(err < 0) { fprintf(stderr,"Couldn't enqueue the kernel, error code %d\n",err); exit(1); } /* Read the kernel's output */ err = clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, 0, sizeof(output), &output, 0, NULL, NULL); if(err < 0) { perror("Couldn't read the buffer"); exit(1); } /* Deallocate resources */ clReleaseKernel(kernel); clReleaseMemObject(output_buffer); clReleaseCommandQueue(queue); clReleaseProgram(program); clReleaseContext(context); return output; }