OPENCL_EXPERIMENTS_EXPORT cl_int opencl_plugin_create(opencl_plugin *plugin_out) { cl_int err = CL_SUCCESS; opencl_plugin plugin; cl_int i; cl_int num_queues = 50; assert(plugin_out != NULL); plugin = calloc(1, sizeof(*plugin)); CHECK_ALLOCATION(plugin); if (get_desired_platform("NVIDIA", &plugin->selected_platform, &err)) goto error; if (get_gpu_device_id(plugin->selected_platform, &plugin->selected_device, CL_TRUE, &err)) goto error; if (create_context(plugin->selected_platform, plugin->selected_device, &plugin->context, &err)) goto error; if (build_program_from_file("program.cl", NULL, plugin->context, plugin->selected_device, &plugin->program, &err)) goto error; plugin->queue = clCreateCommandQueue(plugin->context, plugin->selected_device, 0, &err); CHECK_CL_ERROR(err); plugin->num_queues = num_queues; plugin->queues = calloc(num_queues, sizeof(cl_command_queue)); CHECK_ALLOCATION(plugin->queues); for (i = 0; i < num_queues; i++) { plugin->queues[i] = clCreateCommandQueue(plugin->context, plugin->selected_device, 0, &err); CHECK_CL_ERROR(err); } plugin->voxelize_kernel = clCreateKernel(plugin->program, "voxelize", &err); CHECK_CL_ERROR(err); *plugin_out = plugin; return 0; error: if (plugin) { if (plugin->voxelize_kernel) clReleaseKernel(plugin->voxelize_kernel); if (plugin->queue) clReleaseCommandQueue(plugin->queue); if (plugin->queues) { for (i = 0; i < num_queues; i++) { if (plugin->queues[i]) clReleaseCommandQueue(plugin->queues[i]); } free(plugin->queues); } if (plugin->context) clReleaseContext(plugin->context); free(plugin); } return -1; }
T profileReduce(ReduceType datatype, cl_int n, int numThreads, int numBlocks, int maxThreads, int maxBlocks, int whichKernel, int testIterations, bool cpuFinalReduction, int cpuFinalThreshold, double* dTotalTime, T* h_odata, cl_mem d_idata, cl_mem d_odata) { T gpu_result = 0; bool needReadBack = true; cl_kernel finalReductionKernel[10]; int finalReductionIterations=0; //shrLog("Profile Kernel %d\n", whichKernel); cl_kernel reductionKernel = getReductionKernel(datatype, whichKernel, numThreads, isPow2(n) ); clSetKernelArg(reductionKernel, 0, sizeof(cl_mem), (void *) &d_idata); clSetKernelArg(reductionKernel, 1, sizeof(cl_mem), (void *) &d_odata); clSetKernelArg(reductionKernel, 2, sizeof(cl_int), &n); clSetKernelArg(reductionKernel, 3, sizeof(T) * numThreads, NULL); if( !cpuFinalReduction ) { int s=numBlocks; int threads = 0, blocks = 0; int kernel = (whichKernel == 6) ? 5 : whichKernel; while(s > cpuFinalThreshold) { getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads); finalReductionKernel[finalReductionIterations] = getReductionKernel(datatype, kernel, threads, isPow2(s) ); clSetKernelArg(finalReductionKernel[finalReductionIterations], 0, sizeof(cl_mem), (void *) &d_odata); clSetKernelArg(finalReductionKernel[finalReductionIterations], 1, sizeof(cl_mem), (void *) &d_odata); clSetKernelArg(finalReductionKernel[finalReductionIterations], 2, sizeof(cl_int), &n); clSetKernelArg(finalReductionKernel[finalReductionIterations], 3, sizeof(T) * numThreads, NULL); if (kernel < 3) s = (s + threads - 1) / threads; else s = (s + (threads*2-1)) / (threads*2); finalReductionIterations++; } } size_t globalWorkSize[1]; size_t localWorkSize[1]; for (int i = 0; i < testIterations; ++i) { gpu_result = 0; clFinish(cqCommandQueue); if(i>0) shrDeltaT(1); // execute the kernel globalWorkSize[0] = numBlocks * numThreads; localWorkSize[0] = numThreads; ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue,reductionKernel, 1, 0, globalWorkSize, localWorkSize, 0, NULL, NULL); // check if kernel execution generated an error oclCheckError(ciErrNum, CL_SUCCESS); if (cpuFinalReduction) { // sum partial sums from each block on CPU // copy result from device to host clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, numBlocks * sizeof(T), h_odata, 0, NULL, NULL); for(int i=0; i<numBlocks; i++) { gpu_result += h_odata[i]; } needReadBack = false; } else { // sum partial block sums on GPU int s=numBlocks; int kernel = (whichKernel == 6) ? 5 : whichKernel; int it = 0; while(s > cpuFinalThreshold) { int threads = 0, blocks = 0; getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads); globalWorkSize[0] = threads * blocks; localWorkSize[0] = threads; ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, finalReductionKernel[it], 1, 0, globalWorkSize, localWorkSize, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); if (kernel < 3) s = (s + threads - 1) / threads; else s = (s + (threads*2-1)) / (threads*2); it++; } if (s > 1) { // copy result from device to host clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, s * sizeof(T), h_odata, 0, NULL, NULL); for(int i=0; i < s; i++) { gpu_result += h_odata[i]; } needReadBack = false; } } clFinish(cqCommandQueue); if(i>0) *dTotalTime += shrDeltaT(1); } if (needReadBack) { // copy final sum from device to host clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, sizeof(T), &gpu_result, 0, NULL, NULL); } // Release the kernels clReleaseKernel(reductionKernel); if( !cpuFinalReduction ) { for(int it=0; it<finalReductionIterations; ++it) { clReleaseKernel(finalReductionKernel[it]); } } return gpu_result; }
int exec_trig_kernel(const char *program_source, int n, void *srcA, void *dst) { cl_context context; cl_command_queue cmd_queue; cl_device_id *devices; cl_program program; cl_kernel kernel; cl_mem memobjs[2]; size_t global_work_size[1]; size_t local_work_size[1]; size_t cb; cl_int err; float c = 7.3f; // a scalar number to test non-pointer args // create the OpenCL context on a GPU device context = poclu_create_any_context(); if (context == (cl_context)0) return -1; // get the list of GPU devices associated with context clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb); devices = (cl_device_id *) malloc(cb); clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, devices, NULL); // create a command-queue cmd_queue = clCreateCommandQueue(context, devices[0], 0, NULL); if (cmd_queue == (cl_command_queue)0) { clReleaseContext(context); free(devices); return -1; } free(devices); // allocate the buffer memory objects memobjs[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float4) * n, srcA, NULL); if (memobjs[0] == (cl_mem)0) { clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } memobjs[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float4) * n, NULL, NULL); if (memobjs[1] == (cl_mem)0) { delete_memobjs(memobjs, 1); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // create the program program = clCreateProgramWithSource(context, 1, (const char**)&program_source, NULL, NULL); if (program == (cl_program)0) { delete_memobjs(memobjs, 2); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // create the kernel kernel = clCreateKernel(program, "trig", NULL); if (kernel == (cl_kernel)0) { delete_memobjs(memobjs, 2); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // set the args values err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &memobjs[0]); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &memobjs[1]); err |= clSetKernelArg(kernel, 2, sizeof(float), (void *) &c); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // set work-item dimensions global_work_size[0] = n; local_work_size[0]= 2; // execute kernel err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // read output image err = clEnqueueReadBuffer(cmd_queue, memobjs[1], CL_TRUE, 0, n * sizeof(cl_float4), dst, 0, NULL, NULL); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // release kernel, program, and memory objects delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return 0; // success... }
int main() { // START:context cl_platform_id platform; clGetPlatformIDs(1, &platform, NULL); cl_device_id device; clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); // END:context // START:queue cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL); // END:queue // START:kernel char* source = read_source("multiply_arrays.cl"); cl_program program = clCreateProgramWithSource(context, 1, (const char**)&source, NULL, NULL); free(source); clBuildProgram(program, 0, NULL, NULL, NULL, NULL); cl_kernel kernel = clCreateKernel(program, "multiply_arrays", NULL); // END:kernel // START:buffers cl_float a[NUM_ELEMENTS], b[NUM_ELEMENTS]; random_fill(a, NUM_ELEMENTS); random_fill(b, NUM_ELEMENTS); cl_mem inputA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * NUM_ELEMENTS, a, NULL); cl_mem inputB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * NUM_ELEMENTS, b, NULL); cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * NUM_ELEMENTS, NULL, NULL); // END:buffers // START:execute clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA); clSetKernelArg(kernel, 1, sizeof(cl_mem), &inputB); clSetKernelArg(kernel, 2, sizeof(cl_mem), &output); size_t work_units = NUM_ELEMENTS; clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_units, NULL, 0, NULL, NULL); // END:execute // START:results cl_float results[NUM_ELEMENTS]; clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(cl_float) * NUM_ELEMENTS, results, 0, NULL, NULL); // END:results // START:cleanup clReleaseMemObject(inputA); clReleaseMemObject(inputB); clReleaseMemObject(output); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(queue); clReleaseContext(context); // END:cleanup for (int i = 0; i < NUM_ELEMENTS; ++i) { printf("%f * %f = %f\n", a[i], b[i], results[i]); } return 0; }
int main(int argc, char* argv[]) { struct pb_Parameters *parameters; parameters = pb_ReadParameters(&argc, argv); if (!parameters) return -1; if(!parameters->inpFiles[0]){ fputs("Input file expected\n", stderr); return -1; } struct pb_TimerSet timers; char oclOverhead[] = "OCL Overhead"; char intermediates[] = "IntermediatesKernel"; char finals[] = "FinalKernel"; pb_InitializeTimerSet(&timers); pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); pb_AddSubTimer(&timers, intermediates, pb_TimerID_KERNEL); pb_AddSubTimer(&timers, finals, pb_TimerID_KERNEL); pb_SwitchToTimer(&timers, pb_TimerID_IO); int numIterations; if (argc >= 2){ numIterations = atoi(argv[1]); } else { fputs("Expected at least one command line argument\n", stderr); return -1; } unsigned int img_width, img_height; unsigned int histo_width, histo_height; FILE* f = fopen(parameters->inpFiles[0],"rb"); int result = 0; result += fread(&img_width, sizeof(unsigned int), 1, f); result += fread(&img_height, sizeof(unsigned int), 1, f); result += fread(&histo_width, sizeof(unsigned int), 1, f); result += fread(&histo_height, sizeof(unsigned int), 1, f); if (result != 4){ fputs("Error reading input and output dimensions from file\n", stderr); return -1; } unsigned int* img = (unsigned int*) malloc (img_width*img_height*sizeof(unsigned int)); unsigned char* histo = (unsigned char*) calloc (histo_width*histo_height, sizeof(unsigned char)); result = fread(img, sizeof(unsigned int), img_width*img_height, f); fclose(f); if (result != img_width*img_height){ fputs("Error reading input array from file\n", stderr); return -1; } cl_int ciErrNum; pb_Context* pb_context; pb_context = pb_InitOpenCLContext(parameters); if (pb_context == NULL) { fprintf (stderr, "Error: No OpenCL platform/device can be found."); return -1; } cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId; cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId; cl_context clContext = (cl_context) pb_context->clContext; cl_command_queue clCommandQueue; cl_program clProgram[2]; cl_kernel histo_intermediates_kernel; cl_kernel histo_final_kernel; cl_mem input; cl_mem ranges; cl_mem sm_mappings; cl_mem global_subhisto; cl_mem global_overflow; cl_mem final_histo; clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); pb_SetOpenCL(&clContext, &clCommandQueue); pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); cl_uint workItemDimensions; OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &workItemDimensions, NULL) ); size_t workItemSizes[workItemDimensions]; OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, workItemDimensions*sizeof(size_t), workItemSizes, NULL) ); size_t program_length[2]; const char *source_path[2] = { "src/opencl_mxpa/histo_intermediates.cl", "src/opencl_mxpa/histo_final.cl"}; char *source[4]; for (int i = 0; i < 2; ++i) { // Dynamically allocate buffer for source source[i] = oclLoadProgSource(source_path[i], "", &program_length[i]); if(!source[i]) { fprintf(stderr, "Could not load program source\n"); exit(1); } clProgram[i] = clCreateProgramWithSource(clContext, 1, (const char **)&source[i], &program_length[i], &ciErrNum); OCL_ERRCK_VAR(ciErrNum); free(source[i]); } for (int i = 0; i < 2; ++i) { //fprintf(stderr, "Building Program #%d...\n", i); OCL_ERRCK_RETVAL ( clBuildProgram(clProgram[i], 1, &clDevice, NULL, NULL, NULL) ); /* char *build_log; size_t ret_val_size; ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); OCL_ERRCK_VAR(ciErrNum); build_log = (char *)malloc(ret_val_size+1); ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); OCL_ERRCK_VAR(ciErrNum); // to be carefully, terminate with \0 // there's no information in the reference whether the string is 0 terminated or not build_log[ret_val_size] = '\0'; fprintf(stderr, "%s\n", build_log ); */ } histo_intermediates_kernel = clCreateKernel(clProgram[0], "histo_intermediates_kernel", &ciErrNum); OCL_ERRCK_VAR(ciErrNum); histo_final_kernel = clCreateKernel(clProgram[1], "histo_final_kernel", &ciErrNum); OCL_ERRCK_VAR(ciErrNum); pb_SwitchToTimer(&timers, pb_TimerID_COPY); input = clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*img_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); ranges = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 2*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); sm_mappings = clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*img_height*4*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); global_subhisto = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); global_overflow = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); final_histo = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); // Must dynamically allocate. Too large for stack unsigned int *zeroData; zeroData = (unsigned int *) calloc(img_width*histo_height, sizeof(unsigned int)); if (zeroData == NULL) { fprintf(stderr, "Failed to allocate %ld bytes of memory on host!\n", sizeof(unsigned int) * img_width * histo_height); exit(1); } for (int y=0; y < img_height; y++){ OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, input, CL_TRUE, y*img_width*sizeof(unsigned int), // Offset in bytes img_width*sizeof(unsigned int), // Size of data to write &img[y*img_width], // Host Source 0, NULL, NULL) ); } pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); unsigned int img_dim = img_height*img_width; OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 0, sizeof(cl_mem), (void *)&input) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 1, sizeof(unsigned int), &img_width) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 2, sizeof(cl_mem), (void *)&global_subhisto) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 0, sizeof(unsigned int), &histo_height) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 1, sizeof(unsigned int), &histo_width) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 2, sizeof(cl_mem), (void *)&global_subhisto) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 3, sizeof(cl_mem), (void *)&final_histo) ); size_t inter_localWS[1] = { workItemSizes[0] }; size_t inter_globalWS[1] = { img_height * inter_localWS[0] }; size_t final_localWS[1] = { workItemSizes[0] }; size_t final_globalWS[1] = {(((int)(histo_height*histo_width+(final_localWS[0]-1))) / (int)final_localWS[0])*(int)final_localWS[0] }; pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); for (int iter = 0; iter < numIterations; iter++) { unsigned int ranges_h[2] = {UINT32_MAX, 0}; // how about something like // __global__ unsigned int ranges[2]; // ...kernel // __shared__ unsigned int s_ranges[2]; // if (threadIdx.x == 0) {s_ranges[0] = ranges[0]; s_ranges[1] = ranges[1];} // __syncthreads(); // Although then removing the blocking cudaMemcpy's might cause something about // concurrent kernel execution. // If kernel launches are synchronous, then how can 2 kernels run concurrently? different host threads? OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, ranges, CL_TRUE, 0, // Offset in bytes 2*sizeof(unsigned int), // Size of data to write ranges_h, // Host Source 0, NULL, NULL) ); OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, global_subhisto, CL_TRUE, 0, // Offset in bytes histo_width*histo_height*sizeof(unsigned int), // Size of data to write zeroData, // Host Source 0, NULL, NULL) ); pb_SwitchToSubTimer(&timers, intermediates, pb_TimerID_KERNEL); OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_intermediates_kernel /*histo_intermediates_kernel*/, 1, 0, inter_globalWS, inter_localWS, 0, 0, 0) ); pb_SwitchToSubTimer(&timers, finals, pb_TimerID_KERNEL); OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_final_kernel, 1, 0, final_globalWS, final_localWS, 0, 0, 0) ); } pb_SwitchToTimer(&timers, pb_TimerID_IO); OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, final_histo, CL_TRUE, 0, // Offset in bytes histo_height*histo_width*sizeof(unsigned char), // Size of data to read histo, // Host Source 0, NULL, NULL) ); OCL_ERRCK_RETVAL ( clReleaseKernel(histo_intermediates_kernel) ); OCL_ERRCK_RETVAL ( clReleaseKernel(histo_final_kernel) ); OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[0]) ); OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[1]) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(input) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(ranges) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(sm_mappings) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(global_subhisto) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(global_overflow) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(final_histo) ); if (parameters->outFile) { dump_histo_img(histo, histo_height, histo_width, parameters->outFile); } pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); free(zeroData); free(img); free(histo); pb_SwitchToTimer(&timers, pb_TimerID_NONE); printf("\n"); pb_PrintTimerSet(&timers); pb_FreeParameters(parameters); pb_DestroyTimerSet(&timers); OCL_ERRCK_RETVAL ( clReleaseCommandQueue(clCommandQueue) ); OCL_ERRCK_RETVAL ( clReleaseContext(clContext) ); return 0; }
void clInvert3D(CL* cl, VglImage* img){ cl_int err; cl_image_desc desc = getDesc(img->shape[0], img->shape[1], 3, img->shape[2]); cl_image_desc descOut = getDesc(img->shape[0], img->shape[1], 3, img->shape[2]); cl_image_format src; cl_image_format out; switch(img->nChannels){ case 1: src.image_channel_order = CL_A; out.image_channel_order = CL_A; break; case 3: rgb2rgba(NULL, img); src.image_channel_order = CL_RGBA; out.image_channel_order = CL_RGBA; break; case 4: src.image_channel_order = CL_RGBA; out.image_channel_order = CL_RGBA; break; default: printf("Numero de canais não suportado\n"); exit; } src.image_channel_data_type = CL_UNORM_INT8; out.image_channel_data_type = CL_UNORM_INT8; cl_mem src_mem = clCreateImage(cl->context, CL_MEM_READ_ONLY, &src, &desc, NULL, &err); printf("IMAGE STATUS SOURCE\t"); cl_error(err); cl_mem out_mem = clCreateImage(cl->context, CL_MEM_WRITE_ONLY, &out, &descOut, NULL, &err); printf("IMAGE STATUS OUT\t"); cl_error(err); clGetMemObjectInfo(src_mem, CL_MEM_TYPE, sizeof(cl_int), &err, NULL); if(err == CL_MEM_OBJECT_IMAGE3D) printf("IMAGE TYPE:\t\tCL_MEM_OBJECT_IMAGE3D\n"); size_t *src_origin=(size_t*)malloc(sizeof(size_t)*3); src_origin[0] = 0; src_origin[1] = 0; src_origin[2] = 0; size_t *src_region=(size_t*)malloc(sizeof(size_t)*3); src_region[0] = img->shape[0]; src_region[1] = img->shape[1]; src_region[2] = img->shape[2]; err = clEnqueueWriteImage(cl->queue, src_mem, CL_TRUE, src_origin, src_region, 0, 0, (void*)img->ndarray, 0, 0, NULL); printf("ENQUEUE IMAGE STATUS "); cl_error(err); cl_program program; cl_kernel kernel; const char* k = "./CLdemos/CL/Invert3D_RGBA.cl"; const char* k2 = "./CLdemos/CL/Invert3D_A.cl"; char** fonte; if(img->nChannels==1) fonte = (char**)getKernelPtr(k2); if(img->nChannels==4) fonte = (char**)getKernelPtr(k); program = clCreateProgramWithSource(cl->context, 1, (const char**)fonte, NULL, &err); printf("CREATE PROGRAM STATUS: "); cl_error(err); clBuildProgram(program, 0, NULL, NULL, NULL, &err); printf("BUILD PROGRAM STATUS: "); cl_error(err); kernel = clCreateKernel(program, "invert", &err); printf("KERNEL STATUS "); cl_error(err); err = clSetKernelArg( kernel, 0, sizeof( cl_mem ), (void *) &src_mem); printf("SET 1 KERNEL ARG "); cl_error(err); err = clSetKernelArg( kernel, 1, sizeof( cl_mem ), (void *) &out_mem); printf("SET 2 KERNEL ARG "); cl_error(err); size_t worksize[] = { img->shape[0], img->shape[1], img->shape[2]}; err = clEnqueueNDRangeKernel(cl->queue, kernel, 2, NULL, worksize, 0, 0, 0, 0); printf("ENQUEUE ND KERNEL STATUS "); cl_error(err); clFinish(cl->queue); char* auxout = (char*)malloc(img->shape[0]*img->shape[1]*img->shape[2]*img->nChannels); err = clEnqueueReadImage(cl->queue, out_mem, CL_TRUE, src_origin, src_region, 0, 0, auxout, 0, NULL, NULL); printf("READ NEW IMAGE STATUS "); cl_error(err); for(int i=0; i<(img->shape[0]*img->nChannels*img->shape[1]*img->shape[2]); i++) img->ndarray[i] = auxout[i]; free(auxout); clReleaseKernel(kernel); clReleaseProgram(program); }
int fft_main(cl_mem dst, cl_mem src, cl_mem twiddles, cl_int m, enum Tipo direcao, struct event_in_fft_t *fft_event) { cl_int ret_code; cl_int iter; cl_uint flag; size_t global_wg[2]; size_t local_wg[2]; cl_int n = 1 << m; cl_kernel kernel_bits_rev = NULL; cl_kernel kernel_butterfly_op = NULL; cl_kernel kernel_normalize = NULL; kernel_bits_rev = clCreateKernel(program, "bits_reverse", &ret_code); kernel_butterfly_op = clCreateKernel(program, "butterfly_operation", &ret_code); kernel_normalize = clCreateKernel(program, "normalizar", &ret_code); switch (direcao) { case direta:flag = 0x00000000; break; case inversa:flag = 0x80000000; break; } CL_CHECK(clSetKernelArg(kernel_bits_rev, 0, sizeof(cl_mem), (void *)&dst)); CL_CHECK(clSetKernelArg(kernel_bits_rev, 1, sizeof(cl_mem), (void *)&src)); CL_CHECK(clSetKernelArg(kernel_bits_rev, 2, sizeof(cl_int), (void *)&m)); CL_CHECK(clSetKernelArg(kernel_bits_rev, 3, sizeof(cl_int), (void *)&n)); CL_CHECK(clSetKernelArg(kernel_butterfly_op, 0, sizeof(cl_mem), (void *)&dst)); CL_CHECK(clSetKernelArg(kernel_butterfly_op, 1, sizeof(cl_mem), (void *)&twiddles)); CL_CHECK(clSetKernelArg(kernel_butterfly_op, 2, sizeof(cl_int), (void *)&m)); CL_CHECK(clSetKernelArg(kernel_butterfly_op, 3, sizeof(cl_int), (void *)&n)); CL_CHECK(clSetKernelArg(kernel_butterfly_op, 5, sizeof(cl_uint), (void *)&flag)); CL_CHECK(clSetKernelArg(kernel_normalize, 0, sizeof(cl_mem), (void *)&dst)); CL_CHECK(clSetKernelArg(kernel_normalize, 1, sizeof(cl_int), (void *)&n)); config_workgroup_size(global_wg, local_wg, n, n); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_bits_rev, 2, NULL, global_wg, local_wg, 0, NULL, &fft_event->kernel_bitsrev)); config_workgroup_size(global_wg, local_wg, n/2, n); for (iter = 1; iter <= m; iter++) { CL_CHECK(clSetKernelArg(kernel_butterfly_op, 4, sizeof(cl_int), (void *)&iter)); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_butterfly_op, 2, NULL, global_wg, local_wg, 0, NULL, &kernel_butter_events[butter_event_it])); butter_event_it++; } fft_event->kernel_normalize = NULL; if (direcao == inversa) { config_workgroup_size(global_wg, local_wg, n, n); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_normalize, 2, NULL, global_wg, local_wg, 0, NULL, &fft_event->kernel_normalize)); } clReleaseKernel(kernel_bits_rev); clReleaseKernel(kernel_butterfly_op); clReleaseKernel(kernel_normalize); return 0; }
int MemoryOptimizations::cleanup() { /* Releases OpenCL resources (Context, Memory etc.) */ cl_int status; for(int i = 0; i < NUM_KERNELS; i++) { status = clReleaseKernel(kernel[i]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clReleaseKernel failed.")) return SDK_FAILURE; } status = clReleaseProgram(program); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clReleaseProgram failed.")) return SDK_FAILURE; status = clReleaseMemObject(inputBuffer); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clReleaseMemObject failed.")) return SDK_FAILURE; status = clReleaseMemObject(outputBuffer); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clReleaseMemObject failed.")) return SDK_FAILURE; status = clReleaseCommandQueue(commandQueue); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clReleaseCommandQueue failed.")) return SDK_FAILURE; status = clReleaseContext(context); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clReleaseContext failed.")) return SDK_FAILURE; /* release program resources (input memory etc.) */ if(input) free(input); if(output) free(output); /* release device list */ if(devices) free(devices); if(maxWorkItemSizes) free(maxWorkItemSizes); return SDK_SUCCESS; }
void JNIContext::dispose(JNIEnv *jenv, Config* config) { //fprintf(stdout, "dispose()\n"); cl_int status = CL_SUCCESS; jenv->DeleteGlobalRef(kernelObject); jenv->DeleteGlobalRef(kernelClass); if (context != 0){ status = clReleaseContext(context); //fprintf(stdout, "dispose context %0lx\n", context); CLException::checkCLError(status, "clReleaseContext()"); context = (cl_context)0; } if (commandQueue != 0){ if (config->isTrackingOpenCLResources()){ commandQueueList.remove((cl_command_queue)commandQueue, __LINE__, __FILE__); } status = clReleaseCommandQueue((cl_command_queue)commandQueue); //fprintf(stdout, "dispose commandQueue %0lx\n", commandQueue); CLException::checkCLError(status, "clReleaseCommandQueue()"); commandQueue = (cl_command_queue)0; } if (program != 0){ status = clReleaseProgram((cl_program)program); //fprintf(stdout, "dispose program %0lx\n", program); CLException::checkCLError(status, "clReleaseProgram()"); program = (cl_program)0; } if (kernel != 0){ status = clReleaseKernel((cl_kernel)kernel); //fprintf(stdout, "dispose kernel %0lx\n", kernel); CLException::checkCLError(status, "clReleaseKernel()"); kernel = (cl_kernel)0; } if (argc > 0){ for (int i=0; i< argc; i++){ KernelArg *arg = args[i]; if (!arg->isPrimitive()){ if (arg->arrayBuffer != NULL){ if (arg->arrayBuffer->mem != 0){ if (config->isTrackingOpenCLResources()){ memList.remove((cl_mem)arg->arrayBuffer->mem, __LINE__, __FILE__); } status = clReleaseMemObject((cl_mem)arg->arrayBuffer->mem); //fprintf(stdout, "dispose arg %d %0lx\n", i, arg->arrayBuffer->mem); CLException::checkCLError(status, "clReleaseMemObject()"); arg->arrayBuffer->mem = (cl_mem)0; } if (arg->arrayBuffer->javaArray != NULL) { jenv->DeleteWeakGlobalRef((jweak) arg->arrayBuffer->javaArray); } delete arg->arrayBuffer; arg->arrayBuffer = NULL; } } if (arg->name != NULL){ free(arg->name); arg->name = NULL; } if (arg->javaArg != NULL ) { jenv->DeleteGlobalRef((jobject) arg->javaArg); } delete arg; arg=args[i]=NULL; } delete[] args; args=NULL; // do we need to call clReleaseEvent on any of these that are still retained.... delete[] readEvents; readEvents = NULL; delete[] writeEvents; writeEvents = NULL; delete[] executeEvents; executeEvents = NULL; if (config->isProfilingEnabled()) { if (config->isProfilingCSVEnabled()) { if (profileFile != NULL && profileFile != stderr) { fclose(profileFile); } } delete[] readEventArgs; readEventArgs=0; delete[] writeEventArgs; writeEventArgs=0; } } if (config->isTrackingOpenCLResources()){ fprintf(stderr, "after dispose{ \n"); commandQueueList.report(stderr); memList.report(stderr); readEventList.report(stderr); executeEventList.report(stderr); writeEventList.report(stderr); fprintf(stderr, "}\n"); } }
int main(int argc, char *argv[]) { //FILE *fp; cl_platform_id platform_id[2]; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret_code; cl_mem image_in_mem = NULL; cl_mem image_out_mem = NULL; cl_mem twiddle_factors_mem = NULL; cl_float2 *image_in_host; cl_float2 *twiddle_factors_host; cl_kernel kernel_twiddle_factors; cl_kernel kernel_matriz_transpose; cl_kernel kernel_lowpass_filter; pgm_t ipgm; pgm_t opgm; image_file_t *image_filename; char *output_filename; FILE *fp; const char *kernel_filename = C_NOME_ARQ_KERNEL; size_t source_size; char *source_str; cl_int i, j,n ,m; cl_int raio = 0; size_t global_wg[2]; size_t local_wg[2]; float *image_amplitudes; size_t log_size; char *log_file; cl_event kernels_events_out_fft[4]; cl_ulong kernel_runtime = (cl_ulong) 0; cl_ulong kernel_start_time = (cl_ulong) 0; cl_ulong kernel_end_time = (cl_ulong) 0; cl_event write_host_dev_event; cl_ulong write_host_dev_start_time = (cl_ulong) 0; cl_ulong write_host_dev_end_time = (cl_ulong) 0; cl_ulong write_host_dev_run_time = (cl_ulong) 0; cl_event read_dev_host_event; cl_ulong read_dev_host_start_time = (cl_ulong) 0; cl_ulong read_dev_host_end_time = (cl_ulong) 0; cl_ulong read_dev_host_run_time = (cl_ulong) 0; unsigned __int64 image_tam; unsigned __int64 MEGA_BYTES = 1048576; // 1024*1024 double image_tam_MB; double tempo_total; struct event_in_fft_t *fft_events; //=== Timer count start ============================================================================== timer_reset(); timer_start(); //=================================================================================================== if (argc < 2) { printf("**Erro: O arquivo de entrada eh necessario.\n"); exit(EXIT_FAILURE); } image_filename = (image_file_t *) malloc(sizeof(image_file_t)); split_image_filename(image_filename, argv[1]); output_filename = (char *) malloc(40*sizeof(char)); sprintf(output_filename, "%d.%d.%s.%s.%s", image_filename->res, image_filename->num, ENV_TYPE, APP_TYPE, EXTENSAO); fp = fopen(kernel_filename, "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(EXIT_FAILURE); } source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose( fp ); //=================================================================================================== /* Abrindo imagem do arquivo para objeto de memoria local*/ if( ler_pgm(&ipgm, argv[1]) == -1) exit(EXIT_FAILURE); n = ipgm.width; raio = n/8; m = (cl_int)(log((double)n)/log(2.0)); image_in_host = (cl_float2 *)malloc((n*n)*sizeof(cl_float2)); twiddle_factors_host = (cl_float2 *)malloc(n / 2 * sizeof(cl_float2)); for (i = 0; i < n; i++) { for (j = 0; j < n; j++) { image_in_host[n*i + j].s[0] = (float)ipgm.buf[n*i + j]; image_in_host[n*i + j].s[1] = (float)0; } } fft_events = (struct event_in_fft_t *)malloc(MAX_CALL_FFT*sizeof(struct event_in_fft_t)); kernel_butter_events = (cl_event *)malloc(MAX_CALL_FFT*m*sizeof(cl_event)); //=================================================================================================== CL_CHECK(clGetPlatformIDs(MAX_PLATFORM_ID, platform_id, &ret_num_platforms)); if (ret_num_platforms == 0 ) { fprintf(stderr,"[Erro] Não existem plataformas OpenCL\n"); exit(2); } //=================================================================================================== CL_CHECK(clGetDeviceIDs( platform_id[0], CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices)); //print_platform_info(&platform_id[1]); //=================================================================================================== context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret_code); //=================================================================================================== cmd_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret_code); //=================================================================================================== image_in_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret_code); image_out_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret_code); twiddle_factors_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, (n/2)*sizeof(cl_float2), NULL, &ret_code); //=================================================================================================== /* Transfer data to memory buffer */ CL_CHECK(clEnqueueWriteBuffer(cmd_queue, image_in_mem, CL_TRUE, 0, n*n*sizeof(cl_float2), image_in_host, 0, NULL, &write_host_dev_event)); image_tam = n*n*sizeof(cl_float2); //=================================================================================================== program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret_code); //=================================================================================================== ret_code = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); //=================================================================================================== if (ret_code != CL_SUCCESS) { // Determine the size of the log clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); //=================================================================================================== // Allocate memory for the log log_file = (char *) malloc(log_size); // Get the log clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log_file, NULL); printf("%s\n", log_file); system("pause"); exit(0); } kernel_twiddle_factors = clCreateKernel(program, "twiddle_factors", &ret_code); kernel_matriz_transpose = clCreateKernel(program, "matrix_trasponse", &ret_code); kernel_lowpass_filter = clCreateKernel(program, "lowpass_filter", &ret_code); /* Processa os fatores Wn*/ //=================================================================================================== CL_CHECK(clSetKernelArg(kernel_twiddle_factors, 0, sizeof(cl_mem), (void *)&twiddle_factors_mem)); CL_CHECK(clSetKernelArg(kernel_twiddle_factors, 1, sizeof(cl_int), (void *)&n)); config_workgroup_size(global_wg, local_wg, n/2, 1); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_twiddle_factors, 1, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[0])); //=================================================================================================== /* Executa a FFT em N/2 */ fft_main(image_out_mem, image_in_mem, twiddle_factors_mem, m, direta, &fft_events[0]); //=================================================================================================== /* Realiza a transposta da Matriz (imagem) */ CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 0, sizeof(cl_mem), (void *)&image_in_mem)); CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 1, sizeof(cl_mem), (void *)&image_out_mem)); CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 2, sizeof(cl_int), (void *)&n)); config_workgroup_size(global_wg, local_wg, n, n); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_matriz_transpose, 2, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[1])); //=================================================================================================== /* Executa a FFT N/2 */ fft_main(image_out_mem, image_in_mem, twiddle_factors_mem, m, direta, &fft_events[1]); //=================================================================================================== /* Processa o filtro passa baixa */ CL_CHECK(clSetKernelArg(kernel_lowpass_filter, 0, sizeof(cl_mem), (void *)&image_out_mem)); CL_CHECK(clSetKernelArg(kernel_lowpass_filter, 1, sizeof(cl_int), (void *)&n)); CL_CHECK(clSetKernelArg(kernel_lowpass_filter, 2, sizeof(cl_int), (void *)&raio)); config_workgroup_size(global_wg, local_wg, n, n); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_lowpass_filter, 2, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[2])); //=================================================================================================== /* Obtem a FFT inversa*/ fft_main(image_in_mem, image_out_mem, twiddle_factors_mem, m, inversa, &fft_events[2]); //=================================================================================================== /* Realiza a transposta da Matriz (imagem) */ CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 0, sizeof(cl_mem), (void *)&image_out_mem)); CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 1, sizeof(cl_mem), (void *)&image_in_mem)); CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 2, sizeof(cl_int), (void *)&n)); config_workgroup_size(global_wg, local_wg, n, n); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_matriz_transpose, 2, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[3])); //=================================================================================================== fft_main(image_in_mem, image_out_mem, twiddle_factors_mem, m, inversa, &fft_events[3]); //=================================================================================================== CL_CHECK(clEnqueueReadBuffer(cmd_queue, image_in_mem, CL_TRUE, 0, n*n*sizeof(cl_float2), image_in_host, 0, NULL, &read_dev_host_event)); //=================================================================================================== //== Total time elapsed ============================================================================ timer_stop(); tempo_total = get_elapsed_time(); //================================================================================================== //====== Get time of Profile Info ================================================================== // Write data time CL_CHECK(clGetEventProfilingInfo(write_host_dev_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &write_host_dev_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(write_host_dev_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &write_host_dev_end_time, NULL)); // Read data time CL_CHECK(clGetEventProfilingInfo(read_dev_host_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &read_dev_host_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(read_dev_host_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &read_dev_host_end_time, NULL)); for (i = 0; i < MAX_CALL_FFT; i++) { kernel_start_time = (cl_long) 0; kernel_end_time = (cl_long) 0; CL_CHECK(clGetEventProfilingInfo(kernels_events_out_fft[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(kernels_events_out_fft[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL)); kernel_runtime += (kernel_end_time - kernel_start_time); kernel_start_time = (cl_long) 0; kernel_end_time = (cl_long) 0; CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_bitsrev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_bitsrev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL)); kernel_runtime += (kernel_end_time - kernel_start_time); kernel_start_time = (cl_long) 0; kernel_end_time = (cl_long) 0; if (fft_events[i].kernel_normalize != NULL) { CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_normalize, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_normalize, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL)); kernel_runtime += (kernel_end_time - kernel_start_time); } } for (j=0; j < MAX_CALL_FFT*m; j++){ kernel_start_time = (cl_long) 0; kernel_end_time = (cl_long) 0; CL_CHECK(clGetEventProfilingInfo(kernel_butter_events[j], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(kernel_butter_events[j], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL)); kernel_runtime += (kernel_end_time - kernel_start_time); } write_host_dev_run_time = write_host_dev_end_time - write_host_dev_start_time; read_dev_host_run_time = read_dev_host_end_time - read_dev_host_start_time; /* save_log_debug(write_host_dev_run_time,fp); save_log_debug(read_dev_host_run_time,fp); close_log_debug(fp); */ image_tam_MB = (double) (((double) image_tam)/(double) MEGA_BYTES); //================================================================================================== save_log_gpu(image_filename, kernel_runtime, (double) (image_tam_MB/( (double) read_dev_host_run_time/(double) NANOSECONDS)), (double) (image_tam_MB/ ((double) write_host_dev_run_time/ (double) NANOSECONDS)), tempo_total, LOG_NAME); //=================================================================================================== image_amplitudes = (float*)malloc(n*n*sizeof(float)); for (i=0; i < n; i++) { for (j=0; j < n; j++) { image_amplitudes[n*j + i] = (float) (AMP(((float*)image_in_host)[(2*n*j)+2*i], ((float*)image_in_host)[(2*n*j)+2*i+1])); } } //clFlush(cmd_queue); //clFinish(cmd_queue); opgm.width = n; opgm.height = n; normalizar_pgm(&opgm, image_amplitudes); escrever_pgm(&opgm, output_filename); //=================================================================================================== clFinish(cmd_queue); clReleaseKernel(kernel_twiddle_factors); clReleaseKernel(kernel_matriz_transpose); clReleaseKernel(kernel_lowpass_filter); clReleaseProgram(program); clReleaseMemObject(image_in_mem); clReleaseMemObject(image_out_mem); clReleaseMemObject(twiddle_factors_mem); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); clReleaseEvent(read_dev_host_event); clReleaseEvent(write_host_dev_event); clReleaseEvent(kernels_events_out_fft[0]); clReleaseEvent(kernels_events_out_fft[1]); clReleaseEvent(kernels_events_out_fft[2]); clReleaseEvent(kernels_events_out_fft[3]); destruir_pgm(&ipgm); destruir_pgm(&opgm); free(image_amplitudes); free(source_str); free(image_in_host); free(image_filename); free(twiddle_factors_host); free(output_filename); free(fft_events); free(kernel_butter_events); //_CrtDumpMemoryLeaks(); return 0; }
double gpu_cgm_image(uint32_t* aList, uint32_t* bList, int aLength, int bLength, int keyLength, uint32_t** matches, char* clFile, int x, int y) { int gap = 0, myoffset = 0; cl_platform_id *platforms; cl_uint num_platforms = 0; cl_device_id *devices; cl_uint num_devices = 0; cl_context context; cl_command_queue command_queue; cl_image_format imgFormat; cl_mem aImg; cl_mem bImg; cl_mem res_buf; cl_program program; cl_kernel kernel; cl_uint *results; FILE *prgm_fptr; struct stat prgm_sbuf; char *prgm_data; size_t prgm_size; size_t offset; size_t count; const size_t global_work_size[] = { x, y }; const size_t origin[] = { 0, 0, 0 }; const size_t region[] = { aLength, 1, 1 }; cl_int ret; cl_uint i; cl_bool imageSupport; struct timeval t1, t2; double elapsedTime; results = malloc(sizeof(cl_uint) * aLength); imgFormat.image_channel_order = CL_RGBA; imgFormat.image_channel_data_type = CL_UNSIGNED_INT32; /* figure out how many CL platforms are available */ ret = clGetPlatformIDs(0, NULL, &num_platforms); if (CL_SUCCESS != ret) { print_error ("Error getting the number of platform IDs: %d", ret); exit(EXIT_FAILURE); } if (0 == num_platforms) { print_error ("No CL platforms were found."); exit(EXIT_FAILURE); } /* allocate space for each available platform ID */ if (NULL == (platforms = malloc((sizeof *platforms) * num_platforms))) { print_error ("Out of memory"); exit(EXIT_FAILURE); } /* get all of the platform IDs */ ret = clGetPlatformIDs(num_platforms, platforms, NULL); if (CL_SUCCESS != ret) { print_error ("Error getting platform IDs: %d", ret); exit(EXIT_FAILURE); } /* find a platform that supports given device type */ // print_error ("Number of platforms found: %d", num_platforms); for (i = 0; i < num_platforms; i++) { ret = clGetDeviceIDs(platforms[i], getDeviceType(), 0, NULL, &num_devices); if (CL_SUCCESS != ret) continue; if (0 < num_devices) break; } /* make sure at least one device was found */ if (num_devices == 0) { print_error ("No CL device found that supports device type: %s.", ((getDeviceType() == CL_DEVICE_TYPE_CPU) ? "CPU" : "GPU")); exit(EXIT_FAILURE); } /* only one device is necessary... */ num_devices = 1; if (NULL == (devices = malloc((sizeof *devices) * num_devices))) { print_error ("Out of memory"); exit(EXIT_FAILURE); } /* get one device id */ ret = clGetDeviceIDs(platforms[i], getDeviceType(), num_devices, devices, NULL); if (CL_SUCCESS != ret) { print_error ("Error getting device IDs: %d", ret); exit(EXIT_FAILURE); } ret = clGetDeviceInfo(*devices, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &imageSupport, NULL); if (CL_SUCCESS != ret) { print_error ("Failed to get Device Info: %d", ret); exit(EXIT_FAILURE); } if(imageSupport == CL_FALSE) { print_error ("Failure: Images are not supported!"); exit(EXIT_FAILURE); } /* create a context for the CPU device that was found earlier */ context = clCreateContext(NULL, num_devices, devices, NULL, NULL, &ret); if (NULL == context || CL_SUCCESS != ret) { print_error ("Failed to create context: %d", ret); exit(EXIT_FAILURE); } /* create a command queue for the CPU device */ command_queue = clCreateCommandQueue(context, devices[0], 0, &ret); if (NULL == command_queue || CL_SUCCESS != ret) { print_error ("Failed to create a command queue: %d", ret); exit(EXIT_FAILURE); } /* create buffers on the CL device */ aImg = clCreateImage2D(context, CL_MEM_READ_ONLY, &imgFormat, aLength, 1, 0, NULL, &ret); if (NULL == aImg || CL_SUCCESS != ret) { print_error ("Failed to create a image: %d", ret); exit(EXIT_FAILURE); } bImg = clCreateImage2D(context, CL_MEM_READ_ONLY, &imgFormat, aLength, 1, 0, NULL, &ret); if (NULL == bImg || CL_SUCCESS != ret) { print_error ("Failed to create b image: %d", ret); exit(EXIT_FAILURE); } int res_bufSize = aLength; res_buf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * res_bufSize, NULL, &ret); if (NULL == res_buf || CL_SUCCESS != ret) { print_error ("Failed to create b buffer: %d", ret); exit(EXIT_FAILURE); } /* read the opencl program code into a string */ prgm_fptr = fopen(clFile, "r"); if (NULL == prgm_fptr) { print_error ("%s", strerror (errno)); exit(EXIT_FAILURE); } if (0 != stat(clFile, &prgm_sbuf)) { print_error ("%s", strerror (errno)); exit(EXIT_FAILURE); } prgm_size = prgm_sbuf.st_size; prgm_data = malloc(prgm_size); if (NULL == prgm_data) { print_error ("Out of memory"); exit(EXIT_FAILURE); } /* make sure all data is read from the file (just in case fread returns * short) */ offset = 0; while (prgm_size - offset != (count = fread(prgm_data + offset, 1, prgm_size - offset, prgm_fptr))) offset += count; if (0 != fclose(prgm_fptr)) { print_error ("%s", strerror (errno)); exit(EXIT_FAILURE); } /* create a 'program' from the source */ program = clCreateProgramWithSource(context, 1, (const char **) &prgm_data, &prgm_size, &ret); if (NULL == program || CL_SUCCESS != ret) { print_error ("Failed to create program with source: %d", ret); exit(EXIT_FAILURE); } /* compile the program.. (it uses llvm or something) */ ret = clBuildProgram(program, num_devices, devices, NULL, NULL, NULL); if (CL_SUCCESS != ret) { size_t size; char *log = calloc(1, 4000); if (NULL == log) { print_error ("Out of memory"); exit(EXIT_FAILURE); } print_error ("Failed to build program: %d", ret); ret = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 4096, log, &size); if (CL_SUCCESS != ret) { print_error ("Failed to get program build info: %d", ret); exit(EXIT_FAILURE); } fprintf(stderr, "Begin log:\n%s\nEnd log.\n", log); exit(EXIT_FAILURE); } /* pull out a reference to your kernel */ kernel = clCreateKernel(program, "cgm_kernel", &ret); if (NULL == kernel || CL_SUCCESS != ret) { print_error ("Failed to create kernel: %d", ret); exit(EXIT_FAILURE); } gettimeofday(&t1, NULL); /* write data to these buffers */ clEnqueueWriteImage(command_queue, aImg, CL_FALSE, origin, region, 0, 0, (void*) aImg, 0, NULL, NULL); clEnqueueWriteImage(command_queue, bImg, CL_FALSE, origin, region, 0, 0, (void*) bImg, 0, NULL, NULL); /* set your kernel's arguments */ ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), &aImg); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bImg); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } ret = clSetKernelArg(kernel, 4, sizeof(int), &gap); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } ret = clSetKernelArg(kernel, 5, sizeof(int), &myoffset); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } ret = clSetKernelArg(kernel, 6, sizeof(int), &keyLength); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } ret = clSetKernelArg(kernel, 7, sizeof(cl_mem), &res_buf); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } /* make sure buffers have been written before executing */ ret = clEnqueueBarrier(command_queue); if (CL_SUCCESS != ret) { print_error ("Failed to enqueue barrier: %d", ret); exit(EXIT_FAILURE); } /* enque this kernel for execution... */ ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL); if (CL_SUCCESS != ret) { print_error ("Failed to enqueue kernel: %d", ret); exit(EXIT_FAILURE); } /* wait for the kernel to finish executing */ ret = clEnqueueBarrier(command_queue); if (CL_SUCCESS != ret) { print_error ("Failed to enqueue barrier: %d", ret); exit(EXIT_FAILURE); } /* copy the contents of dev_buf from the CL device to the host (CPU) */ ret = clEnqueueReadBuffer(command_queue, res_buf, true, 0, sizeof(cl_uint) * aLength, results, 0, NULL, NULL); gettimeofday(&t2, NULL); elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000.0; // sec to ms elapsedTime += (t2.tv_usec - t1.tv_usec) / 1000.0; // us to ms if (CL_SUCCESS != ret) { print_error ("Failed to copy data from device to host: %d", ret); exit(EXIT_FAILURE); } ret = clEnqueueBarrier(command_queue); if (CL_SUCCESS != ret) { print_error ("Failed to enqueue barrier: %d", ret); exit(EXIT_FAILURE); } /* make sure the content of the buffer are what we expect */ //for (i = 0; i < aLength; i++) // printf("%d\n", results[i]); /* free up resources */ ret = clReleaseKernel(kernel); if (CL_SUCCESS != ret) { print_error ("Failed to release kernel: %d", ret); exit(EXIT_FAILURE); } ret = clReleaseProgram(program); if (CL_SUCCESS != ret) { print_error ("Failed to release program: %d", ret); exit(EXIT_FAILURE); } ret = clReleaseMemObject(aImg); if (CL_SUCCESS != ret) { print_error ("Failed to release memory object: %d", ret); exit(EXIT_FAILURE); } ret = clReleaseMemObject(bImg); if (CL_SUCCESS != ret) { print_error ("Failed to release memory object: %d", ret); exit(EXIT_FAILURE); } ret = clReleaseMemObject(res_buf); if (CL_SUCCESS != ret) { print_error ("Failed to release memory object: %d", ret); exit(EXIT_FAILURE); } if (CL_SUCCESS != (ret = clReleaseCommandQueue(command_queue))) { print_error ("Failed to release command queue: %d", ret); exit(EXIT_FAILURE); } if (CL_SUCCESS != (ret = clReleaseContext(context))) { print_error ("Failed to release context: %d", ret); exit(EXIT_FAILURE); } matches = &results; return elapsedTime; }
static void clrpc_client_test2(void) { int err; int size = 1024; cl_uint nplatforms = 0; cl_platform_id* platforms = 0; cl_uint nplatforms_ret; clGetPlatformIDs(nplatforms,platforms,&nplatforms_ret); printf( "after call one i get nplatforms_ret = %d", nplatforms_ret); if (nplatforms_ret == 0) exit(1); nplatforms = nplatforms_ret; platforms = (cl_platform_id*)calloc(nplatforms,sizeof(cl_platform_id)); clGetPlatformIDs(nplatforms,platforms,&nplatforms_ret); int i; for(i=0;i<nplatforms;i++) { clrpc_dptr* tmp = ((_xobj_t*)platforms[i])->obj; int is_rpc; if ( clGetPlatformInfo(platforms[i],999,sizeof(cl_int),&is_rpc,0)==CL_SUCCESS) { printf( "platforms[%d] local=%p remote=%p\n", i,(void*)tmp->local, (void*)tmp->remote); } else { printf( "platforms[%d] not RPC\n",i); } } char buffer[1024]; size_t sz; cl_platform_id rpc_platform = 0; for(i=0;i<nplatforms;i++) { clGetPlatformInfo(platforms[i],CL_PLATFORM_NAME,1023,buffer,&sz); printf( "\n [%d] CL_PLATFORM_NAME|%ld:%s|\n",i,sz,buffer); } int iplat; for(iplat=0;iplat<nplatforms;iplat++) { printf("\n******************\nTEST PLATFORM %d\n*************\n\n",iplat); cl_uint ndevices = 0; cl_device_id* devices = 0; cl_uint ndevices_ret; clGetDeviceIDs(platforms[iplat],CL_DEVICE_TYPE_ALL, ndevices,devices,&ndevices_ret); printf( "after call one i get ndevices_ret = %d\n", ndevices_ret); if (ndevices_ret > 10) exit(-1); ndevices = ndevices_ret; devices = (cl_device_id*)calloc(ndevices,sizeof(cl_device_id)); clGetDeviceIDs(platforms[iplat],CL_DEVICE_TYPE_ALL, ndevices,devices,&ndevices_ret); if (!ndevices_ret) { //printf("no devices, stopping.\n"); //exit(1); printf("no devices, skipping.\n"); continue; } for(i=0;i<ndevices;i++) { clrpc_dptr* tmp = ((_xobj_t*)devices[i])->obj; clGetDeviceInfo(devices[i],CL_DEVICE_NAME,1023,buffer,&sz); printf( "CL_DEVICE_NAME |%s|\n",buffer); cl_platform_id tmpid; clGetDeviceInfo(devices[i],CL_DEVICE_PLATFORM,sizeof(tmpid),&tmpid,&sz); printf("%p\n",platforms[iplat]); fflush(stdout); printf("%p\n",tmpid); fflush(stdout); clGetPlatformInfo(tmpid,CL_PLATFORM_NAME,1023,buffer,&sz); printf( "\n [%d] CL_PLATFORM_NAME|%ld:%s|\n",i,sz,buffer); } cl_context_properties ctxprop[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[iplat], 0 }; printf("i am setting this: prop[%d] %p\n",iplat,platforms[iplat]); cl_context ctx = clCreateContext(ctxprop,ndevices,devices, 0,0,&err); cl_command_queue* cmdq = (cl_command_queue*) calloc(ndevices,sizeof(cl_command_queue)); for(i=0;i<ndevices;i++) { cmdq[i] = clCreateCommandQueue(ctx,devices[i],0,&err); printf( "cmdq %d %p",i,cmdq[i]); } cl_mem a_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int), 0,&err); cl_mem b_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int), 0,&err); cl_mem c_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int), 0,&err); cl_mem d_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int), 0,&err); int* a = (int*)malloc(1024*sizeof(int)); int* b = (int*)malloc(1024*sizeof(int)); int* c = (int*)malloc(1024*sizeof(int)); int* d = (int*)malloc(1024*sizeof(int)); char* prgsrc[] = { "__kernel void my_kern( int n, __global int* a, __global int* b )\n" " { int i = get_global_id(0); int tmp = 0; int j; for(j=0;j<n;j++) tmp += a[i] * a[j]; b[i] = tmp; }\n" }; size_t prgsrc_sz = strlen(prgsrc[0]) + 1; cl_program prg = clCreateProgramWithSource(ctx,1, (const char**)prgsrc,&prgsrc_sz,&err); clBuildProgram(prg,ndevices,devices,0,0,0); cl_kernel krn = clCreateKernel(prg,"my_kern",&err); int idev; for(idev=0;idev<ndevices;idev++) { printf("\n******************\nTEST DEVICE %d(%d)\n*************\n\n",idev,iplat); for(i=0;i<size;i++) a[i] = i*10; for(i=0;i<size;i++) b[i] = i*10+1; for(i=0;i<size;i++) c[i] = 0; for(i=0;i<size;i++) d[i] = 0; cl_event ev[8]; for(i=0;i<32;i++) printf("%d/",a[i]); printf("\n"); for(i=0;i<32;i++) printf("%d/",b[i]); printf("\n"); clEnqueueWriteBuffer(cmdq[idev],a_buf,CL_FALSE,0,size*sizeof(int),a, 0,0,&ev[0]); clEnqueueWriteBuffer(cmdq[idev],b_buf,CL_FALSE,0,size*sizeof(int),b, 1,ev,&ev[1]); clEnqueueWriteBuffer(cmdq[idev],c_buf,CL_FALSE,0,size*sizeof(int),c, 2,ev,&ev[2]); clEnqueueWriteBuffer(cmdq[idev],d_buf,CL_FALSE,0,size*sizeof(int),d, 3,ev,&ev[3]); size_t offset = 0; size_t gwsz = 128; size_t lwsz = 16; clSetKernelArg(krn,0,sizeof(int),&size); clSetKernelArg(krn,1,sizeof(cl_mem),&a_buf); clSetKernelArg(krn,2,sizeof(cl_mem),&c_buf); clEnqueueNDRangeKernel(cmdq[idev],krn,1,&offset,&gwsz,&lwsz,4,ev,&ev[4]); clSetKernelArg(krn,1,sizeof(cl_mem),&b_buf); clSetKernelArg(krn,2,sizeof(cl_mem),&d_buf); clEnqueueNDRangeKernel(cmdq[idev],krn,1,&offset,&gwsz,&lwsz,5,ev,&ev[5]); clEnqueueReadBuffer(cmdq[idev],c_buf,CL_FALSE,0,size*sizeof(int),c, 6,ev,&ev[6]); clEnqueueReadBuffer(cmdq[idev],d_buf,CL_FALSE,0,size*sizeof(int),d, 7,ev,&ev[7]); clFlush(cmdq[idev]); clWaitForEvents(8,ev); for(i=0;i<32;i++) printf("%d/",c[i]); printf("\n"); for(i=0;i<32;i++) printf("%d/",d[i]); printf("\n"); for(i=0;i<8;i++) clReleaseEvent(ev[i]); } clReleaseKernel(krn); clReleaseProgram(prg); clReleaseMemObject(a_buf); clReleaseMemObject(b_buf); clReleaseMemObject(c_buf); clReleaseMemObject(d_buf); clReleaseCommandQueue(cmdq[0]); clReleaseContext(ctx); // printf("sleeping ...\n"); // sleep(1); } // clrpc_final(); }
//////////////////////////////////////////////////////////////////////////////////// // Measure the local memoy to local memoy bandwidth. //////////////////////////////////////////////////////////////////////////////////// int measureLocalMemory(cl_device_id device_id, cl_context context, cl_command_queue commands, unsigned int type, int f4, unsigned int elements, unsigned int iterations, bool larg, double time_taken[2]) { cl_int err = CL_SUCCESS; const char* source_path = "mem_streaming.cl"; char buf[512]; int elementsToAlloc = elements; size_t local, global; for(size_t ws = 0; ws <= 1; ++ws) { if(ws == 0) { // Execute the kernel using just one single workitem local = 1; global = 1; } else { // Execute the kernel using the max number of threads on each processor _DEVICE_INFO* info = get_device_info(device_id); size_t* tmp = info->max_work_item_sizes; local = tmp[0]; free(tmp); global = info->max_compute_units; while(local > elements) local /= 2; global *= local; } if(type == 1) elementsToAlloc = (elements + local-1)/local; if(f4 == 0) sprintf(buf, "#define dtype float\n"); else sprintf(buf, "#define dtype float%d\n", (int)pow(2.0, f4)); sprintf(buf+strlen(buf), "#define VEC %d\n#define ELEMENTS %d\n#define localRange %lu\n", f4, elementsToAlloc, local); if(larg) sprintf(buf+strlen(buf), "#define LARG\n"); cl_program program = load_kernel(source_path, context, buf); if(!program) { fprintf(stderr, "Error: Failed to create compute program!\n"); return 1; } // Build the program executable err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if(err != CL_SUCCESS) { size_t len; char buffer[8096]; fprintf(stderr, "Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); fprintf(stderr, "%s\n", buffer); return 1; } // Create the compute kernel cl_kernel kernel; switch(type) { case 1: kernel = clCreateKernel(program, "private_mem", &err); break; case 2: kernel = clCreateKernel(program, "global_mem", &err); break; default: kernel = clCreateKernel(program, "local_mem", &err); } if (!kernel || err != CL_SUCCESS) { fprintf(stderr, "Error: Failed to create compute kernel!\n"); return 1; } float* hOutput = (float*)malloc(global * sizeof(float)); memset(hOutput, 0, global * sizeof(float)); cl_mem output = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * global, hOutput, NULL); if (!output || err != CL_SUCCESS) { fprintf(stderr, "Error: Failed to allocate device memory!\n"); return 1; } // Set the arguments to our compute kernel err = CL_SUCCESS; err |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &output); cl_mem g1, g2; switch(type) { case 1: break; case 2: switch(f4) { case(1): g1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float2) * elements, NULL, NULL); g2 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float2) * elements*2, NULL, NULL); break; case(2): g1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float4) * elements, NULL, NULL); g2 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float4) * elements*2, NULL, NULL); break; case(3): g1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float8) * elements, NULL, NULL); g2 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float8) * elements*2, NULL, NULL); break; case(4): g1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float16) * elements, NULL, NULL); g2 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float16) * elements*2, NULL, NULL); break; default: g1 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * elements, NULL, NULL); g2 = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float) * elements*2, NULL, NULL); break; break; } err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &g1); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &g2); break; default: if(larg) switch(f4) { case(1): err |= clSetKernelArg(kernel, 1, sizeof(cl_float2)*elements, NULL); err |= clSetKernelArg(kernel, 2, sizeof(cl_float2)*elements*2, NULL); break; case(2): err |= clSetKernelArg(kernel, 1, sizeof(cl_float4)*elements, NULL); err |= clSetKernelArg(kernel, 2, sizeof(cl_float4)*elements*2, NULL); break; case(3): err |= clSetKernelArg(kernel, 1, sizeof(cl_float8)*elements, NULL); err |= clSetKernelArg(kernel, 2, sizeof(cl_float8)*elements*2, NULL); break; case(4): err |= clSetKernelArg(kernel, 1, sizeof(cl_float8)*elements, NULL); err |= clSetKernelArg(kernel, 2, sizeof(cl_float8)*elements*2, NULL); break; default: err |= clSetKernelArg(kernel, 1, sizeof(cl_float)*elements, NULL); err |= clSetKernelArg(kernel, 2, sizeof(cl_float)*elements*2, NULL); break; break; } } if (err != CL_SUCCESS) { fprintf(stderr, "Error: Failed to set kernel arguments! %d\n", err); return 1; } // warmup for(unsigned i = 0; i < WARMUP_CYCLES; ++i) { err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); clFinish(commands); } // start actual measurement unsigned long start_time = current_msecs(); for(unsigned i = 0; i < iterations; ++i) { err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err) { fprintf(stderr, "Error %i: Failed to execute kernel!\n%s\n", err, oclErrorString(err)); return 1; } clFlush(commands); } clFinish(commands); time_taken[ws] = elapsed_msecs(start_time) / 1000.0; /* cl_event read; err = clEnqueueReadBuffer(commands, output, CL_FALSE, 0, global*sizeof(float), hOutput, 0, NULL, &read); if (err) { fprintf(stderr, "Error %i: Failed read buffer!\n%s\n", err, oclErrorString(err)); return 1; } clWaitForEvents(1, &read); for(size_t i = 0; i < global; ++i) printf(", %d %f ", i, hOutput[i]); printf("\n\n"); */ free(hOutput); clReleaseMemObject(output); if(type == 2) { clReleaseMemObject(g1); clReleaseMemObject(g2); } clReleaseProgram(program); clReleaseKernel(kernel); } return err; }
bool runTest( int argc, const char** argv, ReduceType datatype) { int size = 1<<24; // number of elements to reduce int maxThreads; cl_kernel reductionKernel = getReductionKernel(datatype, 0, 64, 1); clReleaseKernel(reductionKernel); if (smallBlock) maxThreads = 64; // number of threads per block else maxThreads = 128; int whichKernel = 6; int maxBlocks = 64; bool cpuFinalReduction = false; int cpuFinalThreshold = 1; shrGetCmdLineArgumenti( argc, (const char**) argv, "n", &size); shrGetCmdLineArgumenti( argc, (const char**) argv, "threads", &maxThreads); shrGetCmdLineArgumenti( argc, (const char**) argv, "kernel", &whichKernel); shrGetCmdLineArgumenti( argc, (const char**) argv, "maxblocks", &maxBlocks); shrLog(" %d elements\n", size); shrLog(" %d threads (max)\n", maxThreads); cpuFinalReduction = (shrCheckCmdLineFlag( argc, (const char**) argv, "cpufinal") == shrTRUE); shrGetCmdLineArgumenti( argc, (const char**) argv, "cputhresh", &cpuFinalThreshold); bool runShmoo = (shrCheckCmdLineFlag(argc, (const char**) argv, "shmoo") == shrTRUE); #ifdef GPU_PROFILING if (runShmoo) { shmoo<T>(1, 33554432, maxThreads, maxBlocks, datatype); return true; } else #endif { // create random input data on CPU unsigned int bytes = size * sizeof(T); T* h_idata = (T*)malloc(bytes); for(int i=0; i<size; i++) { // Keep the numbers small so we don't get truncation error in the sum if (datatype == REDUCE_INT) h_idata[i] = (T)(rand() & 0xFF); else h_idata[i] = (rand() & 0xFF) / (T)RAND_MAX; } int numBlocks = 0; int numThreads = 0; getNumBlocksAndThreads(whichKernel, size, maxBlocks, maxThreads, numBlocks, numThreads); if (numBlocks == 1) cpuFinalThreshold = 1; shrLog(" %d blocks\n\n", numBlocks); // allocate mem for the result on host side T* h_odata = (T*)malloc(numBlocks * sizeof(T)); // allocate device memory and data cl_mem d_idata = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, bytes, h_idata, NULL); cl_mem d_odata = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, numBlocks * sizeof(T), NULL, NULL); int testIterations = 100; double dTotalTime = 0.0; T gpu_result = 0; gpu_result = profileReduce<T>(datatype, size, numThreads, numBlocks, maxThreads, maxBlocks, whichKernel, testIterations, cpuFinalReduction, cpuFinalThreshold, &dTotalTime, h_odata, d_idata, d_odata); #ifdef GPU_PROFILING double reduceTime = dTotalTime/(double)testIterations; shrLogEx(LOGBOTH | MASTER, 0, "oclReduction, Throughput = %.4f GB/s, Time = %.5f s, Size = %u Elements, NumDevsUsed = %d, Workgroup = %u\n", 1.0e-9 * ((double)bytes)/reduceTime, reduceTime, size, 1, numThreads); #endif // compute reference solution shrLog("\nComparing against Host/C++ computation...\n"); T cpu_result = reduceCPU<T>(h_idata, size); if (datatype == REDUCE_INT) { shrLog(" GPU result = %d\n", gpu_result); shrLog(" CPU result = %d\n\n", cpu_result); shrLog("%s\n\n", (gpu_result == cpu_result) ? "PASSED" : "FAILED"); } else { shrLog(" GPU result = %.9f\n", gpu_result); shrLog(" CPU result = %.9f\n\n", cpu_result); double threshold = (datatype == REDUCE_FLOAT) ? 1e-8 * size : 1e-12; double diff = abs((double)gpu_result - (double)cpu_result); shrLog("%s\n\n", (diff < threshold) ? "PASSED" : "FAILED"); } // cleanup free(h_idata); free(h_odata); clReleaseMemObject(d_idata); clReleaseMemObject(d_odata); return (gpu_result == cpu_result); } }
/** * @brief Main principal * @param argc El número de argumentos del programa * @param argv Cadenas de argumentos del programa * @return Nada si es correcto o algún número negativo si es incorrecto */ int main( int argc, char** argv ) { if(argc != 2) return -1; // Medimos tiempo para el programa const double start_time = getCurrentTimestamp(); FILE *kernels; char *source_str; size_t source_size, work_items; // OpenCL runtime configuration unsigned num_devices; cl_platform_id platform_ids[3]; cl_uint ret_num_platforms; cl_device_id device_id; cl_context context = NULL; cl_command_queue command_queue; cl_program program = NULL; cl_int ret; cl_kernel kernelINIT; cl_event kernel_event, finish_event; cl_mem objPARTICULAS; // Abrimos el fichero que contiene el kernel fopen_s(&kernels, "initparticulasCPU.cl", "r"); if (!kernels) { fprintf(stderr, "Fallo al cargar el kernel\n"); exit(-1); } source_str = (char *) malloc(0x100000); source_size = fread(source_str, 1, 0x100000, kernels); fclose(kernels); // Obtenemos los IDs de las plataformas disponibles if( clGetPlatformIDs(3, platform_ids, &ret_num_platforms) != CL_SUCCESS) { printf("No se puede obtener id de la plataforma"); return -1; } // Intentamos obtener un dispositivo CPU soportado if( clGetDeviceIDs(platform_ids[1], CL_DEVICE_TYPE_CPU, 1, &device_id, &num_devices) != CL_SUCCESS) { printf("No se puede obtener id del dispositivo"); return -1; } clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &work_items, NULL); // Creación de un contexto OpenCL context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); // Creación de una cola de comandos command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret); // Creación de un programa kernel desde un fichero de código program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); if (ret != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: ¡Fallo al construir el programa ejecutable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s", buffer); exit(-1); } // Creación del kernel OpenCL kernelINIT = clCreateKernel(program, "calc_particles_init", &ret); // Creamos el buffer para las partÃculas y reservamos espacio ALINEADO para los datos size_t N = atoi(argv[1]); particle *particulas = (particle*) _aligned_malloc(N * sizeof(particle), 64); objPARTICULAS = clCreateBuffer(context, CL_MEM_WRITE_ONLY, N * sizeof(particle), NULL, &ret); const size_t global = 4; const size_t local_work_size = 1; // Transferimos el frame al dispositivo cl_event write_event; ret = clEnqueueWriteBuffer(command_queue, objPARTICULAS, CL_FALSE, 0, N * sizeof(particle), particulas, 0, NULL, &write_event); // Establecemos los argumentos del kernel ret = clSetKernelArg(kernelINIT, 0, sizeof(cl_mem), &objPARTICULAS); ret = clSetKernelArg(kernelINIT, 1, sizeof(int), &N); // Ejecutamos el kernel. Un work-item por cada work-group o unidad de cómputo ret = clEnqueueNDRangeKernel(command_queue, kernelINIT, 1, NULL, &global, &local_work_size, 1, &write_event, &kernel_event); // Leemos los resultados ret = clEnqueueReadBuffer(command_queue, objPARTICULAS, CL_FALSE, 0, N * sizeof(particle), particulas, 1, &kernel_event, &finish_event); // Esperamos a que termine de leer los resultados clWaitForEvents(1, &finish_event); // Obtenemos el tiempo del kernel y de las transferencias CPU-RAM cl_ulong totalKernel = getStartEndTime(kernel_event); cl_ulong totalRam = getStartEndTime(write_event) + getStartEndTime(finish_event); const double end_time = getCurrentTimestamp(); // Obtenemos el tiempo consumido por el programa, el kernel y las transferencias de memoria printf("\nTiempo total del programa: %0.3f ms\n", (end_time - start_time) * 1e3); printf("Tiempo total consumido por el kernel: %0.3f ms\n", double(totalKernel) * 1e-6); printf("Tiempo total consumido en transferencias CPU-RAM: %0.3f ms\n", double(totalRam) * 1e-6); // Liberamos todos los recursos usados (kernels y objetos OpenCL) clReleaseEvent(kernel_event); clReleaseEvent(finish_event); clReleaseEvent(write_event); clReleaseMemObject(objPARTICULAS); clReleaseKernel(kernelINIT); clReleaseCommandQueue(command_queue); clReleaseProgram(program); clReleaseContext(context); }
int main(int argc, char **argv) { int start,end; unsigned long p[64], c[64], k[56]; unsigned long res; build_samples (p, c, k, 0); set_low_keys(k); cl_platform_id cpPlatform; clGetPlatformIDs(1, &cpPlatform, NULL); cl_device_id cdDevice; clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL); char cBuffer[1024]; clGetDeviceInfo(cdDevice, CL_DEVICE_NAME, sizeof(cBuffer), &cBuffer, NULL); printf("CL_DEVICE_NAME:\t\t%s\n", cBuffer); clGetDeviceInfo(cdDevice, CL_DRIVER_VERSION, sizeof(cBuffer), &cBuffer, NULL); printf("CL_DRIVER_VERSION:\t%s\n\n", cBuffer); cl_uint compute_units; clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(compute_units), &compute_units, NULL); printf("CL_DEVICE_MAX_COMPUTE_UNITS:\t%u\n", compute_units); size_t workitem_dims; clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(workitem_dims), &workitem_dims, NULL); printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS:\t%u\n", workitem_dims); size_t workitem_size[3]; clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(workitem_size), &workitem_size, NULL); printf("CL_DEVICE_MAX_WORK_ITEM_SIZES:\t%u / %u / %u \n", workitem_size[0], workitem_size[1], workitem_size[2]); size_t workgroup_size; clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(workgroup_size), &workgroup_size, NULL); printf("CL_DEVICE_MAX_WORK_GROUP_SIZE:\t%u\n", workgroup_size); cl_uint clock_frequency; clGetDeviceInfo(cdDevice, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(clock_frequency), &clock_frequency, NULL); printf("CL_DEVICE_MAX_CLOCK_FREQUENCY:\t%u MHz\n", clock_frequency); cl_context GPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, NULL); cl_command_queue cqCommandQueue = clCreateCommandQueue(GPUContext, cdDevice, 0, NULL); cl_mem GPUVector1 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(unsigned long) * 64, p, NULL); cl_mem GPUVector2 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(unsigned long) * 64, c, NULL); cl_mem GPUVector3 = clCreateBuffer(GPUContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, sizeof(unsigned long) * 56, k, NULL); cl_mem GPUOutputVector = clCreateBuffer(GPUContext, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(unsigned long), &res, NULL); size_t szKernelLength; char* cSourceCL = oclLoadProgSource("ocl_deseval.cl", "", &szKernelLength); cl_program OpenCLProgram = clCreateProgramWithSource(GPUContext, 1, (const char **)&cSourceCL, &szKernelLength, NULL); if (clBuildProgram(OpenCLProgram, 0, NULL, NULL, NULL, NULL)!=CL_SUCCESS) { char cBuffer[2048]; if(clGetProgramBuildInfo(OpenCLProgram,cdDevice,CL_PROGRAM_BUILD_LOG,sizeof(cBuffer),cBuffer,NULL)==CL_SUCCESS); printf("Build error:\n%s\n",cBuffer); exit(1); } cl_kernel OpenCLVectorAdd = clCreateKernel(OpenCLProgram, "keysearch", NULL); clSetKernelArg(OpenCLVectorAdd, 0, sizeof(cl_mem), (void*)&GPUOutputVector); clSetKernelArg(OpenCLVectorAdd, 1, sizeof(cl_mem), (void*)&GPUVector1); clSetKernelArg(OpenCLVectorAdd, 2, sizeof(cl_mem), (void*)&GPUVector2); clSetKernelArg(OpenCLVectorAdd, 3, sizeof(cl_mem), (void*)&GPUVector3); size_t WorkSize[1] = {1024}; start=clock(); for (int i=0; i<1024; i++) { //clEnqueueWriteBuffer(cqCommandQueue, GPUOutputVector, CL_TRUE, 0, // 56 * sizeof(unsigned long), k, 0, NULL, NULL); clEnqueueNDRangeKernel(cqCommandQueue, OpenCLVectorAdd, 1, NULL, WorkSize, NULL, 0, NULL, NULL); //clEnqueueReadBuffer(cqCommandQueue, GPUOutputVector, CL_TRUE, 0, // sizeof(unsigned long), &res, 0, NULL, NULL); if(res!=0) { printf("Key found\n"); //key_found(res,k); break; } increment_key (k); } end=clock(); clReleaseKernel(OpenCLVectorAdd); clReleaseProgram(OpenCLProgram); clReleaseCommandQueue(cqCommandQueue); clReleaseContext(GPUContext); clReleaseMemObject(GPUVector1); clReleaseMemObject(GPUVector2); clReleaseMemObject(GPUOutputVector); printf ("Searched %i keys in %.3f seconds\n", 1000000, ((double)(end-start))/CLOCKS_PER_SEC); return 0; }
int CommandGenerate::execute(const std::vector<std::string>& p_args) { if(p_args.size() < 10) { help(); return -1; } unsigned int platformId = atol(p_args[1].c_str()); unsigned int deviceId = atol(p_args[2].c_str()); unsigned int staggerSize = atol(p_args[3].c_str()); unsigned int threadsNumber = atol(p_args[4].c_str()); unsigned int hashesNumber = atol(p_args[5].c_str()); unsigned int nonceSize = PLOT_SIZE * staggerSize; std::cerr << "Threads number: " << threadsNumber << std::endl; std::cerr << "Hashes number: " << hashesNumber << std::endl; unsigned int numjobs = (p_args.size() - 5)/4; std::cerr << numjobs << " plot(s) to do." << std::endl; unsigned int staggerMbSize = staggerSize / 4; std::cerr << "Non-GPU memory usage: " << staggerMbSize*numjobs << "MB" << std::endl; std::vector<std::string> paths(numjobs); std::vector<std::ofstream *> out_files(numjobs); std::vector<unsigned long long> addresses(numjobs); std::vector<unsigned long long> startNonces(numjobs); std::vector<unsigned long long> endNonces(numjobs); std::vector<unsigned int> noncesNumbers(numjobs); std::vector<unsigned char*> buffersCpu(numjobs); std::vector<bool> saving_thread_flags(numjobs); std::vector<std::future<void>> save_threads(numjobs); unsigned long long maxNonceNumber = 0; unsigned long long totalNonces = 0; int returnCode = 0; try { for (unsigned int i = 0; i < numjobs; i++) { std::cerr << "----" << std::endl; std::cerr << "Job number " << i << std::endl; unsigned int argstart = 6 + i*4; paths[i] = std::string(p_args[argstart]); addresses[i] = strtoull(p_args[argstart+1].c_str(), NULL, 10); startNonces[i] = strtoull(p_args[argstart+2].c_str(), NULL, 10); noncesNumbers[i] = atol(p_args[argstart+3].c_str()); maxNonceNumber = std::max(maxNonceNumber, (long long unsigned int)noncesNumbers[i]); totalNonces += noncesNumbers[i]; std::ostringstream outFile; outFile << paths[i] << "/" << addresses[i] << "_" << startNonces[i] << "_" << \ noncesNumbers[i] << "_" << staggerSize; std::ios_base::openmode file_mode = std::ios::out | std::ios::binary | std::ios::trunc; out_files[i] = new std::ofstream(outFile.str(), file_mode); assert(out_files[i]); if(noncesNumbers[i] % staggerSize != 0) { noncesNumbers[i] -= noncesNumbers[i] % staggerSize; noncesNumbers[i] += staggerSize; } endNonces[i] = startNonces[i] + noncesNumbers[i]; unsigned int noncesGbSize = noncesNumbers[i] / 4 / 1024; std::cerr << "Path: " << outFile.str() << std::endl; std::cerr << "Nonces: " << startNonces[i] << " to " << endNonces[i] << " (" << noncesGbSize << " GB)" << std::endl; std::cerr << "Creating CPU buffer" << std::endl; buffersCpu[i] = new unsigned char[nonceSize]; if(!buffersCpu[i]) { throw std::runtime_error("Unable to create the CPU buffer (probably out of host memory.)"); } saving_thread_flags[i] = false; std::cerr << "----" << std::endl; } cl_platform_id platforms[4]; cl_uint platformsNumber; cl_device_id devices[32]; cl_uint devicesNumber; cl_context context = 0; cl_command_queue commandQueue = 0; cl_mem bufferGpuGen = 0; cl_mem bufferGpuScoops = 0; cl_program program = 0; cl_kernel kernelStep1 = 0; cl_kernel kernelStep2 = 0; cl_kernel kernelStep3 = 0; int error; std::cerr << "Retrieving OpenCL platforms" << std::endl; error = clGetPlatformIDs(4, platforms, &platformsNumber); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to retrieve the OpenCL platforms"); } if(platformId >= platformsNumber) { throw std::runtime_error("No platform found with the provided id"); } std::cerr << "Retrieving OpenCL GPU devices" << std::endl; error = clGetDeviceIDs(platforms[platformId], CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU, 32, devices, &devicesNumber); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to retrieve the OpenCL devices"); } if(deviceId >= devicesNumber) { throw std::runtime_error("No device found with the provided id"); } std::cerr << "Creating OpenCL context" << std::endl; context = clCreateContext(0, 1, &devices[deviceId], NULL, NULL, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL context"); } std::cerr << "Creating OpenCL command queue" << std::endl; commandQueue = clCreateCommandQueue(context, devices[deviceId], 0, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL command queue"); } std::cerr << "Creating OpenCL GPU generation buffer" << std::endl; bufferGpuGen = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uchar) * GEN_SIZE * staggerSize, 0, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL GPU generation buffer"); } std::cerr << "Creating OpenCL GPU scoops buffer" << std::endl; bufferGpuScoops = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uchar) * nonceSize, 0, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL GPU scoops buffer"); } std::cerr << "Creating OpenCL program" << std::endl; std::string source = loadSource("kernel/nonce.cl"); const char* sources[] = {source.c_str()}; size_t sourcesLength[] = {source.length()}; program = clCreateProgramWithSource(context, 1, sources, sourcesLength, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL program"); } std::cerr << "Building OpenCL program" << std::endl; error = clBuildProgram(program, 1, &devices[deviceId], "-I kernel", 0, 0); if(error != CL_SUCCESS) { size_t logSize; clGetProgramBuildInfo(program, devices[deviceId], CL_PROGRAM_BUILD_LOG, 0, 0, &logSize); char* log = new char[logSize]; clGetProgramBuildInfo(program, devices[deviceId], CL_PROGRAM_BUILD_LOG, logSize, (void*)log, 0); std::cerr << log << std::endl; delete[] log; throw OpenclError(error, "Unable to build the OpenCL program"); } std::cerr << "Creating OpenCL step1 kernel" << std::endl; kernelStep1 = clCreateKernel(program, "nonce_step1", &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL kernel"); } std::cerr << "Setting OpenCL step1 kernel static arguments" << std::endl; error = clSetKernelArg(kernelStep1, 2, sizeof(cl_mem), (void*)&bufferGpuGen); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL kernel arguments"); } std::cerr << "Creating OpenCL step2 kernel" << std::endl; kernelStep2 = clCreateKernel(program, "nonce_step2", &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL kernel"); } std::cerr << "Setting OpenCL step2 kernel static arguments" << std::endl; error = clSetKernelArg(kernelStep2, 1, sizeof(cl_mem), (void*)&bufferGpuGen); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL kernel arguments"); } std::cerr << "Creating OpenCL step3 kernel" << std::endl; kernelStep3 = clCreateKernel(program, "nonce_step3", &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL kernel"); } std::cerr << "Setting OpenCL step3 kernel static arguments" << std::endl; error = clSetKernelArg(kernelStep3, 0, sizeof(cl_uint), (void*)&staggerSize); error = clSetKernelArg(kernelStep3, 1, sizeof(cl_mem), (void*)&bufferGpuGen); error = clSetKernelArg(kernelStep3, 2, sizeof(cl_mem), (void*)&bufferGpuScoops); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL kernel arguments"); } size_t globalWorkSize = staggerSize; size_t localWorkSize = (staggerSize < threadsNumber) ? staggerSize : threadsNumber; time_t startTime = time(0); unsigned int totalNoncesCompleted = 0; for (unsigned long long nonce_ordinal = 0; nonce_ordinal < maxNonceNumber; nonce_ordinal += staggerSize) { for (unsigned int jobnum = 0; jobnum < paths.size(); jobnum += 1) { unsigned long long nonce = startNonces[jobnum] + nonce_ordinal; if (nonce > endNonces[jobnum]) { break; } std::cout << "Running with start nonce " << nonce << std::endl; // Is a cl_ulong always an unsigned long long? unsigned int error = 0; error = clSetKernelArg(kernelStep1, 0, sizeof(cl_ulong), (void*)&addresses[jobnum]); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL step1 kernel arguments"); } error = clSetKernelArg(kernelStep1, 1, sizeof(cl_ulong), (void*)&nonce); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL step1 kernel arguments"); } error = clEnqueueNDRangeKernel(commandQueue, kernelStep1, 1, 0, &globalWorkSize, &localWorkSize, 0, 0, 0); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in step1 kernel launch"); } unsigned int hashesSize = hashesNumber * HASH_SIZE; for(int hashesOffset = PLOT_SIZE ; hashesOffset > 0 ; hashesOffset -= hashesSize) { error = clSetKernelArg(kernelStep2, 0, sizeof(cl_ulong), (void*)&nonce); error = clSetKernelArg(kernelStep2, 2, sizeof(cl_uint), (void*)&hashesOffset); error = clSetKernelArg(kernelStep2, 3, sizeof(cl_uint), (void*)&hashesNumber); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL step2 kernel arguments"); } error = clEnqueueNDRangeKernel(commandQueue, kernelStep2, 1, 0, &globalWorkSize, &localWorkSize, 0, 0, 0); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in step2 kernel launch"); } error = clFinish(commandQueue); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in step2 kernel finish"); } } totalNoncesCompleted += staggerSize; double percent = 100.0 * (double)totalNoncesCompleted / totalNonces; time_t currentTime = time(0); double speed = (double)totalNoncesCompleted / difftime(currentTime, startTime) * 60.0; double estimatedTime = (double)(totalNonces - totalNoncesCompleted) / speed; std::cerr << "\r" << percent << "% (" << totalNoncesCompleted << "/" << totalNonces << " nonces)"; std::cerr << ", " << speed << " nonces/minutes"; std::cerr << ", ETA: " << ((int)estimatedTime / 60) << "h" << ((int)estimatedTime % 60) << "m" << ((int)(estimatedTime * 60.0) % 60) << "s"; std::cerr << "... "; error = clEnqueueNDRangeKernel(commandQueue, kernelStep3, 1, 0, &globalWorkSize, &localWorkSize, 0, 0, 0); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in step3 kernel launch"); } if (saving_thread_flags[jobnum]) { save_threads[jobnum].wait(); // Wait for last job to finish saving_thread_flags[jobnum] = false; } error = clEnqueueReadBuffer(commandQueue, bufferGpuScoops, CL_TRUE, 0, sizeof(cl_uchar) * nonceSize, buffersCpu[jobnum], 0, 0, 0); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in synchronous read"); } saving_thread_flags[jobnum] = true; save_threads[jobnum] = std::async(std::launch::async, save_nonces, nonceSize, out_files[jobnum], buffersCpu[jobnum]); } } //Clean up for (unsigned int i = 0; i < paths.size(); i += 1) { if (saving_thread_flags[i]) { std::cerr << "waiting for final save to " << paths[i] << " to finish" << std::endl; save_threads[i].wait(); saving_thread_flags[i] = false; std::cerr << "done waiting for final save" << std::endl; if (buffersCpu[i]) { delete[] buffersCpu[i]; } } } if(kernelStep3) { clReleaseKernel(kernelStep3); } if(kernelStep2) { clReleaseKernel(kernelStep2); } if(kernelStep1) { clReleaseKernel(kernelStep1); } if(program) { clReleaseProgram(program); } if(bufferGpuGen) { clReleaseMemObject(bufferGpuGen); } if(bufferGpuScoops) { clReleaseMemObject(bufferGpuScoops); } if(commandQueue) { clReleaseCommandQueue(commandQueue); } if(context) { clReleaseContext(context); } time_t currentTime = time(0); double elapsedTime = difftime(currentTime, startTime) / 60.0; double speed = (double)totalNonces / elapsedTime; std::cerr << "\r100% (" << totalNonces << "/" << totalNonces << " nonces)"; std::cerr << ", " << speed << " nonces/minutes"; std::cerr << ", " << ((int)elapsedTime / 60) << "h" << ((int)elapsedTime % 60) << "m" << ((int)(elapsedTime * 60.0) % 60) << "s"; std::cerr << " " << std::endl; } catch(const OpenclError& ex) { std::cerr << "[ERROR] [" << ex.getCode() << "] " << ex.what() << std::endl; returnCode = -1; } catch(const std::exception& ex) { std::cerr << "[ERROR] " << ex.what() << std::endl; returnCode = -1; } return returnCode; }
XdevLComputeKernelCL::~XdevLComputeKernelCL() { XDEVL_MODULEX_INFO(XdevLComputeKernelCL, "~XdevLComputeKernelCL()\n"); if(nullptr != m_kernel) { clReleaseKernel(m_kernel); } }
int main(int argc, char **argv) { printf("enter demo main\n"); fflush(stdout); putenv("POCL_VERBOSE=1"); putenv("POCL_DEVICES=basic"); putenv("POCL_LEAVE_TEMP_DIRS=1"); putenv("POCL_LEAVE_KERNEL_COMPILER_TEMP_FILES=1"); putenv("POCL_TEMP_DIR=pocl"); putenv("POCL_CACHE_DIR=pocl"); putenv("POCL_WORK_GROUP_METHOD=spmd"); if(argc >= 2){ printf("argv[1]:%s:\n",argv[1]); if(!strcmp(argv[1], "h")) putenv("POCL_WORK_GROUP_METHOD=spmd"); if(!strcmp(argv[1], "c")) putenv("POCL_CROSS_COMPILE=1"); } if(argc >= 3){ printf("argv[2]:%s:\n",argv[2]); if(!strcmp(argv[2], "h")) putenv("POCL_WORK_GROUP_METHOD=spmd"); if(!strcmp(argv[2], "c")) putenv("POCL_CROSS_COMPILE=1"); } //putenv("LD_LIBRARY_PATH=/scratch/colins/build/linux/fs/lib"); //putenv("LTDL_LIBRARY_PATH=/scratch/colins/build/linux/fs/lib"); //lt_dlsetsearchpath("/scratch/colins/build/linux/fs/lib"); //printf("SEARCH_PATH:%s\n",lt_dlgetsearchpath()); cl_platform_id platforms[100]; cl_uint platforms_n = 0; CL_CHECK(clGetPlatformIDs(100, platforms, &platforms_n)); printf("=== %d OpenCL platform(s) found: ===\n", platforms_n); for (int i=0; i<platforms_n; i++) { char buffer[10240]; printf(" -- %d --\n", i); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL)); printf(" PROFILE = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL)); printf(" VERSION = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL)); printf(" NAME = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL)); printf(" VENDOR = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL)); printf(" EXTENSIONS = %s\n", buffer); } if (platforms_n == 0) return 1; cl_device_id devices[100]; cl_uint devices_n = 0; // CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 100, devices, &devices_n)); CL_CHECK(clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 100, devices, &devices_n)); printf("=== %d OpenCL device(s) found on platform:\n", devices_n); for (int i=0; i<devices_n; i++) { char buffer[10240]; cl_uint buf_uint; cl_ulong buf_ulong; printf(" -- %d --\n", i); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL)); printf(" DEVICE_NAME = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VENDOR = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL)); printf(" DRIVER_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL)); printf(" DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong); } if (devices_n == 0) return 1; cl_context context; context = CL_CHECK_ERR(clCreateContext(NULL, 1, devices+1, &pfn_notify, NULL, &_err)); cl_command_queue queue; queue = CL_CHECK_ERR(clCreateCommandQueue(context, devices[1], CL_QUEUE_PROFILING_ENABLE, &_err)); cl_kernel kernel = 0; cl_mem memObjects[2] = {0,0}; // Create OpenCL program - first attempt to load cached binary. // If that is not available, then create the program from source // and store the binary for future use. std::cout << "Attempting to create program from binary..." << std::endl; cl_program program = CreateProgramFromBinary(context, devices[1], "kernel.cl.bin"); if (program == NULL) { std::cout << "Binary not loaded, create from source..." << std::endl; program = CreateProgram(context, devices[1], "kernel.cl"); if (program == NULL) { Cleanup(context, queue, program, kernel, memObjects); return 1; } std::cout << "Save program binary for future run..." << std::endl; if (SaveProgramBinary(program, devices[1], "kernel.cl.bin") == false) { std::cerr << "Failed to write program binary" << std::endl; Cleanup(context, queue, program, kernel, memObjects); return 1; } } else { std::cout << "Read program from binary." << std::endl; } printf("attempting to create input buffer\n"); fflush(stdout); cl_mem input_buffer; input_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(double)*NUM_DATA, NULL, &_err)); printf("attempting to create output buffer\n"); fflush(stdout); cl_mem output_buffer; output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(double)*NUM_DATA, NULL, &_err)); memObjects[0] = input_buffer; memObjects[1] = output_buffer; double factor = ((double)rand()/(double)(RAND_MAX)) * 100.0;; printf("attempting to create kernel\n"); fflush(stdout); kernel = CL_CHECK_ERR(clCreateKernel(program, "daxpy", &_err)); printf("setting up kernel args cl_mem:%lx \n",input_buffer); fflush(stdout); CL_CHECK(clSetKernelArg(kernel, 0, sizeof(input_buffer), &input_buffer)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(output_buffer), &output_buffer)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(factor), &factor)); printf("attempting to enqueue write buffer\n"); fflush(stdout); for (int i=0; i<NUM_DATA; i++) { double in = ((double)rand()/(double)(RAND_MAX)) * 100.0;; CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, i*sizeof(double), 8, &in, 0, NULL, NULL)); } cl_event kernel_completion; size_t global_work_size[1] = { NUM_DATA }; printf("attempting to enqueue kernel\n"); fflush(stdout); CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, &kernel_completion)); printf("Enqueue'd kerenel\n"); fflush(stdout); cl_ulong time_start, time_end; CL_CHECK(clWaitForEvents(1, &kernel_completion)); CL_CHECK(clGetEventProfilingInfo(kernel_completion, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL)); CL_CHECK(clGetEventProfilingInfo(kernel_completion, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL)); double elapsed = time_end - time_start; printf("time(ns):%lg\n",elapsed); CL_CHECK(clReleaseEvent(kernel_completion)); printf("Result:"); for (int i=0; i<NUM_DATA; i++) { double data; CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, i*sizeof(double), 8, &data, 0, NULL, NULL)); //printf(" %lg", data); } printf("\n"); CL_CHECK(clReleaseMemObject(memObjects[0])); CL_CHECK(clReleaseMemObject(memObjects[1])); CL_CHECK(clReleaseKernel(kernel)); CL_CHECK(clReleaseProgram(program)); CL_CHECK(clReleaseContext(context)); return 0; }
void execute(float *grid, size_t gridSize, unsigned int width, unsigned int workGroupSize, unsigned int iterations, bool printResult) { cl_context context; cl_command_queue commandQueue; cl_program program; cl_kernel kernel; size_t dataBytes, kernelLength; cl_int errorCode; cl_mem gridBuffer; cl_device_id* devices; cl_device_id gpu; cl_uint numPlatforms; errorCode = clGetPlatformIDs(0, NULL, &numPlatforms); cl_platform_id platforms[numPlatforms]; errorCode = clGetPlatformIDs(numPlatforms, platforms, NULL); checkError(errorCode); cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (int) platforms[0], 0}; context = clCreateContextFromType(properties, CL_DEVICE_TYPE_ALL, 0, NULL, &errorCode); checkError(errorCode); errorCode = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &dataBytes); devices = malloc(dataBytes); errorCode |= clGetContextInfo(context, CL_CONTEXT_DEVICES, dataBytes, devices, NULL); gpu = devices[0]; commandQueue = clCreateCommandQueue(context, gpu, 0, &errorCode); checkError(errorCode); gridBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, gridSize, grid, &errorCode); checkError(errorCode); const char* programBuffer = readFile("kernel.cl"); kernelLength = strlen(programBuffer); program = clCreateProgramWithSource(context, 1, (const char **)&programBuffer, &kernelLength, &errorCode); checkError(errorCode); errorCode = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (errorCode == CL_BUILD_PROGRAM_FAILURE) { // Determine the size of the log size_t log_size; clGetProgramBuildInfo(program, gpu, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); // Allocate memory for the log char *log = (char *) malloc(log_size); // Get the log clGetProgramBuildInfo(program, gpu, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); // Print the log free(log); printf("%s\n", log); } checkError(errorCode); kernel = clCreateKernel(program, "diffuse", &errorCode); checkError(errorCode); size_t localWorkSize[2] = {workGroupSize, workGroupSize}, globalWorkSize[2] = {width, width}; errorCode |= clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&gridBuffer); errorCode |= clSetKernelArg(kernel, 1, sizeof(float) * workGroupSize * workGroupSize, NULL); errorCode |= clSetKernelArg(kernel, 2, sizeof(int), (void *)&width); errorCode |= clSetKernelArg(kernel, 3, sizeof(int), (void *)&workGroupSize); errorCode |= clSetKernelArg(kernel, 4, sizeof(int), (void *)&iterations); checkError(errorCode); errorCode = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); checkError(errorCode); errorCode = clEnqueueReadBuffer(commandQueue, gridBuffer, CL_TRUE, 0, gridSize, grid, 0, NULL, NULL); checkError(errorCode); free(devices); free((void *)programBuffer); clReleaseContext(context); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(commandQueue); }
int main(int argc, char **argv) { /* test name */ char name[] = "test_sampler_address_clamp"; size_t global_work_size[1] = { 1 }, local_work_size[1]= { 1 }; size_t srcdir_length, name_length, filename_size; char *filename = NULL; char *source = NULL; cl_device_id devices[1]; cl_context context = NULL; cl_command_queue queue = NULL; cl_program program = NULL; cl_kernel kernel = NULL; cl_int result; int retval = -1; /* image parameters */ cl_uchar4 *imageData; cl_image_format image_format; cl_image_desc image_desc; printf("Running test %s...\n", name); memset(&image_desc, 0, sizeof(cl_image_desc)); image_desc.image_type = CL_MEM_OBJECT_IMAGE2D; image_desc.image_width = 4; image_desc.image_height = 4; image_format.image_channel_order = CL_RGBA; image_format.image_channel_data_type = CL_UNSIGNED_INT8; imageData = (cl_uchar4*)malloc (4 * 4 * sizeof(cl_uchar4)); if (imageData == NULL) { puts("out of host memory\n"); goto error; } memset (imageData, 1, 4*4*sizeof(cl_uchar4)); /* determine file name of kernel source to load */ srcdir_length = strlen(SRCDIR); name_length = strlen(name); filename_size = srcdir_length + name_length + 16; filename = (char *)malloc(filename_size + 1); if (!filename) { puts("out of memory"); goto error; } snprintf(filename, filename_size, "%s/%s.cl", SRCDIR, name); /* read source code */ source = poclu_read_file (filename); TEST_ASSERT (source != NULL && "Kernel .cl not found."); /* setup an OpenCL context and command queue using default device */ context = poclu_create_any_context(); if (!context) { puts("clCreateContextFromType call failed\n"); goto error; } result = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), devices, NULL); if (result != CL_SUCCESS) { puts("clGetContextInfo call failed\n"); goto error; } queue = clCreateCommandQueue(context, devices[0], 0, NULL); if (!queue) { puts("clCreateCommandQueue call failed\n"); goto error; } /* Create image */ cl_mem image = clCreateImage (context, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &image_format, &image_desc, imageData, &result); if (result != CL_SUCCESS) { puts("image creation failed\n"); goto error; } /* create and build program */ program = clCreateProgramWithSource (context, 1, (const char **)&source, NULL, NULL); if (!program) { puts("clCreateProgramWithSource call failed\n"); goto error; } result = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (result != CL_SUCCESS) { puts("clBuildProgram call failed\n"); goto error; } /* execute the kernel with give name */ kernel = clCreateKernel(program, name, NULL); if (!kernel) { puts("clCreateKernel call failed\n"); goto error; } result = clSetKernelArg( kernel, 0, sizeof(cl_mem), &image); if (result) { puts("clSetKernelArg failed\n"); goto error; } result = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (result != CL_SUCCESS) { puts("clEnqueueNDRangeKernel call failed\n"); goto error; } result = clFinish(queue); if (result == CL_SUCCESS) retval = 0; error: if (image) { clReleaseMemObject (image); } if (kernel) { clReleaseKernel(kernel); } if (program) { clReleaseProgram(program); } if (queue) { clReleaseCommandQueue(queue); } if (context) { clUnloadCompiler (); clReleaseContext (context); } if (source) { free(source); } if (filename) { free(filename); } if (imageData) { free(imageData); } if (retval) { printf("FAIL\n"); return 1; } printf("OK\n"); return 0; }
int main(int argc, char const *argv[]) { /* Get platform */ cl_platform_id platform; cl_uint num_platforms; cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformIDs' failed\n"); exit(1); } printf("Number of platforms: %d\n", num_platforms); printf("platform=%p\n", platform); /* Get platform name */ char platform_name[100]; ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformInfo' failed\n"); exit(1); } printf("platform.name='%s'\n\n", platform_name); /* Get device */ cl_device_id device; cl_uint num_devices; ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceIDs' failed\n"); exit(1); } printf("Number of devices: %d\n", num_devices); printf("device=%p\n", device); /* Get device name */ char device_name[100]; ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceInfo' failed\n"); exit(1); } printf("device.name='%s'\n", device_name); printf("\n"); /* Create a Context Object */ cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateContext' failed\n"); exit(1); } printf("context=%p\n", context); /* Create a Command Queue Object*/ cl_command_queue command_queue; command_queue = clCreateCommandQueue(context, device, 0, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateCommandQueue' failed\n"); exit(1); } printf("command_queue=%p\n", command_queue); printf("\n"); /* Program source */ unsigned char *source_code; size_t source_length; /* Read program from 'divide_short8short8.cl' */ source_code = read_buffer("divide_short8short8.cl", &source_length); /* Create a program */ cl_program program; program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_length, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateProgramWithSource' failed\n"); exit(1); } printf("program=%p\n", program); /* Build program */ ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (ret != CL_SUCCESS ) { size_t size; char *log; /* Get log size */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size); /* Allocate log and print */ log = malloc(size); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL); printf("error: call to 'clBuildProgram' failed:\n%s\n", log); /* Free log and exit */ free(log); exit(1); } printf("program built\n"); printf("\n"); /* Create a Kernel Object */ cl_kernel kernel; kernel = clCreateKernel(program, "divide_short8short8", &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateKernel' failed\n"); exit(1); } /* Create and allocate host buffers */ size_t num_elem = 10; /* Create and init host side src buffer 0 */ cl_short8 *src_0_host_buffer; src_0_host_buffer = malloc(num_elem * sizeof(cl_short8)); for (int i = 0; i < num_elem; i++) src_0_host_buffer[i] = (cl_short8){{2, 2, 2, 2, 2, 2, 2, 2}}; /* Create and init device side src buffer 0 */ cl_mem src_0_device_buffer; src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_short8), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_short8), src_0_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create and init host side src buffer 1 */ cl_short8 *src_1_host_buffer; src_1_host_buffer = malloc(num_elem * sizeof(cl_short8)); for (int i = 0; i < num_elem; i++) src_1_host_buffer[i] = (cl_short8){{2, 2, 2, 2, 2, 2, 2, 2}}; /* Create and init device side src buffer 1 */ cl_mem src_1_device_buffer; src_1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_short8), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_1_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_short8), src_1_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create host dst buffer */ cl_float8 *dst_host_buffer; dst_host_buffer = malloc(num_elem * sizeof(cl_float8)); memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_float8)); /* Create device dst buffer */ cl_mem dst_device_buffer; dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_float8), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create dst buffer\n"); exit(1); } /* Set kernel arguments */ ret = CL_SUCCESS; ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src_1_device_buffer); ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clSetKernelArg' failed\n"); exit(1); } /* Launch the kernel */ size_t global_work_size = num_elem; size_t local_work_size = num_elem; ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueNDRangeKernel' failed\n"); exit(1); } /* Wait for it to finish */ clFinish(command_queue); /* Read results from GPU */ ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_float8), dst_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueReadBuffer' failed\n"); exit(1); } /* Dump dst buffer to file */ char dump_file[100]; sprintf((char *)&dump_file, "%s.result", argv[0]); write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_float8)); printf("Result dumped to %s\n", dump_file); /* Free host dst buffer */ free(dst_host_buffer); /* Free device dst buffer */ ret = clReleaseMemObject(dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 0 */ free(src_0_host_buffer); /* Free device side src buffer 0 */ ret = clReleaseMemObject(src_0_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 1 */ free(src_1_host_buffer); /* Free device side src buffer 1 */ ret = clReleaseMemObject(src_1_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Release kernel */ ret = clReleaseKernel(kernel); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseKernel' failed\n"); exit(1); } /* Release program */ ret = clReleaseProgram(program); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseProgram' failed\n"); exit(1); } /* Release command queue */ ret = clReleaseCommandQueue(command_queue); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseCommandQueue' failed\n"); exit(1); } /* Release context */ ret = clReleaseContext(context); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseContext' failed\n"); exit(1); } return 0; }
double runCode(double input,double input2){ /* OpenCL structures */ cl_device_id device; cl_context context; cl_program program; cl_kernel kernel; cl_command_queue queue; cl_int err; size_t global_size; double output; cl_mem output_buffer; cl_mem input_buffer; /* Create device and context */ device = create_device(); context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if(err < 0) { perror("Couldn't create a context"); exit(1); } /* Build program */ program = build_program(context, device, PROGRAM_FILE); /* Create data buffer */ //This effectively means having only a single work-item, which means no //paraellizm. That's okay, this is only a test. global_size = 1; input_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(double), &input, &err); if(err < 0) { fprintf(stderr,"Couldn't create input Buffer: %d\n",err); exit(1); }; output_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(double), NULL, &err); if(err < 0) { fprintf(stderr,"Couldn't create output Buffer: %d\n",err); exit(1); }; /* Create a command queue */ queue = clCreateCommandQueue(context, device, 0, &err); if(err < 0) { perror("Couldn't create a command queue"); exit(1); }; /* Create a kernel */ //kernel = clCreateKernel(program, KERNEL_FUNC, &err); kernel = clCreateKernel(program, "test", &err); if(err < 0) { perror("Couldn't create a kernel"); exit(1); }; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_buffer); if(err < 0) { fprintf(stderr,"Error setting kernel arguments, code: %d \n",err); } err = clSetKernelArg(kernel, 1, sizeof(cl_double), (void*)&input2); if(err < 0) { fprintf(stderr,"Error setting kernel arguments, code: %d \n",err); } err = clSetKernelArg(kernel, 2, sizeof(cl_double), (void*)&input2); if(err < 0) { fprintf(stderr,"Error setting kernel arguments, code: %d \n",err); } err = clSetKernelArg(kernel, 3, sizeof(cl_double), (void*)&input2); if(err < 0) { fprintf(stderr,"Error setting kernel arguments, code: %d \n",err); } err = clSetKernelArg(kernel, 4, sizeof(cl_double), (void*)&input2); if(err < 0) { fprintf(stderr,"Error setting kernel arguments, code: %d \n",err); } err = clSetKernelArg(kernel, 5, sizeof(cl_double), (void*)&input2); if(err < 0) { fprintf(stderr,"Error setting kernel arguments, code: %d \n",err); } err = clSetKernelArg(kernel, 6, sizeof(cl_double), (void*)&input2); if(err < 0) { fprintf(stderr,"Error setting kernel arguments, code: %d \n",err); } err = clSetKernelArg(kernel, 7, sizeof(cl_mem), &output_buffer); if(err < 0) { fprintf(stderr,"Error setting kernel arguments, code: %d \n",err); } /* Enqueue kernel */ err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, NULL, 0, NULL, NULL); if(err < 0) { fprintf(stderr,"Couldn't enqueue the kernel, error code %d\n",err); exit(1); } /* Read the kernel's output */ err = clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, 0, sizeof(output), &output, 0, NULL, NULL); if(err < 0) { perror("Couldn't read the buffer"); exit(1); } /* Deallocate resources */ clReleaseKernel(kernel); clReleaseMemObject(output_buffer); clReleaseCommandQueue(queue); clReleaseProgram(program); clReleaseContext(context); return output; }