int main(int argc, char** argv) { /* OpenCL 1.1 data structures */ cl_platform_id* platforms; cl_program program; cl_device_id device; cl_context context; cl_command_queue queue; cl_uint numOfPlatforms; cl_int error; cl_mem matrixAMemObj; // input matrix A mem buffer cl_mem matrixBMemObj; // input matrix B mem buffer cl_mem matrixCMemObj; // input matrix C mem buffer cl_int* matrixA; // input matrix A cl_int* matrixB; // input matrix B cl_int* matrixC; // input matrix C cl_uint widthA = WIDTH_G; cl_uint heightA = HEIGHT_G; cl_uint widthB = WIDTH_G; cl_uint heightB = HEIGHT_G; { // allocate memory for input and output matrices // based on whatever matrix theory i know. matrixA = (cl_int*)malloc(widthA * heightA * sizeof(cl_int)); matrixB = (cl_int*)malloc(widthB * heightB * sizeof(cl_int)); matrixC = (cl_int*)malloc(widthB * heightA * sizeof(cl_int)); memset(matrixA, 0, widthA * heightA * sizeof(cl_int)); memset(matrixB, 0, widthB * heightB * sizeof(cl_int)); memset(matrixC, 0, widthB * heightA * sizeof(cl_int)); fillRandom(matrixA, widthA, heightA, 643); fillRandom(matrixB, widthB, heightB, 991); } /* Get the number of platforms Remember that for each vendor's SDK installed on the computer, the number of available platform also increased. */ error = clGetPlatformIDs(0, NULL, &numOfPlatforms); if(error != CL_SUCCESS) { perror("Unable to find any OpenCL platforms"); exit(1); } platforms = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numOfPlatforms); printf("Number of OpenCL platforms found: %d\n", numOfPlatforms); error = clGetPlatformIDs(numOfPlatforms, platforms, NULL); if(error != CL_SUCCESS) { perror("Unable to find any OpenCL platforms"); exit(1); } // Search for a GPU device through the installed platforms // Build a OpenCL program and do not run it. for(cl_int i = 0; i < numOfPlatforms; i++ ) { // Get the GPU device error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 1, &device, NULL); if(error != CL_SUCCESS) { perror("Can't locate a OpenCL compliant device i.e. GPU"); exit(1); } /* Create a context */ context = clCreateContext(NULL, 1, &device, NULL, NULL, &error); if(error != CL_SUCCESS) { perror("Can't create a valid OpenCL context"); exit(1); } /* Load the two source files into temporary datastores */ const char *file_names[] = {"mmult.cl"}; const int NUMBER_OF_FILES = 1; char* buffer[NUMBER_OF_FILES]; size_t sizes[NUMBER_OF_FILES]; loadProgramSource(file_names, NUMBER_OF_FILES, buffer, sizes); /* Create the OpenCL program object */ program = clCreateProgramWithSource(context, NUMBER_OF_FILES, (const char**)buffer, sizes, &error); if(error != CL_SUCCESS) { perror("Can't create the OpenCL program object"); exit(1); } /* Build OpenCL program object and dump the error message, if any */ char *program_log; const char options[] = ""; size_t log_size; error = clBuildProgram(program, 1, &device, options, NULL, NULL); if(error != CL_SUCCESS) { // If there's an error whilst building the program, dump the log clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); program_log = (char*) malloc(log_size+1); program_log[log_size] = '\0'; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL); printf("\n=== ERROR ===\n\n%s\n=============\n", program_log); free(program_log); exit(1); } // Queue is created with profiling enabled cl_command_queue_properties props; props |= CL_QUEUE_PROFILING_ENABLE; queue = clCreateCommandQueue(context, device, props, &error); cl_kernel kernel = clCreateKernel(program, "mmmult", &error); matrixAMemObj = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, widthA * heightA * sizeof(cl_int), matrixA, &error); matrixBMemObj = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, widthB * heightB * sizeof(cl_int), matrixB, &error); matrixCMemObj = clCreateBuffer(context, CL_MEM_WRITE_ONLY|CL_MEM_ALLOC_HOST_PTR, widthB * heightA * sizeof(cl_int), 0, &error); clSetKernelArg(kernel, 0, sizeof(cl_int),(void*)&widthB); clSetKernelArg(kernel, 1, sizeof(cl_int),(void*)&heightA); clSetKernelArg(kernel, 2, sizeof(cl_mem),(void*)&matrixAMemObj); clSetKernelArg(kernel, 3, sizeof(cl_mem),(void*)&matrixBMemObj); clSetKernelArg(kernel, 4, sizeof(cl_mem),(void*)&matrixCMemObj); size_t globalThreads[] = {heightA}; size_t localThreads[] = {256}; cl_event exeEvt; cl_ulong executionStart, executionEnd; error = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, globalThreads, localThreads, 0, NULL, &exeEvt); clWaitForEvents(1, &exeEvt); if(error != CL_SUCCESS) { printf("Kernel execution failure!\n"); exit(-22); } // let's understand how long it took? clGetEventProfilingInfo(exeEvt, CL_PROFILING_COMMAND_START, sizeof(executionStart), &executionStart, NULL); clGetEventProfilingInfo(exeEvt, CL_PROFILING_COMMAND_END, sizeof(executionEnd), &executionEnd, NULL); clReleaseEvent(exeEvt); printf("Execution the matrix-matrix multiplication took %lu.%lu s\n", (executionEnd - executionStart)/1000000000, (executionEnd - executionStart)%1000000000); printf("Execution the matrix-matrix multiplication took %lu s\n", (executionEnd - executionStart)); clEnqueueReadBuffer(queue, matrixCMemObj, CL_TRUE, 0, widthB * heightA * sizeof(cl_int), matrixC, 0, NULL, NULL); if (compare(matrixC, matrixA, matrixB, heightA, widthA, widthB)) printf("Passed!\n"); else printf("Failed!\n"); /* Clean up */ for(i=0; i< NUMBER_OF_FILES; i++) { free(buffer[i]); } clReleaseProgram(program); clReleaseContext(context); clReleaseMemObject(matrixAMemObj); clReleaseMemObject(matrixBMemObj); clReleaseMemObject(matrixCMemObj); } free(matrixA); free(matrixB); free(matrixC); }
int bpnn_train_kernel(BPNN *net, float *eo, float *eh) { int in, hid, out; float out_err, hid_err; in = net->input_n; hid = net->hidden_n; out = net->output_n; //int use_device = 0; // use CPU as device int use_device = 2; // use GPU as device //int use_device = 2; // use FPGA as device if(initialize(use_device)) return -1; int sourcesize = 1024*1024; char * source = (char *)calloc(sourcesize, sizeof(char)); if(!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; } // read the kernel core source char * kernel_bp1 = "bpnn_layerforward_ocl"; char * kernel_bp2 = "bpnn_adjust_weights_ocl"; char * tempchar = "./backprop_kernel.cl"; char * krnl_file = "./binary/backprop_kernel_default.xclbin"; cl_int err = 0; cl_program prog; // create program from source if (use_device < 2 ) { FILE * fp = fopen(tempchar, "rb"); if(!fp) { printf("ERROR: unable to open '%s'\n", tempchar); return -1; } fread(source + strlen(source), sourcesize, 1, fp); fclose(fp); // compile kernel err = 0; const char * slist[2] = { source, 0 }; prog = clCreateProgramWithSource(context, 1, slist, NULL, &err); if(err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; } } // create program from binary else { char *krnl_bin; const size_t krnl_size = load_file_to_memory(krnl_file, &krnl_bin); err = 0; prog = clCreateProgramWithBinary(context, 1, &device_list[0], &krnl_size, (const unsigned char**) &krnl_bin, NULL, &err); if ((!prog) || (err!=CL_SUCCESS)) { printf("Error: Failed to create compute program from binary %d!\n", err); printf("Test failed\n"); exit(EXIT_FAILURE); } } err = clBuildProgram(prog, 0, NULL, NULL, NULL, NULL); { // show warnings/errors //static char log[65536]; memset(log, 0, sizeof(log)); //cl_device_id device_id = 0; //err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id), &device_id, NULL); //clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL); //if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log); } if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; } cl_kernel kernel1; cl_kernel kernel2; kernel1 = clCreateKernel(prog, kernel_bp1, &err); if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel(kernel1) 0 => %d\n", err); return -1; } kernel2 = clCreateKernel(prog, kernel_bp2, &err); if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel(kernel2) 0 => %d\n", err); return -1; } /* clReleaseProgram(prog); */ float *input_weights_one_dim; float *input_weights_prev_one_dim; float * partial_sum; float sum; float num_blocks = in / BLOCK_SIZE; input_weights_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float)); input_weights_prev_one_dim = (float *) malloc((in + 1)* (hid + 1) * sizeof(float)); partial_sum = (float *) malloc(num_blocks * WIDTH * sizeof(float)); // set global and local workitems size_t global_work[3] = { BLOCK_SIZE, BLOCK_SIZE * num_blocks, 1 }; size_t local_work[3] = { BLOCK_SIZE, BLOCK_SIZE, 1 }; // this preprocessing stage is temporarily added to correct the bug of wrong memcopy using two-dimensional net->inputweights // todo: fix mem allocation int m = 0; for (int k = 0; k <= in; k++) { for (int j = 0; j <= hid; j++) { input_weights_one_dim[m] = net->input_weights[k][j]; input_weights_prev_one_dim[m] = net-> input_prev_weights[k][j]; m++; } } cl_mem input_hidden_ocl; cl_mem input_ocl; cl_mem output_hidden_ocl; cl_mem hidden_partial_sum; cl_mem hidden_delta_ocl; cl_mem input_prev_weights_ocl; input_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_ocl\n"); return -1;} input_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_hidden_ocl\n"); return -1;} output_hidden_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer output_hidden_ocl\n"); return -1;} hidden_partial_sum = clCreateBuffer(context, CL_MEM_READ_WRITE, num_blocks * WIDTH * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_partial_sum\n"); return -1;} hidden_delta_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (hid + 1) * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer hidden_delta_ocl\n"); return -1;} input_prev_weights_ocl = clCreateBuffer(context, CL_MEM_READ_WRITE, (in + 1) * (hid + 1) * sizeof(float), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_prev_weights_ocl\n"); return -1;} printf("Performing GPU computation\n"); //write buffers err = clEnqueueWriteBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_ocl\n"); return -1; } err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; } clSetKernelArg(kernel1, 0, sizeof(void *), (void*) &input_ocl); clSetKernelArg(kernel1, 1, sizeof(void *), (void*) &output_hidden_ocl); clSetKernelArg(kernel1, 2, sizeof(void *), (void*) &input_hidden_ocl); clSetKernelArg(kernel1, 3, sizeof(void *), (void*) &hidden_partial_sum ); clSetKernelArg(kernel1, 4, sizeof(float) * HEIGHT, (void*)NULL ); clSetKernelArg(kernel1, 5, sizeof(float ) * HEIGHT * WIDTH, (void*)NULL ); clSetKernelArg(kernel1, 6, sizeof(cl_int), (void*) &in); clSetKernelArg(kernel1, 7, sizeof(cl_int), (void*) &hid); err = clEnqueueNDRangeKernel(cmd_queue, kernel1, 3, NULL, global_work, local_work, 0, NULL, 0); if(err == CL_INVALID_KERNEL) {printf("Error is invalid kernel\n");} if(err != CL_SUCCESS) { printf("ERROR: 1 kernel1 clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; } err = clEnqueueReadBuffer(cmd_queue, hidden_partial_sum, 1, 0, num_blocks * WIDTH * sizeof(float), partial_sum, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueReadBuffer: partial sum\n"); return -1; } for (int j = 1; j <= hid; j++) { sum = 0.0; for (int k = 0; k < num_blocks; k++) { sum += partial_sum[k * hid + j-1] ; } sum += net->input_weights[0][j]; net-> hidden_units[j] = float(1.0 / (1.0 + exp(-sum))); } bpnn_layerforward(net->hidden_units, net->output_units, net->hidden_weights, hid, out); bpnn_output_error(net->output_delta, net->target, net->output_units, out, &out_err); bpnn_hidden_error(net->hidden_delta, hid, net->output_delta, out, net->hidden_weights, net->hidden_units, &hid_err); bpnn_adjust_weights(net->output_delta, out, net->hidden_units, hid, net->hidden_weights, net->hidden_prev_weights); err = clEnqueueWriteBuffer(cmd_queue, hidden_delta_ocl, 1, 0, (hid + 1) * sizeof(float), net->hidden_delta, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer hidden_delta_ocl\n"); return -1; } err = clEnqueueWriteBuffer(cmd_queue, input_prev_weights_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_prev_one_dim, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_prev_weights_ocl\n"); return -1; } err = clEnqueueWriteBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer input_hidden_ocl\n"); return -1; } clSetKernelArg(kernel2, 0, sizeof(void *), (void*) &hidden_delta_ocl); clSetKernelArg(kernel2, 1, sizeof(cl_int), (void*) &hid); clSetKernelArg(kernel2, 2, sizeof(void *), (void*) &input_ocl); clSetKernelArg(kernel2, 3, sizeof(cl_int), (void*) &in); clSetKernelArg(kernel2, 4, sizeof(void *), (void*) &input_hidden_ocl); clSetKernelArg(kernel2, 5, sizeof(void *), (void*) &input_prev_weights_ocl ); err = clEnqueueNDRangeKernel(cmd_queue, kernel2, 2, NULL, global_work, local_work, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; } err = clEnqueueReadBuffer(cmd_queue, input_ocl, 1, 0, (in + 1) * sizeof(float), net->input_units, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueReadBuffer: input_ocl\n"); return -1; } err = clEnqueueReadBuffer(cmd_queue, input_hidden_ocl, 1, 0, (in + 1) * (hid + 1) * sizeof(float), input_weights_one_dim, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueReadBuffer: input_hidden_ocl\n"); return -1; } clReleaseMemObject(input_ocl); clReleaseMemObject(output_hidden_ocl); clReleaseMemObject(input_hidden_ocl); clReleaseMemObject(hidden_partial_sum); clReleaseMemObject(input_prev_weights_ocl); free(input_weights_prev_one_dim); free(partial_sum); free(input_weights_one_dim); }
void execute(float *grid, size_t gridSize, unsigned int width, unsigned int workGroupSize, unsigned int iterations, bool printResult) { cl_context context; cl_command_queue commandQueue; cl_program program; cl_kernel kernel; size_t dataBytes, kernelLength; cl_int errorCode; cl_mem gridBuffer; cl_device_id* devices; cl_device_id gpu; cl_uint numPlatforms; errorCode = clGetPlatformIDs(0, NULL, &numPlatforms); cl_platform_id platforms[numPlatforms]; errorCode = clGetPlatformIDs(numPlatforms, platforms, NULL); checkError(errorCode); cl_context_properties properties[] = {CL_CONTEXT_PLATFORM, (int) platforms[0], 0}; context = clCreateContextFromType(properties, CL_DEVICE_TYPE_ALL, 0, NULL, &errorCode); checkError(errorCode); errorCode = clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &dataBytes); devices = malloc(dataBytes); errorCode |= clGetContextInfo(context, CL_CONTEXT_DEVICES, dataBytes, devices, NULL); gpu = devices[0]; commandQueue = clCreateCommandQueue(context, gpu, 0, &errorCode); checkError(errorCode); gridBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, gridSize, grid, &errorCode); checkError(errorCode); const char* programBuffer = readFile("kernel.cl"); kernelLength = strlen(programBuffer); program = clCreateProgramWithSource(context, 1, (const char **)&programBuffer, &kernelLength, &errorCode); checkError(errorCode); errorCode = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (errorCode == CL_BUILD_PROGRAM_FAILURE) { // Determine the size of the log size_t log_size; clGetProgramBuildInfo(program, gpu, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); // Allocate memory for the log char *log = (char *) malloc(log_size); // Get the log clGetProgramBuildInfo(program, gpu, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); // Print the log free(log); printf("%s\n", log); } checkError(errorCode); kernel = clCreateKernel(program, "diffuse", &errorCode); checkError(errorCode); size_t localWorkSize[2] = {workGroupSize, workGroupSize}, globalWorkSize[2] = {width, width}; errorCode |= clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&gridBuffer); errorCode |= clSetKernelArg(kernel, 1, sizeof(float) * workGroupSize * workGroupSize, NULL); errorCode |= clSetKernelArg(kernel, 2, sizeof(int), (void *)&width); errorCode |= clSetKernelArg(kernel, 3, sizeof(int), (void *)&workGroupSize); errorCode |= clSetKernelArg(kernel, 4, sizeof(int), (void *)&iterations); checkError(errorCode); errorCode = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); checkError(errorCode); errorCode = clEnqueueReadBuffer(commandQueue, gridBuffer, CL_TRUE, 0, gridSize, grid, 0, NULL, NULL); checkError(errorCode); free(devices); free((void *)programBuffer); clReleaseContext(context); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(commandQueue); }
int main(int argc, char* argv[]) { struct pb_Parameters *parameters; parameters = pb_ReadParameters(&argc, argv); if (!parameters) return -1; if(!parameters->inpFiles[0]){ fputs("Input file expected\n", stderr); return -1; } struct pb_TimerSet timers; char oclOverhead[] = "OCL Overhead"; char intermediates[] = "IntermediatesKernel"; char finals[] = "FinalKernel"; pb_InitializeTimerSet(&timers); pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); pb_AddSubTimer(&timers, intermediates, pb_TimerID_KERNEL); pb_AddSubTimer(&timers, finals, pb_TimerID_KERNEL); pb_SwitchToTimer(&timers, pb_TimerID_IO); int numIterations; if (argc >= 2){ numIterations = atoi(argv[1]); } else { fputs("Expected at least one command line argument\n", stderr); return -1; } unsigned int img_width, img_height; unsigned int histo_width, histo_height; FILE* f = fopen(parameters->inpFiles[0],"rb"); int result = 0; result += fread(&img_width, sizeof(unsigned int), 1, f); result += fread(&img_height, sizeof(unsigned int), 1, f); result += fread(&histo_width, sizeof(unsigned int), 1, f); result += fread(&histo_height, sizeof(unsigned int), 1, f); if (result != 4){ fputs("Error reading input and output dimensions from file\n", stderr); return -1; } unsigned int* img = (unsigned int*) malloc (img_width*img_height*sizeof(unsigned int)); unsigned char* histo = (unsigned char*) calloc (histo_width*histo_height, sizeof(unsigned char)); result = fread(img, sizeof(unsigned int), img_width*img_height, f); fclose(f); if (result != img_width*img_height){ fputs("Error reading input array from file\n", stderr); return -1; } cl_int ciErrNum; pb_Context* pb_context; pb_context = pb_InitOpenCLContext(parameters); if (pb_context == NULL) { fprintf (stderr, "Error: No OpenCL platform/device can be found."); return -1; } cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId; cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId; cl_context clContext = (cl_context) pb_context->clContext; cl_command_queue clCommandQueue; cl_program clProgram[2]; cl_kernel histo_intermediates_kernel; cl_kernel histo_final_kernel; cl_mem input; cl_mem ranges; cl_mem sm_mappings; cl_mem global_subhisto; cl_mem global_overflow; cl_mem final_histo; clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); pb_SetOpenCL(&clContext, &clCommandQueue); pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); cl_uint workItemDimensions; OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &workItemDimensions, NULL) ); size_t workItemSizes[workItemDimensions]; OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, workItemDimensions*sizeof(size_t), workItemSizes, NULL) ); size_t program_length[2]; const char *source_path[2] = { "src/opencl_mxpa/histo_intermediates.cl", "src/opencl_mxpa/histo_final.cl"}; char *source[4]; for (int i = 0; i < 2; ++i) { // Dynamically allocate buffer for source source[i] = oclLoadProgSource(source_path[i], "", &program_length[i]); if(!source[i]) { fprintf(stderr, "Could not load program source\n"); exit(1); } clProgram[i] = clCreateProgramWithSource(clContext, 1, (const char **)&source[i], &program_length[i], &ciErrNum); OCL_ERRCK_VAR(ciErrNum); free(source[i]); } for (int i = 0; i < 2; ++i) { //fprintf(stderr, "Building Program #%d...\n", i); OCL_ERRCK_RETVAL ( clBuildProgram(clProgram[i], 1, &clDevice, NULL, NULL, NULL) ); /* char *build_log; size_t ret_val_size; ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); OCL_ERRCK_VAR(ciErrNum); build_log = (char *)malloc(ret_val_size+1); ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); OCL_ERRCK_VAR(ciErrNum); // to be carefully, terminate with \0 // there's no information in the reference whether the string is 0 terminated or not build_log[ret_val_size] = '\0'; fprintf(stderr, "%s\n", build_log ); */ } histo_intermediates_kernel = clCreateKernel(clProgram[0], "histo_intermediates_kernel", &ciErrNum); OCL_ERRCK_VAR(ciErrNum); histo_final_kernel = clCreateKernel(clProgram[1], "histo_final_kernel", &ciErrNum); OCL_ERRCK_VAR(ciErrNum); pb_SwitchToTimer(&timers, pb_TimerID_COPY); input = clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*img_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); ranges = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 2*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); sm_mappings = clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*img_height*4*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); global_subhisto = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); global_overflow = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); final_histo = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); // Must dynamically allocate. Too large for stack unsigned int *zeroData; zeroData = (unsigned int *) calloc(img_width*histo_height, sizeof(unsigned int)); if (zeroData == NULL) { fprintf(stderr, "Failed to allocate %ld bytes of memory on host!\n", sizeof(unsigned int) * img_width * histo_height); exit(1); } for (int y=0; y < img_height; y++){ OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, input, CL_TRUE, y*img_width*sizeof(unsigned int), // Offset in bytes img_width*sizeof(unsigned int), // Size of data to write &img[y*img_width], // Host Source 0, NULL, NULL) ); } pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); unsigned int img_dim = img_height*img_width; OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 0, sizeof(cl_mem), (void *)&input) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 1, sizeof(unsigned int), &img_width) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 2, sizeof(cl_mem), (void *)&global_subhisto) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 0, sizeof(unsigned int), &histo_height) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 1, sizeof(unsigned int), &histo_width) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 2, sizeof(cl_mem), (void *)&global_subhisto) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 3, sizeof(cl_mem), (void *)&final_histo) ); size_t inter_localWS[1] = { workItemSizes[0] }; size_t inter_globalWS[1] = { img_height * inter_localWS[0] }; size_t final_localWS[1] = { workItemSizes[0] }; size_t final_globalWS[1] = {(((int)(histo_height*histo_width+(final_localWS[0]-1))) / (int)final_localWS[0])*(int)final_localWS[0] }; pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); for (int iter = 0; iter < numIterations; iter++) { unsigned int ranges_h[2] = {UINT32_MAX, 0}; // how about something like // __global__ unsigned int ranges[2]; // ...kernel // __shared__ unsigned int s_ranges[2]; // if (threadIdx.x == 0) {s_ranges[0] = ranges[0]; s_ranges[1] = ranges[1];} // __syncthreads(); // Although then removing the blocking cudaMemcpy's might cause something about // concurrent kernel execution. // If kernel launches are synchronous, then how can 2 kernels run concurrently? different host threads? OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, ranges, CL_TRUE, 0, // Offset in bytes 2*sizeof(unsigned int), // Size of data to write ranges_h, // Host Source 0, NULL, NULL) ); OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, global_subhisto, CL_TRUE, 0, // Offset in bytes histo_width*histo_height*sizeof(unsigned int), // Size of data to write zeroData, // Host Source 0, NULL, NULL) ); pb_SwitchToSubTimer(&timers, intermediates, pb_TimerID_KERNEL); OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_intermediates_kernel /*histo_intermediates_kernel*/, 1, 0, inter_globalWS, inter_localWS, 0, 0, 0) ); pb_SwitchToSubTimer(&timers, finals, pb_TimerID_KERNEL); OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_final_kernel, 1, 0, final_globalWS, final_localWS, 0, 0, 0) ); } pb_SwitchToTimer(&timers, pb_TimerID_IO); OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, final_histo, CL_TRUE, 0, // Offset in bytes histo_height*histo_width*sizeof(unsigned char), // Size of data to read histo, // Host Source 0, NULL, NULL) ); OCL_ERRCK_RETVAL ( clReleaseKernel(histo_intermediates_kernel) ); OCL_ERRCK_RETVAL ( clReleaseKernel(histo_final_kernel) ); OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[0]) ); OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[1]) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(input) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(ranges) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(sm_mappings) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(global_subhisto) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(global_overflow) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(final_histo) ); if (parameters->outFile) { dump_histo_img(histo, histo_height, histo_width, parameters->outFile); } pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); free(zeroData); free(img); free(histo); pb_SwitchToTimer(&timers, pb_TimerID_NONE); printf("\n"); pb_PrintTimerSet(&timers); pb_FreeParameters(parameters); pb_DestroyTimerSet(&timers); OCL_ERRCK_RETVAL ( clReleaseCommandQueue(clCommandQueue) ); OCL_ERRCK_RETVAL ( clReleaseContext(clContext) ); return 0; }
int main(int argc, char **argv) { cl_int ret; /* * Command line */ char *binary_path; if (argc != 2) { printf("syntax: %s <binary>\n", argv[0]); exit(1); } binary_path = argv[1]; /* * Platform */ /* Get platform */ cl_platform_id platform; cl_uint num_platforms; ret = clGetPlatformIDs(1, &platform, &num_platforms); if (ret != CL_SUCCESS) { printf("error: second call to 'clGetPlatformIDs' failed\n"); exit(1); } printf("Number of platforms: %d\n", num_platforms); /* Get platform name */ char platform_name[100]; ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformInfo' failed\n"); exit(1); } printf("platform.name='%s'\n", platform_name); printf("\n"); /* * Device */ /* Get device */ cl_device_id device; cl_uint num_devices; ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceIDs' failed\n"); exit(1); } printf("Number of devices: %d\n", num_devices); /* Get device name */ char device_name[100]; ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceInfo' failed\n"); exit(1); } printf("device.name='%s'\n", device_name); printf("\n"); /* * Context */ /* Create context */ cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateContext' failed\n"); exit(1); } /* * Command Queue */ /* Create command queue */ cl_command_queue command_queue; command_queue = clCreateCommandQueue(context, device, 0, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateCommandQueue' failed\n"); exit(1); } printf("\n"); /* * Program */ /* Program binary */ const unsigned char *binary; size_t binary_length; /* Read binary */ binary = read_buffer(binary_path, &binary_length); if (!binary) { printf("error: %s: cannot open binary\n", binary_path); exit(1); } /* Create a program */ cl_program program; program = clCreateProgramWithBinary(context, 1, &device, &binary_length, &binary, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateProgramWithSource' failed\n"); exit(1); } /* Build program */ ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (ret != CL_SUCCESS ) { size_t size; char *log; /* Get log size */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &size); /* Allocate log and print */ log = malloc(size); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, size, log, NULL); printf("error: call to 'clBuildProgram' failed:\n%s\n", log); /* Free log and exit */ free(log); exit(1); } printf("program built\n"); printf("\n"); /* * Kernel */ /* Create kernel */ cl_kernel kernel; kernel = clCreateKernel(program, "vector_add", &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateKernel' failed\n"); exit(1); } printf("\n"); /* * Buffers */ /* Create and allocate host buffers */ size_t num_elem = 10; cl_int *src1_host_buffer; cl_int *src2_host_buffer; cl_int *dst_host_buffer; src1_host_buffer = malloc(num_elem * sizeof(cl_int)); src2_host_buffer = malloc(num_elem * sizeof(cl_int)); dst_host_buffer = malloc(num_elem * sizeof(cl_int)); /* Initialize host source buffer */ int i; for (i = 0; i < num_elem; i++) { src1_host_buffer[i] = i; src2_host_buffer[i] = 100; } /* Create device source buffers */ cl_mem src1_device_buffer; cl_mem src2_device_buffer; src1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_int), NULL, NULL); src2_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_int), NULL, NULL); if (!src1_device_buffer || !src2_device_buffer) { printf("error: could not create destination buffer\n"); exit(1); } /* Create device destination buffer */ cl_mem dst_device_buffer; dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem * sizeof(cl_int), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create destination buffer\n"); exit(1); } /* Copy buffer */ ret = clEnqueueWriteBuffer(command_queue, src1_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_int), src1_host_buffer, 0, NULL, NULL); ret |= clEnqueueWriteBuffer(command_queue, src2_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_int), src2_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* * Kernel arguments */ ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), &src1_device_buffer); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src2_device_buffer); ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clSetKernelArg' failed\n"); exit(1); } /* * Launch Kernel */ size_t global_work_size = num_elem; size_t local_work_size = num_elem; /* Launch the kernel */ ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueNDRangeKernel' failed\n"); exit(1); } /* Wait for it to finish */ clFinish(command_queue); /* * Result */ /* Receive buffer */ ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_int), dst_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueReadBuffer' failed\n"); exit(1); } /* Print result */ for (i = 0; i < num_elem; i++) printf("dst_host_buffer[%d] = %d\n", i, dst_host_buffer[i]); printf("\n"); return 0; }
int SieveBoth::Sieve(size_t n) { cl_context context = 0; cl_command_queue commandQueue = 0; cl_program program = 0; cl_device_id device = 0; cl_kernel kernel = 0; cl_mem memObjects[2] = { 0, 0 }; cl_int errNum; int array_size = 10; // Create an OpenCL context on first available platform context = OpenCLFuncs::CreateContext(); if (context == NULL) { std::cerr << "Failed to create OpenCL context." << std::endl; system("pause"); return 1; } // Create a command-queue on the first device available // on the created context commandQueue = OpenCLFuncs::CreateCommandQueue(context, &device); if (commandQueue == NULL) { OpenCLFuncs::CleanupSieve(context, commandQueue, program, kernel, memObjects); system("pause"); return 2; } // Create OpenCL program from HelloWorld.cl kernel source program = OpenCLFuncs::CreateProgram(context, device, "Sieve.cl"); if (program == NULL) { OpenCLFuncs::CleanupSieve(context, commandQueue, program, kernel, memObjects); system("pause"); return 3; } // Create OpenCL kernel kernel = clCreateKernel(program, "main_kernel", NULL); if (kernel == NULL) { std::cerr << "Failed to create kernel" << std::endl; OpenCLFuncs::Cleanup(context, commandQueue, program, kernel, memObjects); system("pause"); return 4; } // Create memory objects that will be used as arguments to // kernel. First create host memory arrays that will be // used to store the arguments to the kernel //int result = 0; int limit = n; if (!OpenCLFuncs::CreateMemObjectsForSieve(context, memObjects, limit)) { OpenCLFuncs::CleanupSieve(context, commandQueue, program, kernel, memObjects); system("pause"); return 5; } // Set the kernel arguments (result, a, b) errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memObjects[0]); errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &memObjects[1]); if (errNum != CL_SUCCESS) { std::cerr << "Error setting kernel arguments." << std::endl; OpenCLFuncs::CleanupSieve(context, commandQueue, program, kernel, memObjects); system("pause"); return 6; } size_t globalWorkSize[1] = { 1 }; size_t localWorkSize[1] = { 1 }; //timer.Start(); // Queue the kernel up for execution across the array errNum = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); if (errNum != CL_SUCCESS) { std::cerr << "Error queuing kernel for execution." << std::endl; OpenCLFuncs::CleanupSieve(context, commandQueue, program, kernel, memObjects); system("pause"); return 7; } int result = 0; //float *a = new float[array_size]; //float *b = new float[array_size]; // Read the output buffer back to the Host errNum = clEnqueueReadBuffer(commandQueue, memObjects[1], CL_TRUE, 0, sizeof(int), &result, 0, NULL, NULL); if (errNum != CL_SUCCESS) { std::cerr << "Error reading result buffer." << std::endl; OpenCLFuncs::CleanupSieve(context, commandQueue, program, kernel, memObjects); system("pause"); return 1; } //timer.End(); //if (timer.Diff(seconds, useconds)) // std::cerr << "Warning: timer returned negative difference!" << std::endl; //std::cout << "OpenCL ran in " << seconds << "." << useconds << " seconds" << std::endl << std::endl; OpenCLFuncs::CleanupSieve(context, commandQueue, program, kernel, memObjects); return result; }
void run_benchmark( void *vargs, cl_context& context, cl_command_queue& commands, cl_program& program, cl_kernel& kernel ) { struct bench_args_t *args = (struct bench_args_t *)vargs; // Create device buffers // cl_mem obs_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->obs), NULL, NULL); cl_mem init_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->init), NULL, NULL); cl_mem transition_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->transition), NULL, NULL); cl_mem emission_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->emission), NULL, NULL); cl_mem path_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->path), NULL, NULL); if (!obs_buffer || !init_buffer || !transition_buffer || !emission_buffer || !path_buffer) { printf("Error: Failed to allocate device memory!\n"); printf("Test failed\n"); exit(1); } // Write our data set into device buffers // int err; err = clEnqueueWriteBuffer(commands, obs_buffer, CL_TRUE, 0, sizeof(args->obs), args->obs, 0, NULL, NULL); err |= clEnqueueWriteBuffer(commands, init_buffer, CL_TRUE, 0, sizeof(args->init), args->init, 0, NULL, NULL); err |= clEnqueueWriteBuffer(commands, transition_buffer, CL_TRUE, 0, sizeof(args->transition), args->transition, 0, NULL, NULL); err |= clEnqueueWriteBuffer(commands, emission_buffer, CL_TRUE, 0, sizeof(args->emission), args->emission, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to device memory!\n"); printf("Test failed\n"); exit(1); } // Set the arguments to our compute kernel // err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &obs_buffer); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &init_buffer); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &transition_buffer); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &emission_buffer); err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &path_buffer); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); printf("Test failed\n"); exit(1); } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // #ifdef C_KERNEL err = clEnqueueTask(commands, kernel, 0, NULL, NULL); #else printf("Error: OpenCL kernel is not currently supported!\n"); exit(1); #endif if (err) { printf("Error: Failed to execute kernel! %d\n", err); printf("Test failed\n"); exit(1); } // Read back the results from the device to verify the output // err = clEnqueueReadBuffer( commands, path_buffer, CL_TRUE, 0, sizeof(args->path), args->path, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); printf("Test failed\n"); exit(1); } }
int ScanLargeArrays::runCLKernels(void) { cl_int status; cl_int eventStatus = CL_QUEUED; cl_event writeEvt; // Enqueue write to seedsBuf status = clEnqueueWriteBuffer(commandQueue, inputBuffer, CL_FALSE, 0, length * sizeof(cl_float), input, 0, NULL, &writeEvt); CHECK_OPENCL_ERROR(status,"clEnqueueWriteBuffer failed.(inputBuffer)"); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status,"clFlush failed."); status = sampleCommon->waitForEventAndRelease(&writeEvt); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(writeEvt) Failed"); // Do block-wise sum if(bScan(length, &inputBuffer, &outputBuffer[0], &blockSumBuffer[0])) return SDK_FAILURE; for(int i = 1; i < (int)pass; i++) { if(bScan((cl_uint)(length / pow((float)blockSize, (float)i)), &blockSumBuffer[i - 1], &outputBuffer[i], &blockSumBuffer[i])) { return SDK_FAILURE; } } int tempLength = (int)(length / pow((float)blockSize, (float)pass)); // Do scan to tempBuffer if(pScan(tempLength, &blockSumBuffer[pass - 1], &tempBuffer)) return SDK_FAILURE; // Do block-addition on outputBuffers if(bAddition((cl_uint)(length / pow((float)blockSize, (float)(pass - 1))), &tempBuffer, &outputBuffer[pass - 1])) { return SDK_FAILURE; } for(int i = pass - 1; i > 0; i--) { if(bAddition((cl_uint)(length / pow((float)blockSize, (float)(i - 1))), &outputBuffer[i], &outputBuffer[i - 1])) { return SDK_FAILURE; } } cl_event readEvt; // Enqueue the results to application pointe status = clEnqueueReadBuffer(commandQueue, outputBuffer[0], CL_FALSE, 0, length * sizeof(cl_float), output, 0, NULL, &readEvt); CHECK_OPENCL_ERROR(status,"clEnqueueReadBuffer failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status,"clFlush failed.(commandQueue)"); status = sampleCommon->waitForEventAndRelease(&readEvt); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(readEvt) Failed"); return SDK_SUCCESS; }
/** * @brief Main principal * @param argc El número de argumentos del programa * @param argv Cadenas de argumentos del programa * @return Nada si es correcto o algún número negativo si es incorrecto */ int main( int argc, char** argv ) { if(argc != 2) return -1; // Medimos tiempo para el programa const double start_time = getCurrentTimestamp(); FILE *kernels; char *source_str; size_t source_size, work_items; // OpenCL runtime configuration unsigned num_devices; cl_platform_id platform_ids[3]; cl_uint ret_num_platforms; cl_device_id device_id; cl_context context = NULL; cl_command_queue command_queue; cl_program program = NULL; cl_int ret; cl_kernel kernelINIT; cl_event kernel_event, finish_event; cl_mem objPARTICULAS; // Abrimos el fichero que contiene el kernel fopen_s(&kernels, "initparticulasCPU.cl", "r"); if (!kernels) { fprintf(stderr, "Fallo al cargar el kernel\n"); exit(-1); } source_str = (char *) malloc(0x100000); source_size = fread(source_str, 1, 0x100000, kernels); fclose(kernels); // Obtenemos los IDs de las plataformas disponibles if( clGetPlatformIDs(3, platform_ids, &ret_num_platforms) != CL_SUCCESS) { printf("No se puede obtener id de la plataforma"); return -1; } // Intentamos obtener un dispositivo CPU soportado if( clGetDeviceIDs(platform_ids[1], CL_DEVICE_TYPE_CPU, 1, &device_id, &num_devices) != CL_SUCCESS) { printf("No se puede obtener id del dispositivo"); return -1; } clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &work_items, NULL); // Creación de un contexto OpenCL context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); // Creación de una cola de comandos command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret); // Creación de un programa kernel desde un fichero de código program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); if (ret != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: ¡Fallo al construir el programa ejecutable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s", buffer); exit(-1); } // Creación del kernel OpenCL kernelINIT = clCreateKernel(program, "calc_particles_init", &ret); // Creamos el buffer para las partículas y reservamos espacio ALINEADO para los datos size_t N = atoi(argv[1]); particle *particulas = (particle*) _aligned_malloc(N * sizeof(particle), 64); objPARTICULAS = clCreateBuffer(context, CL_MEM_WRITE_ONLY, N * sizeof(particle), NULL, &ret); const size_t global = 4; const size_t local_work_size = 1; // Transferimos el frame al dispositivo cl_event write_event; ret = clEnqueueWriteBuffer(command_queue, objPARTICULAS, CL_FALSE, 0, N * sizeof(particle), particulas, 0, NULL, &write_event); // Establecemos los argumentos del kernel ret = clSetKernelArg(kernelINIT, 0, sizeof(cl_mem), &objPARTICULAS); ret = clSetKernelArg(kernelINIT, 1, sizeof(int), &N); // Ejecutamos el kernel. Un work-item por cada work-group o unidad de cómputo ret = clEnqueueNDRangeKernel(command_queue, kernelINIT, 1, NULL, &global, &local_work_size, 1, &write_event, &kernel_event); // Leemos los resultados ret = clEnqueueReadBuffer(command_queue, objPARTICULAS, CL_FALSE, 0, N * sizeof(particle), particulas, 1, &kernel_event, &finish_event); // Esperamos a que termine de leer los resultados clWaitForEvents(1, &finish_event); // Obtenemos el tiempo del kernel y de las transferencias CPU-RAM cl_ulong totalKernel = getStartEndTime(kernel_event); cl_ulong totalRam = getStartEndTime(write_event) + getStartEndTime(finish_event); const double end_time = getCurrentTimestamp(); // Obtenemos el tiempo consumido por el programa, el kernel y las transferencias de memoria printf("\nTiempo total del programa: %0.3f ms\n", (end_time - start_time) * 1e3); printf("Tiempo total consumido por el kernel: %0.3f ms\n", double(totalKernel) * 1e-6); printf("Tiempo total consumido en transferencias CPU-RAM: %0.3f ms\n", double(totalRam) * 1e-6); // Liberamos todos los recursos usados (kernels y objetos OpenCL) clReleaseEvent(kernel_event); clReleaseEvent(finish_event); clReleaseEvent(write_event); clReleaseMemObject(objPARTICULAS); clReleaseKernel(kernelINIT); clReleaseCommandQueue(command_queue); clReleaseProgram(program); clReleaseContext(context); }
int SimpleConvolution::runCLKernels(void) { cl_int status; cl_event events[2]; status = this->setWorkGroupSize(); CHECK_ERROR(status, SDK_SUCCESS, "setWorkGroupSize() failed"); // Set appropriate arguments to the kernel status = clSetKernelArg( kernel, 0, sizeof(cl_mem), (void *)&outputBuffer); CHECK_OPENCL_ERROR( status, "clSetKernelArg failed. (outputBuffer)"); status = clSetKernelArg( kernel, 1, sizeof(cl_mem), (void *)&inputBuffer); CHECK_OPENCL_ERROR( status, "clSetKernelArg failed. (inputBuffer)"); status = clSetKernelArg( kernel, 2, sizeof(cl_mem), (void *)&maskBuffer); CHECK_OPENCL_ERROR( status, "clSetKernelArg failed. (maskBuffer)"); cl_uint2 inputDimensions = {width, height}; cl_uint2 maskDimensions = {maskWidth, maskHeight}; status = clSetKernelArg( kernel, 3, sizeof(cl_uint2), (void *)&inputDimensions); CHECK_OPENCL_ERROR( status, "clSetKernelArg failed. (inputDimensions)"); status = clSetKernelArg( kernel, 4, sizeof(cl_uint2), (void *)&maskDimensions); CHECK_OPENCL_ERROR( status, "clSetKernelArg failed. (maskDimensions)"); // Enqueue a kernel run call. status = clEnqueueNDRangeKernel( commandQueue, kernel, 1, NULL, globalThreads, localThreads, 0, NULL, &events[0]); CHECK_OPENCL_ERROR( status, "clEnqueueNDRangeKernel failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status,"clFlush() failed"); status = sampleCommon->waitForEventAndRelease(&events[0]); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(events[0]) Failed"); // Enqueue readBuffer status = clEnqueueReadBuffer( commandQueue, outputBuffer, CL_TRUE, 0, width * height * sizeof(cl_uint), output, 0, NULL, &events[1]); CHECK_OPENCL_ERROR( status, "clEnqueueReadBuffer failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status,"clFlush() failed"); status = sampleCommon->waitForEventAndRelease(&events[1]); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(events[1]) Failed"); return SDK_SUCCESS; }
int main(int argc, char *argv[]) { //FILE *fp; cl_platform_id platform_id[2]; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret_code; cl_mem image_in_mem = NULL; cl_mem image_out_mem = NULL; cl_mem twiddle_factors_mem = NULL; cl_float2 *image_in_host; cl_float2 *twiddle_factors_host; cl_kernel kernel_twiddle_factors; cl_kernel kernel_matriz_transpose; cl_kernel kernel_lowpass_filter; pgm_t ipgm; pgm_t opgm; image_file_t *image_filename; char *output_filename; FILE *fp; const char *kernel_filename = C_NOME_ARQ_KERNEL; size_t source_size; char *source_str; cl_int i, j,n ,m; cl_int raio = 0; size_t global_wg[2]; size_t local_wg[2]; float *image_amplitudes; size_t log_size; char *log_file; cl_event kernels_events_out_fft[4]; cl_ulong kernel_runtime = (cl_ulong) 0; cl_ulong kernel_start_time = (cl_ulong) 0; cl_ulong kernel_end_time = (cl_ulong) 0; cl_event write_host_dev_event; cl_ulong write_host_dev_start_time = (cl_ulong) 0; cl_ulong write_host_dev_end_time = (cl_ulong) 0; cl_ulong write_host_dev_run_time = (cl_ulong) 0; cl_event read_dev_host_event; cl_ulong read_dev_host_start_time = (cl_ulong) 0; cl_ulong read_dev_host_end_time = (cl_ulong) 0; cl_ulong read_dev_host_run_time = (cl_ulong) 0; unsigned __int64 image_tam; unsigned __int64 MEGA_BYTES = 1048576; // 1024*1024 double image_tam_MB; double tempo_total; struct event_in_fft_t *fft_events; //=== Timer count start ============================================================================== timer_reset(); timer_start(); //=================================================================================================== if (argc < 2) { printf("**Erro: O arquivo de entrada eh necessario.\n"); exit(EXIT_FAILURE); } image_filename = (image_file_t *) malloc(sizeof(image_file_t)); split_image_filename(image_filename, argv[1]); output_filename = (char *) malloc(40*sizeof(char)); sprintf(output_filename, "%d.%d.%s.%s.%s", image_filename->res, image_filename->num, ENV_TYPE, APP_TYPE, EXTENSAO); fp = fopen(kernel_filename, "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(EXIT_FAILURE); } source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose( fp ); //=================================================================================================== /* Abrindo imagem do arquivo para objeto de memoria local*/ if( ler_pgm(&ipgm, argv[1]) == -1) exit(EXIT_FAILURE); n = ipgm.width; raio = n/8; m = (cl_int)(log((double)n)/log(2.0)); image_in_host = (cl_float2 *)malloc((n*n)*sizeof(cl_float2)); twiddle_factors_host = (cl_float2 *)malloc(n / 2 * sizeof(cl_float2)); for (i = 0; i < n; i++) { for (j = 0; j < n; j++) { image_in_host[n*i + j].s[0] = (float)ipgm.buf[n*i + j]; image_in_host[n*i + j].s[1] = (float)0; } } fft_events = (struct event_in_fft_t *)malloc(MAX_CALL_FFT*sizeof(struct event_in_fft_t)); kernel_butter_events = (cl_event *)malloc(MAX_CALL_FFT*m*sizeof(cl_event)); //=================================================================================================== CL_CHECK(clGetPlatformIDs(MAX_PLATFORM_ID, platform_id, &ret_num_platforms)); if (ret_num_platforms == 0 ) { fprintf(stderr,"[Erro] Não existem plataformas OpenCL\n"); exit(2); } //=================================================================================================== CL_CHECK(clGetDeviceIDs( platform_id[0], CL_DEVICE_TYPE_GPU, 1, &device_id, &ret_num_devices)); //print_platform_info(&platform_id[1]); //=================================================================================================== context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret_code); //=================================================================================================== cmd_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret_code); //=================================================================================================== image_in_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret_code); image_out_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, n*n*sizeof(cl_float2), NULL, &ret_code); twiddle_factors_mem = clCreateBuffer(context, CL_MEM_READ_WRITE, (n/2)*sizeof(cl_float2), NULL, &ret_code); //=================================================================================================== /* Transfer data to memory buffer */ CL_CHECK(clEnqueueWriteBuffer(cmd_queue, image_in_mem, CL_TRUE, 0, n*n*sizeof(cl_float2), image_in_host, 0, NULL, &write_host_dev_event)); image_tam = n*n*sizeof(cl_float2); //=================================================================================================== program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret_code); //=================================================================================================== ret_code = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); //=================================================================================================== if (ret_code != CL_SUCCESS) { // Determine the size of the log clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); //=================================================================================================== // Allocate memory for the log log_file = (char *) malloc(log_size); // Get the log clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log_file, NULL); printf("%s\n", log_file); system("pause"); exit(0); } kernel_twiddle_factors = clCreateKernel(program, "twiddle_factors", &ret_code); kernel_matriz_transpose = clCreateKernel(program, "matrix_trasponse", &ret_code); kernel_lowpass_filter = clCreateKernel(program, "lowpass_filter", &ret_code); /* Processa os fatores Wn*/ //=================================================================================================== CL_CHECK(clSetKernelArg(kernel_twiddle_factors, 0, sizeof(cl_mem), (void *)&twiddle_factors_mem)); CL_CHECK(clSetKernelArg(kernel_twiddle_factors, 1, sizeof(cl_int), (void *)&n)); config_workgroup_size(global_wg, local_wg, n/2, 1); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_twiddle_factors, 1, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[0])); //=================================================================================================== /* Executa a FFT em N/2 */ fft_main(image_out_mem, image_in_mem, twiddle_factors_mem, m, direta, &fft_events[0]); //=================================================================================================== /* Realiza a transposta da Matriz (imagem) */ CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 0, sizeof(cl_mem), (void *)&image_in_mem)); CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 1, sizeof(cl_mem), (void *)&image_out_mem)); CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 2, sizeof(cl_int), (void *)&n)); config_workgroup_size(global_wg, local_wg, n, n); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_matriz_transpose, 2, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[1])); //=================================================================================================== /* Executa a FFT N/2 */ fft_main(image_out_mem, image_in_mem, twiddle_factors_mem, m, direta, &fft_events[1]); //=================================================================================================== /* Processa o filtro passa baixa */ CL_CHECK(clSetKernelArg(kernel_lowpass_filter, 0, sizeof(cl_mem), (void *)&image_out_mem)); CL_CHECK(clSetKernelArg(kernel_lowpass_filter, 1, sizeof(cl_int), (void *)&n)); CL_CHECK(clSetKernelArg(kernel_lowpass_filter, 2, sizeof(cl_int), (void *)&raio)); config_workgroup_size(global_wg, local_wg, n, n); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_lowpass_filter, 2, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[2])); //=================================================================================================== /* Obtem a FFT inversa*/ fft_main(image_in_mem, image_out_mem, twiddle_factors_mem, m, inversa, &fft_events[2]); //=================================================================================================== /* Realiza a transposta da Matriz (imagem) */ CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 0, sizeof(cl_mem), (void *)&image_out_mem)); CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 1, sizeof(cl_mem), (void *)&image_in_mem)); CL_CHECK(clSetKernelArg(kernel_matriz_transpose, 2, sizeof(cl_int), (void *)&n)); config_workgroup_size(global_wg, local_wg, n, n); CL_CHECK(clEnqueueNDRangeKernel(cmd_queue, kernel_matriz_transpose, 2, NULL, global_wg, local_wg, 0, NULL, &kernels_events_out_fft[3])); //=================================================================================================== fft_main(image_in_mem, image_out_mem, twiddle_factors_mem, m, inversa, &fft_events[3]); //=================================================================================================== CL_CHECK(clEnqueueReadBuffer(cmd_queue, image_in_mem, CL_TRUE, 0, n*n*sizeof(cl_float2), image_in_host, 0, NULL, &read_dev_host_event)); //=================================================================================================== //== Total time elapsed ============================================================================ timer_stop(); tempo_total = get_elapsed_time(); //================================================================================================== //====== Get time of Profile Info ================================================================== // Write data time CL_CHECK(clGetEventProfilingInfo(write_host_dev_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &write_host_dev_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(write_host_dev_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &write_host_dev_end_time, NULL)); // Read data time CL_CHECK(clGetEventProfilingInfo(read_dev_host_event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &read_dev_host_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(read_dev_host_event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &read_dev_host_end_time, NULL)); for (i = 0; i < MAX_CALL_FFT; i++) { kernel_start_time = (cl_long) 0; kernel_end_time = (cl_long) 0; CL_CHECK(clGetEventProfilingInfo(kernels_events_out_fft[i], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(kernels_events_out_fft[i], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL)); kernel_runtime += (kernel_end_time - kernel_start_time); kernel_start_time = (cl_long) 0; kernel_end_time = (cl_long) 0; CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_bitsrev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_bitsrev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL)); kernel_runtime += (kernel_end_time - kernel_start_time); kernel_start_time = (cl_long) 0; kernel_end_time = (cl_long) 0; if (fft_events[i].kernel_normalize != NULL) { CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_normalize, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(fft_events[i].kernel_normalize, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL)); kernel_runtime += (kernel_end_time - kernel_start_time); } } for (j=0; j < MAX_CALL_FFT*m; j++){ kernel_start_time = (cl_long) 0; kernel_end_time = (cl_long) 0; CL_CHECK(clGetEventProfilingInfo(kernel_butter_events[j], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &kernel_start_time, NULL)); CL_CHECK(clGetEventProfilingInfo(kernel_butter_events[j], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &kernel_end_time, NULL)); kernel_runtime += (kernel_end_time - kernel_start_time); } write_host_dev_run_time = write_host_dev_end_time - write_host_dev_start_time; read_dev_host_run_time = read_dev_host_end_time - read_dev_host_start_time; /* save_log_debug(write_host_dev_run_time,fp); save_log_debug(read_dev_host_run_time,fp); close_log_debug(fp); */ image_tam_MB = (double) (((double) image_tam)/(double) MEGA_BYTES); //================================================================================================== save_log_gpu(image_filename, kernel_runtime, (double) (image_tam_MB/( (double) read_dev_host_run_time/(double) NANOSECONDS)), (double) (image_tam_MB/ ((double) write_host_dev_run_time/ (double) NANOSECONDS)), tempo_total, LOG_NAME); //=================================================================================================== image_amplitudes = (float*)malloc(n*n*sizeof(float)); for (i=0; i < n; i++) { for (j=0; j < n; j++) { image_amplitudes[n*j + i] = (float) (AMP(((float*)image_in_host)[(2*n*j)+2*i], ((float*)image_in_host)[(2*n*j)+2*i+1])); } } //clFlush(cmd_queue); //clFinish(cmd_queue); opgm.width = n; opgm.height = n; normalizar_pgm(&opgm, image_amplitudes); escrever_pgm(&opgm, output_filename); //=================================================================================================== clFinish(cmd_queue); clReleaseKernel(kernel_twiddle_factors); clReleaseKernel(kernel_matriz_transpose); clReleaseKernel(kernel_lowpass_filter); clReleaseProgram(program); clReleaseMemObject(image_in_mem); clReleaseMemObject(image_out_mem); clReleaseMemObject(twiddle_factors_mem); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); clReleaseEvent(read_dev_host_event); clReleaseEvent(write_host_dev_event); clReleaseEvent(kernels_events_out_fft[0]); clReleaseEvent(kernels_events_out_fft[1]); clReleaseEvent(kernels_events_out_fft[2]); clReleaseEvent(kernels_events_out_fft[3]); destruir_pgm(&ipgm); destruir_pgm(&opgm); free(image_amplitudes); free(source_str); free(image_in_host); free(image_filename); free(twiddle_factors_host); free(output_filename); free(fft_events); free(kernel_butter_events); //_CrtDumpMemoryLeaks(); return 0; }
/// // main() for Convoloution example // int main(int argc, char** argv) { cl_int errNum; cl_uint numPlatforms; cl_uint numDevices; cl_platform_id * platformIDs; cl_device_id * deviceIDs; cl_context context = NULL; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_mem inputSignalBuffer; cl_mem outputSignalBuffer; cl_mem maskBuffer; // First, select an OpenCL platform to run on. errNum = clGetPlatformIDs(0, NULL, &numPlatforms); checkErr( (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs"); platformIDs = (cl_platform_id *)alloca( sizeof(cl_platform_id) * numPlatforms); errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL); checkErr( (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs"); // Iterate through the list of platforms until we find one that supports // a CPU device, otherwise fail with an error. deviceIDs = NULL; cl_uint i; for (i = 0; i < numPlatforms; i++) { errNum = clGetDeviceIDs( platformIDs[i], CL_DEVICE_TYPE_CPU, 0, NULL, &numDevices); if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND) { checkErr(errNum, "clGetDeviceIDs"); } else if (numDevices > 0) { deviceIDs = (cl_device_id *)alloca(sizeof(cl_device_id) * numDevices); errNum = clGetDeviceIDs( platformIDs[i], CL_DEVICE_TYPE_CPU, numDevices, &deviceIDs[0], NULL); checkErr(errNum, "clGetDeviceIDs"); break; } } // Check to see if we found at least one CPU device, otherwise return if (deviceIDs == NULL) { std::cout << "No CPU device found" << std::endl; exit(-1); } // Next, create an OpenCL context on the selected platform. cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platformIDs[i], 0 }; context = clCreateContext( contextProperties, numDevices, deviceIDs, &contextCallback, NULL, &errNum); checkErr(errNum, "clCreateContext"); std::ifstream srcFile("../convolution/Convolution.cl"); checkErr(srcFile.is_open() ? CL_SUCCESS : -1, "reading Convolution.cl"); std::string srcProg( std::istreambuf_iterator<char>(srcFile), (std::istreambuf_iterator<char>())); const char * src = srcProg.c_str(); size_t length = srcProg.length(); // Create program from source program = clCreateProgramWithSource( context, 1, &src, &length, &errNum); checkErr(errNum, "clCreateProgramWithSource"); // Build program errNum = clBuildProgram( program, numDevices, deviceIDs, NULL, NULL, NULL); if (errNum != CL_SUCCESS) { // Determine the reason for the error char buildLog[16384]; clGetProgramBuildInfo( program, deviceIDs[0], CL_PROGRAM_BUILD_LOG, sizeof(buildLog), buildLog, NULL); std::cerr << "Error in kernel: " << std::endl; std::cerr << buildLog; checkErr(errNum, "clBuildProgram"); } // Create kernel object kernel = clCreateKernel( program, "convolve", &errNum); checkErr(errNum, "clCreateKernel"); // Now allocate buffers inputSignalBuffer = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * inputSignalHeight * inputSignalWidth, static_cast<void *>(inputSignal), &errNum); checkErr(errNum, "clCreateBuffer(inputSignal)"); maskBuffer = clCreateBuffer( context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * maskHeight * maskWidth, static_cast<void *>(mask), &errNum); checkErr(errNum, "clCreateBuffer(mask)"); outputSignalBuffer = clCreateBuffer( context, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * outputSignalHeight * outputSignalWidth, NULL, &errNum); checkErr(errNum, "clCreateBuffer(outputSignal)"); // Pick the first device and create command queue. queue = clCreateCommandQueue( context, deviceIDs[0], 0, &errNum); checkErr(errNum, "clCreateCommandQueue"); errNum = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputSignalBuffer); errNum |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &maskBuffer); errNum |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &outputSignalBuffer); errNum |= clSetKernelArg(kernel, 3, sizeof(cl_uint), &inputSignalWidth); errNum |= clSetKernelArg(kernel, 4, sizeof(cl_uint), &maskWidth); checkErr(errNum, "clSetKernelArg"); const size_t globalWorkSize[1] = { outputSignalWidth * outputSignalHeight }; const size_t localWorkSize[1] = { 1 }; // Queue the kernel up for execution across the array errNum = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); checkErr(errNum, "clEnqueueNDRangeKernel"); errNum = clEnqueueReadBuffer( queue, outputSignalBuffer, CL_TRUE, 0, sizeof(cl_uint) * outputSignalHeight * outputSignalHeight, outputSignal, 0, NULL, NULL); checkErr(errNum, "clEnqueueReadBuffer"); // Output the result buffer for (int y = 0; y < outputSignalHeight; y++) { for (int x = 0; x < outputSignalWidth; x++) { std::cout << outputSignal[x][y] << " "; } std::cout << std::endl; } std::cout << std::endl << "Executed program succesfully." << std::endl; return 0; }
double gpu_cgm_image(uint32_t* aList, uint32_t* bList, int aLength, int bLength, int keyLength, uint32_t** matches, char* clFile, int x, int y) { int gap = 0, myoffset = 0; cl_platform_id *platforms; cl_uint num_platforms = 0; cl_device_id *devices; cl_uint num_devices = 0; cl_context context; cl_command_queue command_queue; cl_image_format imgFormat; cl_mem aImg; cl_mem bImg; cl_mem res_buf; cl_program program; cl_kernel kernel; cl_uint *results; FILE *prgm_fptr; struct stat prgm_sbuf; char *prgm_data; size_t prgm_size; size_t offset; size_t count; const size_t global_work_size[] = { x, y }; const size_t origin[] = { 0, 0, 0 }; const size_t region[] = { aLength, 1, 1 }; cl_int ret; cl_uint i; cl_bool imageSupport; struct timeval t1, t2; double elapsedTime; results = malloc(sizeof(cl_uint) * aLength); imgFormat.image_channel_order = CL_RGBA; imgFormat.image_channel_data_type = CL_UNSIGNED_INT32; /* figure out how many CL platforms are available */ ret = clGetPlatformIDs(0, NULL, &num_platforms); if (CL_SUCCESS != ret) { print_error ("Error getting the number of platform IDs: %d", ret); exit(EXIT_FAILURE); } if (0 == num_platforms) { print_error ("No CL platforms were found."); exit(EXIT_FAILURE); } /* allocate space for each available platform ID */ if (NULL == (platforms = malloc((sizeof *platforms) * num_platforms))) { print_error ("Out of memory"); exit(EXIT_FAILURE); } /* get all of the platform IDs */ ret = clGetPlatformIDs(num_platforms, platforms, NULL); if (CL_SUCCESS != ret) { print_error ("Error getting platform IDs: %d", ret); exit(EXIT_FAILURE); } /* find a platform that supports given device type */ // print_error ("Number of platforms found: %d", num_platforms); for (i = 0; i < num_platforms; i++) { ret = clGetDeviceIDs(platforms[i], getDeviceType(), 0, NULL, &num_devices); if (CL_SUCCESS != ret) continue; if (0 < num_devices) break; } /* make sure at least one device was found */ if (num_devices == 0) { print_error ("No CL device found that supports device type: %s.", ((getDeviceType() == CL_DEVICE_TYPE_CPU) ? "CPU" : "GPU")); exit(EXIT_FAILURE); } /* only one device is necessary... */ num_devices = 1; if (NULL == (devices = malloc((sizeof *devices) * num_devices))) { print_error ("Out of memory"); exit(EXIT_FAILURE); } /* get one device id */ ret = clGetDeviceIDs(platforms[i], getDeviceType(), num_devices, devices, NULL); if (CL_SUCCESS != ret) { print_error ("Error getting device IDs: %d", ret); exit(EXIT_FAILURE); } ret = clGetDeviceInfo(*devices, CL_DEVICE_IMAGE_SUPPORT, sizeof(cl_bool), &imageSupport, NULL); if (CL_SUCCESS != ret) { print_error ("Failed to get Device Info: %d", ret); exit(EXIT_FAILURE); } if(imageSupport == CL_FALSE) { print_error ("Failure: Images are not supported!"); exit(EXIT_FAILURE); } /* create a context for the CPU device that was found earlier */ context = clCreateContext(NULL, num_devices, devices, NULL, NULL, &ret); if (NULL == context || CL_SUCCESS != ret) { print_error ("Failed to create context: %d", ret); exit(EXIT_FAILURE); } /* create a command queue for the CPU device */ command_queue = clCreateCommandQueue(context, devices[0], 0, &ret); if (NULL == command_queue || CL_SUCCESS != ret) { print_error ("Failed to create a command queue: %d", ret); exit(EXIT_FAILURE); } /* create buffers on the CL device */ aImg = clCreateImage2D(context, CL_MEM_READ_ONLY, &imgFormat, aLength, 1, 0, NULL, &ret); if (NULL == aImg || CL_SUCCESS != ret) { print_error ("Failed to create a image: %d", ret); exit(EXIT_FAILURE); } bImg = clCreateImage2D(context, CL_MEM_READ_ONLY, &imgFormat, aLength, 1, 0, NULL, &ret); if (NULL == bImg || CL_SUCCESS != ret) { print_error ("Failed to create b image: %d", ret); exit(EXIT_FAILURE); } int res_bufSize = aLength; res_buf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uint) * res_bufSize, NULL, &ret); if (NULL == res_buf || CL_SUCCESS != ret) { print_error ("Failed to create b buffer: %d", ret); exit(EXIT_FAILURE); } /* read the opencl program code into a string */ prgm_fptr = fopen(clFile, "r"); if (NULL == prgm_fptr) { print_error ("%s", strerror (errno)); exit(EXIT_FAILURE); } if (0 != stat(clFile, &prgm_sbuf)) { print_error ("%s", strerror (errno)); exit(EXIT_FAILURE); } prgm_size = prgm_sbuf.st_size; prgm_data = malloc(prgm_size); if (NULL == prgm_data) { print_error ("Out of memory"); exit(EXIT_FAILURE); } /* make sure all data is read from the file (just in case fread returns * short) */ offset = 0; while (prgm_size - offset != (count = fread(prgm_data + offset, 1, prgm_size - offset, prgm_fptr))) offset += count; if (0 != fclose(prgm_fptr)) { print_error ("%s", strerror (errno)); exit(EXIT_FAILURE); } /* create a 'program' from the source */ program = clCreateProgramWithSource(context, 1, (const char **) &prgm_data, &prgm_size, &ret); if (NULL == program || CL_SUCCESS != ret) { print_error ("Failed to create program with source: %d", ret); exit(EXIT_FAILURE); } /* compile the program.. (it uses llvm or something) */ ret = clBuildProgram(program, num_devices, devices, NULL, NULL, NULL); if (CL_SUCCESS != ret) { size_t size; char *log = calloc(1, 4000); if (NULL == log) { print_error ("Out of memory"); exit(EXIT_FAILURE); } print_error ("Failed to build program: %d", ret); ret = clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 4096, log, &size); if (CL_SUCCESS != ret) { print_error ("Failed to get program build info: %d", ret); exit(EXIT_FAILURE); } fprintf(stderr, "Begin log:\n%s\nEnd log.\n", log); exit(EXIT_FAILURE); } /* pull out a reference to your kernel */ kernel = clCreateKernel(program, "cgm_kernel", &ret); if (NULL == kernel || CL_SUCCESS != ret) { print_error ("Failed to create kernel: %d", ret); exit(EXIT_FAILURE); } gettimeofday(&t1, NULL); /* write data to these buffers */ clEnqueueWriteImage(command_queue, aImg, CL_FALSE, origin, region, 0, 0, (void*) aImg, 0, NULL, NULL); clEnqueueWriteImage(command_queue, bImg, CL_FALSE, origin, region, 0, 0, (void*) bImg, 0, NULL, NULL); /* set your kernel's arguments */ ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), &aImg); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), &bImg); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } ret = clSetKernelArg(kernel, 4, sizeof(int), &gap); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } ret = clSetKernelArg(kernel, 5, sizeof(int), &myoffset); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } ret = clSetKernelArg(kernel, 6, sizeof(int), &keyLength); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } ret = clSetKernelArg(kernel, 7, sizeof(cl_mem), &res_buf); if (CL_SUCCESS != ret) { print_error ("Failed to set kernel argument: %d", ret); exit(EXIT_FAILURE); } /* make sure buffers have been written before executing */ ret = clEnqueueBarrier(command_queue); if (CL_SUCCESS != ret) { print_error ("Failed to enqueue barrier: %d", ret); exit(EXIT_FAILURE); } /* enque this kernel for execution... */ ret = clEnqueueNDRangeKernel(command_queue, kernel, 2, NULL, global_work_size, NULL, 0, NULL, NULL); if (CL_SUCCESS != ret) { print_error ("Failed to enqueue kernel: %d", ret); exit(EXIT_FAILURE); } /* wait for the kernel to finish executing */ ret = clEnqueueBarrier(command_queue); if (CL_SUCCESS != ret) { print_error ("Failed to enqueue barrier: %d", ret); exit(EXIT_FAILURE); } /* copy the contents of dev_buf from the CL device to the host (CPU) */ ret = clEnqueueReadBuffer(command_queue, res_buf, true, 0, sizeof(cl_uint) * aLength, results, 0, NULL, NULL); gettimeofday(&t2, NULL); elapsedTime = (t2.tv_sec - t1.tv_sec) * 1000.0; // sec to ms elapsedTime += (t2.tv_usec - t1.tv_usec) / 1000.0; // us to ms if (CL_SUCCESS != ret) { print_error ("Failed to copy data from device to host: %d", ret); exit(EXIT_FAILURE); } ret = clEnqueueBarrier(command_queue); if (CL_SUCCESS != ret) { print_error ("Failed to enqueue barrier: %d", ret); exit(EXIT_FAILURE); } /* make sure the content of the buffer are what we expect */ //for (i = 0; i < aLength; i++) // printf("%d\n", results[i]); /* free up resources */ ret = clReleaseKernel(kernel); if (CL_SUCCESS != ret) { print_error ("Failed to release kernel: %d", ret); exit(EXIT_FAILURE); } ret = clReleaseProgram(program); if (CL_SUCCESS != ret) { print_error ("Failed to release program: %d", ret); exit(EXIT_FAILURE); } ret = clReleaseMemObject(aImg); if (CL_SUCCESS != ret) { print_error ("Failed to release memory object: %d", ret); exit(EXIT_FAILURE); } ret = clReleaseMemObject(bImg); if (CL_SUCCESS != ret) { print_error ("Failed to release memory object: %d", ret); exit(EXIT_FAILURE); } ret = clReleaseMemObject(res_buf); if (CL_SUCCESS != ret) { print_error ("Failed to release memory object: %d", ret); exit(EXIT_FAILURE); } if (CL_SUCCESS != (ret = clReleaseCommandQueue(command_queue))) { print_error ("Failed to release command queue: %d", ret); exit(EXIT_FAILURE); } if (CL_SUCCESS != (ret = clReleaseContext(context))) { print_error ("Failed to release context: %d", ret); exit(EXIT_FAILURE); } matches = &results; return elapsedTime; }
static void clrpc_client_test2(void) { int err; int size = 1024; cl_uint nplatforms = 0; cl_platform_id* platforms = 0; cl_uint nplatforms_ret; clGetPlatformIDs(nplatforms,platforms,&nplatforms_ret); printf( "after call one i get nplatforms_ret = %d", nplatforms_ret); if (nplatforms_ret == 0) exit(1); nplatforms = nplatforms_ret; platforms = (cl_platform_id*)calloc(nplatforms,sizeof(cl_platform_id)); clGetPlatformIDs(nplatforms,platforms,&nplatforms_ret); int i; for(i=0;i<nplatforms;i++) { clrpc_dptr* tmp = ((_xobj_t*)platforms[i])->obj; int is_rpc; if ( clGetPlatformInfo(platforms[i],999,sizeof(cl_int),&is_rpc,0)==CL_SUCCESS) { printf( "platforms[%d] local=%p remote=%p\n", i,(void*)tmp->local, (void*)tmp->remote); } else { printf( "platforms[%d] not RPC\n",i); } } char buffer[1024]; size_t sz; cl_platform_id rpc_platform = 0; for(i=0;i<nplatforms;i++) { clGetPlatformInfo(platforms[i],CL_PLATFORM_NAME,1023,buffer,&sz); printf( "\n [%d] CL_PLATFORM_NAME|%ld:%s|\n",i,sz,buffer); } int iplat; for(iplat=0;iplat<nplatforms;iplat++) { printf("\n******************\nTEST PLATFORM %d\n*************\n\n",iplat); cl_uint ndevices = 0; cl_device_id* devices = 0; cl_uint ndevices_ret; clGetDeviceIDs(platforms[iplat],CL_DEVICE_TYPE_ALL, ndevices,devices,&ndevices_ret); printf( "after call one i get ndevices_ret = %d\n", ndevices_ret); if (ndevices_ret > 10) exit(-1); ndevices = ndevices_ret; devices = (cl_device_id*)calloc(ndevices,sizeof(cl_device_id)); clGetDeviceIDs(platforms[iplat],CL_DEVICE_TYPE_ALL, ndevices,devices,&ndevices_ret); if (!ndevices_ret) { //printf("no devices, stopping.\n"); //exit(1); printf("no devices, skipping.\n"); continue; } for(i=0;i<ndevices;i++) { clrpc_dptr* tmp = ((_xobj_t*)devices[i])->obj; clGetDeviceInfo(devices[i],CL_DEVICE_NAME,1023,buffer,&sz); printf( "CL_DEVICE_NAME |%s|\n",buffer); cl_platform_id tmpid; clGetDeviceInfo(devices[i],CL_DEVICE_PLATFORM,sizeof(tmpid),&tmpid,&sz); printf("%p\n",platforms[iplat]); fflush(stdout); printf("%p\n",tmpid); fflush(stdout); clGetPlatformInfo(tmpid,CL_PLATFORM_NAME,1023,buffer,&sz); printf( "\n [%d] CL_PLATFORM_NAME|%ld:%s|\n",i,sz,buffer); } cl_context_properties ctxprop[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[iplat], 0 }; printf("i am setting this: prop[%d] %p\n",iplat,platforms[iplat]); cl_context ctx = clCreateContext(ctxprop,ndevices,devices, 0,0,&err); cl_command_queue* cmdq = (cl_command_queue*) calloc(ndevices,sizeof(cl_command_queue)); for(i=0;i<ndevices;i++) { cmdq[i] = clCreateCommandQueue(ctx,devices[i],0,&err); printf( "cmdq %d %p",i,cmdq[i]); } cl_mem a_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int), 0,&err); cl_mem b_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int), 0,&err); cl_mem c_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int), 0,&err); cl_mem d_buf = clCreateBuffer(ctx,CL_MEM_READ_WRITE,size*sizeof(int), 0,&err); int* a = (int*)malloc(1024*sizeof(int)); int* b = (int*)malloc(1024*sizeof(int)); int* c = (int*)malloc(1024*sizeof(int)); int* d = (int*)malloc(1024*sizeof(int)); char* prgsrc[] = { "__kernel void my_kern( int n, __global int* a, __global int* b )\n" " { int i = get_global_id(0); int tmp = 0; int j; for(j=0;j<n;j++) tmp += a[i] * a[j]; b[i] = tmp; }\n" }; size_t prgsrc_sz = strlen(prgsrc[0]) + 1; cl_program prg = clCreateProgramWithSource(ctx,1, (const char**)prgsrc,&prgsrc_sz,&err); clBuildProgram(prg,ndevices,devices,0,0,0); cl_kernel krn = clCreateKernel(prg,"my_kern",&err); int idev; for(idev=0;idev<ndevices;idev++) { printf("\n******************\nTEST DEVICE %d(%d)\n*************\n\n",idev,iplat); for(i=0;i<size;i++) a[i] = i*10; for(i=0;i<size;i++) b[i] = i*10+1; for(i=0;i<size;i++) c[i] = 0; for(i=0;i<size;i++) d[i] = 0; cl_event ev[8]; for(i=0;i<32;i++) printf("%d/",a[i]); printf("\n"); for(i=0;i<32;i++) printf("%d/",b[i]); printf("\n"); clEnqueueWriteBuffer(cmdq[idev],a_buf,CL_FALSE,0,size*sizeof(int),a, 0,0,&ev[0]); clEnqueueWriteBuffer(cmdq[idev],b_buf,CL_FALSE,0,size*sizeof(int),b, 1,ev,&ev[1]); clEnqueueWriteBuffer(cmdq[idev],c_buf,CL_FALSE,0,size*sizeof(int),c, 2,ev,&ev[2]); clEnqueueWriteBuffer(cmdq[idev],d_buf,CL_FALSE,0,size*sizeof(int),d, 3,ev,&ev[3]); size_t offset = 0; size_t gwsz = 128; size_t lwsz = 16; clSetKernelArg(krn,0,sizeof(int),&size); clSetKernelArg(krn,1,sizeof(cl_mem),&a_buf); clSetKernelArg(krn,2,sizeof(cl_mem),&c_buf); clEnqueueNDRangeKernel(cmdq[idev],krn,1,&offset,&gwsz,&lwsz,4,ev,&ev[4]); clSetKernelArg(krn,1,sizeof(cl_mem),&b_buf); clSetKernelArg(krn,2,sizeof(cl_mem),&d_buf); clEnqueueNDRangeKernel(cmdq[idev],krn,1,&offset,&gwsz,&lwsz,5,ev,&ev[5]); clEnqueueReadBuffer(cmdq[idev],c_buf,CL_FALSE,0,size*sizeof(int),c, 6,ev,&ev[6]); clEnqueueReadBuffer(cmdq[idev],d_buf,CL_FALSE,0,size*sizeof(int),d, 7,ev,&ev[7]); clFlush(cmdq[idev]); clWaitForEvents(8,ev); for(i=0;i<32;i++) printf("%d/",c[i]); printf("\n"); for(i=0;i<32;i++) printf("%d/",d[i]); printf("\n"); for(i=0;i<8;i++) clReleaseEvent(ev[i]); } clReleaseKernel(krn); clReleaseProgram(prg); clReleaseMemObject(a_buf); clReleaseMemObject(b_buf); clReleaseMemObject(c_buf); clReleaseMemObject(d_buf); clReleaseCommandQueue(cmdq[0]); clReleaseContext(ctx); // printf("sleeping ...\n"); // sleep(1); } // clrpc_final(); }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX, bufY, bufParam; cl_event event = NULL; int ret = 0; int lenX = 1 + (N-1)*abs(incx); int lenY = 1 + (N-1)*abs(incy); int lenParam = 5; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenX*sizeof(cl_float)), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err); bufParam = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenParam*sizeof(cl_float)), NULL, &err); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufParam, CL_TRUE, 0, (lenParam*sizeof(cl_float)), SPARAM, 0, NULL, NULL); /* Call clblas function. */ err = clblasSrotm(N, bufX, 0, incx, bufY, 0, incy, bufParam, 0, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSrotm() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); /* At this point you will get the result of SROTM placed in vector Y. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); clReleaseMemObject(bufParam); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
int CommandGenerate::execute(const std::vector<std::string>& p_args) { if(p_args.size() < 10) { help(); return -1; } unsigned int platformId = atol(p_args[1].c_str()); unsigned int deviceId = atol(p_args[2].c_str()); unsigned int staggerSize = atol(p_args[3].c_str()); unsigned int threadsNumber = atol(p_args[4].c_str()); unsigned int hashesNumber = atol(p_args[5].c_str()); unsigned int nonceSize = PLOT_SIZE * staggerSize; std::cerr << "Threads number: " << threadsNumber << std::endl; std::cerr << "Hashes number: " << hashesNumber << std::endl; unsigned int numjobs = (p_args.size() - 5)/4; std::cerr << numjobs << " plot(s) to do." << std::endl; unsigned int staggerMbSize = staggerSize / 4; std::cerr << "Non-GPU memory usage: " << staggerMbSize*numjobs << "MB" << std::endl; std::vector<std::string> paths(numjobs); std::vector<std::ofstream *> out_files(numjobs); std::vector<unsigned long long> addresses(numjobs); std::vector<unsigned long long> startNonces(numjobs); std::vector<unsigned long long> endNonces(numjobs); std::vector<unsigned int> noncesNumbers(numjobs); std::vector<unsigned char*> buffersCpu(numjobs); std::vector<bool> saving_thread_flags(numjobs); std::vector<std::future<void>> save_threads(numjobs); unsigned long long maxNonceNumber = 0; unsigned long long totalNonces = 0; int returnCode = 0; try { for (unsigned int i = 0; i < numjobs; i++) { std::cerr << "----" << std::endl; std::cerr << "Job number " << i << std::endl; unsigned int argstart = 6 + i*4; paths[i] = std::string(p_args[argstart]); addresses[i] = strtoull(p_args[argstart+1].c_str(), NULL, 10); startNonces[i] = strtoull(p_args[argstart+2].c_str(), NULL, 10); noncesNumbers[i] = atol(p_args[argstart+3].c_str()); maxNonceNumber = std::max(maxNonceNumber, (long long unsigned int)noncesNumbers[i]); totalNonces += noncesNumbers[i]; std::ostringstream outFile; outFile << paths[i] << "/" << addresses[i] << "_" << startNonces[i] << "_" << \ noncesNumbers[i] << "_" << staggerSize; std::ios_base::openmode file_mode = std::ios::out | std::ios::binary | std::ios::trunc; out_files[i] = new std::ofstream(outFile.str(), file_mode); assert(out_files[i]); if(noncesNumbers[i] % staggerSize != 0) { noncesNumbers[i] -= noncesNumbers[i] % staggerSize; noncesNumbers[i] += staggerSize; } endNonces[i] = startNonces[i] + noncesNumbers[i]; unsigned int noncesGbSize = noncesNumbers[i] / 4 / 1024; std::cerr << "Path: " << outFile.str() << std::endl; std::cerr << "Nonces: " << startNonces[i] << " to " << endNonces[i] << " (" << noncesGbSize << " GB)" << std::endl; std::cerr << "Creating CPU buffer" << std::endl; buffersCpu[i] = new unsigned char[nonceSize]; if(!buffersCpu[i]) { throw std::runtime_error("Unable to create the CPU buffer (probably out of host memory.)"); } saving_thread_flags[i] = false; std::cerr << "----" << std::endl; } cl_platform_id platforms[4]; cl_uint platformsNumber; cl_device_id devices[32]; cl_uint devicesNumber; cl_context context = 0; cl_command_queue commandQueue = 0; cl_mem bufferGpuGen = 0; cl_mem bufferGpuScoops = 0; cl_program program = 0; cl_kernel kernelStep1 = 0; cl_kernel kernelStep2 = 0; cl_kernel kernelStep3 = 0; int error; std::cerr << "Retrieving OpenCL platforms" << std::endl; error = clGetPlatformIDs(4, platforms, &platformsNumber); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to retrieve the OpenCL platforms"); } if(platformId >= platformsNumber) { throw std::runtime_error("No platform found with the provided id"); } std::cerr << "Retrieving OpenCL GPU devices" << std::endl; error = clGetDeviceIDs(platforms[platformId], CL_DEVICE_TYPE_CPU | CL_DEVICE_TYPE_GPU, 32, devices, &devicesNumber); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to retrieve the OpenCL devices"); } if(deviceId >= devicesNumber) { throw std::runtime_error("No device found with the provided id"); } std::cerr << "Creating OpenCL context" << std::endl; context = clCreateContext(0, 1, &devices[deviceId], NULL, NULL, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL context"); } std::cerr << "Creating OpenCL command queue" << std::endl; commandQueue = clCreateCommandQueue(context, devices[deviceId], 0, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL command queue"); } std::cerr << "Creating OpenCL GPU generation buffer" << std::endl; bufferGpuGen = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uchar) * GEN_SIZE * staggerSize, 0, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL GPU generation buffer"); } std::cerr << "Creating OpenCL GPU scoops buffer" << std::endl; bufferGpuScoops = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_uchar) * nonceSize, 0, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL GPU scoops buffer"); } std::cerr << "Creating OpenCL program" << std::endl; std::string source = loadSource("kernel/nonce.cl"); const char* sources[] = {source.c_str()}; size_t sourcesLength[] = {source.length()}; program = clCreateProgramWithSource(context, 1, sources, sourcesLength, &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL program"); } std::cerr << "Building OpenCL program" << std::endl; error = clBuildProgram(program, 1, &devices[deviceId], "-I kernel", 0, 0); if(error != CL_SUCCESS) { size_t logSize; clGetProgramBuildInfo(program, devices[deviceId], CL_PROGRAM_BUILD_LOG, 0, 0, &logSize); char* log = new char[logSize]; clGetProgramBuildInfo(program, devices[deviceId], CL_PROGRAM_BUILD_LOG, logSize, (void*)log, 0); std::cerr << log << std::endl; delete[] log; throw OpenclError(error, "Unable to build the OpenCL program"); } std::cerr << "Creating OpenCL step1 kernel" << std::endl; kernelStep1 = clCreateKernel(program, "nonce_step1", &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL kernel"); } std::cerr << "Setting OpenCL step1 kernel static arguments" << std::endl; error = clSetKernelArg(kernelStep1, 2, sizeof(cl_mem), (void*)&bufferGpuGen); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL kernel arguments"); } std::cerr << "Creating OpenCL step2 kernel" << std::endl; kernelStep2 = clCreateKernel(program, "nonce_step2", &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL kernel"); } std::cerr << "Setting OpenCL step2 kernel static arguments" << std::endl; error = clSetKernelArg(kernelStep2, 1, sizeof(cl_mem), (void*)&bufferGpuGen); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL kernel arguments"); } std::cerr << "Creating OpenCL step3 kernel" << std::endl; kernelStep3 = clCreateKernel(program, "nonce_step3", &error); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to create the OpenCL kernel"); } std::cerr << "Setting OpenCL step3 kernel static arguments" << std::endl; error = clSetKernelArg(kernelStep3, 0, sizeof(cl_uint), (void*)&staggerSize); error = clSetKernelArg(kernelStep3, 1, sizeof(cl_mem), (void*)&bufferGpuGen); error = clSetKernelArg(kernelStep3, 2, sizeof(cl_mem), (void*)&bufferGpuScoops); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL kernel arguments"); } size_t globalWorkSize = staggerSize; size_t localWorkSize = (staggerSize < threadsNumber) ? staggerSize : threadsNumber; time_t startTime = time(0); unsigned int totalNoncesCompleted = 0; for (unsigned long long nonce_ordinal = 0; nonce_ordinal < maxNonceNumber; nonce_ordinal += staggerSize) { for (unsigned int jobnum = 0; jobnum < paths.size(); jobnum += 1) { unsigned long long nonce = startNonces[jobnum] + nonce_ordinal; if (nonce > endNonces[jobnum]) { break; } std::cout << "Running with start nonce " << nonce << std::endl; // Is a cl_ulong always an unsigned long long? unsigned int error = 0; error = clSetKernelArg(kernelStep1, 0, sizeof(cl_ulong), (void*)&addresses[jobnum]); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL step1 kernel arguments"); } error = clSetKernelArg(kernelStep1, 1, sizeof(cl_ulong), (void*)&nonce); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL step1 kernel arguments"); } error = clEnqueueNDRangeKernel(commandQueue, kernelStep1, 1, 0, &globalWorkSize, &localWorkSize, 0, 0, 0); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in step1 kernel launch"); } unsigned int hashesSize = hashesNumber * HASH_SIZE; for(int hashesOffset = PLOT_SIZE ; hashesOffset > 0 ; hashesOffset -= hashesSize) { error = clSetKernelArg(kernelStep2, 0, sizeof(cl_ulong), (void*)&nonce); error = clSetKernelArg(kernelStep2, 2, sizeof(cl_uint), (void*)&hashesOffset); error = clSetKernelArg(kernelStep2, 3, sizeof(cl_uint), (void*)&hashesNumber); if(error != CL_SUCCESS) { throw OpenclError(error, "Unable to set the OpenCL step2 kernel arguments"); } error = clEnqueueNDRangeKernel(commandQueue, kernelStep2, 1, 0, &globalWorkSize, &localWorkSize, 0, 0, 0); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in step2 kernel launch"); } error = clFinish(commandQueue); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in step2 kernel finish"); } } totalNoncesCompleted += staggerSize; double percent = 100.0 * (double)totalNoncesCompleted / totalNonces; time_t currentTime = time(0); double speed = (double)totalNoncesCompleted / difftime(currentTime, startTime) * 60.0; double estimatedTime = (double)(totalNonces - totalNoncesCompleted) / speed; std::cerr << "\r" << percent << "% (" << totalNoncesCompleted << "/" << totalNonces << " nonces)"; std::cerr << ", " << speed << " nonces/minutes"; std::cerr << ", ETA: " << ((int)estimatedTime / 60) << "h" << ((int)estimatedTime % 60) << "m" << ((int)(estimatedTime * 60.0) % 60) << "s"; std::cerr << "... "; error = clEnqueueNDRangeKernel(commandQueue, kernelStep3, 1, 0, &globalWorkSize, &localWorkSize, 0, 0, 0); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in step3 kernel launch"); } if (saving_thread_flags[jobnum]) { save_threads[jobnum].wait(); // Wait for last job to finish saving_thread_flags[jobnum] = false; } error = clEnqueueReadBuffer(commandQueue, bufferGpuScoops, CL_TRUE, 0, sizeof(cl_uchar) * nonceSize, buffersCpu[jobnum], 0, 0, 0); if(error != CL_SUCCESS) { throw OpenclError(error, "Error in synchronous read"); } saving_thread_flags[jobnum] = true; save_threads[jobnum] = std::async(std::launch::async, save_nonces, nonceSize, out_files[jobnum], buffersCpu[jobnum]); } } //Clean up for (unsigned int i = 0; i < paths.size(); i += 1) { if (saving_thread_flags[i]) { std::cerr << "waiting for final save to " << paths[i] << " to finish" << std::endl; save_threads[i].wait(); saving_thread_flags[i] = false; std::cerr << "done waiting for final save" << std::endl; if (buffersCpu[i]) { delete[] buffersCpu[i]; } } } if(kernelStep3) { clReleaseKernel(kernelStep3); } if(kernelStep2) { clReleaseKernel(kernelStep2); } if(kernelStep1) { clReleaseKernel(kernelStep1); } if(program) { clReleaseProgram(program); } if(bufferGpuGen) { clReleaseMemObject(bufferGpuGen); } if(bufferGpuScoops) { clReleaseMemObject(bufferGpuScoops); } if(commandQueue) { clReleaseCommandQueue(commandQueue); } if(context) { clReleaseContext(context); } time_t currentTime = time(0); double elapsedTime = difftime(currentTime, startTime) / 60.0; double speed = (double)totalNonces / elapsedTime; std::cerr << "\r100% (" << totalNonces << "/" << totalNonces << " nonces)"; std::cerr << ", " << speed << " nonces/minutes"; std::cerr << ", " << ((int)elapsedTime / 60) << "h" << ((int)elapsedTime % 60) << "m" << ((int)(elapsedTime * 60.0) % 60) << "s"; std::cerr << " " << std::endl; } catch(const OpenclError& ex) { std::cerr << "[ERROR] [" << ex.getCode() << "] " << ex.what() << std::endl; returnCode = -1; } catch(const std::exception& ex) { std::cerr << "[ERROR] " << ex.what() << std::endl; returnCode = -1; } return returnCode; }
void test_variable_opencl_func(void *buffers[], void *args) { STARPU_SKIP_IF_VALGRIND; int id, devid, ret; int factor = *(int *) args; cl_int err; cl_kernel kernel; cl_command_queue queue; cl_event event; ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION, &opencl_program, NULL); STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file"); cl_mem val = (cl_mem)STARPU_VARIABLE_GET_PTR(buffers[0]); cl_context context; id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); starpu_opencl_get_context(devid, &context); cl_mem fail = clCreateBuffer(context, CL_MEM_COPY_HOST_PTR, sizeof(int), &variable_config.copy_failed, &err); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_program, "variable_opencl", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 0, sizeof(val), &val); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 1, sizeof(fail), &fail); if (err) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 2, sizeof(factor), &factor); if (err) STARPU_OPENCL_REPORT_ERROR(err); { size_t global = 1; size_t local; size_t s; cl_device_id device; starpu_opencl_get_device(devid, &device); err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); if (local > global) local = global; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } err = clEnqueueReadBuffer(queue, fail, CL_TRUE, 0, sizeof(int), &variable_config.copy_failed, 0, NULL, NULL); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); clFinish(queue); starpu_opencl_collect_stats(event); clReleaseEvent(event); starpu_opencl_release_kernel(kernel); ret = starpu_opencl_unload_opencl(&opencl_program); STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl"); return; }
int main(int argc, char **argv) { printf("enter demo main\n"); fflush(stdout); putenv("POCL_VERBOSE=1"); putenv("POCL_DEVICES=basic"); putenv("POCL_LEAVE_TEMP_DIRS=1"); putenv("POCL_LEAVE_KERNEL_COMPILER_TEMP_FILES=1"); putenv("POCL_TEMP_DIR=pocl"); putenv("POCL_CACHE_DIR=pocl"); putenv("POCL_WORK_GROUP_METHOD=spmd"); if(argc >= 2){ printf("argv[1]:%s:\n",argv[1]); if(!strcmp(argv[1], "h")) putenv("POCL_WORK_GROUP_METHOD=spmd"); if(!strcmp(argv[1], "c")) putenv("POCL_CROSS_COMPILE=1"); } if(argc >= 3){ printf("argv[2]:%s:\n",argv[2]); if(!strcmp(argv[2], "h")) putenv("POCL_WORK_GROUP_METHOD=spmd"); if(!strcmp(argv[2], "c")) putenv("POCL_CROSS_COMPILE=1"); } //putenv("LD_LIBRARY_PATH=/scratch/colins/build/linux/fs/lib"); //putenv("LTDL_LIBRARY_PATH=/scratch/colins/build/linux/fs/lib"); //lt_dlsetsearchpath("/scratch/colins/build/linux/fs/lib"); //printf("SEARCH_PATH:%s\n",lt_dlgetsearchpath()); cl_platform_id platforms[100]; cl_uint platforms_n = 0; CL_CHECK(clGetPlatformIDs(100, platforms, &platforms_n)); printf("=== %d OpenCL platform(s) found: ===\n", platforms_n); for (int i=0; i<platforms_n; i++) { char buffer[10240]; printf(" -- %d --\n", i); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL)); printf(" PROFILE = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL)); printf(" VERSION = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL)); printf(" NAME = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL)); printf(" VENDOR = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL)); printf(" EXTENSIONS = %s\n", buffer); } if (platforms_n == 0) return 1; cl_device_id devices[100]; cl_uint devices_n = 0; // CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 100, devices, &devices_n)); CL_CHECK(clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_GPU, 100, devices, &devices_n)); printf("=== %d OpenCL device(s) found on platform:\n", devices_n); for (int i=0; i<devices_n; i++) { char buffer[10240]; cl_uint buf_uint; cl_ulong buf_ulong; printf(" -- %d --\n", i); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL)); printf(" DEVICE_NAME = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VENDOR = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL)); printf(" DRIVER_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL)); printf(" DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong); } if (devices_n == 0) return 1; cl_context context; context = CL_CHECK_ERR(clCreateContext(NULL, 1, devices+1, &pfn_notify, NULL, &_err)); cl_command_queue queue; queue = CL_CHECK_ERR(clCreateCommandQueue(context, devices[1], CL_QUEUE_PROFILING_ENABLE, &_err)); cl_kernel kernel = 0; cl_mem memObjects[2] = {0,0}; // Create OpenCL program - first attempt to load cached binary. // If that is not available, then create the program from source // and store the binary for future use. std::cout << "Attempting to create program from binary..." << std::endl; cl_program program = CreateProgramFromBinary(context, devices[1], "kernel.cl.bin"); if (program == NULL) { std::cout << "Binary not loaded, create from source..." << std::endl; program = CreateProgram(context, devices[1], "kernel.cl"); if (program == NULL) { Cleanup(context, queue, program, kernel, memObjects); return 1; } std::cout << "Save program binary for future run..." << std::endl; if (SaveProgramBinary(program, devices[1], "kernel.cl.bin") == false) { std::cerr << "Failed to write program binary" << std::endl; Cleanup(context, queue, program, kernel, memObjects); return 1; } } else { std::cout << "Read program from binary." << std::endl; } printf("attempting to create input buffer\n"); fflush(stdout); cl_mem input_buffer; input_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(double)*NUM_DATA, NULL, &_err)); printf("attempting to create output buffer\n"); fflush(stdout); cl_mem output_buffer; output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(double)*NUM_DATA, NULL, &_err)); memObjects[0] = input_buffer; memObjects[1] = output_buffer; double factor = ((double)rand()/(double)(RAND_MAX)) * 100.0;; printf("attempting to create kernel\n"); fflush(stdout); kernel = CL_CHECK_ERR(clCreateKernel(program, "daxpy", &_err)); printf("setting up kernel args cl_mem:%lx \n",input_buffer); fflush(stdout); CL_CHECK(clSetKernelArg(kernel, 0, sizeof(input_buffer), &input_buffer)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(output_buffer), &output_buffer)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(factor), &factor)); printf("attempting to enqueue write buffer\n"); fflush(stdout); for (int i=0; i<NUM_DATA; i++) { double in = ((double)rand()/(double)(RAND_MAX)) * 100.0;; CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, i*sizeof(double), 8, &in, 0, NULL, NULL)); } cl_event kernel_completion; size_t global_work_size[1] = { NUM_DATA }; printf("attempting to enqueue kernel\n"); fflush(stdout); CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, &kernel_completion)); printf("Enqueue'd kerenel\n"); fflush(stdout); cl_ulong time_start, time_end; CL_CHECK(clWaitForEvents(1, &kernel_completion)); CL_CHECK(clGetEventProfilingInfo(kernel_completion, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL)); CL_CHECK(clGetEventProfilingInfo(kernel_completion, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL)); double elapsed = time_end - time_start; printf("time(ns):%lg\n",elapsed); CL_CHECK(clReleaseEvent(kernel_completion)); printf("Result:"); for (int i=0; i<NUM_DATA; i++) { double data; CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, i*sizeof(double), 8, &data, 0, NULL, NULL)); //printf(" %lg", data); } printf("\n"); CL_CHECK(clReleaseMemObject(memObjects[0])); CL_CHECK(clReleaseMemObject(memObjects[1])); CL_CHECK(clReleaseKernel(kernel)); CL_CHECK(clReleaseProgram(program)); CL_CHECK(clReleaseContext(context)); return 0; }
/* uint32_t run (in uint32_t rank, [array, size_is (rank)] in uint32_t shape, [array, size_is (rank), optional] in uint32_t tile); */ NS_IMETHODIMP dpoCKernel::Run(uint32_t rank, uint32_t *shape, uint32_t *tile, uint32_t *_retval) { cl_int err_code; cl_event runEvent, readEvent, writeEvent; size_t *global_work_size; size_t *local_work_size; const int zero = 0; DEBUG_LOG_STATUS("Run", "preparing execution of kernel"); if (sizeof(size_t) == sizeof(uint32_t)) { global_work_size = (size_t *) shape; } else { global_work_size = (size_t *) nsMemory::Alloc(rank * sizeof(size_t)); if (global_work_size == NULL) { DEBUG_LOG_STATUS("Run", "allocation of global_work_size failed"); return NS_ERROR_OUT_OF_MEMORY; } for (uint32_t cnt = 0; cnt < rank; cnt++) { global_work_size[cnt] = shape[cnt]; } } #ifdef USE_LOCAL_WORKSIZE if (tile == NULL) { local_work_size = NULL; } else { if ((sizeof(size_t) == sizeof(uint32_t))) { local_work_size = (size_t *) tile; } else { local_work_size = (size_t *) nsMemory::Alloc(rank * sizeof(size_t)); if (local_work_size == NULL) { DEBUG_LOG_STATUS("Run", "allocation of local_work_size failed"); return NS_ERROR_OUT_OF_MEMORY; } for (int cnt = 0; cnt < rank; cnt++) { local_work_size[cnt] = (size_t) tile[cnt]; } } } #else /* USE_LOCAL_WORKSIZE */ local_work_size = NULL; #endif /* USE_LOCAL_WORKSIZE */ DEBUG_LOG_STATUS("Run", "setting failure code to 0"); err_code = clEnqueueWriteBuffer(cmdQueue, failureMem, CL_FALSE, 0, sizeof(int), &zero, 0, NULL, &writeEvent); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("Run", err_code); return NS_ERROR_ABORT; } DEBUG_LOG_STATUS("Run", "enqueing execution of kernel"); #ifdef WINDOWS_ROUNDTRIP dpoCContext::RecordBeginOfRoundTrip(parent); #endif /* WINDOWS_ROUNDTRIP */ err_code = clEnqueueNDRangeKernel(cmdQueue, kernel, rank, NULL, global_work_size, NULL, 1, &writeEvent, &runEvent); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("Run", err_code); return NS_ERROR_ABORT; } DEBUG_LOG_STATUS("Run", "reading failure code"); err_code = clEnqueueReadBuffer(cmdQueue, failureMem, CL_FALSE, 0, sizeof(int), _retval, 1, &runEvent, &readEvent); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("Run", err_code); return NS_ERROR_ABORT; } DEBUG_LOG_STATUS("Run", "waiting for execution to finish"); // For now we always wait for the run to complete. // In the long run, we may want to interleave this with JS execution and only sync on result read. err_code = clWaitForEvents( 1, &readEvent); DEBUG_LOG_STATUS("Run", "first event fired"); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("Run", err_code); return NS_ERROR_ABORT; } #ifdef WINDOWS_ROUNDTRIP dpoCContext::RecordEndOfRoundTrip(parent); #endif /* WINDOWS_ROUNDTRIP */ #ifdef CLPROFILE #ifdef CLPROFILE_ASYNC err_code = clSetEventCallback( readEvent, CL_COMPLETE, &dpoCContext::CollectTimings, parent); DEBUG_LOG_STATUS("Run", "second event fired"); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("Run", err_code); return NS_ERROR_ABORT; } #else /* CLPROFILE_ASYNC */ dpoCContext::CollectTimings(runEvent,CL_COMPLETE,parent); #endif /* CLPROFILE_ASYNC */ #endif /* CLPROFILE */ DEBUG_LOG_STATUS("Run", "execution completed successfully, start cleanup"); if (global_work_size != (size_t *) shape) { nsMemory::Free(global_work_size); } #ifdef USE_LOCAL_WORKSIZE if (local_work_size != (size_t *) tile) { nsMemory::Free(local_work_size); } #endif /* USE_LOCAL_WORKSIZE */ err_code = clReleaseEvent(readEvent); err_code = clReleaseEvent(runEvent); err_code = clReleaseEvent(writeEvent); if (err_code != CL_SUCCESS) { DEBUG_LOG_ERROR("Run", err_code); return NS_ERROR_ABORT; } DEBUG_LOG_STATUS("Run", "cleanup complete"); return NS_OK; }
//////////////////////////////////////////////////////////////////////////////// // Main program //////////////////////////////////////////////////////////////////////////////// int main(int argc, char **argv) { cl_platform_id cpPlatform; //OpenCL platform cl_device_id cdDevice; //OpenCL device cl_context cxGPUContext; //OpenCL context cl_command_queue cqCommandQueue; //OpenCL command que cl_mem d_Input, d_Output; //OpenCL memory buffer objects cl_int ciErrNum; float *h_Input, *h_OutputCPU, *h_OutputGPU; const uint imageW = 2048, imageH = 2048, stride = 2048; const int dir = DCT_FORWARD; shrQAStart(argc, argv); // set logfile name and start logs shrSetLogFileName ("oclDCT8x8.txt"); shrLog("%s Starting...\n\n", argv[0]); shrLog("Allocating and initializing host memory...\n"); h_Input = (float *)malloc(imageH * stride * sizeof(float)); h_OutputCPU = (float *)malloc(imageH * stride * sizeof(float)); h_OutputGPU = (float *)malloc(imageH * stride * sizeof(float)); srand(2009); for(uint i = 0; i < imageH; i++) for(uint j = 0; j < imageW; j++) h_Input[i * stride + j] = (float)rand() / (float)RAND_MAX; shrLog("Initializing OpenCL...\n"); //Get the NVIDIA platform ciErrNum = oclGetPlatformID(&cpPlatform); oclCheckError(ciErrNum, CL_SUCCESS); //Get a GPU device ciErrNum = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL); oclCheckError(ciErrNum, CL_SUCCESS); //Create the context cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); //Create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Initializing OpenCL DCT 8x8...\n"); initDCT8x8(cxGPUContext, cqCommandQueue, (const char **)argv); shrLog("Creating OpenCL memory objects...\n"); d_Input = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, imageH * stride * sizeof(cl_float), h_Input, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); d_Output = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, imageH * stride * sizeof(cl_float), NULL, &ciErrNum); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Performing DCT8x8 of %u x %u image...\n\n", imageH, imageW); //Just a single iteration or a warmup iteration DCT8x8( cqCommandQueue, d_Output, d_Input, stride, imageH, imageW, dir ); #define GPU_PROFILING 1 #ifdef GPU_PROFILING const int numIterations = 16; cl_event startMark, endMark; ciErrNum = clEnqueueMarker(cqCommandQueue, &startMark); ciErrNum |= clFinish(cqCommandQueue); shrCheckError(ciErrNum, CL_SUCCESS); shrDeltaT(0); for(int iter = 0; iter < numIterations; iter++) DCT8x8( NULL, d_Output, d_Input, stride, imageH, imageW, dir ); ciErrNum = clEnqueueMarker(cqCommandQueue, &endMark); ciErrNum |= clFinish(cqCommandQueue); shrCheckError(ciErrNum, CL_SUCCESS); //Calculate performance metrics by wallclock time double gpuTime = shrDeltaT(0) / (double)numIterations; shrLogEx(LOGBOTH | MASTER, 0, "oclDCT8x8, Throughput = %.4f MPixels/s, Time = %.5f s, Size = %u Pixels, NumDevsUsed = %i, Workgroup = %u\n", (1.0e-6 * (double)(imageW * imageH)/ gpuTime), gpuTime, (imageW * imageH), 1, 0); //Get profiler time cl_ulong startTime = 0, endTime = 0; ciErrNum = clGetEventProfilingInfo(startMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &startTime, NULL); ciErrNum |= clGetEventProfilingInfo(endMark, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL); shrCheckError(ciErrNum, CL_SUCCESS); shrLog("\nOpenCL time: %.5f s\n\n", 1.0e-9 * ((double)endTime - (double)startTime) / (double)numIterations); #endif shrLog("Reading back OpenCL results...\n"); ciErrNum = clEnqueueReadBuffer(cqCommandQueue, d_Output, CL_TRUE, 0, imageH * stride * sizeof(cl_float), h_OutputGPU, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); shrLog("Comparing against Host/C++ computation...\n"); DCT8x8CPU(h_OutputCPU, h_Input, stride, imageH, imageW, dir); double sum = 0, delta = 0; double L2norm; for(uint i = 0; i < imageH; i++) for(uint j = 0; j < imageW; j++){ sum += h_OutputCPU[i * stride + j] * h_OutputCPU[i * stride + j]; delta += (h_OutputGPU[i * stride + j] - h_OutputCPU[i * stride + j]) * (h_OutputGPU[i * stride + j] - h_OutputCPU[i * stride + j]); } L2norm = sqrt(delta / sum); shrLog("Relative L2 norm: %.3e\n\n", L2norm); shrLog("Shutting down...\n"); //Release kernels and program closeDCT8x8(); //Release other OpenCL objects ciErrNum = clReleaseMemObject(d_Output); ciErrNum |= clReleaseMemObject(d_Input); ciErrNum |= clReleaseCommandQueue(cqCommandQueue); ciErrNum |= clReleaseContext(cxGPUContext); oclCheckError(ciErrNum, CL_SUCCESS); //Release host buffers free(h_OutputGPU); free(h_OutputCPU); free(h_Input); //Finish shrQAFinishExit(argc, (const char **)argv, (L2norm < 1E-3) ? QA_PASSED : QA_FAILED); }
int MemoryOptimizations::copy(cl_kernel& kernel, int vectorSize) { cl_int status; cl_event events[2]; /* Check group size against kernelWorkGroupSize */ status = clGetKernelWorkGroupInfo(kernel, devices[deviceId], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetKernelWorkGroupInfo failed.")) { return SDK_FAILURE; } if(localThreads[0] * localThreads[1] > kernelWorkGroupSize) { std::cout << "\nDevice doesn't support required work-group size!\n"; return SDK_SUCCESS; } /*** Set appropriate arguments to the kernel ***/ status = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&inputBuffer); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(inputBuffer)")) return SDK_FAILURE; status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&outputBuffer); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clSetKernelArg failed.(outputBuffer)")) return SDK_FAILURE; double nsec = 0; // Reduce the iterations if verification is enabled. if(verify) Iterations = 1; /* Run the kernel for a number of iterations */ for(int i = 0; i < Iterations; i++) { /*Enqueue a kernel run call */ status = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, &events[0]); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clEnqueueNDRangeKernel failed.")) return SDK_FAILURE; /* wait for the kernel call to finish execution */ status = clWaitForEvents(1, &events[0]); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clWaitForEvents failed.")) return SDK_FAILURE; /* Calculate performance */ cl_ulong startTime; cl_ulong endTime; /* Get kernel profiling info */ status = clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, 0); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetEventProfilingInfo failed.(startTime)")) return SDK_FAILURE; status = clGetEventProfilingInfo(events[0], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, 0); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetEventProfilingInfo failed.(endTime)")) return SDK_FAILURE; /* Cumulate time for each iteration */ nsec += endTime - startTime; } /* Copy bytes */ int numThreads = (int)(globalThreads[0] * globalThreads[1]); double bytes = (double)(Iterations * 2 * vectorSize * sizeof(cl_float)); double perf = (bytes / nsec) * numThreads; std::cout << ": " << perf << " GB/s" << std::endl; if(verify) { /* Enqueue readBuffer*/ status = clEnqueueReadBuffer(commandQueue, outputBuffer, CL_TRUE, 0, length * sizeof(cl_float4), output, 0, NULL, 0); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clEnqueueReadBuffer failed.")) return SDK_FAILURE; /* Verify data */ if(!memcmp(input, output, vectorSize * sizeof(cl_float) * length)) { std::cout << "Passed!\n"; return SDK_SUCCESS; } else { std::cout << "Failed!\n"; return SDK_FAILURE; } } return SDK_SUCCESS; }
int main() { // START:context cl_platform_id platform; clGetPlatformIDs(1, &platform, NULL); cl_device_id device; clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); cl_context context = clCreateContext(NULL, 1, &device, NULL, NULL, NULL); // END:context // START:queue cl_command_queue queue = clCreateCommandQueue(context, device, 0, NULL); // END:queue // START:kernel char* source = read_source("multiply_arrays.cl"); cl_program program = clCreateProgramWithSource(context, 1, (const char**)&source, NULL, NULL); free(source); clBuildProgram(program, 0, NULL, NULL, NULL, NULL); cl_kernel kernel = clCreateKernel(program, "multiply_arrays", NULL); // END:kernel // START:buffers cl_float a[NUM_ELEMENTS], b[NUM_ELEMENTS]; random_fill(a, NUM_ELEMENTS); random_fill(b, NUM_ELEMENTS); cl_mem inputA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * NUM_ELEMENTS, a, NULL); cl_mem inputB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float) * NUM_ELEMENTS, b, NULL); cl_mem output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * NUM_ELEMENTS, NULL, NULL); // END:buffers // START:execute clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputA); clSetKernelArg(kernel, 1, sizeof(cl_mem), &inputB); clSetKernelArg(kernel, 2, sizeof(cl_mem), &output); size_t work_units = NUM_ELEMENTS; clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &work_units, NULL, 0, NULL, NULL); // END:execute // START:results cl_float results[NUM_ELEMENTS]; clEnqueueReadBuffer(queue, output, CL_TRUE, 0, sizeof(cl_float) * NUM_ELEMENTS, results, 0, NULL, NULL); // END:results // START:cleanup clReleaseMemObject(inputA); clReleaseMemObject(inputB); clReleaseMemObject(output); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(queue); clReleaseContext(context); // END:cleanup for (int i = 0; i < NUM_ELEMENTS; ++i) { printf("%f * %f = %f\n", a[i], b[i], results[i]); } return 0; }
int main(int argc, char *argv[]) { int iGlobalSize = 1; int iCheck1, iCheck2, iCheck3, iCheck4; size_t iGlobalWorkSize = -1; size_t iLocalWorkSize = -1; if (argc > 1) // Size of input vector { iCheck1 = atoi(argv[1]); if (iCheck1 != 0) { iGlobalSize = iCheck1; } } int iNoReps = 100; // Number of repetitions. if (argc > 2) { iCheck2 = atoi(argv[2]); if (iCheck2 != 0) { iNoReps = iCheck2; } } /* if (argc > 3) // Global work size { iCheck3 = atoi(argv[3]); if (iCheck3 != 0) { iGlobalWorkSize = iCheck3; } } if (argc > 4) // Local work size { iCheck4 = atoi(argv[4]); if (iCheck4 != 0) { iLocalWorkSize = iCheck4; } } */ int bPrint = 0; if (argc > 3) // Originally 5. { bPrint = 1; } // printf("The global size is %d, the global work size is %ld, and the local work size is %ld. \n", iGlobalSize, iGlobalWorkSize, iLocalWorkSize); /* size_t * ipGlobalWorkParam = NULL; if (iGlobalWorkSize != -1) { ipGlobalWorkParam = &iGlobalWorkSize; } size_t * ipLocalWorkParam = NULL; if (iLocalWorkSize != -1) { ipLocalWorkParam = &iLocalWorkSize; } */ GCAQ * TheGCAQ = GCAQSetup(); if (TheGCAQ == NULL) { return 1; } #if BIGFLOAT const char *szFloatOpt = "-DBIGFLOAT"; #else const char *szFloatOpt = NULL; #endif const int iNoKernels = 1; char *ourKernelStrings[6] = { szDotProduct, szReduce, szDotProduct2, szReduce2, szDotProduct4, szReduce4}; GPAK *TheGPAK = GPAKSetup(TheGCAQ, iNoKernels, ourKernelStrings, szFloatOpt); if (TheGPAK == NULL) { GCAQShutdown(TheGCAQ); return 2; } INTG iTypicalWorkgroupNo = TheGPAK->TheMaxWorkGroupSizes[0]; INTG iExpOutputSize = ioutsize(iGlobalSize, iTypicalWorkgroupNo); FLPT * fExpDotProdResult = (FLPT *) malloc(iExpOutputSize * sizeof(FLPT)); FLPT * fExpReduceResult = (FLPT *) malloc(iExpOutputSize * sizeof(FLPT)); fdotprodexpresult(iGlobalSize, iTypicalWorkgroupNo, fExpDotProdResult); freduceexpresult(iGlobalSize, iTypicalWorkgroupNo, fExpReduceResult); // printvector("dot prod", iExpOutputSize, fExpDotProdResult); // printvector("reduce", iExpOutputSize, fExpReduceResult); FLPT* inputDataF = (FLPT *) malloc(iGlobalSize * sizeof(FLPT)); SetFIncrease(iGlobalSize, inputDataF); // For the dot product. FLPT* outputDataD = (FLPT *) malloc(iGlobalSize * sizeof(FLPT)); SetFNull(iGlobalSize, outputDataD); // For the reduction. FLPT* outputDataR = (FLPT *) malloc(iGlobalSize * sizeof(FLPT)); SetFNull(iGlobalSize, outputDataR); struct timespec start[iNoKernels]; struct timespec end[iNoKernels]; // create buffers for the input and ouput int err; cl_mem inputF, outputF, outputAll; inputF = clCreateBuffer(TheGCAQ->TheContext, CL_MEM_READ_ONLY, iGlobalSize * sizeof(FLPT), NULL, &err); if (err != CL_SUCCESS) { printf("Error allocating for F"); return 3; } outputF = clCreateBuffer(TheGCAQ->TheContext, CL_MEM_WRITE_ONLY, iGlobalSize * sizeof(float), NULL, &err); if (err != CL_SUCCESS) { printf("Error allocating for output 7"); return 9; } outputAll = clCreateBuffer(TheGCAQ->TheContext, CL_MEM_WRITE_ONLY, iGlobalSize * sizeof(float), NULL, &err); if (err != CL_SUCCESS) { printf("Error allocating for output 8"); return 9; } clEnqueueWriteBuffer(TheGCAQ->TheQueue, inputF, CL_TRUE, 0, iGlobalSize * sizeof(FLPT), inputDataF, 0, NULL, NULL); int iRep; int iKernel; int i; int iLengthTotal = iGlobalSize; size_t iGlobalWorkThing = iGlobalSize; int iSomething = 1; for (iKernel = 0; iKernel < iNoKernels; iKernel++) { for (i = 0; i < iLengthTotal; i++) { outputDataD[i] = 0.0; outputDataR[i] = 0.0; } clock_gettime(CLOCK_MONOTONIC, &(start[iKernel])); for (iRep = 0; iRep < iNoReps; iRep++) { clSetKernelArg(TheGPAK->TheKernels[iKernel], 0, sizeof(int), &iLengthTotal); clSetKernelArg(TheGPAK->TheKernels[iKernel], 1, sizeof(cl_mem), &inputF); clSetKernelArg(TheGPAK->TheKernels[iKernel], 2, iSomething * iLocalWorkSize * sizeof(float), NULL); // Was 3 clSetKernelArg(TheGPAK->TheKernels[iKernel], 3, sizeof(cl_mem), &outputAll); // Was 4 clEnqueueNDRangeKernel(TheGCAQ->TheQueue, TheGPAK->TheKernels[iKernel], 1, NULL, &iGlobalWorkThing, &(TheGPAK->TheMaxWorkGroupSizes[iKernel]), 0, NULL, NULL); clFinish(TheGCAQ->TheQueue); // copy the results from out of the output buffer if (iKernel % 2 == 0) { clEnqueueReadBuffer(TheGCAQ->TheQueue, outputAll, CL_TRUE, 0, iExpOutputSize * sizeof(float), outputDataD, 0, NULL, NULL); } else { clEnqueueReadBuffer(TheGCAQ->TheQueue, outputAll, CL_TRUE, 0, iExpOutputSize * sizeof(float), outputDataR, 0, NULL, NULL); } } clock_gettime(CLOCK_MONOTONIC, &(end[iKernel])); if (bPrint) { for (i = 0; i < iExpOutputSize; i++) { if (iKernel % 2 == 0) { if (outputDataD[i] != fExpDotProdResult[i]) { printf ("A problem at kernel %d and iteration %d for actual value %f but expected value %f!\n", iKernel, i, outputDataD[i], fExpDotProdResult[i]); break; } } else { if (outputDataR[i] != fExpReduceResult[i]) { printf ("A problem at kernel %d and iteration %d for actual value %f but expected value %f!\n", iKernel, i, outputDataR[i], fExpReduceResult[i]); break; } } } } // if ((iKernel % 2) == 1) // { // iLengthTotal = iLengthTotal / 2; // iSomething = iSomething * 2; // iGlobalWorkThing = iGlobalWorkThing / 2; // } } clReleaseMemObject(inputF); clReleaseMemObject(outputF); clReleaseMemObject(outputAll); // print the results // if (bPrint) // { // printf("output %d: \n", iGlobalSize); // for(i=0;i<iExpOutputSize; i++) // { // printf("%d - %f - %f\n", i, outputDataD[i], outputDataR[i]); // } // } // cleanup - release OpenCL resources free(inputDataF); free(outputDataD); free(outputDataR); GPAKShutdown(TheGPAK); GCAQShutdown (TheGCAQ); printf("%d - ", iGlobalSize); for (iKernel = 0; iKernel < iNoKernels; iKernel++) { printf("%f - ", (1.0 * TLPERS * iGlobalSize * iNoReps) / (MEGAHERTZ * timespecDiff(&(end[iKernel]), &(start[iKernel])))); } printf("\n"); return 0; }
OPENCL_EXPERIMENTS_EXPORT cl_int opencl_plugin_voxelize_meshes(opencl_plugin plugin, float inv_element_size, float corner_x, float corner_y, float corner_z, cl_int x_cell_length, cl_int y_cell_length, cl_int z_cell_length, cl_int mesh_data_count, mesh_data *mesh_data_list, cl_uchar *voxel_grid_out) { cl_int err = CL_SUCCESS; cl_int i; cl_int next_row_offset, next_slice_offset; size_t local_work_size; cl_int num_voxels; clock_t t1; clock_t t2; clock_t t3; assert(plugin != NULL); assert(inv_element_size >= 0); assert(x_cell_length >= 0); assert(y_cell_length >= 0); assert(z_cell_length >= 0); assert(mesh_data_count >= 0); assert(mesh_data_list != NULL); t1 = clock(); /* (Re-)allocate buffer for voxel grid */ num_voxels = x_cell_length * y_cell_length * z_cell_length; if (opencl_plugin_init_voxel_buffer(plugin, num_voxels)) goto error; /* (Re-)allocate buffers for mesh data */ if (opencl_plugin_init_mesh_buffers(plugin, mesh_data_count, mesh_data_list)) goto error; err = clGetKernelWorkGroupInfo( plugin->voxelize_kernel, plugin->selected_device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local_work_size), &local_work_size, NULL); CHECK_CL_ERROR(err); if (enqueue_zero_buffer(plugin->queue, plugin->voxel_grid_buffer, plugin->voxel_grid_buffer_capacity, 0, NULL, NULL, &err)) goto error; err = clFinish(plugin->queue); CHECK_CL_ERROR(err); t1 = clock() - t1; t2 = clock(); next_row_offset = x_cell_length; next_slice_offset = x_cell_length * y_cell_length; err |= clSetKernelArg(plugin->voxelize_kernel, 0, sizeof(cl_mem), &plugin->voxel_grid_buffer); err |= clSetKernelArg(plugin->voxelize_kernel, 1, sizeof(float), &inv_element_size); err |= clSetKernelArg(plugin->voxelize_kernel, 2, sizeof(float), &corner_x); err |= clSetKernelArg(plugin->voxelize_kernel, 3, sizeof(float), &corner_y); err |= clSetKernelArg(plugin->voxelize_kernel, 4, sizeof(float), &corner_z); err |= clSetKernelArg(plugin->voxelize_kernel, 5, sizeof(cl_int), &next_row_offset); err |= clSetKernelArg(plugin->voxelize_kernel, 6, sizeof(cl_int), &next_slice_offset); err |= clSetKernelArg(plugin->voxelize_kernel, 7, sizeof(cl_int), &x_cell_length); err |= clSetKernelArg(plugin->voxelize_kernel, 8, sizeof(cl_int), &y_cell_length); err |= clSetKernelArg(plugin->voxelize_kernel, 9, sizeof(cl_int), &z_cell_length); CHECK_CL_ERROR(err); for (i = 0; i < mesh_data_count; i++) { size_t global_work_size; cl_uint vertex_buffer_base_idx = mesh_data_list[i].vertex_buffer_base_idx; cl_uint triangle_buffer_base_idx = mesh_data_list[i].triangle_buffer_base_idx; err |= clSetKernelArg(plugin->voxelize_kernel, 10, sizeof(cl_mem), &plugin->vertex_buffer); err |= clSetKernelArg(plugin->voxelize_kernel, 11, sizeof(cl_mem), &plugin->triangle_buffer); err |= clSetKernelArg(plugin->voxelize_kernel, 12, sizeof(cl_int), &mesh_data_list[i].num_triangles); err |= clSetKernelArg(plugin->voxelize_kernel, 13, sizeof(cl_uint), &vertex_buffer_base_idx); err |= clSetKernelArg(plugin->voxelize_kernel, 14, sizeof(cl_uint), &triangle_buffer_base_idx); CHECK_CL_ERROR(err); /* As per the OpenCL spec, global_work_size must divide evenly by * local_work_size */ global_work_size = mesh_data_list[i].num_triangles / local_work_size; global_work_size *= local_work_size; if (global_work_size < (size_t)mesh_data_list[i].num_triangles) global_work_size += local_work_size; err = clEnqueueNDRangeKernel( plugin->queues[i % plugin->num_queues], plugin->voxelize_kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); CHECK_CL_ERROR_MSG(err, "clEnqueueNDRangeKernel failed on mesh %d/%d", i + 1, mesh_data_count); err = clFinish(plugin->queue); CHECK_CL_ERROR_MSG(err, "clFinish failed on mesh %d/%d", i + 1, mesh_data_count); } err = clFinish(plugin->queue); CHECK_CL_ERROR(err); for (i = 0; i < plugin->num_queues; i++) { err = clFinish(plugin->queues[i]); CHECK_CL_ERROR(err); } t2 = clock() - t2; t3 = clock(); err = clEnqueueReadBuffer( plugin->queue, plugin->voxel_grid_buffer, CL_TRUE, 0, num_voxels, voxel_grid_out, 0, NULL, NULL); CHECK_CL_ERROR(err); t3 = clock() - t3; TRACE("Clock T1: %f", ((float)t1 * 1000.0f) / CLOCKS_PER_SEC); TRACE("Clock T2: %f", ((float)t2 * 1000.0f) / CLOCKS_PER_SEC); TRACE("Clock T3: %f", ((float)t3 * 1000.0f) / CLOCKS_PER_SEC); return 0; error: return -1; }
int FastWalshTransform::runCLKernels(void) { cl_int status; size_t globalThreads[1]; size_t localThreads[1]; // Enqueue write input to inputBuffer cl_event writeEvt; status = clEnqueueWriteBuffer( commandQueue, inputBuffer, CL_FALSE, 0, length * sizeof(cl_float), input, 0, NULL, &writeEvt); CHECK_OPENCL_ERROR(status, "clEnqueueWriteBuffer failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush failed.(commandQueue)"); status = waitForEventAndRelease(&writeEvt); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(writeEvt) Failed"); /* * The kernel performs a butterfly operation and it runs for half the * total number of input elements in the array. * In each pass of the kernel two corresponding elements are found using * the butterfly operation on an array of numbers and their sum and difference * is stored in the same locations as the numbers */ globalThreads[0] = length / 2; localThreads[0] = 256; // Check group size against kernelWorkGroupSize status = kernelInfo.setKernelWorkGroupInfo(kernel, devices[sampleArgs->deviceId]); CHECK_OPENCL_ERROR(status, "kernelInfo.setKernelWorkGroupInfo failed."); if((cl_uint)(localThreads[0]) > kernelInfo.kernelWorkGroupSize) { if(!sampleArgs->quiet) { std::cout << "Out of Resources!" << std::endl; std::cout << "Group Size specified : " << localThreads[0] << std::endl; std::cout << "Max Group Size supported on the kernel : " << kernelInfo.kernelWorkGroupSize << std::endl; std::cout<<"Changing the group size to " << kernelInfo.kernelWorkGroupSize << std::endl; } localThreads[0] = kernelInfo.kernelWorkGroupSize; } // Set appropriate arguments to the kernel // the input array - also acts as output status = clSetKernelArg( kernel, 0, sizeof(cl_mem), (void *)&inputBuffer); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (inputBuffer)"); for(cl_int step = 1; step < length; step <<= 1) { // stage of the algorithm status = clSetKernelArg( kernel, 1, sizeof(cl_int), (void *)&step); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed. (step)"); // Enqueue a kernel run call cl_event ndrEvt; status = clEnqueueNDRangeKernel( commandQueue, kernel, 1, NULL, globalThreads, localThreads, 0, NULL, &ndrEvt); CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush failed.(commandQueue)"); status = waitForEventAndRelease(&ndrEvt); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(ndrEvt) Failed"); } // Enqueue readBuffer cl_event readEvt; status = clEnqueueReadBuffer( commandQueue, inputBuffer, CL_FALSE, 0, length * sizeof(cl_float), output, 0, NULL, &readEvt); CHECK_OPENCL_ERROR(status, "clEnqueueReadBuffer failed."); status = clFlush(commandQueue); CHECK_OPENCL_ERROR(status, "clFlush failed.(commandQueue)"); status = waitForEventAndRelease(&readEvt); CHECK_ERROR(status, SDK_SUCCESS, "WaitForEventAndRelease(readEvt) Failed"); return SDK_SUCCESS; }
void nrm2CorrectnessTest(TestParams *params) { cl_int err; T1 *blasX; T2 *clblasNRM2, *blasNRM2; cl_mem bufX, bufNRM2, scratchBuff; clMath::BlasBase *base; cl_event *events; cl_double deltaForType = 0.0; base = clMath::BlasBase::getInstance(); if ((typeid(T1) == typeid(cl_double) || typeid(T1) == typeid(DoubleComplex)) && !base->isDevSupportDoublePrecision()) { std::cerr << ">> WARNING: The target device doesn't support native " "double precision floating point arithmetic" << std::endl << ">> Test skipped" << std::endl; SUCCEED(); return; } printf("number of command queues : %d\n\n", params->numCommandQueues); events = new cl_event[params->numCommandQueues]; memset(events, 0, params->numCommandQueues * sizeof(cl_event)); size_t lengthX = (1 + ((params->N -1) * abs(params->incx))); blasX = new T1[lengthX + params->offBX ]; blasNRM2 = new T2[1]; clblasNRM2 = new T2[1 + params->offa]; if((blasX == NULL) || (clblasNRM2 == NULL) || (blasNRM2 == NULL)) { ::std::cerr << "Cannot allocate memory on host side\n" << "!!!!!!!!!!!!Test skipped.!!!!!!!!!!!!" << ::std::endl; deleteBuffers<T1>(blasX); deleteBuffers<T2>(blasNRM2, clblasNRM2); delete[] events; SUCCEED(); return; } srand(params->seed); randomVectors<T1>(params->N, (blasX + params->offBX), params->incx, (T1*)NULL, 0, true); // Allocate buffers bufX = base->createEnqueueBuffer(blasX, (lengthX + params->offBX)* sizeof(*blasX), 0, CL_MEM_READ_WRITE); bufNRM2 = base->createEnqueueBuffer(NULL, (1 + params->offa) * sizeof(T2), 0, CL_MEM_READ_WRITE); scratchBuff = base->createEnqueueBuffer(NULL, (lengthX * 2 * sizeof(T1)), 0, CL_MEM_READ_WRITE); *blasNRM2 = ::clMath::blas::nrm2( params->N, blasX, params->offBX, params->incx); if ((bufX == NULL) || (bufNRM2 == NULL) || (scratchBuff == NULL)) { releaseMemObjects(bufX, bufNRM2, scratchBuff); deleteBuffers<T1>(blasX); deleteBuffers<T2>(blasNRM2, clblasNRM2); delete[] events; ::std::cerr << ">> Failed to create/enqueue buffer for a matrix." << ::std::endl << ">> Can't execute the test, because data is not transfered to GPU." << ::std::endl << ">> Test skipped." << ::std::endl; SUCCEED(); return; } DataType type; type = ( typeid(T1) == typeid(cl_float))? TYPE_FLOAT : ( typeid(T1) == typeid(cl_double))? TYPE_DOUBLE: ( typeid(T1) == typeid(cl_float2))? TYPE_COMPLEX_FLOAT:TYPE_COMPLEX_DOUBLE; err = (cl_int)::clMath::clblas::nrm2( type, params->N, bufNRM2, params->offa, bufX, params->offBX, params->incx, scratchBuff, params->numCommandQueues, base->commandQueues(), 0, NULL, events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufNRM2, scratchBuff); deleteBuffers<T1>(blasX); deleteBuffers<T2>(blasNRM2, clblasNRM2); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "::clMath::clblas::NRM2() failed"; } err = waitForSuccessfulFinish(params->numCommandQueues, base->commandQueues(), events); if (err != CL_SUCCESS) { releaseMemObjects(bufX, bufNRM2, scratchBuff); deleteBuffers<T1>(blasX); deleteBuffers<T2>(blasNRM2, clblasNRM2); delete[] events; ASSERT_EQ(CL_SUCCESS, err) << "waitForSuccessfulFinish()"; } err = clEnqueueReadBuffer(base->commandQueues()[0], bufNRM2, CL_TRUE, 0, (1 + params->offa) * sizeof(*clblasNRM2), clblasNRM2, 0, NULL, NULL); if (err != CL_SUCCESS) { ::std::cerr << "NRM2: Reading results failed...." << std::endl; } releaseMemObjects(bufX, bufNRM2, scratchBuff); deltaForType = DELTA_0<T1>(); // Since every element of X encounters a division, delta would be sum of deltas for every element in X cl_double delta = 0; for(unsigned int i=0; i<(params->N); i++) { delta += deltaForType * returnMax<T1>(blasX[params->offBX + i]); } compareValues<T2>( (blasNRM2), (clblasNRM2+params->offa), delta); if (::testing::Test::HasFailure()) { printTestParams(params->N, params->offBX, params->incx); ::std::cerr << "offNRM2 = " << params->offa << ::std::endl; ::std::cerr << "queues = " << params->numCommandQueues << ::std::endl; } deleteBuffers<T1>(blasX); deleteBuffers<T2>(blasNRM2, clblasNRM2); delete[] events; }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufX; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * lda * sizeof(cl_float), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * sizeof(cl_float), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, N * lda * sizeof(cl_float), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float), X, 0, NULL, NULL); /* Call clblas function. */ err = clblasStbsv(order, uplo, trans, diag, N, K, bufA, 0, lda, bufX, 0, incx, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasStbsv() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float), X, 0, NULL, NULL); /* At this point you will get the result of STBSV placed in X array. */ printResult(); } /* Release OpenCL memory objects. */ clReleaseMemObject(bufX); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
T profileReduce(ReduceType datatype, cl_int n, int numThreads, int numBlocks, int maxThreads, int maxBlocks, int whichKernel, int testIterations, bool cpuFinalReduction, int cpuFinalThreshold, double* dTotalTime, T* h_odata, cl_mem d_idata, cl_mem d_odata) { T gpu_result = 0; bool needReadBack = true; cl_kernel finalReductionKernel[10]; int finalReductionIterations=0; //shrLog("Profile Kernel %d\n", whichKernel); cl_kernel reductionKernel = getReductionKernel(datatype, whichKernel, numThreads, isPow2(n) ); clSetKernelArg(reductionKernel, 0, sizeof(cl_mem), (void *) &d_idata); clSetKernelArg(reductionKernel, 1, sizeof(cl_mem), (void *) &d_odata); clSetKernelArg(reductionKernel, 2, sizeof(cl_int), &n); clSetKernelArg(reductionKernel, 3, sizeof(T) * numThreads, NULL); if( !cpuFinalReduction ) { int s=numBlocks; int threads = 0, blocks = 0; int kernel = (whichKernel == 6) ? 5 : whichKernel; while(s > cpuFinalThreshold) { getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads); finalReductionKernel[finalReductionIterations] = getReductionKernel(datatype, kernel, threads, isPow2(s) ); clSetKernelArg(finalReductionKernel[finalReductionIterations], 0, sizeof(cl_mem), (void *) &d_odata); clSetKernelArg(finalReductionKernel[finalReductionIterations], 1, sizeof(cl_mem), (void *) &d_odata); clSetKernelArg(finalReductionKernel[finalReductionIterations], 2, sizeof(cl_int), &n); clSetKernelArg(finalReductionKernel[finalReductionIterations], 3, sizeof(T) * numThreads, NULL); if (kernel < 3) s = (s + threads - 1) / threads; else s = (s + (threads*2-1)) / (threads*2); finalReductionIterations++; } } size_t globalWorkSize[1]; size_t localWorkSize[1]; for (int i = 0; i < testIterations; ++i) { gpu_result = 0; clFinish(cqCommandQueue); if(i>0) shrDeltaT(1); // execute the kernel globalWorkSize[0] = numBlocks * numThreads; localWorkSize[0] = numThreads; ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue,reductionKernel, 1, 0, globalWorkSize, localWorkSize, 0, NULL, NULL); // check if kernel execution generated an error oclCheckError(ciErrNum, CL_SUCCESS); if (cpuFinalReduction) { // sum partial sums from each block on CPU // copy result from device to host clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, numBlocks * sizeof(T), h_odata, 0, NULL, NULL); for(int i=0; i<numBlocks; i++) { gpu_result += h_odata[i]; } needReadBack = false; } else { // sum partial block sums on GPU int s=numBlocks; int kernel = (whichKernel == 6) ? 5 : whichKernel; int it = 0; while(s > cpuFinalThreshold) { int threads = 0, blocks = 0; getNumBlocksAndThreads(kernel, s, maxBlocks, maxThreads, blocks, threads); globalWorkSize[0] = threads * blocks; localWorkSize[0] = threads; ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, finalReductionKernel[it], 1, 0, globalWorkSize, localWorkSize, 0, NULL, NULL); oclCheckError(ciErrNum, CL_SUCCESS); if (kernel < 3) s = (s + threads - 1) / threads; else s = (s + (threads*2-1)) / (threads*2); it++; } if (s > 1) { // copy result from device to host clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, s * sizeof(T), h_odata, 0, NULL, NULL); for(int i=0; i < s; i++) { gpu_result += h_odata[i]; } needReadBack = false; } } clFinish(cqCommandQueue); if(i>0) *dTotalTime += shrDeltaT(1); } if (needReadBack) { // copy final sum from device to host clEnqueueReadBuffer(cqCommandQueue, d_odata, CL_TRUE, 0, sizeof(T), &gpu_result, 0, NULL, NULL); } // Release the kernels clReleaseKernel(reductionKernel); if( !cpuFinalReduction ) { for(int it=0; it<finalReductionIterations; ++it) { clReleaseKernel(finalReductionKernel[it]); } } return gpu_result; }
int main(int argc, char const *argv[]) { /* Get platform */ cl_platform_id platform; cl_uint num_platforms; cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformIDs' failed\n"); exit(1); } printf("Number of platforms: %d\n", num_platforms); printf("platform=%p\n", platform); /* Get platform name */ char platform_name[100]; ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformInfo' failed\n"); exit(1); } printf("platform.name='%s'\n\n", platform_name); /* Get device */ cl_device_id device; cl_uint num_devices; ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceIDs' failed\n"); exit(1); } printf("Number of devices: %d\n", num_devices); printf("device=%p\n", device); /* Get device name */ char device_name[100]; ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceInfo' failed\n"); exit(1); } printf("device.name='%s'\n", device_name); printf("\n"); /* Create a Context Object */ cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateContext' failed\n"); exit(1); } printf("context=%p\n", context); /* Create a Command Queue Object*/ cl_command_queue command_queue; command_queue = clCreateCommandQueue(context, device, 0, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateCommandQueue' failed\n"); exit(1); } printf("command_queue=%p\n", command_queue); printf("\n"); /* Program source */ unsigned char *source_code; size_t source_length; /* Read program from 'remainder_uchar8uchar8.cl' */ source_code = read_buffer("remainder_uchar8uchar8.cl", &source_length); /* Create a program */ cl_program program; program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_length, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateProgramWithSource' failed\n"); exit(1); } printf("program=%p\n", program); /* Build program */ ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (ret != CL_SUCCESS ) { size_t size; char *log; /* Get log size */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size); /* Allocate log and print */ log = malloc(size); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL); printf("error: call to 'clBuildProgram' failed:\n%s\n", log); /* Free log and exit */ free(log); exit(1); } printf("program built\n"); printf("\n"); /* Create a Kernel Object */ cl_kernel kernel; kernel = clCreateKernel(program, "remainder_uchar8uchar8", &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateKernel' failed\n"); exit(1); } /* Create and allocate host buffers */ size_t num_elem = 10; /* Create and init host side src buffer 0 */ cl_uchar8 *src_0_host_buffer; src_0_host_buffer = malloc(num_elem * sizeof(cl_uchar8)); for (int i = 0; i < num_elem; i++) src_0_host_buffer[i] = (cl_uchar8){{2, 2, 2, 2, 2, 2, 2, 2}}; /* Create and init device side src buffer 0 */ cl_mem src_0_device_buffer; src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_uchar8), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_uchar8), src_0_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create and init host side src buffer 1 */ cl_uchar8 *src_1_host_buffer; src_1_host_buffer = malloc(num_elem * sizeof(cl_uchar8)); for (int i = 0; i < num_elem; i++) src_1_host_buffer[i] = (cl_uchar8){{2, 2, 2, 2, 2, 2, 2, 2}}; /* Create and init device side src buffer 1 */ cl_mem src_1_device_buffer; src_1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_uchar8), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_1_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_uchar8), src_1_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create host dst buffer */ cl_uchar8 *dst_host_buffer; dst_host_buffer = malloc(num_elem * sizeof(cl_uchar8)); memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_uchar8)); /* Create device dst buffer */ cl_mem dst_device_buffer; dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_uchar8), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create dst buffer\n"); exit(1); } /* Set kernel arguments */ ret = CL_SUCCESS; ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src_1_device_buffer); ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clSetKernelArg' failed\n"); exit(1); } /* Launch the kernel */ size_t global_work_size = num_elem; size_t local_work_size = num_elem; ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueNDRangeKernel' failed\n"); exit(1); } /* Wait for it to finish */ clFinish(command_queue); /* Read results from GPU */ ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_uchar8), dst_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueReadBuffer' failed\n"); exit(1); } /* Dump dst buffer to file */ char dump_file[100]; sprintf((char *)&dump_file, "%s.result", argv[0]); write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_uchar8)); printf("Result dumped to %s\n", dump_file); /* Free host dst buffer */ free(dst_host_buffer); /* Free device dst buffer */ ret = clReleaseMemObject(dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 0 */ free(src_0_host_buffer); /* Free device side src buffer 0 */ ret = clReleaseMemObject(src_0_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 1 */ free(src_1_host_buffer); /* Free device side src buffer 1 */ ret = clReleaseMemObject(src_1_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Release kernel */ ret = clReleaseKernel(kernel); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseKernel' failed\n"); exit(1); } /* Release program */ ret = clReleaseProgram(program); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseProgram' failed\n"); exit(1); } /* Release command queue */ ret = clReleaseCommandQueue(command_queue); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseCommandQueue' failed\n"); exit(1); } /* Release context */ ret = clReleaseContext(context); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseContext' failed\n"); exit(1); } return 0; }
int main(void) { cl_int err; cl_platform_id platforms[MAX_PLATFORMS]; cl_uint nplatforms; cl_device_id devices[MAX_DEVICES]; cl_uint ndevices; cl_uint i, j; err = clGetPlatformIDs(MAX_PLATFORMS, platforms, &nplatforms); if (err != CL_SUCCESS) return EXIT_FAILURE; for (i = 0; i < nplatforms; i++) { err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, MAX_DEVICES, devices, &ndevices); if (err != CL_SUCCESS) return EXIT_FAILURE; for (j = 0; j < ndevices; j++) { cl_context context = clCreateContext(NULL, 1, &devices[j], NULL, NULL, &err); if (err != CL_SUCCESS) return EXIT_FAILURE; cl_command_queue queue = clCreateCommandQueue(context, devices[j], 0, &err); if (err != CL_SUCCESS) return EXIT_FAILURE; const int buf_size = 1024; cl_int host_buf[buf_size]; cl_mem buf = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int) * buf_size, NULL, &err); if (err != CL_SUCCESS) return EXIT_FAILURE; cl_event buf_event; if (clEnqueueReadBuffer(queue, buf, CL_TRUE, 0, sizeof(cl_int) * buf_size, &host_buf, 0, NULL, &buf_event) != CL_SUCCESS) return EXIT_FAILURE; clFinish(queue); cl_command_queue event_command_queue; size_t param_val_size_ret; if (clGetEventInfo(buf_event, CL_EVENT_COMMAND_QUEUE, sizeof(cl_command_queue), &event_command_queue, ¶m_val_size_ret) != CL_SUCCESS) return EXIT_FAILURE; if (param_val_size_ret != sizeof(cl_command_queue) || event_command_queue != queue) return EXIT_FAILURE; cl_command_type command_type; if (clGetEventInfo(buf_event, CL_EVENT_COMMAND_TYPE, sizeof(cl_command_type), &command_type, ¶m_val_size_ret) != CL_SUCCESS) return EXIT_FAILURE; if (param_val_size_ret != sizeof(cl_command_type) || command_type != CL_COMMAND_READ_BUFFER) return EXIT_FAILURE; cl_int execution_status; if (clGetEventInfo(buf_event, CL_EVENT_COMMAND_EXECUTION_STATUS, sizeof(cl_int), &execution_status, ¶m_val_size_ret) != CL_SUCCESS) return EXIT_FAILURE; if (param_val_size_ret != sizeof(cl_int) || execution_status != CL_COMPLETE) return EXIT_FAILURE; cl_uint ref_count; if (clGetEventInfo(buf_event, CL_EVENT_REFERENCE_COUNT, sizeof(cl_uint), &ref_count, ¶m_val_size_ret) != CL_SUCCESS) return EXIT_FAILURE; if (param_val_size_ret != sizeof(cl_uint) || ref_count != 1) { printf("FAIL: expected refcount 1, got %d\n", ref_count); return EXIT_FAILURE; } clReleaseEvent(buf_event); clReleaseMemObject(buf); clReleaseCommandQueue(queue); } } return EXIT_SUCCESS; }