int main(int argc, char **argv) { if(argc < 2) { usage(); return -1; } //init the filter array float filter[49] = {-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 49, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1}; //operate the params of cmd const char* inputFileName; const char* outputFileName; inputFileName = (argv[1]); outputFileName = (argv[2]); //the image height and width int imageHeight, imageWidth; int filterWidth = 7; //read the bmp image to the memory float* inputImage = readBmpImage(inputFileName, &imageWidth, &imageHeight); //to check the read is succ printf("the width of the image is %d, the height of the image is %d\n", imageWidth, imageHeight); //calculate the datasize int dataSize = imageHeight * imageWidth * sizeof(float); int filterSize = sizeof(float) * filterWidth * filterWidth; //output image float *outputImage = NULL; outputImage = (float*)malloc(dataSize); //set up the OpenCL environment cl_int status; //Discovery platform cl_platform_id platforms[2]; cl_platform_id platform; status = clGetPlatformIDs(2, platforms, NULL); check(status, "clGetPlatformIDs"); platform = platforms[PLATFORM_TO_USE]; //Discovery device cl_device_id device; clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL); check(status, "clGetDeviceIDs"); //create context cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(platform), 0}; cl_context context; context = clCreateContext(props, 1, &device, NULL, NULL, &status); check(status, "clCreateContext"); //create command queue cl_command_queue queue; queue = clCreateCommandQueue(context, device, 0, &status); check(status, "clCreateCommandQueue"); //create the input and output buffers cl_mem d_input, d_output, d_filter; d_input = clCreateBuffer(context, CL_MEM_READ_ONLY, dataSize, NULL, &status); check(status, "clCreateBuffer"); d_filter = clCreateBuffer(context, CL_MEM_READ_ONLY, filterSize, NULL, &status); check(status, "clCreateBuffer"); // Copy the input image to the device d_output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, dataSize, NULL, &status); check(status, "clCreateBuffer"); status = clEnqueueWriteBuffer(queue, d_input, CL_TRUE, 0, dataSize, inputImage, 0, NULL, NULL); check(status, "clEnqueueWriteBuffer"); status = clEnqueueWriteBuffer(queue, d_filter, CL_TRUE, 0, filterSize, filter, 0, NULL, NULL); check(status, "clEnqueueWriteBuffer"); const char* source = readSource(kernelPath); //create a program object with source and build it cl_program program; program = clCreateProgramWithSource(context, 1, &source, NULL, NULL); check(status, "clCreateProgramWithSource"); status = clBuildProgram(program, 1, &device, NULL, NULL, NULL); size_t log_size; char *program_log; if(status < 0) { clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); program_log = (char*)malloc(log_size + 1); program_log[log_size] = '\0'; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size + 1, program_log, NULL); printf("%s\n", program_log); free(program_log); exit(1); } check(status, "clBuildProgram"); //create the kernel object cl_kernel kernel; kernel = clCreateKernel(program, "sharpen", &status); check(status, "clCreateKernel"); //set the kernel arguments status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_output); status |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_input); status |= clSetKernelArg(kernel, 2, sizeof(int), &imageWidth); status |= clSetKernelArg(kernel, 3, sizeof(int), &imageHeight); status |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_filter); status |= clSetKernelArg(kernel, 5, sizeof(int), &filterWidth); check(status, "clSetKernelArg"); // Set the work item dimensions size_t globalSize[2] = {imageWidth, imageHeight}; status = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, NULL, 0, NULL, NULL); check(status, "clEnqueueNDRange"); // Read the image back to the host status = clEnqueueReadBuffer(queue, d_output, CL_TRUE, 0, dataSize, outputImage, 0, NULL, NULL); check(status, "clEnqueueReadBuffer"); // Write the output image to file storeBmpImage(outputImage, outputFileName, imageHeight, imageWidth, inputFileName); //free opencl resources clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(queue); clReleaseMemObject(d_input); clReleaseMemObject(d_output); clReleaseMemObject(d_filter); clReleaseContext(context); //free host resources free(inputImage); free(outputImage); }
int main(int argc, char **argv){ printf("WG size of kernel = %d \n", BLOCK_SIZE); int max_rows, max_cols, penalty; char * tempchar; // the lengths of the two sequences should be able to divided by 16. // And at current stage max_rows needs to equal max_cols if (argc == 4) { max_rows = atoi(argv[1]); max_cols = atoi(argv[1]); penalty = atoi(argv[2]); tempchar = argv[3]; } else{ usage(argc, argv); } if(atoi(argv[1])%16!=0){ fprintf(stderr,"The dimension values must be a multiple of 16\n"); exit(1); } max_rows = max_rows + 1; max_cols = max_cols + 1; int *reference; int *input_itemsets; int *output_itemsets; reference = (int *)malloc( max_rows * max_cols * sizeof(int) ); input_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) ); output_itemsets = (int *)malloc( max_rows * max_cols * sizeof(int) ); srand(7); //initialization for (int i = 0 ; i < max_cols; i++){ for (int j = 0 ; j < max_rows; j++){ input_itemsets[i*max_cols+j] = 0; } } for( int i=1; i< max_rows ; i++){ //initialize the cols input_itemsets[i*max_cols] = rand() % 10 + 1; } for( int j=1; j< max_cols ; j++){ //initialize the rows input_itemsets[j] = rand() % 10 + 1; } for (int i = 1 ; i < max_cols; i++){ for (int j = 1 ; j < max_rows; j++){ reference[i*max_cols+j] = blosum62[input_itemsets[i*max_cols]][input_itemsets[j]]; } } for( int i = 1; i< max_rows ; i++) input_itemsets[i*max_cols] = -i * penalty; for( int j = 1; j< max_cols ; j++) input_itemsets[j] = -j * penalty; int sourcesize = 1024*1024; char * source = (char *)calloc(sourcesize, sizeof(char)); if(!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; } // read the kernel core source char * kernel_nw1 = "nw_kernel1"; char * kernel_nw2 = "nw_kernel2"; FILE * fp = fopen(tempchar, "rb"); if(!fp) { printf("ERROR: unable to open '%s'\n", tempchar); return -1; } fread(source + strlen(source), sourcesize, 1, fp); fclose(fp); int nworkitems, workgroupsize = 0; nworkitems = BLOCK_SIZE; if(nworkitems < 1 || workgroupsize < 0){ printf("ERROR: invalid or missing <num_work_items>[/<work_group_size>]\n"); return -1; } // set global and local workitems size_t local_work[3] = { (workgroupsize>0)?workgroupsize:1, 1, 1 }; size_t global_work[3] = { nworkitems, 1, 1 }; //nworkitems = no. of GPU threads int use_gpu = 1; // OpenCL initialization if(initialize(use_gpu)) return -1; // compile kernel cl_int err = 0; const char * slist[2] = { source, 0 }; cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err); if(err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; } char clOptions[110]; // sprintf(clOptions,"-I../../src"); sprintf(clOptions," "); #ifdef BLOCK_SIZE sprintf(clOptions + strlen(clOptions), " -DBLOCK_SIZE=%d", BLOCK_SIZE); #endif err = DIVIDEND_CL_WRAP(clBuildProgram)(prog, 0, NULL, clOptions, NULL, NULL); /*{ // show warnings/errors static char log[65536]; memset(log, 0, sizeof(log)); cl_device_id device_id = 0; err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof(device_id), &device_id, NULL); clGetProgramBuildInfo(prog, device_id, CL_PROGRAM_BUILD_LOG, sizeof(log)-1, log, NULL); if(err || strstr(log,"warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log); }*/ if(err != CL_SUCCESS) { printf("ERROR: clBuildProgram() => %d\n", err); return -1; } cl_kernel kernel1; cl_kernel kernel2; kernel1 = clCreateKernel(prog, kernel_nw1, &err); kernel2 = clCreateKernel(prog, kernel_nw2, &err); if(err != CL_SUCCESS) { printf("ERROR: clCreateKernel() 0 => %d\n", err); return -1; } clReleaseProgram(prog); // creat buffers cl_mem input_itemsets_d; cl_mem output_itemsets_d; cl_mem reference_d; input_itemsets_d = clCreateBuffer(context, CL_MEM_READ_WRITE, max_cols * max_rows * sizeof(int), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer input_item_set (size:%d) => %d\n", max_cols * max_rows, err); return -1;} reference_d = clCreateBuffer(context, CL_MEM_READ_WRITE, max_cols * max_rows * sizeof(int), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer reference (size:%d) => %d\n", max_cols * max_rows, err); return -1;} output_itemsets_d = clCreateBuffer(context, CL_MEM_READ_WRITE, max_cols * max_rows * sizeof(int), NULL, &err ); if(err != CL_SUCCESS) { printf("ERROR: clCreateBuffer output_item_set (size:%d) => %d\n", max_cols * max_rows, err); return -1;} //write buffers err = clEnqueueWriteBuffer(cmd_queue, input_itemsets_d, 1, 0, max_cols * max_rows * sizeof(int), input_itemsets, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer bufIn1 (size:%d) => %d\n", max_cols * max_rows, err); return -1; } err = clEnqueueWriteBuffer(cmd_queue, reference_d, 1, 0, max_cols * max_rows * sizeof(int), reference, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer bufIn2 (size:%d) => %d\n", max_cols * max_rows, err); return -1; } int worksize = max_cols - 1; printf("worksize = %d\n", worksize); //these two parameters are for extension use, don't worry about it. int offset_r = 0, offset_c = 0; int block_width = worksize/BLOCK_SIZE ; clSetKernelArg(kernel1, 0, sizeof(void *), (void*) &reference_d); clSetKernelArg(kernel1, 1, sizeof(void *), (void*) &input_itemsets_d); clSetKernelArg(kernel1, 2, sizeof(void *), (void*) &output_itemsets_d); clSetKernelArg(kernel1, 3, sizeof(cl_int) * (BLOCK_SIZE + 1) *(BLOCK_SIZE+1), (void*)NULL ); clSetKernelArg(kernel1, 4, sizeof(cl_int) * BLOCK_SIZE * BLOCK_SIZE, (void*)NULL ); clSetKernelArg(kernel1, 5, sizeof(cl_int), (void*) &max_cols); clSetKernelArg(kernel1, 6, sizeof(cl_int), (void*) &penalty); clSetKernelArg(kernel1, 8, sizeof(cl_int), (void*) &block_width); clSetKernelArg(kernel1, 9, sizeof(cl_int), (void*) &worksize); clSetKernelArg(kernel1, 10, sizeof(cl_int), (void*) &offset_r); clSetKernelArg(kernel1, 11, sizeof(cl_int), (void*) &offset_c); clSetKernelArg(kernel2, 0, sizeof(void *), (void*) &reference_d); clSetKernelArg(kernel2, 1, sizeof(void *), (void*) &input_itemsets_d); clSetKernelArg(kernel2, 2, sizeof(void *), (void*) &output_itemsets_d); clSetKernelArg(kernel2, 3, sizeof(cl_int) * (BLOCK_SIZE + 1) *(BLOCK_SIZE+1), (void*)NULL ); clSetKernelArg(kernel2, 4, sizeof(cl_int) * BLOCK_SIZE *BLOCK_SIZE, (void*)NULL ); clSetKernelArg(kernel2, 5, sizeof(cl_int), (void*) &max_cols); clSetKernelArg(kernel2, 6, sizeof(cl_int), (void*) &penalty); clSetKernelArg(kernel2, 8, sizeof(cl_int), (void*) &block_width); clSetKernelArg(kernel2, 9, sizeof(cl_int), (void*) &worksize); clSetKernelArg(kernel2, 10, sizeof(cl_int), (void*) &offset_r); clSetKernelArg(kernel2, 11, sizeof(cl_int), (void*) &offset_c); printf("Processing upper-left matrix\n"); for( int blk = 1 ; blk <= worksize/BLOCK_SIZE ; blk++){ global_work[0] = BLOCK_SIZE * blk; local_work[0] = BLOCK_SIZE; clSetKernelArg(kernel1, 7, sizeof(cl_int), (void*) &blk); #pragma dividend local_work_group_size local_work dim 2 dim1(2:1024:2:32) dim2(1:1:2:1) //This lws will be used to profile the OpenCL kernel with id 1 size_t _dividend_lws_local_work_k1[3]; { _dividend_lws_local_work_k1[0] = getLWSValue("DIVIDEND_LWS1_D0",DIVIDEND_LWS1_D0_DEFAULT_VAL); _dividend_lws_local_work_k1[1] = getLWSValue("DIVIDEND_LWS1_D1",DIVIDEND_LWS1_D1_DEFAULT_VAL); //Dividend extension: store the kernel id as the last element _dividend_lws_local_work_k1[2] = 1; } err = DIVIDEND_CL_WRAP(clEnqueueNDRangeKernel)(cmd_queue, kernel1, 2, NULL, global_work, _dividend_lws_local_work_k1, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: 1 clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; } } printf("BLOCK_SIZE:%d\n", BLOCK_SIZE); printf("Processing lower-right matrix\n"); for( int blk = worksize/BLOCK_SIZE - 1 ; blk >= 1 ; blk--){ global_work[0] = BLOCK_SIZE * blk; local_work[0] = BLOCK_SIZE; clSetKernelArg(kernel2, 7, sizeof(cl_int), (void*) &blk); #pragma dividend local_work_group_size local_work dim 2 dim1(2:1024:2:32) dim2(1:1:2:1) //This lws will be used to profile the OpenCL kernel with id 2 size_t _dividend_lws_local_work_k2[3]; { _dividend_lws_local_work_k2[0] = getLWSValue("DIVIDEND_LWS2_D0",DIVIDEND_LWS2_D0_DEFAULT_VAL); _dividend_lws_local_work_k2[1] = getLWSValue("DIVIDEND_LWS2_D1",DIVIDEND_LWS2_D1_DEFAULT_VAL); //Dividend extension: store the kernel id as the last element _dividend_lws_local_work_k2[2] = 2; } err = DIVIDEND_CL_WRAP(clEnqueueNDRangeKernel)(cmd_queue, kernel2, 2, NULL, global_work, _dividend_lws_local_work_k2, 0, 0, 0); if(err != CL_SUCCESS) { printf("ERROR: 2 clEnqueueNDRangeKernel()=>%d failed\n", err); return -1; } } // Lingjie Zhang modified at Nov 1, 2015 // clFinish(cmd_queue); // fflush(stdout); //end Lingjie Zhang modification //DIVIDEND_CL_WRAP(clFinish)(cmd_queue); err = clEnqueueReadBuffer(cmd_queue, input_itemsets_d, 1, 0, max_cols * max_rows * sizeof(int), output_itemsets, 0, 0, 0); DIVIDEND_CL_WRAP(clFinish)(cmd_queue); //#define TRACEBACK #ifdef TRACEBACK FILE *fpo = fopen("result.txt","w"); fprintf(fpo, "print traceback value GPU:\n"); for (int i = max_rows - 2, j = max_rows - 2; i>=0, j>=0;){ int nw, n, w, traceback; if ( i == max_rows - 2 && j == max_rows - 2 ) fprintf(fpo, "%d ", output_itemsets[ i * max_cols + j]); //print the first element if ( i == 0 && j == 0 ) break; if ( i > 0 && j > 0 ){ nw = output_itemsets[(i - 1) * max_cols + j - 1]; w = output_itemsets[ i * max_cols + j - 1 ]; n = output_itemsets[(i - 1) * max_cols + j]; } else if ( i == 0 ){ nw = n = LIMIT; w = output_itemsets[ i * max_cols + j - 1 ]; } else if ( j == 0 ){ nw = w = LIMIT; n = output_itemsets[(i - 1) * max_cols + j]; } else{ } //traceback = maximum(nw, w, n); int new_nw, new_w, new_n; new_nw = nw + reference[i * max_cols + j]; new_w = w - penalty; new_n = n - penalty; traceback = maximum(new_nw, new_w, new_n); if(traceback == new_nw) traceback = nw; if(traceback == new_w) traceback = w; if(traceback == new_n) traceback = n; fprintf(fpo, "%d ", traceback); if(traceback == nw ) {i--; j--; continue;} else if(traceback == w ) {j--; continue;} else if(traceback == n ) {i--; continue;} else ; } fclose(fpo); #endif printf("Computation Done\n"); // OpenCL shutdown if(shutdown()) return -1; clReleaseMemObject(input_itemsets_d); clReleaseMemObject(output_itemsets_d); clReleaseMemObject(reference_d); free(reference); free(input_itemsets); free(output_itemsets); }
int main(int argc, char const *argv[]) { /* Get platform */ cl_platform_id platform; cl_uint num_platforms; cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformIDs' failed\n"); exit(1); } printf("Number of platforms: %d\n", num_platforms); printf("platform=%p\n", platform); /* Get platform name */ char platform_name[100]; ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformInfo' failed\n"); exit(1); } printf("platform.name='%s'\n\n", platform_name); /* Get device */ cl_device_id device; cl_uint num_devices; ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceIDs' failed\n"); exit(1); } printf("Number of devices: %d\n", num_devices); printf("device=%p\n", device); /* Get device name */ char device_name[100]; ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceInfo' failed\n"); exit(1); } printf("device.name='%s'\n", device_name); printf("\n"); /* Create a Context Object */ cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateContext' failed\n"); exit(1); } printf("context=%p\n", context); /* Create a Command Queue Object*/ cl_command_queue command_queue; command_queue = clCreateCommandQueue(context, device, 0, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateCommandQueue' failed\n"); exit(1); } printf("command_queue=%p\n", command_queue); printf("\n"); /* Program source */ unsigned char *source_code; size_t source_length; /* Read program from 'abs_ulong4.cl' */ source_code = read_buffer("abs_ulong4.cl", &source_length); /* Create a program */ cl_program program; program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_length, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateProgramWithSource' failed\n"); exit(1); } printf("program=%p\n", program); /* Build program */ ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (ret != CL_SUCCESS ) { size_t size; char *log; /* Get log size */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size); /* Allocate log and print */ log = malloc(size); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL); printf("error: call to 'clBuildProgram' failed:\n%s\n", log); /* Free log and exit */ free(log); exit(1); } printf("program built\n"); printf("\n"); /* Create a Kernel Object */ cl_kernel kernel; kernel = clCreateKernel(program, "abs_ulong4", &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateKernel' failed\n"); exit(1); } /* Create and allocate host buffers */ size_t num_elem = 10; /* Create and init host side src buffer 0 */ cl_ulong4 *src_0_host_buffer; src_0_host_buffer = malloc(num_elem * sizeof(cl_ulong4)); for (int i = 0; i < num_elem; i++) src_0_host_buffer[i] = (cl_ulong4){{2, 2, 2, 2}}; /* Create and init device side src buffer 0 */ cl_mem src_0_device_buffer; src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_ulong4), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_ulong4), src_0_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create host dst buffer */ cl_ulong4 *dst_host_buffer; dst_host_buffer = malloc(num_elem * sizeof(cl_ulong4)); memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_ulong4)); /* Create device dst buffer */ cl_mem dst_device_buffer; dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_ulong4), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create dst buffer\n"); exit(1); } /* Set kernel arguments */ ret = CL_SUCCESS; ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clSetKernelArg' failed\n"); exit(1); } /* Launch the kernel */ size_t global_work_size = num_elem; size_t local_work_size = num_elem; ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueNDRangeKernel' failed\n"); exit(1); } /* Wait for it to finish */ clFinish(command_queue); /* Read results from GPU */ ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_ulong4), dst_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueReadBuffer' failed\n"); exit(1); } /* Dump dst buffer to file */ char dump_file[100]; sprintf((char *)&dump_file, "%s.result", argv[0]); write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_ulong4)); printf("Result dumped to %s\n", dump_file); /* Free host dst buffer */ free(dst_host_buffer); /* Free device dst buffer */ ret = clReleaseMemObject(dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 0 */ free(src_0_host_buffer); /* Free device side src buffer 0 */ ret = clReleaseMemObject(src_0_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Release kernel */ ret = clReleaseKernel(kernel); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseKernel' failed\n"); exit(1); } /* Release program */ ret = clReleaseProgram(program); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseProgram' failed\n"); exit(1); } /* Release command queue */ ret = clReleaseCommandQueue(command_queue); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseCommandQueue' failed\n"); exit(1); } /* Release context */ ret = clReleaseContext(context); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseContext' failed\n"); exit(1); } return 0; }
static bool opencl_thread_init(struct thr_info *thr) { const int thr_id = thr->id; struct cgpu_info *gpu = thr->cgpu; struct opencl_thread_data *thrdata; _clState *clState = clStates[thr_id]; cl_int status = 0; thrdata = calloc(1, sizeof(*thrdata)); thr->cgpu_data = thrdata; int buffersize = opt_scrypt ? SCRYPT_BUFFERSIZE : BUFFERSIZE; if (opt_neoscrypt) { buffersize = opt_neoscrypt ? SCRYPT_BUFFERSIZE : BUFFERSIZE; } if (!thrdata) { applog(LOG_ERR, "Failed to calloc in opencl_thread_init"); return false; } switch (clState->chosen_kernel) { case KL_POCLBM: thrdata->queue_kernel_parameters = &queue_poclbm_kernel; break; case KL_PHATK: thrdata->queue_kernel_parameters = &queue_phatk_kernel; break; case KL_DIAKGCN: thrdata->queue_kernel_parameters = &queue_diakgcn_kernel; break; #ifdef USE_SCRYPT case KL_SCRYPT: thrdata->queue_kernel_parameters = &queue_scrypt_kernel; break; #endif #ifdef USE_NEOSCRYPT case KL_NEOSCRYPT: thrdata->queue_kernel_parameters = &queue_neoscrypt_kernel; break; #endif #ifdef USE_KECCAK case KL_KECCAK: thrdata->queue_kernel_parameters = &queue_keccak_kernel; break; #endif default: case KL_DIABLO: thrdata->queue_kernel_parameters = &queue_diablo_kernel; break; } thrdata->res = calloc(buffersize, 1); if (!thrdata->res) { free(thrdata); applog(LOG_ERR, "Failed to calloc in opencl_thread_init"); return false; } status |= clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_TRUE, 0, buffersize, blank_res, 0, NULL, NULL); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed."); return false; } gpu->status = LIFE_WELL; gpu->device_last_well = time(NULL); return true; }
int main(int argc, char *argv[]) { std::string vvadd_kernel_str; /* Provide names of the OpenCL kernels * and cl file that they're kept in */ std::string vvadd_name_str = std::string("vvadd"); std::string vvadd_kernel_file = std::string("vvadd.cl"); cl_vars_t cv; cl_kernel vvadd; /* Read OpenCL file into STL string */ readFile(vvadd_kernel_file, vvadd_kernel_str); /* Initialize the OpenCL runtime * Source in clhelp.cpp */ initialize_ocl(cv); /* Compile all OpenCL kernels */ compile_ocl_program(vvadd, cv, vvadd_kernel_str.c_str(), vvadd_name_str.c_str()); /* Arrays on the host (CPU) */ float *h_A, *h_B, *h_Y; /* Arrays on the device (GPU) */ cl_mem g_A, g_B, g_Y; /* Allocate arrays on the host * and fill with random data */ int n = (1<<20); h_A = new float[n]; h_B = new float[n]; h_Y = new float[n]; bzero(h_Y, sizeof(float)*n); for(int i = 0; i < n; i++) { h_A[i] = (float)drand48(); h_B[i] = (float)drand48(); } /* CS194: Allocate memory for arrays on * the GPU */ cl_int err = CL_SUCCESS; /* CS194: Here's something to get you started */ // creates memory on the device to hold the A and B source arrays, plus the results array Y. g_Y = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err); CHK_ERR(err); g_A = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err); CHK_ERR(err); g_B = clCreateBuffer(cv.context,CL_MEM_READ_WRITE,sizeof(float)*n,NULL,&err); CHK_ERR(err); /* CS194: Copy data from host CPU to GPU */ // copies the host array A and B to the device. err = clEnqueueWriteBuffer(cv.commands, g_A, true, 0, sizeof(float)*n, h_A, 0, NULL, NULL); CHK_ERR(err); err = clEnqueueWriteBuffer(cv.commands, g_B, true, 0, sizeof(float)*n, h_B, 0, NULL, NULL); CHK_ERR(err); /* CS194: Define the global and local workgroup sizes */ size_t global_work_size[1] = {n}; size_t local_work_size[1] = {128}; /* CS194: Set Kernel Arguments */ err = clSetKernelArg(vvadd, 0, sizeof(cl_mem), &g_Y); CHK_ERR(err); err = clSetKernelArg(vvadd, 1, sizeof(cl_mem), &g_A); CHK_ERR(err); err = clSetKernelArg(vvadd, 2, sizeof(cl_mem), &g_B); CHK_ERR(err); err = clSetKernelArg(vvadd, 3, sizeof(int), &n); CHK_ERR(err); /* CS194: Call kernel on the GPU */ err = clEnqueueNDRangeKernel(cv.commands, vvadd, 1,//work_dim, NULL, //global_work_offset global_work_size, //global_work_size local_work_size, //local_work_size 0, //num_events_in_wait_list NULL, //event_wait_list NULL // ); CHK_ERR(err); /* Read result of GPU on host CPU */ // copies the result array Y from the device back to the host Y. err = clEnqueueReadBuffer(cv.commands, g_Y, true, 0, sizeof(float)*n, h_Y, 0, NULL, NULL); CHK_ERR(err); /* Check answer */ for(int i = 0; i < n; i++) { float d = h_A[i] + h_B[i]; if(h_Y[i] != d) { printf("error at %d :(\n", i); break; } } /* Shut down the OpenCL runtime */ uninitialize_ocl(cv); delete [] h_A; delete [] h_B; delete [] h_Y; // frees memory allocated on device clReleaseMemObject(g_A); clReleaseMemObject(g_B); clReleaseMemObject(g_Y); return 0; }
int main(int argc, char* argv[]) { int ciErrNum = 0; printf("press a key to start\n"); getchar(); const char* vendorSDK = btOpenCLUtils::getSdkVendorName(); printf("This program was compiled using the %s OpenCL SDK\n",vendorSDK); cl_device_type deviceType = CL_DEVICE_TYPE_GPU;//CL_DEVICE_TYPE_ALL void* glCtx=0; void* glDC = 0; printf("Initialize OpenCL using btOpenCLUtils::createContextFromType for CL_DEVICE_TYPE_GPU\n"); g_cxMainContext = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, glCtx, glDC); oclCHECKERROR(ciErrNum, CL_SUCCESS); int numDev = btOpenCLUtils::getNumDevices(g_cxMainContext); if (numDev>0) { int deviceIndex=0; cl_device_id device; device = btOpenCLUtils::getDevice(g_cxMainContext,deviceIndex); btOpenCLDeviceInfo clInfo; btOpenCLUtils::getDeviceInfo(device,clInfo); btOpenCLUtils::printDeviceInfo(device); const char* globalAtomicsKernelStringPatched = globalAtomicsKernelString; if (!strstr(clInfo.m_deviceExtensions,"cl_ext_atomic_counters_32")) { globalAtomicsKernelStringPatched = findAndReplace(globalAtomicsKernelString,"counter32_t", "volatile __global int*"); } // create a command-queue g_cqCommandQue = clCreateCommandQueue(g_cxMainContext, device, 0, &ciErrNum); oclCHECKERROR(ciErrNum, CL_SUCCESS); cl_mem counterBuffer = clCreateBuffer(g_cxMainContext, CL_MEM_READ_WRITE, sizeof(int), NULL, &ciErrNum); oclCHECKERROR(ciErrNum, CL_SUCCESS); char* kernelMethods[] = { "globalAtomicKernelOpenCL1_1", "counterAtomicKernelExt", "globalAtomicKernelExt", "globalAtomicKernelCounters32Broken" }; int numKernelMethods = sizeof(kernelMethods)/sizeof(char*); for (int i=0;i<numKernelMethods;i++) { int myCounter = 0; //write to counterBuffer int deviceOffset=0; int hostOffset=0; ciErrNum = clEnqueueWriteBuffer(g_cqCommandQue, counterBuffer,CL_FALSE, deviceOffset, sizeof(int), &myCounter, 0, NULL, NULL); oclCHECKERROR(ciErrNum, CL_SUCCESS); g_atomicsKernel = btOpenCLUtils::compileCLKernelFromString(g_cxMainContext,device,globalAtomicsKernelStringPatched,kernelMethods[i], &ciErrNum); oclCHECKERROR(ciErrNum, CL_SUCCESS); ciErrNum = clSetKernelArg(g_atomicsKernel, 0, sizeof(cl_mem),(void*)&counterBuffer); oclCHECKERROR(ciErrNum, CL_SUCCESS); size_t numWorkItems = workGroupSize*((NUM_OBJECTS + (workGroupSize-1)) / workGroupSize); ciErrNum = clEnqueueNDRangeKernel(g_cqCommandQue, g_atomicsKernel, 1, NULL, &numWorkItems, &workGroupSize,0 ,0 ,0); oclCHECKERROR(ciErrNum, CL_SUCCESS); clFinish(g_cqCommandQue); oclCHECKERROR(ciErrNum, CL_SUCCESS); //read from counterBuffer ciErrNum = clEnqueueReadBuffer(g_cqCommandQue, counterBuffer, CL_TRUE, deviceOffset, sizeof(int), &myCounter, 0, NULL, NULL); oclCHECKERROR(ciErrNum, CL_SUCCESS); if (myCounter != NUM_OBJECTS) { printf("%s is broken, expected %d got %d\n",kernelMethods[i],NUM_OBJECTS,myCounter); } else { printf("%s success, got %d\n",kernelMethods[i],myCounter); } } clReleaseCommandQueue(g_cqCommandQue); oclCHECKERROR(ciErrNum, CL_SUCCESS); } clReleaseContext(g_cxMainContext); printf("press a key to end\n"); getchar(); return 0; }
void cl_launch_kernel() { double t_start, t_end; int m = M; int n = N; DATA_TYPE float_n = FLOAT_N; DATA_TYPE eps = EPS; DATA_TYPE val = 1.0; size_t localWorkSize_Kernel1[2], globalWorkSize_Kernel1[2]; size_t localWorkSize_Kernel2[2], globalWorkSize_Kernel2[2]; size_t localWorkSize_Kernel3[2], globalWorkSize_Kernel3[2]; size_t localWorkSize_Kernel4[2], globalWorkSize_Kernel4[2]; localWorkSize_Kernel1[0] = LWS_KERNEL_1_X; localWorkSize_Kernel1[1] = LWS_KERNEL_1_Y; globalWorkSize_Kernel1[0] = (size_t)ceil(((float)M) / ((float)LWS_KERNEL_1_X)) * LWS_KERNEL_1_X; globalWorkSize_Kernel1[1] = 1; localWorkSize_Kernel2[0] = LWS_KERNEL_2_X; localWorkSize_Kernel2[1] = LWS_KERNEL_2_Y; globalWorkSize_Kernel2[0] = (size_t)ceil(((float)M) / ((float)LWS_KERNEL_2_X)) * LWS_KERNEL_2_X; globalWorkSize_Kernel2[1] = 1; localWorkSize_Kernel3[0] = LWS_KERNEL_3_X; localWorkSize_Kernel3[1] = LWS_KERNEL_3_Y; globalWorkSize_Kernel3[0] = (size_t)ceil(((float)M) / ((float)LWS_KERNEL_3_X)) * LWS_KERNEL_3_X; globalWorkSize_Kernel3[1] = (size_t)ceil(((float)N) / ((float)LWS_KERNEL_3_Y)) * LWS_KERNEL_3_Y; localWorkSize_Kernel4[0] = LWS_KERNEL_4_X; localWorkSize_Kernel4[1] = LWS_KERNEL_4_Y; globalWorkSize_Kernel4[0] = (size_t)ceil(((float)M) / ((float)LWS_KERNEL_4_X)) * LWS_KERNEL_4_X; globalWorkSize_Kernel4[1] = 1; // t_start = rtclock(); // Set the arguments of the kernel err_code = clSetKernelArg(clKernel_mean, 0, sizeof(cl_mem), (void *)&mean_mem_obj); err_code |= clSetKernelArg(clKernel_mean, 1, sizeof(cl_mem), (void *)&data_mem_obj); err_code |= clSetKernelArg(clKernel_mean, 2, sizeof(DATA_TYPE), (void *)&float_n); err_code |= clSetKernelArg(clKernel_mean, 3, sizeof(int), (void *)&m); err_code |= clSetKernelArg(clKernel_mean, 4, sizeof(int), (void *)&n); if(err_code != CL_SUCCESS) { printf("Error in seting arguments1\n"); exit(1); } // Execute the OpenCL kernel err_code = clEnqueueNDRangeKernel(clCommandQue, clKernel_mean, 1, NULL, globalWorkSize_Kernel1, localWorkSize_Kernel1, 0, NULL, NULL); if(err_code != CL_SUCCESS) { printf("Error in launching kernel1\n"); exit(1); } clEnqueueBarrier(clCommandQue); // Set the arguments of the kernel err_code = clSetKernelArg(clKernel_std, 0, sizeof(cl_mem), (void *)&mean_mem_obj); err_code = clSetKernelArg(clKernel_std, 1, sizeof(cl_mem), (void *)&stddev_mem_obj); err_code |= clSetKernelArg(clKernel_std, 2, sizeof(cl_mem), (void *)&data_mem_obj); err_code |= clSetKernelArg(clKernel_std, 3, sizeof(DATA_TYPE), (void *)&float_n); err_code |= clSetKernelArg(clKernel_std, 4, sizeof(DATA_TYPE), (void *)&eps); err_code |= clSetKernelArg(clKernel_std, 5, sizeof(int), (void *)&m); err_code |= clSetKernelArg(clKernel_std, 6, sizeof(int), (void *)&n); if(err_code != CL_SUCCESS) { printf("Error in seting arguments2\n"); exit(1); } // Execute the OpenCL kernel err_code = clEnqueueNDRangeKernel(clCommandQue, clKernel_std, 1, NULL, globalWorkSize_Kernel2, localWorkSize_Kernel2, 0, NULL, NULL); if(err_code != CL_SUCCESS) { printf("Error in launching kernel2\n"); exit(1); } clEnqueueBarrier(clCommandQue); // Set the arguments of the kernel err_code = clSetKernelArg(clKernel_reduce, 0, sizeof(cl_mem), (void *)&mean_mem_obj); err_code = clSetKernelArg(clKernel_reduce, 1, sizeof(cl_mem), (void *)&stddev_mem_obj); err_code |= clSetKernelArg(clKernel_reduce, 2, sizeof(cl_mem), (void *)&data_mem_obj); err_code |= clSetKernelArg(clKernel_reduce, 3, sizeof(DATA_TYPE), (void *)&float_n); err_code |= clSetKernelArg(clKernel_reduce, 4, sizeof(int), (void *)&m); err_code |= clSetKernelArg(clKernel_reduce, 5, sizeof(int), (void *)&n); if(err_code != CL_SUCCESS) { printf("Error in seting arguments3\n"); exit(1); } // Execute the OpenCL kernel err_code = clEnqueueNDRangeKernel(clCommandQue, clKernel_reduce, 2, NULL, globalWorkSize_Kernel3, localWorkSize_Kernel3, 0, NULL, NULL); if(err_code != CL_SUCCESS) { printf("Error in launching kernel3\n"); exit(1); } clEnqueueBarrier(clCommandQue); // Set the arguments of the kernel err_code = clSetKernelArg(clKernel_corr, 0, sizeof(cl_mem), (void *)&symmat_mem_obj); err_code |= clSetKernelArg(clKernel_corr, 1, sizeof(cl_mem), (void *)&data_mem_obj); err_code |= clSetKernelArg(clKernel_corr, 2, sizeof(int), (void *)&m); err_code |= clSetKernelArg(clKernel_corr, 3, sizeof(int), (void *)&n); if(err_code != CL_SUCCESS) { printf("Error in seting arguments4\n"); exit(1); } // Execute the OpenCL kernel err_code = clEnqueueNDRangeKernel(clCommandQue, clKernel_corr, 1, NULL, globalWorkSize_Kernel4, localWorkSize_Kernel4, 0, NULL, NULL); if(err_code != CL_SUCCESS) { printf("Error in launching kernel4\n"); exit(1); } clEnqueueBarrier(clCommandQue); clEnqueueWriteBuffer(clCommandQue, symmat_mem_obj, CL_TRUE, ((M)*(M+1) + (M))*sizeof(DATA_TYPE), sizeof(DATA_TYPE), &val, 0, NULL, NULL); clFinish(clCommandQue); // t_end = rtclock(); // fprintf(stdout, "GPU Runtime: %0.6lfs\n", t_end - t_start); }
compute::buffer cape::fighter_to_fixed_vec(vec3f p1, vec3f p2, vec3f p3, vec3f rot) { vec3f rotation = rot; vec3f diff = p3 - p1; float shrink = 0.12f; diff = diff * shrink; p3 = p3 - diff; p1 = p1 + diff; vec3f lpos = p1; vec3f rpos = p3; ///approximation ///could also use body scaling float ldepth = (p3 - p1).length() / 3.f; float rdepth = ldepth; ///we should move perpendicularly away, not zdistance away vec2f ldir = {p3.v[0], p3.v[2]}; ldir = ldir - (vec2f){p1.v[0], p1.v[2]}; vec2f perp = perpendicular(ldir.norm()); vec3f perp3 = {perp.v[0], 0.f, perp.v[1]}; lpos = lpos + perp3 * ldepth; rpos = rpos + perp3 * ldepth; lpos.v[1] += bodypart::scale / 4; rpos.v[1] += bodypart::scale / 4; ///dir could also just be (p3 - p1).rot ??? vec3f dir = rpos - lpos; int len = width; vec3f step = dir / (float)(len - 1); vec3f current = lpos; compute::buffer buf = compute::buffer(cl::context, sizeof(float)*width*3, CL_MEM_READ_WRITE, nullptr); if(!cape_init) { gpu_cape.resize(width * 3); cape_init = true; } //cl_float* mem_map = (cl_float*) clEnqueueMapBuffer(cl::cqueue.get(), buf.get(), CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, sizeof(cl_float)*width*3, 0, NULL, NULL, NULL); float sag = bodypart::scale/32.f; //sag = 0; for(int i=0; i<len; i++) { float xf = (float)i / len; float yval = 4 * xf * (xf - 1) * sag + sin(xf * 30); /*mem_map[i*3 + 0] = current.v[0]; mem_map[i*3 + 1] = current.v[1] + yval; mem_map[i*3 + 2] = current.v[2];*/ gpu_cape[i*3 + 0] = current.v[0]; gpu_cape[i*3 + 1] = current.v[1] + yval; gpu_cape[i*3 + 2] = current.v[2]; current = current + step; } clEnqueueWriteBuffer(cl::cqueue.get(), buf.get(), CL_FALSE, 0, sizeof(cl_float) * width * 3, gpu_cape.data(), 0, NULL, NULL); //clEnqueueUnmapMemObject(cl::cqueue.get(), buf.get(), mem_map, 0, NULL, NULL); return buf; }
void init_cl_radix_sort( int nkeys){ cl_int err; cl_int status; /**/ nkeys_rounded=nkeys; // check some conditions assert(_TOTALBITS % _BITS == 0); assert(nkeys % (_GROUPS * _ITEMS) == 0); assert( (_GROUPS * _ITEMS * _RADIX) % _HISTOSPLIT == 0); assert(pow(2,(int) log2(_GROUPS)) == _GROUPS); assert(pow(2,(int) log2(_ITEMS)) == _ITEMS); // init the timers histo_time=0; scan_time=0; reorder_time=0; transpose_time=0; //printf("Construct the random list\n"); // construction of a random list uint maxint=_MAXINT; assert(_MAXINT != 0); h_checkKeys = (uint*)malloc(sizeof(uint)*nkeys); h_Permut = (uint*)malloc(sizeof(uint)*nkeys); // construction of the initial permutation for(uint i = 0; i < nkeys; i++){ //printf("%d, ",i); h_Permut[i] = i; h_checkKeys[i]=h_keys[i]; } printf("Send to the GPU\n"); // copy on the GPU d_inKeys = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(uint)* nkeys , NULL, &err); assert(err == CL_SUCCESS); d_outKeys = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(uint)* nkeys , NULL, &err); assert(err == CL_SUCCESS); d_inPermut = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(uint)* nkeys , NULL, &err); assert(err == CL_SUCCESS); d_outPermut = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(uint)* nkeys , NULL, &err); assert(err == CL_SUCCESS); //////////////////////////////////////////////////////////////////////////////// //copy the two previous vectors to the device //cl_radix_host2gpu(); //////////////////////////////////////////////////////////////////////////////// status = clEnqueueWriteBuffer( command_que, d_inKeys, CL_TRUE, 0, sizeof(uint) * nkeys, h_keys, 0, NULL, NULL ); if(status == CL_INVALID_COMMAND_QUEUE){ printf("if command_queue is not a valid command-queue.1 \n"); }else if(status == CL_INVALID_CONTEXT){ printf("if command_queue is not a valid command-queue.2 \n"); }else if(status == CL_INVALID_MEM_OBJECT){ printf("if command_queue is not a valid command-queue.3 \n"); }else if(status == CL_INVALID_VALUE){ printf("if command_queue is not a valid command-queue.4 \n"); }else if(status == CL_INVALID_EVENT_WAIT_LIST){ printf("if command_queue is not a valid command-queue.5 \n"); }else if(status == CL_MISALIGNED_SUB_BUFFER_OFFSET){ printf("if command_queue is not a valid command-queue. 6\n"); }else if(status == CL_EXEC_STATUS_ERROR_FOR_EVENTS_IN_WAIT_LIST){ printf("if command_queue is not a valid command-queue.7 \n"); }else if(status == CL_MEM_OBJECT_ALLOCATION_FAILURE){ printf("if command_queue is not a valid command-queue.8 \n"); }else if(status == CL_OUT_OF_RESOURCES){ printf("if command_queue is not a valid command-queue. 9\n"); }else if(status == CL_OUT_OF_HOST_MEMORY){ printf("if command_queue is not a valid command-queue.10 \n"); } assert (status == CL_SUCCESS); clFinish(command_que); // wait end of read status = clEnqueueWriteBuffer( command_que, d_inPermut, CL_TRUE, 0, sizeof(uint) * nkeys, h_Permut, 0, NULL, NULL ); assert (status == CL_SUCCESS); clFinish(command_que); // wait end of read //////////////////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////////////////// // allocate the histogram on the GPU d_Histograms = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(uint)* _RADIX * _GROUPS * _ITEMS, NULL, &err); assert(err == CL_SUCCESS); // allocate the auxiliary histogram on GPU d_globsum = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(uint)* _HISTOSPLIT, NULL, &err); assert(err == CL_SUCCESS); // temporary vector when the sum is not needed d_temp = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(uint)* _HISTOSPLIT, NULL, &err); assert(err == CL_SUCCESS); cl_radix_resize(nkeys); // we set here the fixed arguments of the OpenCL kernels // the changing arguments are modified elsewhere in the class //void histogram(const __global int* d_Keys,__global int* d_Histograms, // const int pass, __local int* loc_histo, const int n) err = clSetKernelArg(ckHistogram, 1, sizeof(cl_mem), &d_Histograms); assert(err == CL_SUCCESS); err = clSetKernelArg(ckHistogram, 3, sizeof(uint)*_RADIX*_ITEMS, NULL); assert(err == CL_SUCCESS); // err = clSetKernelArg(ckHistogram, 3, sizeof(uint)*_ITEMS, NULL); // assert(err == CL_SUCCESS); err = clSetKernelArg(ckPasteHistogram, 0, sizeof(cl_mem), &d_Histograms); assert(err == CL_SUCCESS); err = clSetKernelArg(ckPasteHistogram, 1, sizeof(cl_mem), &d_globsum); assert(err == CL_SUCCESS); err = clSetKernelArg(ckReorder, 2, sizeof(cl_mem), &d_Histograms); assert(err == CL_SUCCESS); err = clSetKernelArg(ckReorder, 6, sizeof(uint)* _RADIX * _ITEMS , NULL); // mem cache assert(err == CL_SUCCESS); }
void * materializeCol(struct materializeNode * mn, struct clContext * context, struct statistic * pp){ struct timespec start,end; clock_gettime(CLOCK_REALTIME,&start); cl_event ndrEvt; cl_ulong startTime, endTime; struct tableNode *tn = mn->table; char * res; cl_mem gpuResult; cl_mem gpuAttrSize; long totalSize = tn->tupleNum * tn->tupleSize; cl_int error = 0; cl_mem gpuContent = clCreateBuffer(context->context, CL_MEM_READ_ONLY, totalSize, NULL, &error); gpuResult = clCreateBuffer(context->context, CL_MEM_READ_WRITE, totalSize, NULL, &error); gpuAttrSize = clCreateBuffer(context->context, CL_MEM_READ_ONLY, sizeof(int)*tn->totalAttr,NULL,&error); clEnqueueWriteBuffer(context->queue,gpuAttrSize,CL_TRUE,0,sizeof(int)*tn->totalAttr,tn->attrSize,0,0,&ndrEvt); clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); res = (char *) malloc(totalSize); long offset = 0; long *colOffset = (long*)malloc(sizeof(long)*tn->totalAttr); for(int i=0;i<tn->totalAttr;i++){ colOffset[i] = offset; int size = tn->tupleNum * tn->attrSize[i]; if(tn->dataPos[i] == MEM){ clEnqueueWriteBuffer(context->queue,gpuContent,CL_TRUE,offset,size,tn->content[i],0,0,&ndrEvt); clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); }else clEnqueueCopyBuffer(context->queue,(cl_mem)tn->content[i],gpuContent,0,offset,size,0,0,0); offset += size; } cl_mem gpuColOffset = clCreateBuffer(context->context, CL_MEM_READ_ONLY, sizeof(long)*tn->totalAttr,NULL,&error); clEnqueueWriteBuffer(context->queue,gpuColOffset,CL_TRUE,0,sizeof(long)*tn->totalAttr,colOffset,0,0,&ndrEvt); clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); size_t globalSize = 512; size_t localSize = 128; context->kernel = clCreateKernel(context->program,"materialize",0); clSetKernelArg(context->kernel,0,sizeof(cl_mem), (void*)&gpuContent); clSetKernelArg(context->kernel,1,sizeof(cl_mem), (void*)&gpuColOffset); clSetKernelArg(context->kernel,2,sizeof(int), (void*)&tn->totalAttr); clSetKernelArg(context->kernel,3,sizeof(cl_mem), (void*)&gpuAttrSize); clSetKernelArg(context->kernel,4,sizeof(long), (void*)&tn->tupleNum); clSetKernelArg(context->kernel,5,sizeof(int), (void*)&tn->tupleSize); clSetKernelArg(context->kernel,6,sizeof(cl_mem), (void*)&gpuResult); clEnqueueNDRangeKernel(context->queue, context->kernel, 1, 0, &globalSize,&localSize,0,0,0); clEnqueueReadBuffer(context->queue,gpuResult,CL_TRUE,0,totalSize,res,0,0,&ndrEvt); clWaitForEvents(1, &ndrEvt); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_START,sizeof(cl_ulong),&startTime,0); clGetEventProfilingInfo(ndrEvt,CL_PROFILING_COMMAND_END,sizeof(cl_ulong),&endTime,0); pp->pcie += 1e-6 * (endTime - startTime); free(colOffset); clFinish(context->queue); clReleaseMemObject(gpuColOffset); clReleaseMemObject(gpuContent); clReleaseMemObject(gpuAttrSize); clReleaseMemObject(gpuResult); clock_gettime(CLOCK_REALTIME,&end); double timeE = (end.tv_sec - start.tv_sec)* BILLION + end.tv_nsec - start.tv_nsec; printf("Materialization Time: %lf\n", timeE/(1000*1000)); return res; }
int main(int argc, char **argv) { cl_int err = 0; cl_context context = 0; cl_device_id * devices = NULL; cl_command_queue queue = 0; cl_program program = 0; cl_mem cl_a = 0, cl_b = 0, cl_res = 0; cl_kernel adder = 0; cl_event event; // The iteration variable int i; // Define our data set cl_float a[DATA_SIZE], b[DATA_SIZE], res[DATA_SIZE]; // Initialize array srand(time(0)); for (i = 0; i < DATA_SIZE; i++) { a[i] = (rand() % 100) / 100.0; b[i] = (rand() % 100) / 100.0; res[i] = 0; } check_release(get_cl_context(&context, &devices, 0) == false, "Fail to create context"); // Specify the queue to be profile-able queue = clCreateCommandQueue(context, devices[0], CL_QUEUE_PROFILING_ENABLE, 0); check_release(queue == NULL, "Can't create command queue"); program = load_program(context, devices[0], "shader.cl"); check_release(program == NULL, "Fail to build program"); cl_a = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_float) * DATA_SIZE, NULL, NULL); cl_b = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(cl_float) * DATA_SIZE, NULL, NULL); cl_res = clCreateBuffer( context, CL_MEM_WRITE_ONLY, sizeof(cl_float) * DATA_SIZE, NULL, NULL); if (cl_a == 0 || cl_b == 0 || cl_res == 0) { printf("Can't create OpenCL buffer\n"); goto release; } check_release(clEnqueueWriteBuffer( queue, cl_a, CL_TRUE, 0, sizeof(cl_float) * DATA_SIZE, a, 0, 0, 0), "Write Buffer 1"); check_release(clEnqueueWriteBuffer( queue, cl_b, CL_TRUE, 0, sizeof(cl_float) * DATA_SIZE, b, 0, 0, 0), "Write Buffer 2"); adder = clCreateKernel(program, "adder", &err); if (err == CL_INVALID_KERNEL_NAME) printf("CL_INVALID_KERNEL_NAME\n"); check_release(adder == NULL, "Can't load kernel"); clSetKernelArg(adder, 0, sizeof(cl_mem), &cl_a); clSetKernelArg(adder, 1, sizeof(cl_mem), &cl_b); clSetKernelArg(adder, 2, sizeof(cl_mem), &cl_res); size_t work_size = DATA_SIZE; check_release(clEnqueueNDRangeKernel(queue, adder, 1, 0, &work_size, 0, 0, 0, &event), "Can't enqueue kernel"); check_release( clEnqueueReadBuffer( queue, cl_res, CL_TRUE, 0, sizeof(cl_float) * DATA_SIZE, res, 0, 0, 0), "Can't enqueue read buffer"); clWaitForEvents(1, &event); printf("Execution Time: %.04lf ms\n\n", get_event_exec_time(event)); // Make sure everything is done before we do anything clFinish(queue); err = 0; for (i = 0; i < DATA_SIZE; i++) { if (res[i] != a[i] + b[i]) { printf("%f + %f = %f(answer %f)\n", a[i], b[i], res[i], a[i] + b[i]); err++; } } if (err == 0) printf("Validation passed\n"); else printf("Validation failed\n"); printf("------\n"); //-------------------------------- // Second test for (i = 0; i < DATA_SIZE; i++) { a[i] = i; b[i] = i; res[i] = 0; } check_err(clEnqueueWriteBuffer( queue, cl_a, CL_TRUE, 0, sizeof(cl_float) * DATA_SIZE, a, 0, 0, 0), "Write Buffer 1"); check_err(clEnqueueWriteBuffer( queue, cl_b, CL_TRUE, 0, sizeof(cl_float) * DATA_SIZE, b, 0, 0, 0), "Write Buffer 2"); check_err(clEnqueueNDRangeKernel(queue, adder, 1, 0, &work_size, 0, 0, 0, &event), "Can't enqueue kernel"); check_err(clEnqueueReadBuffer( queue, cl_res, CL_TRUE, 0, sizeof(cl_float) * DATA_SIZE, res, 0, 0, 0), "Can't enqueue read buffer"); clWaitForEvents(1, &event); printf("Execution Time: %.04lf ms\n\n", get_event_exec_time(event)); // Make sure everything is done before we do anything clFinish(queue); err = 0; for (i = 0; i < DATA_SIZE; i++) { if (res[i] != a[i] + b[i]) { printf("%f + %f = %f(answer %f)\n", a[i], b[i], res[i], a[i] + b[i]); err++; } } if (err == 0) printf("Validation passed\n"); else printf("Validation failed\n"); release: clReleaseKernel(adder); clReleaseProgram(program); clReleaseMemObject(cl_a); clReleaseMemObject(cl_b); clReleaseMemObject(cl_res); clReleaseCommandQueue(queue); clReleaseContext(context); return 0; }
// Main function // ********************************************************************* int main(const int argc, const char** argv) { // start logs shrSetLogFileName ("oclDXTCompression.txt"); shrLog(LOGBOTH, 0, "%s Starting...\n\n", argv[0]); cl_context cxGPUContext; cl_command_queue cqCommandQueue; cl_program cpProgram; cl_kernel ckKernel; cl_mem cmMemObjs[3]; size_t szGlobalWorkSize[1]; size_t szLocalWorkSize[1]; cl_int ciErrNum; // Get the path of the filename char *filename; if (shrGetCmdLineArgumentstr(argc, argv, "image", &filename)) { image_filename = filename; } // load image const char* image_path = shrFindFilePath(image_filename, argv[0]); shrCheckError(image_path != NULL, shrTRUE); shrLoadPPM4ub(image_path, (unsigned char **)&h_img, &width, &height); shrCheckError(h_img != NULL, shrTRUE); shrLog(LOGBOTH, 0, "Loaded '%s', %d x %d pixels\n", image_path, width, height); // Convert linear image to block linear. uint * block_image = (uint *) malloc(width * height * 4); // Convert linear image to block linear. for(uint by = 0; by < height/4; by++) { for(uint bx = 0; bx < width/4; bx++) { for (int i = 0; i < 16; i++) { const int x = i & 3; const int y = i / 4; block_image[(by * width/4 + bx) * 16 + i] = ((uint *)h_img)[(by * 4 + y) * 4 * (width/4) + bx * 4 + x]; } } } // create the OpenCL context on a GPU device cxGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); // get and log device cl_device_id device; if( shrCheckCmdLineFlag(argc, argv, "device") ) { int device_nr = 0; shrGetCmdLineArgumenti(argc, argv, "device", &device_nr); device = oclGetDev(cxGPUContext, device_nr); } else { device = oclGetMaxFlopsDev(cxGPUContext); } oclPrintDevInfo(LOGBOTH, device); // create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, device, 0, &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); // Memory Setup // Compute permutations. cl_uint permutations[1024]; computePermutations(permutations); // Upload permutations. cmMemObjs[0] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_uint) * 1024, permutations, &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); // Image cmMemObjs[1] = clCreateBuffer(cxGPUContext, CL_MEM_READ_ONLY , sizeof(cl_uint) * width * height, NULL, &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); // Result const uint compressedSize = (width / 4) * (height / 4) * 8; cmMemObjs[2] = clCreateBuffer(cxGPUContext, CL_MEM_WRITE_ONLY, compressedSize, NULL , &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); unsigned int * h_result = (uint *)malloc(compressedSize); // Program Setup size_t program_length; const char* source_path = shrFindFilePath("DXTCompression.cl", argv[0]); shrCheckError(source_path != NULL, shrTRUE); char *source = oclLoadProgSource(source_path, "", &program_length); shrCheckError(source != NULL, shrTRUE); // create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **) &source, &program_length, &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); // build the program ciErrNum = clBuildProgram(cpProgram, 0, NULL, "-cl-mad-enable", NULL, NULL); if (ciErrNum != CL_SUCCESS) { // write out standard error, Build Log and PTX, then cleanup and exit shrLog(LOGBOTH | ERRORMSG, ciErrNum, STDERROR); oclLogBuildInfo(cpProgram, oclGetFirstDev(cxGPUContext)); oclLogPtx(cpProgram, oclGetFirstDev(cxGPUContext), "oclDXTCompression.ptx"); shrCheckError(ciErrNum, CL_SUCCESS); } // create the kernel ckKernel = clCreateKernel(cpProgram, "compress", &ciErrNum); shrCheckError(ciErrNum, CL_SUCCESS); // set the args values ciErrNum = clSetKernelArg(ckKernel, 0, sizeof(cl_mem), (void *) &cmMemObjs[0]); ciErrNum |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void *) &cmMemObjs[1]); ciErrNum |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void *) &cmMemObjs[2]); ciErrNum |= clSetKernelArg(ckKernel, 3, sizeof(float) * 4 * 16, NULL); ciErrNum |= clSetKernelArg(ckKernel, 4, sizeof(float) * 4 * 16, NULL); ciErrNum |= clSetKernelArg(ckKernel, 5, sizeof(int) * 64, NULL); ciErrNum |= clSetKernelArg(ckKernel, 6, sizeof(float) * 16 * 6, NULL); ciErrNum |= clSetKernelArg(ckKernel, 7, sizeof(unsigned int) * 160, NULL); ciErrNum |= clSetKernelArg(ckKernel, 8, sizeof(int) * 16, NULL); shrCheckError(ciErrNum, CL_SUCCESS); shrLog(LOGBOTH, 0, "Running DXT Compression on %u x %u image...\n\n", width, height); // Upload the image clEnqueueWriteBuffer(cqCommandQueue, cmMemObjs[1], CL_FALSE, 0, sizeof(cl_uint) * width * height, block_image, 0,0,0); // set work-item dimensions szGlobalWorkSize[0] = width * height * (NUM_THREADS/16); szLocalWorkSize[0]= NUM_THREADS; #ifdef GPU_PROFILING int numIterations = 100; for (int i = -1; i < numIterations; ++i) { if (i == 0) { // start timing only after the first warmup iteration clFinish(cqCommandQueue); // flush command queue shrDeltaT(0); // start timer } #endif // execute kernel ciErrNum = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, szGlobalWorkSize, szLocalWorkSize, 0, NULL, NULL); shrCheckError(ciErrNum, CL_SUCCESS); #ifdef GPU_PROFILING } clFinish(cqCommandQueue); double dAvgTime = shrDeltaT(0) / (double)numIterations; shrLog(LOGBOTH | MASTER, 0, "oclDXTCompression, Throughput = %.4f, Time = %.5f, Size = %u, NumDevsUsed = %i\n", (1.0e-6 * (double)(width * height)/ dAvgTime), dAvgTime, (width * height), 1); #endif // blocking read output ciErrNum = clEnqueueReadBuffer(cqCommandQueue, cmMemObjs[2], CL_TRUE, 0, compressedSize, h_result, 0, NULL, NULL); shrCheckError(ciErrNum, CL_SUCCESS); // Write DDS file. FILE* fp = NULL; char output_filename[1024]; #ifdef WIN32 strcpy_s(output_filename, 1024, image_path); strcpy_s(output_filename + strlen(image_path) - 3, 1024 - strlen(image_path) + 3, "dds"); fopen_s(&fp, output_filename, "wb"); #else strcpy(output_filename, image_path); strcpy(output_filename + strlen(image_path) - 3, "dds"); fp = fopen(output_filename, "wb"); #endif shrCheckError(fp != NULL, shrTRUE); DDSHeader header; header.fourcc = FOURCC_DDS; header.size = 124; header.flags = (DDSD_WIDTH|DDSD_HEIGHT|DDSD_CAPS|DDSD_PIXELFORMAT|DDSD_LINEARSIZE); header.height = height; header.width = width; header.pitch = compressedSize; header.depth = 0; header.mipmapcount = 0; memset(header.reserved, 0, sizeof(header.reserved)); header.pf.size = 32; header.pf.flags = DDPF_FOURCC; header.pf.fourcc = FOURCC_DXT1; header.pf.bitcount = 0; header.pf.rmask = 0; header.pf.gmask = 0; header.pf.bmask = 0; header.pf.amask = 0; header.caps.caps1 = DDSCAPS_TEXTURE; header.caps.caps2 = 0; header.caps.caps3 = 0; header.caps.caps4 = 0; header.notused = 0; fwrite(&header, sizeof(DDSHeader), 1, fp); fwrite(h_result, compressedSize, 1, fp); fclose(fp); // Make sure the generated image matches the reference image (regression check) shrLog(LOGBOTH, 0, "\nComparing against Host/C++ computation...\n"); const char* reference_image_path = shrFindFilePath(refimage_filename, argv[0]); shrCheckError(reference_image_path != NULL, shrTRUE); // read in the reference image from file #ifdef WIN32 fopen_s(&fp, reference_image_path, "rb"); #else fp = fopen(reference_image_path, "rb"); #endif shrCheckError(fp != NULL, shrTRUE); fseek(fp, sizeof(DDSHeader), SEEK_SET); uint referenceSize = (width / 4) * (height / 4) * 8; uint * reference = (uint *)malloc(referenceSize); fread(reference, referenceSize, 1, fp); fclose(fp); // compare the reference image data to the sample/generated image float rms = 0; for (uint y = 0; y < height; y += 4) { for (uint x = 0; x < width; x += 4) { // binary comparison of data uint referenceBlockIdx = ((y/4) * (width/4) + (x/4)); uint resultBlockIdx = ((y/4) * (width/4) + (x/4)); int cmp = compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, ((BlockDXT1 *)reference) + referenceBlockIdx); // log deviations, if any if (cmp != 0.0f) { compareBlock(((BlockDXT1 *)h_result) + resultBlockIdx, ((BlockDXT1 *)reference) + referenceBlockIdx); shrLog(LOGBOTH, 0, "Deviation at (%d, %d):\t%f rms\n", x/4, y/4, float(cmp)/16/3); } rms += cmp; } } rms /= width * height * 3; shrLog(LOGBOTH, 0, "RMS(reference, result) = %f\n\n", rms); shrLog(LOGBOTH, 0, "TEST %s\n\n", (rms <= ERROR_THRESHOLD) ? "PASSED" : "FAILED !!!"); // Free OpenCL resources oclDeleteMemObjs(cmMemObjs, 3); clReleaseKernel(ckKernel); clReleaseProgram(cpProgram); clReleaseCommandQueue(cqCommandQueue); clReleaseContext(cxGPUContext); // Free host memory free(source); free(h_img); // finish shrEXIT(argc, argv); }
int main(int argc, char** argv) { double serial_time, openCL_time, start_time; cl_int err; cl_platform_id* platforms = NULL; char platform_name[1024]; cl_device_id device_id = NULL; cl_uint num_of_platforms = 0; cl_uint num_of_devices = 0; cl_context context; cl_kernel kernel; cl_command_queue command_queue; cl_program program; cl_mem input1, input2, input3, output; float **A, **B, **C, **serialC; // matrices int d1, d2, d3; // dimensions of matrices /* print user instruction */ if (argc != 4) { printf("Matrix multiplication: C = A x B\n"); printf("Usage: %s <NumRowA> <NumColA> <NumColB>\n", argv[0]); return 0; } /* read user input */ d1 = 1000; // rows of A and C d2 = 1000; // cols of A and rows of B d3 = 1000; // cols of B and C int d[4] = { 0, d1, d2, d3 }; size_t global[1] = { (size_t)d1*d3 }; printf("Matrix sizes C[%d][%d] = A[%d][%d] x B[%d][%d]\n", d1, d3, d1, d2, d2, d3); /* prepare matrices */ A = alloc_mat(d1, d2); init_mat(A, d1, d2); B = alloc_mat(d2, d3); init_mat(B, d2, d3); C = alloc_mat(d1, d3); serialC = alloc_mat(d1, d3); err = clGetPlatformIDs(0, NULL, &num_of_platforms); if (err != CL_SUCCESS) { printf("No platforms found. Error: %d\n", err); return 0; } platforms = (cl_platform_id *)malloc(num_of_platforms); err = clGetPlatformIDs(num_of_platforms, platforms, NULL); if (err != CL_SUCCESS) { printf("No platforms found. Error: %d\n", err); return 0; } else { int nvidia_platform = 0; for (unsigned int i = 0; i<num_of_platforms; i++) { clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (err != CL_SUCCESS) { printf("Could not get information about platform. Error: %d\n", err); return 0; } if (strstr(platform_name, "NVIDIA") != NULL) { nvidia_platform = i; break; } } err = clGetDeviceIDs(platforms[nvidia_platform], CL_DEVICE_TYPE_GPU, 1, &device_id, &num_of_devices); if (err != CL_SUCCESS) { printf("Could not get device in platform. Error: %d\n", err); return 0; } } context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (err != CL_SUCCESS) { printf("Unable to create context. Error: %d\n", err); return 0; } command_queue = clCreateCommandQueue(context, device_id, 0, &err); if (err != CL_SUCCESS) { printf("Unable to create command queue. Error: %d\n", err); return 0; } program = clCreateProgramWithSource(context, 1, (const char **)&KernelSource, NULL, &err); if (err != CL_SUCCESS) { printf("Unable to create program. Error: %d\n", err); return 0; } if (clBuildProgram(program, 0, NULL, NULL, NULL, NULL) != CL_SUCCESS) { char *log; size_t size; clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &size); // 1. Länge des Logbuches? log = (char *)malloc(size + 1); if (log) { clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, size, log, NULL); // 2. Hole das Logbuch ab log[size] = '\0'; printf("%s", log); free(log); } return 1; } err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { printf("Error building program. Error: %d\n", err); return 0; } kernel = clCreateKernel(program, "matmult_ocl", &err); if (err != CL_SUCCESS) { printf("Error setting kernel. Error: %d\n", err); return 0; } input1 = clCreateBuffer(context, CL_MEM_READ_ONLY, d1*d2*sizeof(float), NULL, &err); input2 = clCreateBuffer(context, CL_MEM_READ_ONLY, d2*d3*sizeof(float), NULL, &err); input3 = clCreateBuffer(context, CL_MEM_READ_ONLY, 4 * sizeof(int), NULL, &err); output = clCreateBuffer(context, CL_MEM_READ_WRITE, d1*d3*sizeof(float), NULL, &err); start_time = omp_get_wtime(); clEnqueueWriteBuffer(command_queue, input1, CL_TRUE, 0, d1*d2*sizeof(float), *A, 0, NULL, NULL); clEnqueueWriteBuffer(command_queue, input2, CL_TRUE, 0, d2*d3*sizeof(float), *B, 0, NULL, NULL); clEnqueueWriteBuffer(command_queue, input3, CL_TRUE, 0, 4 * sizeof(int), d, 0, NULL, NULL); clSetKernelArg(kernel, 0, sizeof(cl_mem), &input1); clSetKernelArg(kernel, 1, sizeof(cl_mem), &input2); clSetKernelArg(kernel, 2, sizeof(cl_mem), &input3); clSetKernelArg(kernel, 3, sizeof(cl_mem), &output); clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global, NULL, 0, NULL, NULL); clFinish(command_queue); clEnqueueReadBuffer(command_queue, output, CL_TRUE, 0, d1*d3*sizeof(float), *C, 0, NULL, NULL); // for (unsigned int i = 0; i < (unsigned int) d1*d3; i++) // printf("%f\n", C[0][i]); openCL_time = omp_get_wtime() - start_time; clReleaseMemObject(input1); clReleaseMemObject(input2); clReleaseMemObject(input3); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(command_queue); clReleaseContext(context); printf("Running serial algorithm...\n"); start_time = omp_get_wtime(); serialC = mult_mat(A, B, d1, d2, d3); serial_time = omp_get_wtime() - start_time; printf("Checking results... "); is_correct(C, serialC, d1, d3); printf("Showing stats...\n"); printf(" serial runtime = %f\n", serial_time); printf(" OpenCL runtime = %f\n", openCL_time); printf(" Speedup = %f\n", serial_time / openCL_time); return 0; }
int main(int argc, char *argv[]) { cl_int err; cl_platform_id platform; cl_device_id device; cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_mem d_a, d_b, d_c; float *h_a, *h_b, *h_c; size_t N = 1024; if (argc > 1) { N = atoi(argv[1]); } size_t global = N; if (argc > 2) { global = atoi(argv[2]); } if (!N || !global) { printf("Usage: ./vecadd N [GLOBAL_SIZE]\n"); exit(1); } // Get list of platforms cl_uint numPlatforms = 0; cl_platform_id platforms[MAX_PLATFORMS]; err = clGetPlatformIDs(MAX_PLATFORMS, platforms, &numPlatforms); checkError(err, "getting platforms"); // Find Oclgrind platform = NULL; for (int i = 0; i < numPlatforms; i++) { char name[256]; err = clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 256, name, NULL); checkError(err, "getting platform name"); if (!strcmp(name, "Oclgrind")) { platform = platforms[i]; break; } } if (!platform) { fprintf(stderr, "Unable to find Oclgrind platform\n"); exit(1); } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL); checkError(err, "getting device"); context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); checkError(err, "creating context"); queue = clCreateCommandQueue(context, device, 0, &err); checkError(err, "creating command queue"); program = clCreateProgramWithSource(context, 1, &KERNEL_SOURCE, NULL, &err); checkError(err, "creating program"); err = clBuildProgram(program, 1, &device, "", NULL, NULL); if (err == CL_BUILD_PROGRAM_FAILURE) { size_t sz; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(size_t), NULL, &sz); char *buildLog = malloc(++sz); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sz, buildLog, NULL); fprintf(stderr, "%s\n", buildLog); } checkError(err, "building program"); kernel = clCreateKernel(program, "vecadd", &err); checkError(err, "creating kernel"); size_t dataSize = N*sizeof(cl_float); // Initialise host data srand(0); h_a = malloc(dataSize); h_b = malloc(dataSize); h_c = malloc(dataSize); for (int i = 0; i < N; i++) { h_a[i] = rand()/(float)RAND_MAX; h_b[i] = rand()/(float)RAND_MAX; h_c[i] = 0; } d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, dataSize, NULL, &err); checkError(err, "creating d_a buffer"); d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, dataSize, NULL, &err); checkError(err, "creating d_b buffer"); d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, dataSize, NULL, &err); checkError(err, "creating d_c buffer"); err = clEnqueueWriteBuffer(queue, d_a, CL_FALSE, 0, dataSize, h_a, 0, NULL, NULL); checkError(err, "writing d_a data"); err = clEnqueueWriteBuffer(queue, d_b, CL_FALSE, 0, dataSize, h_b, 0, NULL, NULL); checkError(err, "writing d_b data"); err = clEnqueueWriteBuffer(queue, d_c, CL_FALSE, 0, dataSize, h_c, 0, NULL, NULL); checkError(err, "writing d_c data"); err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c); checkError(err, "setting kernel args"); err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, NULL, 0, NULL, NULL); checkError(err, "enqueuing kernel"); err = clFinish(queue); checkError(err, "running kernel"); err = clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0, dataSize, h_c, 0, NULL, NULL); checkError(err, "reading d_c data"); // Check results int errors = 0; for (int i = 0; i < N; i++) { float ref = h_a[i] + h_b[i]; if (fabs(ref - h_c[i]) > TOL) { if (errors < MAX_ERRORS) { fprintf(stderr, "%4d: %.4f != %.4f\n", i, h_c[i], ref); } errors++; } } printf("%d errors detected\n", errors); free(h_a); free(h_b); free(h_c); clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(queue); clReleaseContext(context); return (errors != 0); }
int main() { //This code executes on the OpenCL host //Host data int * A = NULL; //Input array int * B = NULL; //Input array int * C = NULL; //Output array //Elements in each array const int elements = 2048; //Compute the size of data size_t datasize = sizeof(int) * elements; //Allocate space for input/output data A = (int *)malloc(datasize); B = (int *)malloc(datasize); C = (int *)malloc(datasize); puts ("After allocation"); //Initialize the input data int i; for (i = 0; i < elements; i++) { A[i] = i; B[i] = i; } puts ("After for"); //Use this check the output of each API call cl_int status; /******************************************************************/ /* PLATFORM */ /******************************************************************/ //Retrieve the number of platforms cl_uint numPlatforms = 0; puts ("Before get platform."); status = clGetPlatformIDs(0, NULL, &numPlatforms); //Allocate enough space for each platform cl_platform_id * platforms = NULL; printf ("Total platform: %d\n", numPlatforms); platforms = (cl_platform_id *)malloc(numPlatforms * sizeof(cl_platform_id)); //Fill in the platforms puts ("Before fill platform"); status = clGetPlatformIDs(numPlatforms, platforms, NULL); /******************************************************************/ /* DEVICE ID */ /******************************************************************/ cl_uint numDevices = 0; puts ("Before get devices"); status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); //Alocate enough space for each device cl_device_id * devices; devices = (cl_device_id *) malloc(numDevices * sizeof(cl_device_id)); //Fill in the devices puts ("Before alloc get devices"); status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); printf ("total devices: %d\n", numDevices); printf ("devices: %p\n", devices); /******************************************************************/ /* CONTEXT */ /******************************************************************/ cl_context context; puts ("Before context"); context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status); /******************************************************************/ /* COMMAND QUEUE */ /******************************************************************/ cl_command_queue cmdQueue; puts ("Before clCreateCommandQueue"); cmdQueue = clCreateCommandQueue (context, devices[0], 0, &status); /******************************************************************/ /* BUFFER OBJECT */ /******************************************************************/ cl_mem bufA; puts ("Before clCreateBuffer A."); bufA = clCreateBuffer(context, CL_MEM_READ_ONLY, datasize, NULL, &status); cl_mem bufB; puts ("Before clCreateBuffer A."); bufB = clCreateBuffer(context, CL_MEM_READ_ONLY, datasize, NULL, &status); // Create a buffer object that will hold the output cl_mem bufC; puts ("Before clCreateBuffer A."); bufC = clCreateBuffer(context, CL_MEM_WRITE_ONLY, datasize, NULL, &status); //Write input array A to the device bufferA puts ("Before clEnqueueWriteBuffer A."); status = clEnqueueWriteBuffer(cmdQueue, bufA, CL_FALSE, 0, datasize, A, 0, NULL, NULL); //Write input array B to the device bufferB puts ("Before clEnqueueWriteBuffer B."); status = clEnqueueWriteBuffer(cmdQueue, bufB, CL_FALSE, 0, datasize, B, 0, NULL, NULL); /******************************************************************/ /*Create a program with source code*/ /******************************************************************/ puts ("Before clCreateProgramWithSource."); cl_program program = clCreateProgramWithSource(context, 1, (const char **)&programSource, NULL, &status); status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL); //Create the vector addition kernel cl_kernel kernel; kernel = clCreateKernel(program, "vecadd", &status); //Associate the input and output buffer with the kernel status = clSetKernelArg(kernel, 0, sizeof (cl_mem), &bufA); status = clSetKernelArg(kernel, 1, sizeof (cl_mem), &bufB); status = clSetKernelArg(kernel, 2, sizeof (cl_mem), &bufC); //Define an index space size_t globalWorkSize[1]; // Execute the kernel for execution status = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); // Read the device output buffer to the host output array clEnqueueReadBuffer(cmdQueue, bufC, CL_TRUE, 0, datasize, C, 0, NULL, NULL); //Verify the output int result = 1; for (i = 0; i < elements; i++) { if (C[i] != i + i) { result = 0; break; } } if (result) { printf("Output is correct\n"); } else { printf("Output is wrong\n"); } //Free OpenCL resoureces clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmdQueue); clReleaseMemObject(bufA); clReleaseMemObject(bufB); clReleaseMemObject(bufC); clReleaseContext(context); //Free host resources free(A); free(B); free(C); free(platforms); free(devices); return 0; }
int main(int argc, char** argv) { // Set up the data on the host clock_t start, start0; start0 = clock(); start = clock(); // Rows and columns in the input image int imageHeight; int imageWidth; const char* inputFile = "input.bmp"; const char* outputFile = "output.bmp"; // Homegrown function to read a BMP from file float* inputImage = readImage(inputFile, &imageWidth, &imageHeight); // Size of the input and output images on the host int dataSize = imageHeight*imageWidth*sizeof(float); // Pad the number of columns #ifdef NON_OPTIMIZED int deviceWidth = imageWidth; #else // READ_ALIGNED || READ4 int deviceWidth = roundUp(imageWidth, WGX); #endif int deviceHeight = imageHeight; // Size of the input and output images on the device int deviceDataSize = imageHeight*deviceWidth*sizeof(float); // Output image on the host float* outputImage = NULL; outputImage = (float*)malloc(dataSize); int i, j; for(i = 0; i < imageHeight; i++) { for(j = 0; j < imageWidth; j++) { outputImage[i*imageWidth+j] = 0; } } // 45 degree motion blur float filter[49] = {0, 0, 0, 0, 0, 0.0145, 0, 0, 0, 0, 0, 0.0376, 0.1283, 0.0145, 0, 0, 0, 0.0376, 0.1283, 0.0376, 0, 0, 0, 0.0376, 0.1283, 0.0376, 0, 0, 0, 0.0376, 0.1283, 0.0376, 0, 0, 0, 0.0145, 0.1283, 0.0376, 0, 0, 0, 0, 0, 0.0145, 0, 0, 0, 0, 0}; int filterWidth = 7; int paddingPixels = (int)(filterWidth/2) * 2; stoptime(start, "set up input, output."); start = clock(); // Set up the OpenCL environment // Discovery platform cl_platform_id platform; clGetPlatformIDs(1, &platform, NULL); // Discover device cl_device_id device; clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 1, &device, NULL); size_t time_res; clGetDeviceInfo(device, CL_DEVICE_PROFILING_TIMER_RESOLUTION, sizeof(time_res), &time_res, NULL); printf("Device profiling timer resolution: %zu ns.\n", time_res); // Create context cl_context_properties props[3] = {CL_CONTEXT_PLATFORM, (cl_context_properties)(platform), 0}; cl_context context; context = clCreateContext(props, 1, &device, NULL, NULL, NULL); // Create command queue cl_ulong time_start, time_end, exec_time; cl_event timing_event; cl_command_queue queue; queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, NULL); // Create memory buffers cl_mem d_inputImage; cl_mem d_outputImage; cl_mem d_filter; d_inputImage = clCreateBuffer(context, CL_MEM_READ_ONLY, deviceDataSize, NULL, NULL); d_outputImage = clCreateBuffer(context, CL_MEM_WRITE_ONLY, deviceDataSize, NULL, NULL); d_filter = clCreateBuffer(context, CL_MEM_READ_ONLY, 49*sizeof(float),NULL, NULL); // Write input data to the device #ifdef NON_OPTIMIZED clEnqueueWriteBuffer(queue, d_inputImage, CL_TRUE, 0, deviceDataSize, inputImage, 0, NULL, NULL); #else // READ_ALIGNED || READ4 size_t buffer_origin[3] = {0,0,0}; size_t host_origin[3] = {0,0,0}; size_t region[3] = {deviceWidth*sizeof(float), imageHeight, 1}; clEnqueueWriteBufferRect(queue, d_inputImage, CL_TRUE, buffer_origin, host_origin, region, deviceWidth*sizeof(float), 0, imageWidth*sizeof(float), 0, inputImage, 0, NULL, NULL); #endif // Write the filter to the device clEnqueueWriteBuffer(queue, d_filter, CL_TRUE, 0, 49*sizeof(float), filter, 0, NULL, NULL); // Read in the program from file char* source = readSource("convolution.cl"); // Create the program cl_program program; // Create and compile the program program = clCreateProgramWithSource(context, 1, (const char**)&source, NULL, NULL); cl_int build_status; build_status = clBuildProgram(program, 1, &device, NULL, NULL, NULL); // Create the kernel cl_kernel kernel; #if defined NON_OPTIMIZED || defined READ_ALIGNED // Only the host-side code differs for the aligned reads kernel = clCreateKernel(program, "convolution", NULL); #else // READ4 kernel = clCreateKernel(program, "convolution_read4", NULL); #endif // Selected work group size is 16x16 int wgWidth = WGX; int wgHeight = WGY; // When computing the total number of work items, the // padding work items do not need to be considered int totalWorkItemsX = roundUp(imageWidth-paddingPixels, wgWidth); int totalWorkItemsY = roundUp(imageHeight-paddingPixels, wgHeight); // Size of a work group size_t localSize[2] = {wgWidth, wgHeight}; // Size of the NDRange size_t globalSize[2] = {totalWorkItemsX, totalWorkItemsY}; // The amount of local data that is cached is the size of the // work groups plus the padding pixels #if defined NON_OPTIMIZED || defined READ_ALIGNED int localWidth = localSize[0] + paddingPixels; #else // READ4 // Round the local width up to 4 for the read4 kernel int localWidth = roundUp(localSize[0]+paddingPixels, 4); #endif int localHeight = localSize[1] + paddingPixels; // Compute the size of local memory (needed for dynamic // allocation) size_t localMemSize = (localWidth * localHeight * sizeof(float)); // Set the kernel arguments clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_inputImage); clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_outputImage); clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_filter); clSetKernelArg(kernel, 3, sizeof(int), &deviceHeight); clSetKernelArg(kernel, 4, sizeof(int), &deviceWidth); clSetKernelArg(kernel, 5, sizeof(int), &filterWidth); clSetKernelArg(kernel, 6, localMemSize, NULL); clSetKernelArg(kernel, 7, sizeof(int), &localHeight); clSetKernelArg(kernel, 8, sizeof(int), &localWidth); stoptime(start, "set up kernel"); start = clock(); // Execute the kernel clEnqueueNDRangeKernel(queue, kernel, 2, NULL, globalSize, localSize, 0, NULL, &timing_event); // Wait for kernel to complete clFinish(queue); stoptime(start, "run kernel"); clGetEventProfilingInfo(timing_event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL); clGetEventProfilingInfo(timing_event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL); exec_time = time_end-time_start; printf("Profile execution time = %.3lf sec.\n", (double) exec_time/1000000000); // Read back the output image #ifdef NON_OPTIMIZED clEnqueueReadBuffer(queue, d_outputImage, CL_TRUE, 0, deviceDataSize, outputImage, 0, NULL, NULL); #else // READ_ALIGNED || READ4 // Begin reading output from (3,3) on the device // (for 7x7 filter with radius 3) buffer_origin[0] = 3*sizeof(float); buffer_origin[1] = 3; buffer_origin[2] = 0; // Read data into (3,3) on the host host_origin[0] = 3*sizeof(float); host_origin[1] = 3; host_origin[2] = 0; // Region is image size minus padding pixels region[0] = (imageWidth-paddingPixels)*sizeof(float); region[1] = (imageHeight-paddingPixels); region[2] = 1; // Perform the read clEnqueueReadBufferRect(queue, d_outputImage, CL_TRUE, buffer_origin, host_origin, region, deviceWidth*sizeof(float), 0, imageWidth*sizeof(float), 0, outputImage, 0, NULL, NULL); #endif // Homegrown function to write the image to file storeImage(outputImage, outputFile, imageHeight, imageWidth, inputFile); // Free OpenCL objects clReleaseMemObject(d_inputImage); clReleaseMemObject(d_outputImage); clReleaseMemObject(d_filter); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(queue); clReleaseContext(context); return 0; }
int NBody::setupCL() { cl_int status = CL_SUCCESS; cl_device_type dType; if(deviceType.compare("cpu") == 0) { dType = CL_DEVICE_TYPE_CPU; } else //deviceType = "gpu" { dType = CL_DEVICE_TYPE_GPU; } /* * Have a look at the available platforms and pick either * the AMD one if available or a reasonable default. */ cl_uint numPlatforms; cl_platform_id platform = NULL; status = clGetPlatformIDs(0, NULL, &numPlatforms); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) { return SDK_FAILURE; } if (0 < numPlatforms) { cl_platform_id* platforms = new cl_platform_id[numPlatforms]; status = clGetPlatformIDs(numPlatforms, platforms, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformIDs failed.")) { return SDK_FAILURE; } for (unsigned i = 0; i < numPlatforms; ++i) { char pbuf[100]; status = clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, sizeof(pbuf), pbuf, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetPlatformInfo failed.")) { return SDK_FAILURE; } platform = platforms[i]; if (!strcmp(pbuf, "Advanced Micro Devices, Inc.")) { break; } } delete[] platforms; } if(NULL == platform) { sampleCommon->error("NULL platform found so Exiting Application."); return SDK_FAILURE; } // Display available devices. if(!sampleCommon->displayDevices(platform, dType)) { sampleCommon->error("sampleCommon::displayDevices() failed"); return SDK_FAILURE; } /* * If we could find our platform, use it. Otherwise use just available platform. */ cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; context = clCreateContextFromType( cps, dType, NULL, NULL, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateContextFromType failed.")) { return SDK_FAILURE; } size_t deviceListSize; /* First, get the size of device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetContextInfo failed.")) return SDK_FAILURE; int deviceCount = (int)(deviceListSize / sizeof(cl_device_id)); if(!sampleCommon->validateDeviceId(deviceId, deviceCount)) { sampleCommon->error("sampleCommon::validateDeviceId() failed"); return SDK_FAILURE; } /* Now allocate memory for device list based on the size we got earlier */ devices = (cl_device_id*)malloc(deviceListSize); if(devices == NULL) { sampleCommon->error("Failed to allocate memory (devices)."); return SDK_FAILURE; } /* Now, get the device list data */ status = clGetContextInfo( context, CL_CONTEXT_DEVICES, deviceListSize, devices, NULL); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetContextInfo failed.")) return SDK_FAILURE; /* Create command queue */ commandQueue = clCreateCommandQueue( context, devices[deviceId], 0, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateCommandQueue failed.")) { return SDK_FAILURE; } /* Get Device specific Information */ status = clGetDeviceInfo( devices[deviceId], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void*)&maxWorkGroupSize, NULL); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetDeviceInfo CL_DEVICE_MAX_WORK_GROUP_SIZE failed.")) return SDK_FAILURE; status = clGetDeviceInfo( devices[deviceId], CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), (void*)&maxDimensions, NULL); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS failed.")) return SDK_FAILURE; maxWorkItemSizes = (size_t*)malloc(maxDimensions * sizeof(size_t)); status = clGetDeviceInfo( devices[deviceId], CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(size_t) * maxDimensions, (void*)maxWorkItemSizes, NULL); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetDeviceInfo CL_DEVICE_MAX_WORK_ITEM_SIZES failed.")) return SDK_FAILURE; status = clGetDeviceInfo( devices[deviceId], CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), (void *)&totalLocalMemory, NULL); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clGetDeviceInfo CL_DEVICE_LOCAL_MEM_SIZE failed.")) return SDK_FAILURE; /* * Create and initialize memory objects */ /* Create memory objects for position */ currPos = clCreateBuffer( context, CL_MEM_READ_WRITE, numBodies * sizeof(cl_float4), 0, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateBuffer failed. (oldPos)")) { return SDK_FAILURE; } /* Initialize position buffer */ status = clEnqueueWriteBuffer(commandQueue, currPos, 1, 0, numBodies * sizeof(cl_float4), pos, 0, 0, 0); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueWriteBuffer failed. (oldPos)")) { return SDK_FAILURE; } /* Create memory objects for position */ newPos = clCreateBuffer( context, CL_MEM_READ_WRITE, numBodies * sizeof(cl_float4), 0, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateBuffer failed. (newPos)")) { return SDK_FAILURE; } /* Create memory objects for velocity */ currVel = clCreateBuffer( context, CL_MEM_READ_WRITE, numBodies * sizeof(cl_float4), 0, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateBuffer failed. (oldVel)")) { return SDK_FAILURE; } /* Initialize velocity buffer */ status = clEnqueueWriteBuffer(commandQueue, currVel, 1, 0, numBodies * sizeof(cl_float4), vel, 0, 0, 0); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueWriteBuffer failed. (oldVel)")) { return SDK_FAILURE; } /* Create memory objects for velocity */ newVel = clCreateBuffer( context, CL_MEM_READ_ONLY, numBodies * sizeof(cl_float4), 0, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateBuffer failed. (newVel)")) { return SDK_FAILURE; } /* create a CL program using the kernel source */ streamsdk::SDKFile kernelFile; std::string kernelPath = sampleCommon->getPath(); if(isLoadBinaryEnabled()) { kernelPath.append(loadBinary.c_str()); if(!kernelFile.readBinaryFromFile(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } const char * binary = kernelFile.source().c_str(); size_t binarySize = kernelFile.source().size(); program = clCreateProgramWithBinary(context, 1, &devices[deviceId], (const size_t *)&binarySize, (const unsigned char**)&binary, NULL, &status); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clCreateProgramWithBinary failed.")) { return SDK_FAILURE; } } else { kernelPath.append("NBody_Kernels.cl"); if(!kernelFile.open(kernelPath.c_str())) { std::cout << "Failed to load kernel file : " << kernelPath << std::endl; return SDK_FAILURE; } const char * source = kernelFile.source().c_str(); size_t sourceSize[] = { strlen(source) }; program = clCreateProgramWithSource(context, 1, &source, sourceSize, &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateProgramWithSource failed.")) return SDK_FAILURE; } /* create a cl program executable for all the devices specified */ status = clBuildProgram( program, 1, &devices[deviceId], NULL, NULL, NULL); if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) { cl_int logStatus; char * buildLog = NULL; size_t buildLogSize = 0; logStatus = clGetProgramBuildInfo (program, devices[deviceId], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, &buildLogSize); if(!sampleCommon->checkVal( logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) return SDK_FAILURE; buildLog = (char*)malloc(buildLogSize); if(buildLog == NULL) { sampleCommon->error("Failed to allocate host memory. (buildLog)"); return SDK_FAILURE; } memset(buildLog, 0, buildLogSize); logStatus = clGetProgramBuildInfo (program, devices[deviceId], CL_PROGRAM_BUILD_LOG, buildLogSize, buildLog, NULL); if(!sampleCommon->checkVal( logStatus, CL_SUCCESS, "clGetProgramBuildInfo failed.")) { free(buildLog); return SDK_FAILURE; } std::cout << " \n\t\t\tBUILD LOG\n"; std::cout << " ************************************************\n"; std::cout << buildLog << std::endl; std::cout << " ************************************************\n"; free(buildLog); } if(!sampleCommon->checkVal( status, CL_SUCCESS, "clBuildProgram failed.")) return SDK_FAILURE; } /* get a kernel object handle for a kernel with the given name */ kernel = clCreateKernel( program, "nbody_sim", &status); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clCreateKernel failed.")) { return SDK_FAILURE; } return SDK_SUCCESS; }
int main( int argc, char **argv ) { int i, iteration; double timecounter; FILE *fp; cl_int ecode; if (argc == 1) { fprintf(stderr, "Usage: %s <kernel directory>\n", argv[0]); exit(-1); } /* Initialize timers */ timer_on = 0; if ((fp = fopen("timer.flag", "r")) != NULL) { fclose(fp); timer_on = 1; } timer_clear( 0 ); if (timer_on) { timer_clear( 1 ); timer_clear( 2 ); timer_clear( 3 ); } if (timer_on) timer_start( 3 ); /* Initialize the verification arrays if a valid class */ for( i=0; i<TEST_ARRAY_SIZE; i++ ) switch( CLASS ) { case 'S': test_index_array[i] = S_test_index_array[i]; test_rank_array[i] = S_test_rank_array[i]; break; case 'A': test_index_array[i] = A_test_index_array[i]; test_rank_array[i] = A_test_rank_array[i]; break; case 'W': test_index_array[i] = W_test_index_array[i]; test_rank_array[i] = W_test_rank_array[i]; break; case 'B': test_index_array[i] = B_test_index_array[i]; test_rank_array[i] = B_test_rank_array[i]; break; case 'C': test_index_array[i] = C_test_index_array[i]; test_rank_array[i] = C_test_rank_array[i]; break; case 'D': test_index_array[i] = D_test_index_array[i]; test_rank_array[i] = D_test_rank_array[i]; break; }; /* set up the OpenCL environment. */ setup_opencl(argc, argv); /* Printout initial NPB info */ printf( "\n\n NAS Parallel Benchmarks (NPB3.3-OCL) - IS Benchmark\n\n" ); printf( " Size: %ld (class %c)\n", (long)TOTAL_KEYS, CLASS ); printf( " Iterations: %d\n", MAX_ITERATIONS ); if (timer_on) timer_start( 1 ); /* Generate random number sequence and subsequent keys on all procs */ create_seq( 314159265.00, /* Random number gen seed */ 1220703125.00 ); /* Random number gen mult */ if (timer_on) timer_stop( 1 ); /* Do one interation for free (i.e., untimed) to guarantee initialization of all data and code pages and respective tables */ rank( 1 ); /* Start verification counter */ passed_verification = 0; DTIMER_START(T_BUFFER_WRITE); ecode = clEnqueueWriteBuffer(cmd_queue, m_passed_verification, CL_TRUE, 0, sizeof(cl_int), &passed_verification, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueWriteBuffer() for m_passed_verification"); DTIMER_STOP(T_BUFFER_WRITE); if( CLASS != 'S' ) printf( "\n iteration\n" ); /* Start timer */ timer_start( 0 ); /* This is the main iteration */ for( iteration=1; iteration<=MAX_ITERATIONS; iteration++ ) { if( CLASS != 'S' ) printf( " %d\n", iteration ); rank( iteration ); } DTIMER_START(T_BUFFER_READ); ecode = clEnqueueReadBuffer(cmd_queue, m_passed_verification, CL_TRUE, 0, sizeof(cl_int), &passed_verification, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueReadBuffer() for m_passed_verification"); DTIMER_STOP(T_BUFFER_READ); /* End of timing, obtain maximum time of all processors */ timer_stop( 0 ); timecounter = timer_read( 0 ); /* This tests that keys are in sequence: sorting of last ranked key seq occurs here, but is an untimed operation */ if (timer_on) timer_start( 2 ); full_verify(); if (timer_on) timer_stop( 2 ); if (timer_on) timer_stop( 3 ); /* The final printout */ if( passed_verification != 5*MAX_ITERATIONS + 1 ) passed_verification = 0; c_print_results( "IS", CLASS, (int)(TOTAL_KEYS/64), 64, 0, MAX_ITERATIONS, timecounter, ((double) (MAX_ITERATIONS*TOTAL_KEYS)) /timecounter/1000000., "keys ranked", passed_verification, NPBVERSION, COMPILETIME, CC, CLINK, C_LIB, C_INC, CFLAGS, CLINKFLAGS, "", clu_GetDeviceTypeName(device_type), device_name); /* Print additional timers */ if (timer_on) { double t_total, t_percent; t_total = timer_read( 3 ); printf("\nAdditional timers -\n"); printf(" Total execution: %8.3f\n", t_total); if (t_total == 0.0) t_total = 1.0; timecounter = timer_read(1); t_percent = timecounter/t_total * 100.; printf(" Initialization : %8.3f (%5.2f%%)\n", timecounter, t_percent); timecounter = timer_read(0); t_percent = timecounter/t_total * 100.; printf(" Benchmarking : %8.3f (%5.2f%%)\n", timecounter, t_percent); timecounter = timer_read(2); t_percent = timecounter/t_total * 100.; printf(" Sorting : %8.3f (%5.2f%%)\n", timecounter, t_percent); } release_opencl(); fflush(stdout); return 0; /**************************/ } /* E N D P R O G R A M */
template <typename ElemType> nano_time_t Syr2PerformanceTest<ElemType>::clblasPerfSingle(void) { nano_time_t time; cl_event event; cl_int status; cl_command_queue queue = base_->commandQueues()[0]; status = clEnqueueWriteBuffer(queue, mobjA_, CL_TRUE, 0, ((params_.N * params_.lda) + params_.offa) * sizeof(ElemType), backA_, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "Matrix A buffer object enqueuing error, status = " << status << endl; return NANOTIME_ERR; } status = clWaitForEvents(1, &event); if (status != CL_SUCCESS) { cout << "Wait on event failed, status = " << status << endl; return NANOTIME_ERR; } event = NULL; #define TIMING #ifdef TIMING clFinish( queue); time = getCurrentTime(); int iter = 100; for ( int i = 1; i <= iter; i++) { #endif status = (cl_int)clMath::clblas::syr2(params_.order, params_.uplo, params_.N, alpha_, mobjX_, params_.offBX, params_.incx, mobjY_, params_.offCY, params_.incy, mobjA_, params_.offa, params_.lda, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { cerr << "The CLBLAS SYR2 function failed, status = " << status << endl; return NANOTIME_ERR; } #ifdef TIMING } // iter loop clFinish( queue); time = getCurrentTime() - time; time /= iter; #else status = flushAll(1, &queue); if (status != CL_SUCCESS) { cerr << "clFlush() failed, status = " << status << endl; return NANOTIME_ERR; } time = getCurrentTime(); status = waitForSuccessfulFinish(1, &queue, &event); if (status == CL_SUCCESS) { time = getCurrentTime() - time; } else { cerr << "Waiting for completion of commands to the queue failed, " "status = " << status << endl; time = NANOTIME_ERR; } #endif return time; }
int main() { int i,j,k; // nb of operations: const int dsize = 512; int nthreads = 1; int nbOfAverages = 1e2; int opsMAC = 2; // operations per MAC cl_short4 *in, *out; cl_half *ck; double tops; //total ops #define NQUEUES 1 cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queues[NQUEUES]; cl_mem bufin, bufck, bufout; cl_event event = NULL; cl_program program; cl_kernel kernel; size_t global[2], local[2]; size_t param[5]; char version[300]; // allocate matrices in = (cl_short4 *) calloc(dsize*dsize, sizeof(*in)); out = (cl_short4 *) calloc(dsize*dsize, sizeof(*out)); ck = (cl_half *) calloc(9*9, sizeof(*ck)); in[0].x = 0x3c00; in[1].x = 0x4000; in[dsize].x = 0x4100; ck[0] = 0x3c00; ck[1] = 0x4000; ck[9] = 0x3000; /* Setup OpenCL environment. */ err = clGetPlatformIDs( 1, &platform, NULL ); err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL ); props[1] = (cl_context_properties)platform; ctx = clCreateContext( props, 1, &device, NULL, NULL, &err ); for(i = 0; i < NQUEUES; i++) queues[i] = clCreateCommandQueue( ctx, device, 0, &err ); // Print some info about the system clGetDeviceInfo(device, CL_DEVICE_VERSION, sizeof(version), version, NULL); printf("CL_DEVICE_VERSION=%s\n", version); clGetDeviceInfo(device, CL_DRIVER_VERSION, sizeof(version), version, NULL); printf("CL_DRIVER_VERSION=%s\n", version); program = clCreateProgramWithSource(ctx, 1, (const char **)&source, NULL, &err); clGetDeviceInfo(device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(param[0]), param, NULL); printf("CL_DEVICE_LOCAL_MEM_SIZE=%d\n", (int)param[0]); clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(param[0]), param, NULL); printf("CL_DEVICE_MAX_WORK_GROUP_SIZE=%d\n", (int)param[0]); clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(param[0]), param, NULL); printf("CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS=%d\n", (int)param[0]); j = param[0]; clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_ITEM_SIZES, sizeof(param[0])*j, param, NULL); printf("CL_DEVICE_MAX_WORK_ITEM_SIZES="); for(i = 0; i < j; i++) printf("%d ", (int)param[i]); printf("\n"); clGetDeviceInfo(device, CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE, sizeof(param[0]), param, NULL); printf("CL_DEVICE_MAX_CONSTANT_BUFFER_SIZE=%d\n", (int)param[0]); program = clCreateProgramWithSource(ctx, 1, (const char **)&source, NULL, &err); if(!program) { printf("Error creating program\n"); return -1; } err = clBuildProgram(program, 0, 0, 0, 0, 0); if(err != CL_SUCCESS) { char buffer[20000]; size_t len; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); puts(buffer); return -1; } kernel = clCreateKernel(program, "conv9x9", &err); if(!kernel || err != CL_SUCCESS) { printf("Error creating kernel\n"); return -1; } /* Prepare OpenCL memory objects and place matrices inside them. */ cl_image_format fmt = {CL_RGBA, CL_HALF_FLOAT}; cl_int rc; bufin = clCreateImage2D(ctx, CL_MEM_READ_ONLY, &fmt, dsize, dsize, 0, 0, &rc); bufout = clCreateImage2D(ctx, CL_MEM_WRITE_ONLY, &fmt, dsize, dsize, 0, 0, &rc); bufck = clCreateBuffer( ctx, CL_MEM_READ_ONLY, 9 * 9 * sizeof(*ck), NULL, &err ); size_t origin[3] = {0,0,0}; size_t region[3] = {dsize, dsize, 1}; err = clEnqueueWriteImage(queues[0], bufin, CL_TRUE, origin, region, dsize * sizeof(*in), 0, in, 0, NULL, NULL ); err = clEnqueueWriteBuffer( queues[0], bufck, CL_TRUE, 0, 9 * 9 * sizeof( *ck ), ck, 0, NULL, NULL ); clSetKernelArg(kernel, 0, sizeof(int), &dsize); clSetKernelArg(kernel, 1, sizeof(cl_mem), &bufin); clSetKernelArg(kernel, 2, sizeof(cl_mem), &bufck); clSetKernelArg(kernel, 3, sizeof(cl_mem), &bufout); local[0] = 8; local[1] = 8; global[0] = global[1] = dsize-32; usleep(100000); struct timeval start,end; gettimeofday(&start, NULL); for (k=0; k<nthreads; k++) { //printf("Hello from thread %d, nthreads %d\n", omp_get_thread_num(), omp_get_num_threads()); for(i=0;i<nbOfAverages;i++) { // do the 2D convolution err = clEnqueueNDRangeKernel(queues[0], kernel, 2, NULL, global, local, 0, NULL, NULL); if(err != CL_SUCCESS) { printf("clEnqueueNDRangeKernel error %d\n", err); return -1; } } } clFinish(queues[0]); gettimeofday(&end, NULL); double t = ((double) (end.tv_sec - start.tv_sec)) + ((double) (end.tv_usec - start.tv_usec)) / 1e6; //reports time in [s] - verified! /* Wait for calculations to be finished. */ /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadImage(queues[0], bufout, CL_TRUE, origin, region, dsize * sizeof(*out), 0, out, 0, NULL, NULL ); clFinish(queues[0]); printf("%x %x %x %x\n", out[0].x, out[1].x, out[dsize].x, out[dsize+1].x); /* Release OpenCL memory objects. */ clReleaseMemObject( bufin ); clReleaseMemObject( bufck ); clReleaseMemObject( bufout ); /* Release OpenCL working objects. */ for(i = 0; i < NQUEUES; i++) clReleaseCommandQueue( queues[i] ); clReleaseContext( ctx ); // report performance: tops = 4 * nthreads * opsMAC * (dsize-32)*(dsize-32)*9*9; // total ops printf("Total M ops = %.0lf, # of threads = %d", nbOfAverages*tops*1e-6, nthreads); printf("\nTime in s: %lf:", t); printf("\nTest performance [G OP/s] %lf:", tops*nbOfAverages/t*1e-9); printf("\n"); return(0); }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX, bufAsum, scratchBuff; cl_event event = NULL; int ret = 0; int lenX = 1 + (N-1)*abs(incx); /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_float)), NULL, &err); bufAsum = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, (sizeof(cl_float)), NULL, &err); // Allocate minimum of N elements scratchBuff = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (N*sizeof(cl_float)), NULL, &err); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); /* Call clblas function. */ err = clblasSasum( N, bufAsum, 0, bufX, 0, incx, scratchBuff, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSasum() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufAsum, CL_TRUE, 0, sizeof(cl_float), &asum, 0, NULL, NULL); printf("Result : %f\n", asum); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufX); clReleaseMemObject(bufAsum); clReleaseMemObject(scratchBuff); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufAP, bufX, bufY; cl_event event = NULL; int ret = 0, numElementsAP; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } numElementsAP = (N * (N+1)) / 2; // To get number of elements in a packed matrix /* Prepare OpenCL memory objects and place matrices inside them. */ bufAP = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (numElementsAP * sizeof(cl_double2)), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_double2), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_double2), NULL, &err); err = clEnqueueWriteBuffer(queue, bufAP, CL_TRUE, 0, numElementsAP * sizeof(cl_double2), AP, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_double2), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, N * sizeof(cl_double2), Y, 0, NULL, NULL); err = clblasZhpr2(order, uplo, N, alpha, bufX, 0 /*offx */, incx, bufY, 0 /*offy*/, incy, bufAP, 0 /*offa */, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasZhpr2() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufAP, CL_TRUE, 0, (numElementsAP * sizeof(cl_double2)), AP, 0, NULL, NULL); /* At this point you will get the result of ZHPR2 placed in A array. */ printResult(); } /* Release OpenCL memory objects. */ clReleaseMemObject(bufX); clReleaseMemObject(bufAP); clReleaseMemObject(bufY); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
static int64_t opencl_scanhash(struct thr_info *thr, struct work *work, int64_t __maybe_unused max_nonce) { const int thr_id = thr->id; struct opencl_thread_data *thrdata = thr->cgpu_data; struct cgpu_info *gpu = thr->cgpu; _clState *clState = clStates[thr_id]; const cl_kernel *kernel = &clState->kernel; const int dynamic_us = opt_dynamic_interval * 1000; cl_int status; size_t globalThreads[1]; size_t localThreads[1] = { clState->wsize }; int64_t hashes; int found = opt_scrypt ? SCRYPT_FOUND : FOUND; int buffersize = opt_scrypt ? SCRYPT_BUFFERSIZE : BUFFERSIZE; if (opt_neoscrypt) { found = opt_neoscrypt ? SCRYPT_FOUND : FOUND; buffersize = opt_neoscrypt ? SCRYPT_BUFFERSIZE : BUFFERSIZE; } /* Windows' timer resolution is only 15ms so oversample 5x */ if (gpu->dynamic && (++gpu->intervals * dynamic_us) > 70000) { struct timeval tv_gpuend; double gpu_us; cgtime(&tv_gpuend); gpu_us = us_tdiff(&tv_gpuend, &gpu->tv_gpustart) / gpu->intervals; if (gpu_us > dynamic_us) { if (gpu->intensity > MIN_INTENSITY) --gpu->intensity; } else if (gpu_us < dynamic_us / 2) { if (gpu->intensity < MAX_INTENSITY) ++gpu->intensity; } memcpy(&(gpu->tv_gpustart), &tv_gpuend, sizeof(struct timeval)); gpu->intervals = 0; } set_threads_hashes(clState->vwidth, &hashes, globalThreads, localThreads[0], &gpu->intensity); if (hashes > gpu->max_hashes) gpu->max_hashes = hashes; status = thrdata->queue_kernel_parameters(clState, &work->blk, globalThreads[0]); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error: clSetKernelArg of all params failed."); return -1; } if (clState->goffset) { size_t global_work_offset[1]; global_work_offset[0] = work->blk.nonce; status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, global_work_offset, globalThreads, localThreads, 0, NULL, NULL); } else status = clEnqueueNDRangeKernel(clState->commandQueue, *kernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error %d: Enqueueing kernel onto command queue. (clEnqueueNDRangeKernel)", status); return -1; } status = clEnqueueReadBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0, buffersize, thrdata->res, 0, NULL, NULL); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error: clEnqueueReadBuffer failed error %d. (clEnqueueReadBuffer)", status); return -1; } /* The amount of work scanned can fluctuate when intensity changes * and since we do this one cycle behind, we increment the work more * than enough to prevent repeating work */ work->blk.nonce += gpu->max_hashes; /* This finish flushes the readbuffer set with CL_FALSE in clEnqueueReadBuffer */ clFinish(clState->commandQueue); /* FOUND entry is used as a counter to say how many nonces exist */ if (thrdata->res[found]) { /* Clear the buffer again */ status = clEnqueueWriteBuffer(clState->commandQueue, clState->outputBuffer, CL_FALSE, 0, buffersize, blank_res, 0, NULL, NULL); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error: clEnqueueWriteBuffer failed."); return -1; } applog(LOG_DEBUG, "GPU %d found something?", gpu->device_id); postcalc_hash_async(thr, work, thrdata->res); memset(thrdata->res, 0, buffersize); /* This finish flushes the writebuffer set with CL_FALSE in clEnqueueWriteBuffer */ clFinish(clState->commandQueue); } return hashes; }
void run_benchmark( void *vargs, cl_context& context, cl_command_queue& commands, cl_program& program, cl_kernel& kernel ) { struct bench_args_t *args = (struct bench_args_t *)vargs; int size = 1 << 26; uint8_t *data = (uint8_t *)malloc(size); for (int i=0; i<size; i+=sizeof(args->buf)) memcpy(data + i, args->buf, sizeof(args->buf)); // 0th: initialize the timer at the beginning of the program timespec timer = tic(); // Create device buffers // cl_mem key_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->k), NULL, NULL); cl_mem value_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, NULL); //cl_mem value_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(args->buf), NULL, NULL); if (!key_buffer || !value_buffer) { printf("Error: Failed to allocate device memory!\n"); printf("Test failed\n"); exit(1); } // 1st: time of buffer allocation toc(&timer, "buffer allocation"); // Write our data set into device buffers // int err; err = clEnqueueWriteBuffer(commands, key_buffer, CL_TRUE, 0, sizeof(args->k), args->k, 0, NULL, NULL); err |= clEnqueueWriteBuffer(commands, value_buffer, CL_TRUE, 0, size, data, 0, NULL, NULL); //err |= clEnqueueWriteBuffer(commands, value_buffer, CL_TRUE, 0, sizeof(args->buf), args->buf, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to device memory!\n"); printf("Test failed\n"); exit(1); } // 2nd: time of pageable-pinned memory copy toc(&timer, "memory copy"); // Set the arguments to our compute kernel // err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &key_buffer); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &value_buffer); err |= clSetKernelArg(kernel, 2, sizeof(int), &size); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); printf("Test failed\n"); exit(1); } // 3rd: time of setting arguments toc(&timer, "set arguments"); // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // #ifdef C_KERNEL err = clEnqueueTask(commands, kernel, 0, NULL, NULL); #else printf("Error: OpenCL kernel is not currently supported!\n"); exit(1); #endif if (err) { printf("Error: Failed to execute kernel! %d\n", err); printf("Test failed\n"); exit(1); } // 4th: time of kernel execution clFinish(commands); toc(&timer, "kernel execution"); // Read back the results from the device to verify the output // err = clEnqueueReadBuffer( commands, value_buffer, CL_TRUE, 0, size, data, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); printf("Test failed\n"); exit(1); } // 5th: time of data retrieving (PCIe + memcpy) toc(&timer, "data retrieving"); memcpy(args->buf, data, sizeof(args->buf)); free(data); }
int main(int argc, char** argv) { int err; // error code returned from api calls float data[DATA_SIZE]; // original data set given to device float results[DATA_SIZE]; // results returned from device unsigned int correct; // number of correct results returned size_t global; // global domain size for our calculation size_t local; // local domain size for our calculation cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel cl_mem input; // device memory used for the input array cl_mem output; // device memory used for the output array int i; int use_gpu = 1; for(i = 0; i < argc && argv; i++) { if(!argv[i]) continue; if(strstr(argv[i], "cpu")) use_gpu = 0; else if(strstr(argv[i], "gpu")) use_gpu = 1; } printf("Parameter detect %s device\n",use_gpu==1?"GPU":"CPU"); // Fill our data set with random float values // unsigned int count = DATA_SIZE; for(i = 0; i < count; i++) data[i] = rand() / (float)RAND_MAX; // Connect to a compute device // err = clGetDeviceIDs(NULL, use_gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command commands // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source buffer // program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "square", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } // Create the input and output arrays in device memory for our calculation // input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL); output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL); if (!input || !output) { printf("Error: Failed to allocate device memory!\n"); exit(1); } // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array!\n"); exit(1); } // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } // Get the maximum work group size for executing the kernel on the device // err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // global = count; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } // Wait for the command commands to get serviced before reading back results // clFinish(commands); // Read back the results from the device to verify the output // err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } // Validate our results // correct = 0; for(i = 0; i < count; i++) { #ifdef __EMSCRIPTEN__ if ((results[i] - (data[i] * data[i])) < MIN_ERROR) correct++; #else if(results[i] == data[i] * data[i]) correct++; #endif } // Print a brief summary detailing the results // printf("Computed '%d/%d' correct values!\n", correct, count); // Shutdown and cleanup // clReleaseMemObject(input); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return 0; }
void call_kernel(float *data1,float *data2,int count,char * cl_name,float *results) { FILE* programHandle; size_t programSize, KernelSourceSize; char *programBuffer, *KernelSource; size_t global; // global domain size for our calculation size_t local; // local domain size for our calculation cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel cl_mem input1; // device memory used for the input array cl_mem input2; // device memory used for the input array cl_mem output; // device memory used for the output array int err; int gpu = 1; err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); commands = clCreateCommandQueue(context, device_id, 0, &err); //---------------------------------------------------------------------------- // get size of kernel source programHandle = fopen(cl_name, "r"); fseek(programHandle, 0, SEEK_END); programSize = ftell(programHandle); rewind(programHandle); programBuffer = (char*) malloc(programSize + 1); programBuffer[programSize] = '\0'; fread(programBuffer, sizeof(char), programSize, programHandle); fclose(programHandle); // create program from buffer program = clCreateProgramWithSource(context,1,(const char**) &programBuffer,&programSize, NULL); free(programBuffer); // read kernel source back in from program to check clGetProgramInfo(program, CL_PROGRAM_SOURCE, 0, NULL, &KernelSourceSize); KernelSource = (char*) malloc(KernelSourceSize); clGetProgramInfo(program, CL_PROGRAM_SOURCE, KernelSourceSize, KernelSource, NULL); program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); kernel = clCreateKernel(program, "square", &err); //---------------------------------------------------------------------------- input1 = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL); input2 = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL); output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL); err = clEnqueueWriteBuffer(commands, input1, CL_TRUE, 0, sizeof(float) * count, data1, 0, NULL, NULL); err = clEnqueueWriteBuffer(commands, input2, CL_TRUE, 0, sizeof(float) * count, data2, 0, NULL, NULL); err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input1); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &input2); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &output); err |= clSetKernelArg(kernel, 3, sizeof(int), &count); printf("*********************%d\n", local); err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); printf("*********************%d\n", local); global = count; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); clFinish(commands); err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL ); clReleaseMemObject(input1); clReleaseMemObject(input2); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); //printf("nKernel source:\n\n %s \n", KernelSource); free(KernelSource); }
void set_constants() { ce[0][0] = 2.0; ce[0][1] = 0.0; ce[0][2] = 0.0; ce[0][3] = 4.0; ce[0][4] = 5.0; ce[0][5] = 3.0; ce[0][6] = 0.5; ce[0][7] = 0.02; ce[0][8] = 0.01; ce[0][9] = 0.03; ce[0][10] = 0.5; ce[0][11] = 0.4; ce[0][12] = 0.3; ce[1][0] = 1.0; ce[1][1] = 0.0; ce[1][2] = 0.0; ce[1][3] = 0.0; ce[1][4] = 1.0; ce[1][5] = 2.0; ce[1][6] = 3.0; ce[1][7] = 0.01; ce[1][8] = 0.03; ce[1][9] = 0.02; ce[1][10] = 0.4; ce[1][11] = 0.3; ce[1][12] = 0.5; ce[2][0] = 2.0; ce[2][1] = 2.0; ce[2][2] = 0.0; ce[2][3] = 0.0; ce[2][4] = 0.0; ce[2][5] = 2.0; ce[2][6] = 3.0; ce[2][7] = 0.04; ce[2][8] = 0.03; ce[2][9] = 0.05; ce[2][10] = 0.3; ce[2][11] = 0.5; ce[2][12] = 0.4; ce[3][0] = 2.0; ce[3][1] = 2.0; ce[3][2] = 0.0; ce[3][3] = 0.0; ce[3][4] = 0.0; ce[3][5] = 2.0; ce[3][6] = 3.0; ce[3][7] = 0.03; ce[3][8] = 0.05; ce[3][9] = 0.04; ce[3][10] = 0.2; ce[3][11] = 0.1; ce[3][12] = 0.3; ce[4][0] = 5.0; ce[4][1] = 4.0; ce[4][2] = 3.0; ce[4][3] = 2.0; ce[4][4] = 0.1; ce[4][5] = 0.4; ce[4][6] = 0.3; ce[4][7] = 0.05; ce[4][8] = 0.04; ce[4][9] = 0.03; ce[4][10] = 0.1; ce[4][11] = 0.3; ce[4][12] = 0.2; c1 = 1.4; c2 = 0.4; c3 = 0.1; c4 = 1.0; c5 = 1.4; dnxm1 = 1.0 / (double)(grid_points[0]-1); dnym1 = 1.0 / (double)(grid_points[1]-1); dnzm1 = 1.0 / (double)(grid_points[2]-1); c1c2 = c1 * c2; c1c5 = c1 * c5; c3c4 = c3 * c4; c1345 = c1c5 * c3c4; conz1 = (1.0-c1c5); tx1 = 1.0 / (dnxm1 * dnxm1); tx2 = 1.0 / (2.0 * dnxm1); tx3 = 1.0 / dnxm1; ty1 = 1.0 / (dnym1 * dnym1); ty2 = 1.0 / (2.0 * dnym1); ty3 = 1.0 / dnym1; tz1 = 1.0 / (dnzm1 * dnzm1); tz2 = 1.0 / (2.0 * dnzm1); tz3 = 1.0 / dnzm1; dx1 = 0.75; dx2 = 0.75; dx3 = 0.75; dx4 = 0.75; dx5 = 0.75; dy1 = 0.75; dy2 = 0.75; dy3 = 0.75; dy4 = 0.75; dy5 = 0.75; dz1 = 1.0; dz2 = 1.0; dz3 = 1.0; dz4 = 1.0; dz5 = 1.0; dxmax = max(dx3, dx4); dymax = max(dy2, dy4); dzmax = max(dz2, dz3); dssp = 0.25 * max(dx1, max(dy1, dz1) ); c4dssp = 4.0 * dssp; c5dssp = 5.0 * dssp; dttx1 = dt*tx1; dttx2 = dt*tx2; dtty1 = dt*ty1; dtty2 = dt*ty2; dttz1 = dt*tz1; dttz2 = dt*tz2; c2dttx1 = 2.0*dttx1; c2dtty1 = 2.0*dtty1; c2dttz1 = 2.0*dttz1; dtdssp = dt*dssp; comz1 = dtdssp; comz4 = 4.0*dtdssp; comz5 = 5.0*dtdssp; comz6 = 6.0*dtdssp; c3c4tx3 = c3c4*tx3; c3c4ty3 = c3c4*ty3; c3c4tz3 = c3c4*tz3; dx1tx1 = dx1*tx1; dx2tx1 = dx2*tx1; dx3tx1 = dx3*tx1; dx4tx1 = dx4*tx1; dx5tx1 = dx5*tx1; dy1ty1 = dy1*ty1; dy2ty1 = dy2*ty1; dy3ty1 = dy3*ty1; dy4ty1 = dy4*ty1; dy5ty1 = dy5*ty1; dz1tz1 = dz1*tz1; dz2tz1 = dz2*tz1; dz3tz1 = dz3*tz1; dz4tz1 = dz4*tz1; dz5tz1 = dz5*tz1; c2iv = 2.5; con43 = 4.0/3.0; con16 = 1.0/6.0; xxcon1 = c3c4tx3*con43*tx3; xxcon2 = c3c4tx3*tx3; xxcon3 = c3c4tx3*conz1*tx3; xxcon4 = c3c4tx3*con16*tx3; xxcon5 = c3c4tx3*c1c5*tx3; yycon1 = c3c4ty3*con43*ty3; yycon2 = c3c4ty3*ty3; yycon3 = c3c4ty3*conz1*ty3; yycon4 = c3c4ty3*con16*ty3; yycon5 = c3c4ty3*c1c5*ty3; zzcon1 = c3c4tz3*con43*tz3; zzcon2 = c3c4tz3*tz3; zzcon3 = c3c4tz3*conz1*tz3; zzcon4 = c3c4tz3*con16*tz3; zzcon5 = c3c4tz3*c1c5*tz3; //------------------------------------------------------------------------ cl_int ecode; int i; for (i = 0; i < num_devices; i++) { ecode = clEnqueueWriteBuffer(cmd_queue[i], m_ce[i], CL_TRUE, 0, sizeof(double)*5*13, ce, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueWriteBuffer() for m_ce"); } //------------------------------------------------------------------------ }
int main( int argc, char* argv[] ) { //unsigned int n; // Length of vectors int m = atoi(argv[4]); //struct timespec start, finish; unsigned int n=(256*m); // Host input vectors int *h_a; int *h_b; // Host output vector int *h_c; double elapsed; // Device input buffers cl_mem d_a; cl_mem d_b; // Device output buffer cl_mem d_c; cl_kernel kernel; cl_platform_id* cpPlatform; // OpenCL platform cl_device_id device_id; // device ID cl_context context; // context //cl_command_queue* queue; // command queue //cl_command_queue queue; // command queue cl_program program; // program cl_platform_id* platforms; // platform id, // differnt for all the device we have in the system cl_uint platformCount; //keeps the divice count // Size, in bytes, of each vector size_t bytes = n*sizeof(int); // Allocate memory for each vector on host h_a = (int*)malloc(bytes); h_b = (int*)malloc(bytes); h_c = (int*)malloc(bytes); // Initialize vectors on host int i; for( i = 0; i < n; i++ ) { h_a[i] = i; h_b[i] = i; // printf("%d ",h_a[i]); } size_t globalSize, localSize; //similar to cuda cl_int err;//for errors int workgrp; int wrkitm; int num_ker; num_ker=atoi(argv[2]); wrkitm=atoi(argv[3]);// i have tried automating lots of data, //u can check my bash script // Number of work items in each local work group localSize = wrkitm ; //n=atoi(argv[1]); // Number of total work items - localSize must be devisor globalSize = n; //mallocing for array of queues (break through) cl_command_queue * queue = (cl_command_queue *)malloc(num_ker * sizeof(cl_command_queue)); //defining platform clGetPlatformIDs(0, NULL, &platformCount); cpPlatform = (cl_platform_id*) malloc(sizeof(cl_platform_id) * platformCount); clGetPlatformIDs(platformCount, cpPlatform, NULL);//what ever is returned from last step will be used here int choice = atoi(argv[1]); if(choice ==1) { // we can have CL_DEVICE_GPU or ACCELERATOR or ALL as an option here //depending what device are we working on // we can these multiple times depending on requirements err = clGetDeviceIDs(cpPlatform[0],CL_DEVICE_TYPE_CPU , 1, &device_id, NULL); if (err != CL_SUCCESS) printf("Error: Failed to create a device group!\n"); } else { // Get ID for the device err = clGetDeviceIDs(cpPlatform[1], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); } } context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); for(i=0;i<num_ker;++i) { queue[i] = clCreateCommandQueue(context, device_id, 0, &err); } // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); // Build the program executable clBuildProgram(program, 0, NULL, NULL, NULL, NULL); // Create the compute kernel in the program we wish to run kernel = clCreateKernel(program, "vecAdd", &err); // Create the input and output arrays in device memory for our calculation d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL); //clock_gettime(CLOCK_MONOTONIC, &start); //struct timeval tim; // double t1,t2; // gettimeofday(&tim, NULL); // t1=tim.tv_sec+(tim.tv_usec/1000000.0); /* gettimeofday(&tim, NULL); t1=tim.tv_sec+(tim.tv_usec/1000000.0); */ // Write our data set into the input array in device memory for(i=0;i<num_ker;++i) { err = clEnqueueWriteBuffer(queue[i], d_a, CL_TRUE, 0,bytes, h_a, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue[i], d_b, CL_TRUE, 0,bytes, h_b, 0, NULL, NULL); } //clFinish(queue); // i know.. way to many APIs to be called in OpenCL // Set the arguments to our compute kernel err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b); err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c); err = clSetKernelArg(kernel, 3, sizeof(unsigned int), &n); // Get the maximum work group size for executing the kernel on the device //localSize=256; // err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(localSize), &localSize, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } // timer for my evalutation //clock_t start=clock(); // clock_gettime(CLOCK_MONOTONIC, &start); // kernel part // Execute the kernel over the entire range of the data set // timing function struct timeval tim; double t1,t2; // gettimeofday(&tim, NULL); // t1=tim.tv_sec+(tim.tv_usec/1000000.0); gettimeofday(&tim, NULL); t1=tim.tv_sec+(tim.tv_usec/1000000.0); //printf("err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize,0, NULL, NULL\n"); for(i=0;i<num_ker;i++) { err = clEnqueueNDRangeKernel(queue[i], kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL); } // Wait for the command queue to get serviced before reading back results //clock_gettime(CLOCK_MONOTONIC, &finish); //elapsed = (finish.tv_sec - start.tv_sec); //elapsed += (finish.tv_nsec - start.tv_nsec)/ 1000000.0; //clock_t finish =clock(); // Read the results from the device for(i=0;i<num_ker;++i) { clFinish(queue[i]); } gettimeofday(&tim, NULL); t2=tim.tv_sec+(tim.tv_usec/1000000.0); printf("%.6lf\t",(t2-t1)); for(i=0;i<num_ker;++i) { clEnqueueReadBuffer(queue[i], d_c, CL_TRUE, 0, bytes, h_c, 0, NULL, NULL ); } //clock_gettime(CLOCK_MONOTONIC, &finish); //elapsed = (finish.tv_nsec - start.tv_nsec); // elapsed += (finish.tv_nsec - start.tv_nsec)/ 1000000.0; for(i=0;i<num_ker;++i) { clFinish(queue[i]); }/* gettimeofday(&tim, NULL); t2=tim.tv_sec+(tim.tv_usec/1000000.0); printf(" %.4lf\t",(t2-t1)); */ //Sum up vector c and print result divided by n, this should equal 1 within error //int threads=globalSize/localSize; //double sum = 0; // for(i=0; i<n; i++) // printf("%d ", h_c[i]); //elapsed=(start-finish)/CLOCKS_PER_SEC; //printf("%d",globalSize); //printf("/%d ",localSize); //printf("threads = %d \n",threads); // printf("Time taken by GPU in MicroSec = %.6le\n ",elapsed); // release OpenCL resources clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); clReleaseProgram(program); clReleaseKernel(kernel); for(i=0;i<num_ker;++i) clReleaseCommandQueue(queue[i]); clReleaseContext(context); //release host memory free(h_a); free(h_b); free(h_c); return 0; }
int mri( float* img, float complex* f, float* mask, float lambda, int N1, int N2) { int i, j; // Use this to check the output of each API call cl_int status; // Retrieve the number of platforms cl_uint numPlatforms = 0; status = clGetPlatformIDs(0, NULL, &numPlatforms); // Allocate enough space for each platform cl_platform_id *platforms = NULL; platforms = (cl_platform_id*)malloc( numPlatforms*sizeof(cl_platform_id)); // Fill in the platforms status = clGetPlatformIDs(numPlatforms, platforms, NULL); // Retrieve the number of devices cl_uint numDevices = 0; status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); // Allocate enough space for each device cl_device_id *devices; devices = (cl_device_id*)malloc( numDevices*sizeof(cl_device_id)); // Fill in the devices status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); // Create a context and associate it with the devices cl_context context; context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status); // Create a command queue and associate it with the device cl_command_queue cmdQueue; cmdQueue = clCreateCommandQueue(context, devices[0], 0, &status); // Create a buffer object that will contain the data // from the host array A float complex* f0 = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dx = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dy = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dx_new = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dy_new = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dtildex = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* dtildey = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* u_fft2 = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* u = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* fftmul = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* Lap = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex* diff = (float complex*) calloc(N1*N2,sizeof(float complex)); float complex *w1 = (float complex*)malloc(((N2-1)*(N2-1)+1)*sizeof(float complex)); float complex *w2 = (float complex*)malloc(((N1-1)*(N1-1)+1)*sizeof(float complex)); float complex *buff = (float complex*)malloc(N2*N1*sizeof(float complex)); Lap(N1-1, N2-1) = 0.f; Lap(N1-1, 0) = 1.f; Lap(N1-1, 1) = 0.f; Lap(0, N2-1) = 1.f; Lap(0, 0) = -4.f; Lap(0, 1) = 1.f; Lap(1, N2-1) = 0.f; Lap(1, 0) = 1.f; Lap(1, 1) = 0.f; cl_mem cl_img = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(float), NULL, &status); cl_mem cl_mask = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(float), NULL, &status); cl_mem cl_f = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_f0 = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_dx = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_dy = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_dx_new = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_dy_new = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_dtildex = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_dtildey = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_u_fft2 = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_u = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_fftmul = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_Lap = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_diff = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); cl_mem cl_w1 = clCreateBuffer(context, CL_MEM_READ_WRITE, (N2*N2)*sizeof(cl_float2), NULL, &status); cl_mem cl_w2 = clCreateBuffer(context, CL_MEM_READ_WRITE, (N1*N1)*sizeof(cl_float2), NULL, &status); cl_mem cl_buff = clCreateBuffer(context, CL_MEM_READ_WRITE, N1*N2*sizeof(cl_float2), NULL, &status); status = clEnqueueWriteBuffer(cmdQueue, cl_mask, CL_FALSE, 0, N1*N2*sizeof(float), mask, 0, NULL, NULL); status = clEnqueueWriteBuffer(cmdQueue, cl_f, CL_FALSE, 0, N1*N2*sizeof(cl_float2), f, 0, NULL, NULL); status = clEnqueueWriteBuffer(cmdQueue, cl_Lap, CL_FALSE, 0, N1*N2*sizeof(cl_float2), Lap, 0, NULL, NULL); cl_program program = clCreateProgramWithSource(context, 1, (const char**)&kernel, NULL, &status); status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL); cl_kernel ker; size_t globalWorkSize[2]={N1,N2}; float sum = 0; for(i=0; i<N1; i++) for(j=0; j<N2; j++) sum += (SQR(crealf(f(i,j))/N1) + SQR(cimagf(f(i,j))/N1)); float normFactor = 1.f/sqrtf(sum); float scale = sqrtf(N1*N2); ker = clCreateKernel(program, "loop1", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_f); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_f0); status = clSetKernelArg(ker, 2, sizeof(cl_float2), &normFactor); status = clSetKernelArg(ker, 3, sizeof(int), &N1); status = clSetKernelArg(ker, 4, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); w1[0] = 1; w2[0] = 1; dft_init(&w1, &w2, &buff, N1, N2); status = clEnqueueWriteBuffer(cmdQueue, cl_w1, CL_FALSE, 0, ((N2-1)*(N2-1)+1)*sizeof(cl_float2), w1, 0, NULL, NULL); status = clEnqueueWriteBuffer(cmdQueue, cl_w2, CL_FALSE, 0, ((N1-1)*(N1-1)+1)*sizeof(cl_float2), w2, 0, NULL, NULL); status = clEnqueueWriteBuffer(cmdQueue, cl_buff, CL_FALSE, 0, N1*N2*sizeof(cl_float2), buff, 0, NULL, NULL); ker = clCreateKernel(program, "dft1", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_Lap); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_Lap); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff); status = clSetKernelArg(ker, 5, sizeof(int), &N1); status = clSetKernelArg(ker, 6, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (status != CL_SUCCESS) printf("error: %d\n", status); ker = clCreateKernel(program, "dft2", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_Lap); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_Lap); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff); status = clSetKernelArg(ker, 5, sizeof(int), &N1); status = clSetKernelArg(ker, 6, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (status != CL_SUCCESS) printf("error: %d\n", status); ker = clCreateKernel(program, "loop2", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_fftmul); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_Lap); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_mask); status = clSetKernelArg(ker, 3, sizeof(float), &lambda); status = clSetKernelArg(ker, 4, sizeof(int), &N1); status = clSetKernelArg(ker, 5, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker,2, NULL, globalWorkSize, NULL, 0, NULL, NULL); float complex *tmp = (float complex*)malloc(N2*N1*sizeof(float complex)); float complex *tmp2 = (float complex*)malloc(N2*N1*sizeof(float complex)); int OuterIter,iter; for(OuterIter= 0; OuterIter<MaxOutIter; OuterIter++) { for(iter = 0; iter<MaxIter; iter++) { ker = clCreateKernel(program, "loop3", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_diff); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_dtildex); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_dtildey); status = clSetKernelArg(ker, 3, sizeof(int), &N1); status = clSetKernelArg(ker, 4, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); ker = clCreateKernel(program, "dft1", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_diff); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_diff); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff); status = clSetKernelArg(ker, 5, sizeof(int), &N1); status = clSetKernelArg(ker, 6, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (status != CL_SUCCESS) printf("error: %d\n", status); ker = clCreateKernel(program, "dft2", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_diff); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_diff); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff); status = clSetKernelArg(ker, 5, sizeof(int), &N1); status = clSetKernelArg(ker, 6, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (status != CL_SUCCESS) printf("error: %d\n", status); //dft(diff, diff, w1, w2, buff, N1, N2); ker = clCreateKernel(program, "loop4", &status); int more = (iter == MaxIter - 1); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_fftmul); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_f); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_diff); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_u_fft2); status = clSetKernelArg(ker, 4, sizeof(int), &N1); status = clSetKernelArg(ker, 5, sizeof(int), &N2); status = clSetKernelArg(ker, 6, sizeof(float), &scale); status = clSetKernelArg(ker, 7, sizeof(float), &lambda); status= clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); ker = clCreateKernel(program, "idft1", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_u); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_u_fft2); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff); status = clSetKernelArg(ker, 5, sizeof(int), &N1); status = clSetKernelArg(ker, 6, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); ker = clCreateKernel(program, "idft2", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_u); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_u_fft2); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_w1); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_w2); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_buff); status = clSetKernelArg(ker, 5, sizeof(int), &N1); status = clSetKernelArg(ker, 6, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); ker = clCreateKernel(program, "loop5", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_dx); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_dy); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_u); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_dtildex); status = clSetKernelArg(ker, 4, sizeof(cl_mem), &cl_dtildey); status = clSetKernelArg(ker, 5, sizeof(cl_mem), &cl_dx_new); status = clSetKernelArg(ker, 6, sizeof(cl_mem), &cl_dy_new); status = clSetKernelArg(ker, 7, sizeof(int), &N1); status = clSetKernelArg(ker, 8, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); } /* ker = clCreateKernel(program, "last_loop", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_f); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_f0); status = clSetKernelArg(ker, 2, sizeof(cl_mem), &cl_mask); status = clSetKernelArg(ker, 3, sizeof(cl_mem), &cl_u_fft2); status = clSetKernelArg(ker, 4, sizeof(float), &scale); status = clSetKernelArg(ker, 5, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (status != CL_SUCCESS) printf("error: %d\n", status); */ clEnqueueReadBuffer(cmdQueue, cl_f, CL_TRUE, 0, N1*N2*sizeof(float), f, 0, NULL, NULL); clEnqueueReadBuffer(cmdQueue, cl_f0, CL_TRUE, 0, N1*N2*sizeof(float), f0, 0, NULL, NULL); clEnqueueReadBuffer(cmdQueue, cl_u_fft2, CL_TRUE, 0, N1*N2*sizeof(float), u_fft2, 0, NULL, NULL); for(i=0;i<N1;i++) { for(j=0;j<N2;j++) { f(i,j) += f0(i,j) - mask(i,j)*u_fft2(i,j)/scale; } } clEnqueueWriteBuffer(cmdQueue, cl_f, CL_TRUE, 0, N1*N2*sizeof(float), f, 0, NULL, NULL); } ker = clCreateKernel(program, "loop7", &status); status = clSetKernelArg(ker, 0, sizeof(cl_mem), &cl_img); status = clSetKernelArg(ker, 1, sizeof(cl_mem), &cl_u); status = clSetKernelArg(ker, 2, sizeof(int), &N1); status = clSetKernelArg(ker, 3, sizeof(int), &N2); status = clEnqueueNDRangeKernel(cmdQueue, ker, 2, NULL, globalWorkSize, NULL, 0, NULL, NULL); clEnqueueReadBuffer(cmdQueue, cl_img, CL_TRUE, 0, N1*N2*sizeof(float), img, 0, NULL, NULL); clReleaseKernel(ker); clReleaseProgram(program); clReleaseCommandQueue(cmdQueue); clReleaseMemObject(cl_img); clReleaseMemObject(cl_mask); clReleaseMemObject(cl_f); clReleaseMemObject(cl_f0); clReleaseMemObject(cl_dx); clReleaseMemObject(cl_dy); clReleaseMemObject(cl_dx_new); clReleaseMemObject(cl_dy_new); clReleaseMemObject(cl_dtildex); clReleaseMemObject(cl_dtildey); clReleaseMemObject(cl_u_fft2); clReleaseMemObject(cl_u); clReleaseMemObject(cl_fftmul); clReleaseMemObject(cl_Lap); clReleaseMemObject(cl_diff); clReleaseMemObject(cl_w1); clReleaseMemObject(cl_w2); clReleaseMemObject(cl_buff); clReleaseContext(context); free(platforms); free(devices); free(w1); free(w2); free(buff); return 0; }
int main( void ) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX; float *X; cl_event event = NULL; int ret = 0; size_t N = 16; char platform_name[128]; char device_name[128]; /* FFT library realted declarations */ clfftPlanHandle planHandle; clfftDim dim = CLFFT_1D; size_t clLengths[1] = {N}; /* Setup OpenCL environment. */ err = clGetPlatformIDs( 1, &platform, NULL ); size_t ret_param_size = 0; err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, &ret_param_size); printf("Platform found: %s\n", platform_name); err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, NULL ); err = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, &ret_param_size); printf("Device found on the above platform: %s\n", device_name); props[1] = (cl_context_properties)platform; ctx = clCreateContext( props, 1, &device, NULL, NULL, &err ); queue = clCreateCommandQueue( ctx, device, 0, &err ); /* Setup clFFT. */ clfftSetupData fftSetup; err = clfftInitSetupData(&fftSetup); err = clfftSetup(&fftSetup); /* Allocate host & initialize data. */ /* Only allocation shown for simplicity. */ X = (float *)malloc(N * 2 * sizeof(*X)); /* print input array */ printf("\nPerforming fft on an one dimensional array of size N = %ld\n", N); int print_iter = 0; while(print_iter<N) { float x = (float)print_iter; float y = (float)print_iter*3; X[2*print_iter ] = x; X[2*print_iter+1] = y; printf("(%f, %f) ", x, y); print_iter++; } printf("\n\nfft result: \n"); /* Prepare OpenCL memory objects and place data inside them. */ bufX = clCreateBuffer( ctx, CL_MEM_READ_WRITE, N * 2 * sizeof(*X), NULL, &err ); err = clEnqueueWriteBuffer( queue, bufX, CL_TRUE, 0, N * 2 * sizeof( *X ), X, 0, NULL, NULL ); /* Create a default plan for a complex FFT. */ err = clfftCreateDefaultPlan(&planHandle, ctx, dim, clLengths); /* Set plan parameters. */ err = clfftSetPlanPrecision(planHandle, CLFFT_SINGLE); err = clfftSetLayout(planHandle, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED); err = clfftSetResultLocation(planHandle, CLFFT_INPLACE); /* Bake the plan. */ err = clfftBakePlan(planHandle, 1, &queue, NULL, NULL); /* Execute the plan. */ err = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &queue, 0, NULL, NULL, &bufX, NULL, NULL); /* Wait for calculations to be finished. */ err = clFinish(queue); /* Fetch results of calculations. */ err = clEnqueueReadBuffer( queue, bufX, CL_TRUE, 0, N * 2 * sizeof( *X ), X, 0, NULL, NULL ); /* print output array */ print_iter = 0; while(print_iter<N) { printf("(%f, %f) ", X[2*print_iter], X[2*print_iter+1]); print_iter++; } printf("\n"); /* Release OpenCL memory objects. */ clReleaseMemObject( bufX ); free(X); /* Release the plan. */ err = clfftDestroyPlan( &planHandle ); /* Release clFFT library. */ clfftTeardown( ); /* Release OpenCL working objects. */ clReleaseCommandQueue( queue ); clReleaseContext( ctx ); return ret; }