/* void jdsmv(int height, int len, float* value, int* perm, int* jds_ptr, int* col_index, float* vector, float* result){ int i; int col,row; int row_index =0; int prem_indicator=0; for (i=0; i<len; i++){ if (i>=jds_ptr[prem_indicator+1]){ prem_indicator++; row_index=0; } if (row_index<height){ col = col_index[i]; row = perm[row_index]; result[row]+=value[i]*vector[col]; } row_index++; } return; } */ int main(int argc, char** argv) { struct pb_TimerSet timers; struct pb_Parameters *parameters; printf("CPU-based sparse matrix vector multiplication****\n"); printf("Original version by Li-Wen Chang <*****@*****.**> and Shengzhao Wu<*****@*****.**>\n"); printf("This version maintained by Chris Rodrigues ***********\n"); parameters = pb_ReadParameters(&argc, argv); if ((parameters->inpFiles[0] == NULL) || (parameters->inpFiles[1] == NULL)) { fprintf(stderr, "Expecting two input filenames\n"); exit(-1); } pb_InitializeTimerSet(&timers); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); //parameters declaration int len; int depth; int dim; int pad=1; int nzcnt_len; //host memory allocation //matrix float *h_data; int *h_indices; int *h_ptr; int *h_perm; int *h_nzcnt; //vector float *h_Ax_vector; float *h_x_vector; //load matrix from files pb_SwitchToTimer(&timers, pb_TimerID_IO); //inputData(parameters->inpFiles[0], &len, &depth, &dim,&nzcnt_len,&pad, // &h_data, &h_indices, &h_ptr, // &h_perm, &h_nzcnt); int col_count; coo_to_jds( parameters->inpFiles[0], // bcsstk32.mtx, fidapm05.mtx, jgl009.mtx 1, // row padding pad, // warp size 1, // pack size 1, // is mirrored? 0, // binary matrix 1, // debug level [0:2] &h_data, &h_ptr, &h_nzcnt, &h_indices, &h_perm, &col_count, &dim, &len, &nzcnt_len, &depth ); h_Ax_vector=(float*)malloc(sizeof(float)*dim); h_x_vector=(float*)malloc(sizeof(float)*dim); // generate_vector(h_x_vector, dim); input_vec( parameters->inpFiles[1],h_x_vector,dim); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); int p, i; //main execution for(p=0;p<50;p++) { #pragma omp parallel for for (i = 0; i < dim; i++) { int k; float sum = 0.0f; //int bound = h_nzcnt[i / 32]; int bound = h_nzcnt[i]; for(k=0;k<bound;k++ ) { int j = h_ptr[k] + i; int in = h_indices[j]; float d = h_data[j]; float t = h_x_vector[in]; sum += d*t; } // #pragma omp critical h_Ax_vector[h_perm[i]] = sum; } } if (parameters->outFile) { pb_SwitchToTimer(&timers, pb_TimerID_IO); outputData(parameters->outFile,h_Ax_vector,dim); } pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); free (h_data); free (h_indices); free (h_ptr); free (h_perm); free (h_nzcnt); free (h_Ax_vector); free (h_x_vector); pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); pb_FreeParameters(parameters); return 0; }
int main( int argc, char **argv ) { int n_bytes; int N, B; struct pb_TimerSet timers; struct pb_Parameters *params; params = pb_ReadParameters(&argc, argv); if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL)) { fprintf(stderr, "Expecting one input filename\n"); exit(-1); } int err = 0; if(argc != 3) err |= 1; else { char* numend; N = strtol(argv[1], &numend, 10); if(numend == argv[1]) err |= 2; B = strtol(argv[2], &numend, 10); if(numend == argv[2]) err |= 4; } if(err) { fprintf(stderr, "Expecting two integers for N and B\n"); exit(-1); } n_bytes = N*B*sizeof(float2); pb_InitializeTimerSet(&timers); pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); pb_SwitchToTimer(&timers, pb_TimerID_IO); float2 *source = (float2 *)malloc( n_bytes ); float2 *result = (float2 *)calloc( N*B, sizeof(float2) ); inputData(params->inpFiles[0],(float*)source,N*B*2); // OpenCL Code cl_int clErrNum; pb_Context* pb_context; pb_context = pb_InitOpenCLContext(params); if (pb_context == NULL) { fprintf (stderr, "Error: No OpenCL platform/device can be found."); return -1; } cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId; cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId; cl_context clContext = (cl_context) pb_context->clContext; cl_command_queue clCommandQueue; cl_program clProgram; cl_kernel fft_kernel; cl_mem d_source, d_work;//float2 *d_source, *d_work; cl_mem *data0, *data1; clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clErrNum); OCL_ERRCK_VAR(clErrNum); pb_SetOpenCL(&clContext, &clCommandQueue); pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); const char *source_path = "src/opencl_base/fft_kernel.cl"; char *sourceCode; sourceCode = readFile(source_path); if (sourceCode == NULL) { fprintf(stderr, "Could not load program source of '%s'\n", source_path); exit(1); } clProgram = clCreateProgramWithSource(clContext, 1, (const char **)&sourceCode, NULL, &clErrNum); OCL_ERRCK_VAR(clErrNum); free(sourceCode); /* char compileOptions[1024]; // -cl-nv-verbose // Provides register info for NVIDIA devices // Set all Macros referenced by kernels sprintf(compileOptions, "\ -D PRESCAN_THREADS=%u\ -D KB=%u -D UNROLL=%u\ -D BINS_PER_BLOCK=%u -D BLOCK_X=%u", prescanThreads, lmemKB, UNROLL, bins_per_block, blockX ); */ OCL_ERRCK_RETVAL ( clBuildProgram(clProgram, 1, &clDevice, NULL /*compileOptions*/, NULL, NULL) ); char *build_log; size_t ret_val_size; OCL_ERRCK_RETVAL ( clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size) ); build_log = (char *)malloc(ret_val_size+1); OCL_ERRCK_RETVAL ( clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL) ); // to be careful, terminate with \0 build_log[ret_val_size] = '\0'; fprintf(stderr, "%s\n", build_log ); fft_kernel = clCreateKernel(clProgram, "GPU_FFT_Global", &clErrNum); OCL_ERRCK_VAR(clErrNum); pb_SwitchToTimer(&timers, pb_TimerID_COPY); // allocate & copy device memory d_source = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, n_bytes, source, &clErrNum); OCL_ERRCK_VAR(clErrNum); //result is initially zero'd out d_work = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, n_bytes, result, &clErrNum); OCL_ERRCK_VAR(clErrNum); size_t block[1] = { N/R }; size_t grid[1] = { B*block[0] }; OCL_ERRCK_RETVAL( clSetKernelArg(fft_kernel, 3, sizeof(int), &N) ); data0 = &d_source; data1 = &d_work; pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); for (int Ns = 1; Ns < N; Ns *= R) { OCL_ERRCK_RETVAL( clSetKernelArg(fft_kernel, 0, sizeof(int), &Ns) ); OCL_ERRCK_RETVAL( clSetKernelArg(fft_kernel, 1, sizeof(cl_mem), (void *)data0) ); OCL_ERRCK_RETVAL( clSetKernelArg(fft_kernel, 2, sizeof(cl_mem), (void *)data1) ); OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, fft_kernel, 1, 0, grid, block, 0, 0, 0) ); cl_mem *tmp = data0; data0 = data1; data1 = tmp; } pb_SwitchToTimer(&timers, pb_TimerID_COPY); // copy device memory to host //cudaMemcpy(result, d_source, n_bytes,cudaMemcpyDeviceToHost); OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, *data0, CL_TRUE, 0, // Offset in bytes n_bytes, // Size of data to read result, // Host Source 0, NULL, NULL) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(d_source) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(d_work) ); if (params->outFile) { /* Write result to file */ pb_SwitchToTimer(&timers, pb_TimerID_IO); outputData(params->outFile, (float*)result, N*B*2); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); } free(source); free(result); pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); pb_DestroyTimerSet(&timers); pb_FreeParameters(params); return 0; }
int main (int argc, char* argv[]){ struct pb_Parameters* prms; struct pb_TimerSet timers; prms = pb_ReadParameters(&argc,argv); pb_InitializeTimerSet(&timers); pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); pb_SwitchToTimer(&timers, pb_TimerID_NONE); char uksdata[250]; parameters params; FILE* uksfile_f = NULL; FILE* uksdata_f = NULL; strcpy(uksdata,prms->inpFiles[0]); strcat(uksdata,".data"); uksfile_f = fopen(prms->inpFiles[0],"r"); if (uksfile_f == NULL){ printf("ERROR: Could not open %s\n",prms->inpFiles[0]); exit(1); } printf("\nReading parameters\n"); if (argc >= 2){ params.binsize = atoi(argv[1]); } else { //default binsize value; params.binsize = 128; } setParameters(uksfile_f, ¶ms); pb_SwitchToTimer(&timers, pb_TimerID_IO); ReconstructionSample* samples; //Input Data // cl_mem samplesPin; float* LUT; //use look-up table for faster execution on CPU (intermediate data) unsigned int sizeLUT; //set in the function calculateLUT (intermediate data) cmplx* gridData; //Output Data float* sampleDensity; //Output Data // cl_mem gridDataPin; // cl_mem sampleDensityPin; cmplx* gridData_gold; //Gold Output Data float* sampleDensity_gold; //Gold Output Data cl_int ciErrNum; cl_platform_id clPlatform; cl_device_type deviceType = CL_DEVICE_TYPE_GPU; cl_device_id clDevice; cl_context clContext; int deviceFound = getOpenCLDevice(&clPlatform, &clDevice, &deviceType, 0); size_t max_alloc_size = 0; (void) clGetDeviceInfo(clDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(size_t), &max_alloc_size, 0); size_t global_mem_size = 0; (void) clGetDeviceInfo(clDevice, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(size_t), &global_mem_size, 0); size_t samples_size = params.numSamples*sizeof(ReconstructionSample); int gridNumElems = params.gridSize[0] * params.gridSize[1] * params.gridSize[2]; size_t output_size = gridNumElems*sizeof(cmplx); if ( (deviceFound < 0) || ((samples_size+output_size) > global_mem_size) || (samples_size > max_alloc_size) || (output_size > max_alloc_size ) ) { fprintf(stderr, "No suitable device was found\n"); if(deviceFound >= 0) { fprintf(stderr, "Memory requirements for this dataset exceed device capabilities\n"); } exit(1); } cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties) clPlatform, 0}; clContext = clCreateContextFromType(cps, deviceType, NULL, NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); cl_command_queue clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); cl_uint workItemDimensions; OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &workItemDimensions, NULL) ); size_t workItemSizes[workItemDimensions]; OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, workItemDimensions*sizeof(size_t), workItemSizes, NULL) ); pb_SetOpenCL(&clContext, &clCommandQueue); /* samplesPin = clCreateBuffer(clContext, CL_MEM_ALLOC_HOST_PTR, params.numSamples*sizeof(ReconstructionSample), NULL, &ciErrNum); */ samples = (ReconstructionSample *) malloc ( params.numSamples*sizeof(ReconstructionSample) ); /*(ReconstructionSample *) clEnqueueMapBuffer(clCommandQueue, samplesPin, CL_TRUE, CL_MAP_WRITE, 0, params.numSamples*sizeof(ReconstructionSample), 0, NULL, NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); */ if (samples == NULL){ printf("ERROR: Unable to allocate and map memory for input data\n"); exit(1); } uksdata_f = fopen(uksdata,"rb"); if(uksdata_f == NULL){ printf("ERROR: Could not open data file\n"); exit(1); } printf("Reading input data from files\n"); unsigned int n = readSampleData(params, uksdata_f, samples); fclose(uksdata_f); if (params.useLUT){ printf("Generating Look-Up Table\n"); float beta = PI * sqrt(4*params.kernelWidth*params.kernelWidth/(params.oversample*params.oversample) * (params.oversample-.5)*(params.oversample-.5)-.8); calculateLUT(beta, params.kernelWidth, &LUT, &sizeLUT); } pb_SwitchToTimer(&timers, pb_TimerID_NONE); gridData_gold = (cmplx*) calloc (gridNumElems, sizeof(cmplx)); sampleDensity_gold = (float*) calloc (gridNumElems, sizeof(float)); if (sampleDensity_gold == NULL || gridData_gold == NULL){ printf("ERROR: Unable to allocate memory for output data\n"); exit(1); } printf("Running gold version\n"); gridding_Gold(n, params, samples, LUT, sizeLUT, gridData_gold, sampleDensity_gold); printf("Running OpenCL version\n"); pb_SwitchToTimer(&timers, pb_TimerID_COPY); /* OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, samplesPin, CL_TRUE, 0, // Offset in bytes n*sizeof(ReconstructionSample), // Size of data to write samples, // Host Source 0, NULL, NULL) );*/ // OCL_ERRCK_RETVAL ( clFinish(clCommandQueue) ); /* gridDataPin = clCreateBuffer(clContext, CL_MEM_ALLOC_HOST_PTR, gridNumElems*sizeof(cmplx), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); */ gridData = (cmplx *) malloc ( gridNumElems*sizeof(cmplx) ); if (gridData == NULL) { fprintf(stderr, "Could not allocate memory on host! (%s: %d)\n", __FILE__, __LINE__); exit(1); } /*(cmplx *) clEnqueueMapBuffer(clCommandQueue, gridDataPin, CL_TRUE, CL_MAP_READ, 0, gridNumElems*sizeof(cmplx), 0, NULL, NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); */ /* sampleDensityPin = clCreateBuffer(clContext, CL_MEM_ALLOC_HOST_PTR, gridNumElems*sizeof(float), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); */ sampleDensity = (float *) malloc ( gridNumElems*sizeof(float) ); if (sampleDensity == NULL) { fprintf(stderr, "Could not allocate memory on host! (%s: %d)\n", __FILE__, __LINE__); exit(1); } /*(float *) clEnqueueMapBuffer(clCommandQueue, sampleDensityPin, CL_TRUE, CL_MAP_READ, 0, gridNumElems*sizeof(float), 0, NULL, NULL, &ciErrNum); */ OCL_ERRCK_VAR(ciErrNum); OCL_ERRCK_VAR(ciErrNum); if (sampleDensity == NULL || gridData == NULL){ printf("ERROR: Unable to allocate memory for output data\n"); exit(1); } pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); //Interface function to GPU implementation of gridding OpenCL_interface(&timers, n, params, samples, LUT, sizeLUT, gridData, sampleDensity, clContext, clCommandQueue, clDevice, workItemSizes); pb_SwitchToTimer(&timers, pb_TimerID_NONE); int passed=1; for (int i=0; i<gridNumElems; i++){ if(sampleDensity[i] != sampleDensity_gold[i]) { passed=0; break; } } //(passed) ? printf("Comparing GPU and Gold results... PASSED\n"):printf("Comparing GPU and Gold results... FAILED\n"); pb_SwitchToTimer(&timers, pb_TimerID_IO); FILE* outfile; if(!(outfile=fopen(prms->outFile,"w"))) { printf("Cannot open output file!\n"); } else { fwrite(&passed,sizeof(int),1,outfile); fclose(outfile); } pb_SwitchToTimer(&timers, pb_TimerID_NONE); if (params.useLUT){ free(LUT); } /* OCL_ERRCK_RETVAL ( clEnqueueUnmapMemObject(clCommandQueue, samplesPin, samples, 0, NULL, NULL) ); OCL_ERRCK_RETVAL ( clEnqueueUnmapMemObject(clCommandQueue, gridDataPin, gridData, 0, NULL, NULL) ); OCL_ERRCK_RETVAL ( clEnqueueUnmapMemObject(clCommandQueue, sampleDensityPin, sampleDensity, 0, NULL, NULL) ); clReleaseMemObject(samplesPin); clReleaseMemObject(gridDataPin); clReleaseMemObject(sampleDensityPin); */ free(samples); free(gridData); free(sampleDensity); free(gridData_gold); free(sampleDensity_gold); printf("\n"); pb_PrintTimerSet(&timers); pb_FreeParameters(prms); return 0; }
int main(int argc, char *argv[]) { Atoms *atom; LatticeDim lattice_dim; Lattice *cpu_lattice; Vec3 min_ext, max_ext; /* Bounding box of atoms */ Vec3 lo, hi; /* Bounding box with padding */ float h = 0.5f; /* Lattice spacing */ float cutoff = 12.f; /* Cutoff radius */ float exclcutoff = 1.f; /* Radius for exclusion */ float padding = 0.5f; /* Bounding box padding distance */ int n; struct pb_Parameters *parameters; struct pb_TimerSet timers; /* Read input parameters */ parameters = pb_ReadParameters(&argc, argv); if (parameters == NULL) { exit(1); } /* Expect one input file */ if (pb_Parameters_CountInputs(parameters) != 1) { fprintf(stderr, "Expecting one input file\n"); exit(1); } pb_InitializeTimerSet(&timers); pb_SwitchToTimer(&timers, pb_TimerID_IO); { const char *pqrfilename = parameters->inpFiles[0]; if (!(atom = read_atom_file(pqrfilename))) { fprintf(stderr, "read_atom_file() failed\n"); exit(1); } printf("read %d atoms from file '%s'\n", atom->size, pqrfilename); } pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); /* find extent of domain */ get_atom_extent(&min_ext, &max_ext, atom); printf("extent of domain is:\n"); printf(" minimum %g %g %g\n", min_ext.x, min_ext.y, min_ext.z); printf(" maximum %g %g %g\n", max_ext.x, max_ext.y, max_ext.z); printf("padding domain by %g Angstroms\n", padding); lo = (Vec3) {min_ext.x - padding, min_ext.y - padding, min_ext.z - padding}; hi = (Vec3) {max_ext.x + padding, max_ext.y + padding, max_ext.z + padding}; printf("domain lengths are %g by %g by %g\n", hi.x-lo.x, hi.y-lo.y, hi.z-lo.z); lattice_dim = lattice_from_bounding_box(lo, hi, h); cpu_lattice = create_lattice(lattice_dim); printf("\n"); /* * CPU kernel */ if (cpu_compute_cutoff_potential_lattice(cpu_lattice, cutoff, atom)) { fprintf(stderr, "Computation failed\n"); exit(1); } /* * Zero the lattice points that are too close to an atom. This is * necessary for numerical stability. */ if (remove_exclusions(cpu_lattice, exclcutoff, atom)) { fprintf(stderr, "remove_exclusions() failed for cpu lattice\n"); exit(1); } /* Print output */ pb_SwitchToTimer(&timers, pb_TimerID_IO); if (parameters->outFile) { write_lattice_summary(parameters->outFile, cpu_lattice); } pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); /* Cleanup */ destroy_lattice(cpu_lattice); free_atom(atom); pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); pb_FreeParameters(parameters); return 0; }
int main(int argc, char* argv[]) { struct pb_TimerSet timers; struct pb_Parameters *parameters; printf("Base implementation of histogramming.\n"); printf("Maintained by Nady Obeid <*****@*****.**>\n"); parameters = pb_ReadParameters(&argc, argv); if (!parameters) return -1; if(!parameters->inpFiles[0]){ fputs("Input file expected\n", stderr); return -1; } int numIterations; if (argc >= 2){ numIterations = atoi(argv[1]); } else { fputs("Expected at least one command line argument\n", stderr); return -1; } pb_InitializeTimerSet(&timers); char *inputStr = "Input"; char *outputStr = "Output"; pb_AddSubTimer(&timers, inputStr, pb_TimerID_IO); pb_AddSubTimer(&timers, outputStr, pb_TimerID_IO); pb_SwitchToSubTimer(&timers, inputStr, pb_TimerID_IO); unsigned int img_width, img_height; unsigned int histo_width, histo_height; FILE* f = fopen(parameters->inpFiles[0],"rb"); int result = 0; result += fread(&img_width, sizeof(unsigned int), 1, f); result += fread(&img_height, sizeof(unsigned int), 1, f); result += fread(&histo_width, sizeof(unsigned int), 1, f); result += fread(&histo_height, sizeof(unsigned int), 1, f); if (result != 4){ fputs("Error reading input and output dimensions from file\n", stderr); return -1; } unsigned int* img = (unsigned int*) malloc (img_width*img_height*sizeof(unsigned int)); unsigned char* histo = (unsigned char*) calloc (histo_width*histo_height, sizeof(unsigned char)); pb_SwitchToSubTimer(&timers, "Input", pb_TimerID_IO); result = fread(img, sizeof(unsigned int), img_width*img_height, f); fclose(f); if (result != img_width*img_height){ fputs("Error reading input array from file\n", stderr); return -1; } pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); int iter; for (iter = 0; iter < numIterations; iter++){ memset(histo,0,histo_height*histo_width*sizeof(unsigned char)); unsigned int i; for (i = 0; i < img_width*img_height; ++i) { const unsigned int value = img[i]; if (histo[value] < UINT8_MAX) { ++histo[value]; } } } // pb_SwitchToTimer(&timers, pb_TimerID_IO); pb_SwitchToSubTimer(&timers, outputStr, pb_TimerID_IO); if (parameters->outFile) { dump_histo_img(histo, histo_height, histo_width, parameters->outFile); } pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); free(img); free(histo); pb_SwitchToTimer(&timers, pb_TimerID_NONE); printf("\n"); pb_PrintTimerSet(&timers); pb_FreeParameters(parameters); return 0; }
int main(int argc, char* argv[]) { struct pb_TimerSet timers; struct pb_Parameters *parameters; parameters = pb_ReadParameters(&argc, argv); if (!parameters) return -1; if(!parameters->inpFiles[0]){ fputs("Input file expected\n", stderr); return -1; } char oclOverhead[] = "OCL Overhead"; char prescans[] = "PreScanKernel"; char postpremems[] = "PostPreMems"; char intermediates[] = "IntermediatesKernel"; char mains[] = "MainKernel"; char finals[] = "FinalKernel"; pb_InitializeTimerSet(&timers); pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); pb_AddSubTimer(&timers, prescans, pb_TimerID_KERNEL); pb_AddSubTimer(&timers, postpremems, pb_TimerID_KERNEL); pb_AddSubTimer(&timers, intermediates, pb_TimerID_KERNEL); pb_AddSubTimer(&timers, mains, pb_TimerID_KERNEL); pb_AddSubTimer(&timers, finals, pb_TimerID_KERNEL); pb_SwitchToTimer(&timers, pb_TimerID_IO); int numIterations; if (argc >= 2){ numIterations = atoi(argv[1]); } else { fputs("Expected at least one command line argument\n", stderr); return -1; } unsigned int img_width, img_height; unsigned int histo_width, histo_height; unsigned int lmemKB; unsigned int nThreads; unsigned int bins_per_block; FILE* f = fopen(parameters->inpFiles[0],"rb"); int result = 0; result += fread(&img_width, sizeof(unsigned int), 1, f); result += fread(&img_height, sizeof(unsigned int), 1, f); result += fread(&histo_width, sizeof(unsigned int), 1, f); result += fread(&histo_height, sizeof(unsigned int), 1, f); if (result != 4){ fputs("Error reading input and output dimensions from file\n", stderr); return -1; } unsigned int* img = (unsigned int*) malloc (img_width*img_height*sizeof(unsigned int)); unsigned char* histo = (unsigned char*) calloc (histo_width*histo_height, sizeof(unsigned char)); result = fread(img, sizeof(unsigned int), img_width*img_height, f); fclose(f); if (result != img_width*img_height){ fputs("Error reading input array from file\n", stderr); return -1; } cl_int ciErrNum; pb_Context* pb_context; pb_context = pb_InitOpenCLContext(parameters); if (pb_context == NULL) { fprintf (stderr, "Error: No OpenCL platform/device can be found."); return -1; } cl_int clStatus; cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId; cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId; cl_context clContext = (cl_context) pb_context->clContext; cl_command_queue clCommandQueue; cl_program clProgram[4]; cl_kernel histo_prescan_kernel; cl_kernel histo_intermediates_kernel; cl_kernel histo_main_kernel; cl_kernel histo_final_kernel; int even_width = ((img_width+1)/2)*2; cl_mem input; cl_mem ranges; cl_mem sm_mappings; cl_mem global_subhisto; cl_mem global_histo; cl_mem global_overflow; cl_mem final_histo; clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); pb_SetOpenCL(&clContext, &clCommandQueue); pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); long unsigned int lmemSize = 0; OCL_ERRCK_RETVAL ( clGetDeviceInfo(clDevice, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), &lmemSize, NULL) ); // lmemKB = lmemSize / 1024; // Should be valid, but not taken into consideration for initial programming if (lmemSize >= 48*1024) { lmemKB = 48; } else if (lmemSize >= 24*1024) { lmemKB = 24; } else { lmemKB = 8; } lmemKB = 24; bins_per_block = lmemKB * 1024; switch (lmemKB) { case 48: nThreads = 1024; break; case 24: nThreads = 768; break; default: nThreads = 512; break; } size_t program_length[4]; const char *source_path[4] = { "src/opencl_nvidia/histo_prescan.cl", "src/opencl_nvidia/histo_intermediates.cl", "src/opencl_nvidia/histo_main.cl","src/opencl_nvidia/histo_final.cl"}; char *source[4]; for (int i = 0; i < 4; ++i) { // Dynamically allocate buffer for source source[i] = oclLoadProgSource(source_path[i], "", &program_length[i]); if(!source[i]) { fprintf(stderr, "Could not load program source\n"); exit(1); } clProgram[i] = clCreateProgramWithSource(clContext, 1, (const char **)&source[i], &program_length[i], &ciErrNum); OCL_ERRCK_VAR(ciErrNum); free(source[i]); } char compileOptions[1024]; // -cl-nv-verbose // Provides register info for NVIDIA devices // Set all Macros referenced by kernels sprintf(compileOptions, "\ -D PRESCAN_THREADS=%u\ -D KB=%u -D UNROLL=%u\ -D BINS_PER_BLOCK=%u -D BLOCK_X=%u", PRESCAN_THREADS, lmemKB, UNROLL, bins_per_block, BLOCK_X ); for (int i = 0; i < 4; ++i) { //fprintf(stderr, "Building Program #%d...\n", i); OCL_ERRCK_RETVAL ( clBuildProgram(clProgram[i], 1, &clDevice, compileOptions, NULL, NULL) ); /* char *build_log; size_t ret_val_size; ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); OCL_ERRCK_VAR(ciErrNum); build_log = (char *)malloc(ret_val_size+1); ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); OCL_ERRCK_VAR(ciErrNum); // to be carefully, terminate with \0 // there's no information in the reference whether the string is 0 terminated or not build_log[ret_val_size] = '\0'; fprintf(stderr, "%s\n", build_log ); */ } histo_prescan_kernel = clCreateKernel(clProgram[0], "histo_prescan_kernel", &ciErrNum); OCL_ERRCK_VAR(ciErrNum); histo_intermediates_kernel = clCreateKernel(clProgram[1], "histo_intermediates_kernel", &ciErrNum); OCL_ERRCK_VAR(ciErrNum); histo_main_kernel = clCreateKernel(clProgram[2], "histo_main_kernel", &ciErrNum); OCL_ERRCK_VAR(ciErrNum); histo_final_kernel = clCreateKernel(clProgram[3], "histo_final_kernel", &ciErrNum); OCL_ERRCK_VAR(ciErrNum); pb_SwitchToTimer(&timers, pb_TimerID_IO); input = clCreateBuffer(clContext, CL_MEM_READ_WRITE, even_width*(((img_height+UNROLL)/UNROLL)*UNROLL)*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); ranges = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 2*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); sm_mappings = clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*img_height*4*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); global_subhisto = clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); global_histo = clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*histo_height*sizeof(unsigned short), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); global_overflow = clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); final_histo = clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*histo_height*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); // Must dynamically allocate. Too large for stack unsigned int *zeroData; zeroData = (unsigned int *) malloc(sizeof(unsigned int) *img_width*histo_height); if (zeroData == NULL) { fprintf(stderr, "Failed to allocate %ld bytes of memory!\n", sizeof(unsigned int) * img_width * histo_height); exit(1); } memset(zeroData, 0, img_width*histo_height*sizeof(unsigned int)); for (int y=0; y < img_height; y++){ OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, input, CL_FALSE, y*even_width*sizeof(unsigned int), // Offset in bytes img_width*sizeof(unsigned int), // Size of data to write &img[y*img_width], // Host Source 0, NULL, NULL) ); } pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); unsigned int img_dim = img_height*img_width; OCL_ERRCK_RETVAL( clSetKernelArg(histo_prescan_kernel, 0, sizeof(cl_mem), (void *)&input) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_prescan_kernel, 1, sizeof(unsigned int), &img_dim) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_prescan_kernel, 2, sizeof(cl_mem), (void *)&ranges) ); unsigned int half_width = (img_width+1)/2; OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 0, sizeof(cl_mem), (void *)&input) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 1, sizeof(unsigned int), &img_height) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 2, sizeof(unsigned int), &img_width) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 3, sizeof(unsigned int), &half_width) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 4, sizeof(cl_mem), (void *)&sm_mappings) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 0, sizeof(cl_mem), (void *)&sm_mappings) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 1, sizeof(unsigned int), &img_dim) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 4, sizeof(unsigned int), &histo_height) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 5, sizeof(unsigned int), &histo_width) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 6, sizeof(cl_mem), (void *)&global_subhisto) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 7, sizeof(cl_mem), (void *)&global_histo) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 8, sizeof(cl_mem), (void *)&global_overflow) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 2, sizeof(unsigned int), &histo_height) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 3, sizeof(unsigned int), &histo_width) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 4, sizeof(cl_mem), (void *)&global_subhisto) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 5, sizeof(cl_mem), (void *)&global_histo) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 6, sizeof(cl_mem), (void *)&global_overflow) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 7, sizeof(cl_mem), (void *)&final_histo) ); size_t prescan_localWS[1] = {PRESCAN_THREADS}; size_t prescan_globalWS[1] = {PRESCAN_BLOCKS_X*prescan_localWS[0]}; size_t inter_localWS[1] = {(img_width+1)/2}; size_t inter_globalWS[1] = {((img_height + UNROLL-1)/UNROLL) * inter_localWS[0]}; size_t main_localWS[2] = {nThreads, 1}; size_t main_globalWS[2]; main_globalWS[0] = BLOCK_X * main_localWS[0]; size_t final_localWS[1] = {512}; size_t final_globalWS[1] = {BLOCK_X*3 * final_localWS[0]}; pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); for (int iter = 0; iter < numIterations; iter++) { unsigned int ranges_h[2] = {UINT32_MAX/2, 0}; // how about something like // __global__ unsigned int ranges[2]; // ...kernel // __shared__ unsigned int s_ranges[2]; // if (threadIdx.x == 0) {s_ranges[0] = ranges[0]; s_ranges[1] = ranges[1];} // __syncthreads(); // Although then removing the blocking cudaMemcpy's might cause something about // concurrent kernel execution. // If kernel launches are synchronous, then how can 2 kernels run concurrently? different host threads? OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, ranges, CL_TRUE, 0, // Offset in bytes 2*sizeof(unsigned int), // Size of data to write ranges_h, // Host Source 0, NULL, NULL) ); pb_SwitchToSubTimer(&timers, prescans , pb_TimerID_KERNEL); OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_prescan_kernel, 1, 0, prescan_globalWS, prescan_localWS, 0, 0, 0) ); pb_SwitchToSubTimer(&timers, postpremems , pb_TimerID_KERNEL); OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, ranges, CL_TRUE, 0, // Offset in bytes 2*sizeof(unsigned int), // Size of data to read ranges_h, // Host Source 0, NULL, NULL) ); OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, global_subhisto, CL_TRUE, 0, // Offset in bytes img_width*histo_height*sizeof(unsigned int), // Size of data to write zeroData, // Host Source 0, NULL, NULL) ); pb_SwitchToSubTimer(&timers, intermediates, pb_TimerID_KERNEL); OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_intermediates_kernel, 1, 0, inter_globalWS, inter_localWS, 0, 0, 0) ); main_globalWS[1] = ranges_h[1]-ranges_h[0]+1; OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 2, sizeof(unsigned int), &ranges_h[0]) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_main_kernel, 3, sizeof(unsigned int), &ranges_h[1]) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 0, sizeof(unsigned int), &ranges_h[0]) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 1, sizeof(unsigned int), &ranges_h[1]) ); pb_SwitchToSubTimer(&timers, mains, pb_TimerID_KERNEL); OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_main_kernel, 2, 0, main_globalWS, main_localWS, 0, 0, 0) ); pb_SwitchToSubTimer(&timers, finals, pb_TimerID_KERNEL); OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_final_kernel, 1, 0, final_globalWS, final_localWS, 0, 0, 0) ); } pb_SwitchToTimer(&timers, pb_TimerID_IO); OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, final_histo, CL_TRUE, 0, // Offset in bytes histo_height*histo_width*sizeof(unsigned char), // Size of data to read histo, // Host Source 0, NULL, NULL) ); OCL_ERRCK_RETVAL ( clReleaseKernel(histo_prescan_kernel) ); OCL_ERRCK_RETVAL ( clReleaseKernel(histo_intermediates_kernel) ); OCL_ERRCK_RETVAL ( clReleaseKernel(histo_main_kernel) ); OCL_ERRCK_RETVAL ( clReleaseKernel(histo_final_kernel) ); OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[0]) ); OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[1]) ); OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[2]) ); OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[3]) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(input) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(ranges) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(sm_mappings) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(global_subhisto) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(global_histo) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(global_overflow) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(final_histo) ); if (parameters->outFile) { dump_histo_img(histo, histo_height, histo_width, parameters->outFile); } pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); free(zeroData); free(img); free(histo); pb_SwitchToTimer(&timers, pb_TimerID_NONE); printf("\n"); pb_PrintTimerSet(&timers); pb_FreeParameters(parameters); OCL_ERRCK_RETVAL ( clReleaseCommandQueue(clCommandQueue) ); OCL_ERRCK_RETVAL ( clReleaseContext(clContext) ); pb_DestroyTimerSet(&timers); sleep(1); return 0; }
int main( int argc, char **argv ) { struct pb_TimerSet timers; struct pb_Parameters *params; int rf, k, nbins, npd, * npr; float *binb, w; long long *DD, *RRS, *DRS; size_t memsize; struct cartesian *data, *random; FILE *outfile; int offset = 0; Triolet_init(&argc, &argv); pb_InitializeTimerSet( &timers ); params = pb_ReadParameters( &argc, argv ); options args; parse_args( argc, argv, &args ); pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); nbins = (int)floor(bins_per_dec * (log10(max_arcmin) - log10(min_arcmin))); memsize = (nbins+2)*sizeof(long long); // memory for bin boundaries binb = (float *)malloc((nbins+1)*sizeof(float)); if (binb == NULL) { fprintf(stderr, "Unable to allocate memory\n"); exit(-1); } for (k = 0; k < nbins+1; k++) { binb[k] = cos(pow(10, log10(min_arcmin) + k*1.0/bins_per_dec) / 60.0*D2R); printf("%.10f\n", binb[k]); } // memory for DD DD = (long long*)malloc(memsize); if (DD == NULL) { fprintf(stderr, "Unable to allocate memory\n"); exit(-1); } bzero(DD, memsize); // memory for RR RRS = (long long*)malloc(memsize); if (RRS == NULL) { fprintf(stderr, "Unable to allocate memory\n"); exit(-1); } bzero(RRS, memsize); // memory for DR DRS = (long long*)malloc(memsize); if (DRS == NULL) { fprintf(stderr, "Unable to allocate memory\n"); exit(-1); } bzero(DRS, memsize); // memory for input data data = (struct cartesian*)malloc (args.npoints* sizeof(struct cartesian)); if (data == NULL) { fprintf(stderr, "Unable to allocate memory for % data points (#1)\n", args.npoints); return(0); } random = (struct cartesian*)malloc (args.npoints*sizeof(struct cartesian)); if (random == NULL) { fprintf(stderr, "Unable to allocate memory for % data points (#2)\n", args.npoints); return(0); } printf("Min distance: %f arcmin\n", min_arcmin); printf("Max distance: %f arcmin\n", max_arcmin); printf("Bins per dec: %i\n", bins_per_dec); printf("Total bins : %i\n", nbins); // read data file pb_SwitchToTimer( &timers, pb_TimerID_IO ); npd = readdatafile(params->inpFiles[0], data, args.npoints); pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); if (npd != args.npoints) { fprintf(stderr, "Error: read %i data points out of %i\n", npd, args.npoints); return(0); } // Marshal to Pyon tri_cartesian_dataset pyon_data = cartesian_to_arrays(data, npd, 1); // compute DD doComputeSelf(pyon_data, DD, nbins, 1, binb, &timers); npr = (int *) malloc(sizeof(int) * args.npoints); assert(npr != NULL); free(random); random = (struct cartesian *) malloc(sizeof(struct cartesian) * args.npoints * args.random_count); // loop through random data files for (rf = 0; rf < args.random_count; rf++) { // read random file pb_SwitchToTimer( &timers, pb_TimerID_IO ); npr[rf] = readdatafile(params->inpFiles[rf+1], &random[offset], args.npoints); pb_SwitchToTimer( &timers, pb_TimerID_COMPUTE ); offset += npr[rf]; if (npr[rf] != args.npoints) { fprintf(stderr, "Error: read %i random points out of %i in file %s\n", npr[rf], args.npoints, params->inpFiles[rf+1]); return(0); } } // compute RR tri_cartesian_dataset pyon_random = cartesian_to_arrays(random, npr[0], args.random_count); doComputeSelf(pyon_random, RRS, nbins, args.random_count, binb, &timers); // compute DR doComputeCross(pyon_data, pyon_random, DRS, nbins, args.random_count, binb, &timers); // compute and output results if ((outfile = fopen(params->outFile, "w")) == NULL) { fprintf(stderr, "Unable to open output file %s for writing, assuming stdout\n", params->outFile); outfile = stdout; } pb_SwitchToTimer( &timers, pb_TimerID_IO ); for (k = 1; k < nbins+1; k++) { fprintf(outfile, "%lld\n%lld\n%lld\n", DD[k], DRS[k], RRS[k]); } if(outfile != stdout) fclose(outfile); // free memory free(data); free(random); free(binb); free(DD); free(RRS); free(DRS); free(npr); pb_SwitchToTimer( &timers, pb_TimerID_NONE ); pb_PrintTimerSet( &timers ); pb_FreeParameters( params ); }
struct pb_Parameters * pb_ReadParameters(int *_argc, char **argv) { char *err_message; struct argparse ap; struct pb_Parameters *ret = (struct pb_Parameters *)malloc(sizeof(struct pb_Parameters)); /* Initialize the parameters structure */ ret->outFile = NULL; ret->inpFiles = (char **)malloc(sizeof(char *)); ret->inpFiles[0] = NULL; /* Each argument */ initialize_argparse(&ap, *_argc, argv); while(!is_end_of_arguments(&ap)) { char *arg = get_argument(&ap); /* Single-character flag */ if ((arg[0] == '-') && (arg[1] != 0) && (arg[2] == 0)) { delete_argument(&ap); /* This argument is consumed here */ switch(arg[1]) { case 'o': /* Output file name */ if (is_end_of_arguments(&ap)) { err_message = "Expecting file name after '-o'\n"; goto error; } free(ret->outFile); ret->outFile = strdup(consume_argument(&ap)); break; case 'i': /* Input file name */ if (is_end_of_arguments(&ap)) { err_message = "Expecting file name after '-i'\n"; goto error; } ret->inpFiles = read_string_array(consume_argument(&ap)); break; case '-': /* End of options */ goto end_of_options; default: err_message = "Unexpected command-line parameter\n"; goto error; } } else { /* Other parameters are ignored */ next_argument(&ap); } } /* end for each argument */ end_of_options: *_argc = ap.argc; /* Save the modified argc value */ finalize_argparse(&ap); return ret; error: fputs(err_message, stderr); pb_FreeParameters(ret); return NULL; }
int main(int argc, char* argv[]) { struct pb_Parameters *parameters; parameters = pb_ReadParameters(&argc, argv); if (!parameters) return -1; if(!parameters->inpFiles[0]){ fputs("Input file expected\n", stderr); return -1; } struct pb_TimerSet timers; char oclOverhead[] = "OCL Overhead"; char intermediates[] = "IntermediatesKernel"; char finals[] = "FinalKernel"; pb_InitializeTimerSet(&timers); pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); pb_AddSubTimer(&timers, intermediates, pb_TimerID_KERNEL); pb_AddSubTimer(&timers, finals, pb_TimerID_KERNEL); pb_SwitchToTimer(&timers, pb_TimerID_IO); int numIterations; if (argc >= 2){ numIterations = atoi(argv[1]); } else { fputs("Expected at least one command line argument\n", stderr); return -1; } unsigned int img_width, img_height; unsigned int histo_width, histo_height; FILE* f = fopen(parameters->inpFiles[0],"rb"); int result = 0; result += fread(&img_width, sizeof(unsigned int), 1, f); result += fread(&img_height, sizeof(unsigned int), 1, f); result += fread(&histo_width, sizeof(unsigned int), 1, f); result += fread(&histo_height, sizeof(unsigned int), 1, f); if (result != 4){ fputs("Error reading input and output dimensions from file\n", stderr); return -1; } unsigned int* img = (unsigned int*) malloc (img_width*img_height*sizeof(unsigned int)); unsigned char* histo = (unsigned char*) calloc (histo_width*histo_height, sizeof(unsigned char)); result = fread(img, sizeof(unsigned int), img_width*img_height, f); fclose(f); if (result != img_width*img_height){ fputs("Error reading input array from file\n", stderr); return -1; } cl_int ciErrNum; pb_Context* pb_context; pb_context = pb_InitOpenCLContext(); if (pb_context == NULL) { fprintf (stderr, "Error: No OpenCL platform/device can be found."); return -1; } cl_int clStatus; cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId; cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId; cl_context clContext = (cl_context) pb_context->clContext; cl_command_queue clCommandQueue; cl_program clProgram[2]; cl_kernel histo_intermediates_kernel; cl_kernel histo_final_kernel; cl_mem input; cl_mem ranges; cl_mem sm_mappings; cl_mem global_subhisto; cl_mem global_overflow; cl_mem final_histo; clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); pb_SetOpenCL(&clContext, &clCommandQueue); pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); cl_uint workItemDimensions; OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_DIMENSIONS, sizeof(cl_uint), &workItemDimensions, NULL) ); size_t workItemSizes[workItemDimensions]; OCL_ERRCK_RETVAL( clGetDeviceInfo(clDevice, CL_DEVICE_MAX_WORK_ITEM_SIZES, workItemDimensions*sizeof(size_t), workItemSizes, NULL) ); size_t program_length[2]; const char *source_path[2] = { "src/opencl_naive/histo_intermediates.cl", "src/opencl_naive/histo_final.cl"}; char *source[4]; for (int i = 0; i < 2; ++i) { // Dynamically allocate buffer for source source[i] = oclLoadProgSource(source_path[i], "", &program_length[i]); if(!source[i]) { fprintf(stderr, "Could not load program source\n"); exit(1); } clProgram[i] = clCreateProgramWithSource(clContext, 1, (const char **)&source[i], &program_length[i], &ciErrNum); OCL_ERRCK_VAR(ciErrNum); free(source[i]); } for (int i = 0; i < 2; ++i) { //fprintf(stderr, "Building Program #%d...\n", i); OCL_ERRCK_RETVAL ( clBuildProgram(clProgram[i], 1, &clDevice, NULL, NULL, NULL) ); #if 1 char *build_log; size_t ret_val_size; ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); OCL_ERRCK_VAR(ciErrNum); build_log = (char *)malloc(ret_val_size+1); ciErrNum = clGetProgramBuildInfo(clProgram[i], clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); OCL_ERRCK_VAR(ciErrNum); // to be carefully, terminate with \0 // there's no information in the reference whether the string is 0 terminated or not build_log[ret_val_size] = '\0'; fprintf(stderr, "%s\n", build_log ); #endif } histo_intermediates_kernel = clCreateKernel(clProgram[0], "histo_intermediates_kernel", &ciErrNum); OCL_ERRCK_VAR(ciErrNum); histo_final_kernel = clCreateKernel(clProgram[1], "histo_final_kernel", &ciErrNum); OCL_ERRCK_VAR(ciErrNum); pb_SwitchToTimer(&timers, pb_TimerID_COPY); input = clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*img_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); ranges = clCreateBuffer(clContext, CL_MEM_READ_WRITE, 2*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); sm_mappings = clCreateBuffer(clContext, CL_MEM_READ_WRITE, img_width*img_height*4*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); global_subhisto = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); global_overflow = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned int), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); final_histo = clCreateBuffer(clContext, CL_MEM_READ_WRITE, histo_width*histo_height*sizeof(unsigned char), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); // Must dynamically allocate. Too large for stack unsigned int *zeroData; zeroData = (unsigned int *) calloc(img_width*histo_height, sizeof(unsigned int)); if (zeroData == NULL) { fprintf(stderr, "Failed to allocate %ld bytes of memory on host!\n", sizeof(unsigned int) * img_width * histo_height); exit(1); } for (int y=0; y < img_height; y++){ OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, input, CL_TRUE, y*img_width*sizeof(unsigned int), // Offset in bytes img_width*sizeof(unsigned int), // Size of data to write &img[y*img_width], // Host Source 0, NULL, NULL) ); } pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); unsigned int img_dim = img_height*img_width; OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 0, sizeof(cl_mem), (void *)&input) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 1, sizeof(unsigned int), &img_width) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_intermediates_kernel, 2, sizeof(cl_mem), (void *)&global_subhisto) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 0, sizeof(unsigned int), &histo_height) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 1, sizeof(unsigned int), &histo_width) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 2, sizeof(cl_mem), (void *)&global_subhisto) ); OCL_ERRCK_RETVAL( clSetKernelArg(histo_final_kernel, 3, sizeof(cl_mem), (void *)&final_histo) ); size_t inter_localWS[1] = { workItemSizes[0] }; size_t inter_globalWS[1] = { img_height * inter_localWS[0] }; size_t final_localWS[1] = { workItemSizes[0] }; size_t final_globalWS[1] = {((histo_height*histo_width+(final_localWS[0]-1)) / final_localWS[0])*final_localWS[0] }; pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); for (int iter = 0; iter < numIterations; iter++) { unsigned int ranges_h[2] = {UINT32_MAX, 0}; // how about something like // __global__ unsigned int ranges[2]; // ...kernel // __shared__ unsigned int s_ranges[2]; // if (threadIdx.x == 0) {s_ranges[0] = ranges[0]; s_ranges[1] = ranges[1];} // __syncthreads(); // Although then removing the blocking cudaMemcpy's might cause something about // concurrent kernel execution. // If kernel launches are synchronous, then how can 2 kernels run concurrently? different host threads? OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, ranges, CL_TRUE, 0, // Offset in bytes 2*sizeof(unsigned int), // Size of data to write ranges_h, // Host Source 0, NULL, NULL) ); OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, global_subhisto, CL_TRUE, 0, // Offset in bytes histo_width*histo_height*sizeof(unsigned int), // Size of data to write zeroData, // Host Source 0, NULL, NULL) ); pb_SwitchToSubTimer(&timers, intermediates, pb_TimerID_KERNEL); OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_intermediates_kernel /*histo_intermediates_kernel*/, 1, 0, inter_globalWS, inter_localWS, 0, 0, 0) ); pb_SwitchToSubTimer(&timers, finals, pb_TimerID_KERNEL); OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, histo_final_kernel, 1, 0, final_globalWS, final_localWS, 0, 0, 0) ); } pb_SwitchToTimer(&timers, pb_TimerID_IO); OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, final_histo, CL_TRUE, 0, // Offset in bytes histo_height*histo_width*sizeof(unsigned char), // Size of data to read histo, // Host Source 0, NULL, NULL) ); OCL_ERRCK_RETVAL ( clReleaseKernel(histo_intermediates_kernel) ); OCL_ERRCK_RETVAL ( clReleaseKernel(histo_final_kernel) ); OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[0]) ); OCL_ERRCK_RETVAL ( clReleaseProgram(clProgram[1]) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(input) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(ranges) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(sm_mappings) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(global_subhisto) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(global_overflow) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(final_histo) ); if (parameters->outFile) { dump_histo_img(histo, histo_height, histo_width, parameters->outFile); } pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); free(zeroData); free(img); free(histo); pb_SwitchToTimer(&timers, pb_TimerID_NONE); printf("\n"); pb_PrintTimerSet(&timers); pb_FreeParameters(parameters); pb_DestroyTimerSet(&timers); OCL_ERRCK_RETVAL ( clReleaseCommandQueue(clCommandQueue) ); OCL_ERRCK_RETVAL ( clReleaseContext(clContext) ); return 0; }
int main(int argc, char **argv) { struct image_i16 *ref_image; struct image_i16 *cur_image; unsigned short *sads_computed; /* SADs generated by the program */ int image_size_bytes; int image_width_macroblocks, image_height_macroblocks; int image_size_macroblocks; struct pb_TimerSet timers; struct pb_Parameters *params; char oclOverhead[]= "OpenCL Overhead"; pb_InitializeTimerSet(&timers); pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); params = pb_ReadParameters(&argc, argv); if (pb_Parameters_CountInputs(params) != 2) { fprintf(stderr, "Expecting two input filenames\n"); exit(-1); } /* Read input files */ pb_SwitchToTimer(&timers, pb_TimerID_IO); ref_image = load_image(params->inpFiles[0]); cur_image = load_image(params->inpFiles[1]); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); if ((ref_image->width != cur_image->width) || (ref_image->height != cur_image->height)) { fprintf(stderr, "Input images must be the same size\n"); exit(-1); } if ((ref_image->width % 16) || (ref_image->height % 16)) { fprintf(stderr, "Input image size must be an integral multiple of 16\n"); exit(-1); } /* Compute parameters, allocate memory */ image_size_bytes = ref_image->width * ref_image->height * sizeof(short); image_width_macroblocks = ref_image->width >> 4; image_height_macroblocks = ref_image->height >> 4; image_size_macroblocks = image_width_macroblocks * image_height_macroblocks; sads_computed = (unsigned short *) malloc(41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(short)); // Run the kernel code // ************************************************************************ cl_int ciErrNum; cl_command_queue clCommandQueue; cl_kernel mb_sad_calc; cl_kernel larger_sad_calc_8; cl_kernel larger_sad_calc_16; cl_mem imgRef; /* Reference image on the device */ cl_mem d_cur_image; /* Current image on the device */ cl_mem d_sads; /* SADs on the device */ // x : image_width_macroblocks // y : image_height_macroblocks pb_Context* pb_context; pb_context = pb_InitOpenCLContext(params); if (pb_context == NULL) { fprintf (stderr, "Error: No OpenCL platform/device can be found."); return -1; } cl_int clStatus; cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId; cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId; cl_context clContext = (cl_context) pb_context->clContext; clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); pb_SetOpenCL(&clContext, &clCommandQueue); pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); // Read Source Code File size_t program_length; const char* source_path = "src/opencl_base/kernel.cl"; char* source = oclLoadProgSource(source_path, "", &program_length); if(!source) { fprintf(stderr, "Could not load program source\n"); exit(1); } cl_program clProgram = clCreateProgramWithSource(clContext, 1, (const char **)&source, &program_length, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); free(source); // JIT Compilation Options char compileOptions[1024]; // -cl-nv-verbose sprintf(compileOptions, "\ -D MAX_POS=%u -D CEIL_POS=%u\ -D POS_PER_THREAD=%u -D MAX_POS_PADDED=%u\ -D THREADS_W=%u -D THREADS_H=%u\ -D SEARCH_RANGE=%u -D SEARCH_DIMENSION=%u\ \0", MAX_POS, CEIL(MAX_POS, POS_PER_THREAD), POS_PER_THREAD, MAX_POS_PADDED, THREADS_W, THREADS_H, SEARCH_RANGE, SEARCH_DIMENSION ); printf ("options = %s\n", compileOptions); OCL_ERRCK_RETVAL( clBuildProgram(clProgram, 1, &clDevice, compileOptions, NULL, NULL) ); /* char *build_log; size_t ret_val_size; OCL_ERRCK_RETVAL( clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size) ); build_log = (char *)malloc(ret_val_size+1); OCL_ERRCK_RETVAL( clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL) ); // Null terminate (original writer wasn't sure) build_log[ret_val_size] = '\0'; fprintf(stderr, "%s\n", build_log ); */ mb_sad_calc = clCreateKernel(clProgram, "mb_sad_calc", &ciErrNum); OCL_ERRCK_VAR(ciErrNum); larger_sad_calc_8 = clCreateKernel(clProgram, "larger_sad_calc_8", &ciErrNum); OCL_ERRCK_VAR(ciErrNum); larger_sad_calc_16 = clCreateKernel(clProgram, "larger_sad_calc_16", &ciErrNum); OCL_ERRCK_VAR(ciErrNum); size_t wgSize; size_t comp_wgSize[3]; cl_ulong localMemSize; size_t prefwgSizeMult; cl_ulong privateMemSize; pb_SwitchToTimer(&timers, pb_TimerID_COPY); #if 0 cl_image_format img_format; img_format.image_channel_order = CL_R; img_format.image_channel_data_type = CL_UNSIGNED_INT16; /* Transfer reference image to device */ imgRef = clCreateImage2D(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, &img_format, ref_image->width /** sizeof(unsigned short)*/, // width ref_image->height, // height ref_image->width * sizeof(unsigned short), // row_pitch ref_image->data, &ciErrNum); #endif #if 1 imgRef = clCreateBuffer(clContext, CL_MEM_READ_ONLY, ref_image->width * ref_image->height * sizeof(unsigned short), NULL, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); OCL_ERRCK_RETVAL( clEnqueueWriteBuffer(clCommandQueue, imgRef, CL_TRUE, 0, ref_image->width * ref_image->height * sizeof(unsigned short), ref_image->data, 0, NULL, NULL) ); #else imgRef = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, ref_image->width * ref_image->height * sizeof(unsigned short), ref_image->data, &ciErrNum); printf ("Allocating %d bytes\n", ref_image->width * ref_image->height * sizeof(unsigned short)); #endif OCL_ERRCK_VAR(ciErrNum); /* Allocate SAD data on the device */ unsigned short *tmpZero = (unsigned short *)calloc(41 * MAX_POS_PADDED * image_size_macroblocks, sizeof(unsigned short)); /* size_t max_alloc_size = 0; clGetDeviceInfo(clDevice, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(max_alloc_size), &max_alloc_size, NULL); if (max_alloc_size < (41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(unsigned short))) { fprintf(stderr, "Can't allocate sad buffer: max alloc size is %dMB\n", (int) (max_alloc_size >> 20)); exit(-1); } */ d_sads = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, 41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(unsigned short), tmpZero, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); free(tmpZero); d_cur_image = clCreateBuffer(clContext, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, image_size_bytes, cur_image->data, &ciErrNum); OCL_ERRCK_VAR(ciErrNum); /* Set Kernel Parameters */ OCL_ERRCK_RETVAL( clSetKernelArg(mb_sad_calc, 0, sizeof(cl_mem), (void *)&d_sads) ); OCL_ERRCK_RETVAL( clSetKernelArg(mb_sad_calc, 1, sizeof(cl_mem), (void *)&d_cur_image) ); OCL_ERRCK_RETVAL( clSetKernelArg(mb_sad_calc, 2, sizeof(int), &image_width_macroblocks) ); OCL_ERRCK_RETVAL( clSetKernelArg(mb_sad_calc, 3, sizeof(int), &image_height_macroblocks) ); OCL_ERRCK_RETVAL( clSetKernelArg(mb_sad_calc, 4, sizeof(cl_mem), (void *)&imgRef) ); OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_8, 0, sizeof(cl_mem), (void *)&d_sads) ); OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_8, 1, sizeof(int), &image_width_macroblocks) ); OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_8, 2, sizeof(int), &image_height_macroblocks) ); OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_16, 0, sizeof(cl_mem), (void *)&d_sads) ); OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_16, 1, sizeof(int), &image_width_macroblocks) ); OCL_ERRCK_RETVAL( clSetKernelArg(larger_sad_calc_16, 2, sizeof(int), &image_height_macroblocks) ); size_t mb_sad_calc_localWorkSize[2] = { CEIL(MAX_POS, POS_PER_THREAD) * THREADS_W * THREADS_H, 1 }; size_t mb_sad_calc_globalWorkSize[2] = { mb_sad_calc_localWorkSize[0] * CEIL(ref_image->width / 4, THREADS_W), mb_sad_calc_localWorkSize[1] * CEIL(ref_image->height / 4, THREADS_H) }; size_t larger_sad_calc_8_localWorkSize[2] = {32,4}; size_t larger_sad_calc_8_globalWorkSize[2] = {image_width_macroblocks * 32, image_height_macroblocks * 4}; size_t larger_sad_calc_16_localWorkSize[2] = {32, 1}; size_t larger_sad_calc_16_globalWorkSize[2] = {image_width_macroblocks * 32, image_height_macroblocks * 1}; pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); /* Run the 4x4 kernel */ printf ("DBlock = %dx%d\n", mb_sad_calc_localWorkSize[1], mb_sad_calc_localWorkSize[0]); OCL_ERRCK_RETVAL( clEnqueueNDRangeKernel(clCommandQueue, mb_sad_calc, 2, 0, mb_sad_calc_globalWorkSize, mb_sad_calc_localWorkSize, 0, 0, 0) ); /* Run the larger-blocks kernels */ OCL_ERRCK_RETVAL( clEnqueueNDRangeKernel(clCommandQueue, larger_sad_calc_8, 2, 0, larger_sad_calc_8_globalWorkSize, larger_sad_calc_8_localWorkSize, 0, 0, 0) ); OCL_ERRCK_RETVAL( clEnqueueNDRangeKernel(clCommandQueue, larger_sad_calc_16, 2, 0, larger_sad_calc_16_globalWorkSize, larger_sad_calc_16_localWorkSize, 0, 0, 0) ); OCL_ERRCK_RETVAL( clFinish(clCommandQueue) ); pb_SwitchToTimer(&timers, pb_TimerID_COPY); /* Transfer SAD data to the host */ OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, d_sads, CL_TRUE, 0, 41 * MAX_POS_PADDED * image_size_macroblocks * sizeof(unsigned short), sads_computed, 0, NULL, NULL) ); /* Free GPU memory */ OCL_ERRCK_RETVAL( clReleaseKernel(larger_sad_calc_8) ); OCL_ERRCK_RETVAL( clReleaseKernel(larger_sad_calc_16) ); OCL_ERRCK_RETVAL( clReleaseProgram(clProgram) ); OCL_ERRCK_RETVAL( clReleaseMemObject(d_sads) ); OCL_ERRCK_RETVAL( clReleaseMemObject(imgRef) ); OCL_ERRCK_RETVAL( clReleaseMemObject(d_cur_image) ); OCL_ERRCK_RETVAL( clFinish(clCommandQueue) ); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); // ************************************************************************ // End GPU Code /* Print output */ if (params->outFile) { pb_SwitchToTimer(&timers, pb_TimerID_IO); write_sads(params->outFile, image_width_macroblocks, image_height_macroblocks, sads_computed); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); } #if 0 /* Debugging */ print_test_sads(sads_computed, image_size_macroblocks); write_sads_directly("sad-debug.bin", ref_image->width / 16, ref_image->height / 16, sads_computed); #endif /* Free memory */ free(sads_computed); free_image(ref_image); free_image(cur_image); pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); pb_FreeParameters(params); OCL_ERRCK_RETVAL( clReleaseCommandQueue(clCommandQueue) ); OCL_ERRCK_RETVAL( clReleaseContext(clContext) ); pb_DestroyTimerSet(&timers); return 0; }