CLWProgram CLWProgram::CreateFromSource(char const* sourcecode, size_t sourcesize, char const** headers, char const** headernames, size_t* headersizes, int numheaders, CLWContext context) { cl_int status = CL_SUCCESS; std::vector<cl_device_id> deviceIds(context.GetDeviceCount()); for(unsigned int i = 0; i < context.GetDeviceCount(); ++i) { deviceIds[i] = context.GetDevice(i); } char const* buildopts = #if defined(__APPLE__) "-D APPLE -cl-mad-enable -cl-fast-relaxed-math -cl-std=CL1.2 -I ." #elif defined(_WIN32) || defined (WIN32) "-D WIN32 -cl-mad-enable -cl-fast-relaxed-math -cl-std=CL1.2 -I." #elif defined(__linux__) "-D __linux__ -I." #else nullptr #endif ; std::vector<cl_program> headerPrograms(numheaders); for (int i=0; i<numheaders; ++i) { size_t sourceSize = headersizes[i]; char const* tempPtr = headers[i]; headerPrograms[i] = clCreateProgramWithSource(context, 1, (const char**)&tempPtr, &sourceSize, &status); ThrowIf(status != CL_SUCCESS, status, "clCreateProgramWithSource failed"); } cl_program program = clCreateProgramWithSource(context, 1, (const char**)&sourcecode, &sourcesize, &status); ThrowIf(status != CL_SUCCESS, status, "clCreateProgramWithSource failed"); status = clCompileProgram(program, context.GetDeviceCount(), &deviceIds[0], buildopts, numheaders, &headerPrograms[0], headernames, nullptr, nullptr); if(status != CL_SUCCESS) { std::vector<char> buildLog; size_t logSize; clGetProgramBuildInfo(program, deviceIds[0], CL_PROGRAM_BUILD_LOG, 0, nullptr, &logSize); buildLog.resize(logSize); clGetProgramBuildInfo(program, deviceIds[0], CL_PROGRAM_BUILD_LOG, logSize, &buildLog[0], nullptr); #ifdef _DEBUG std::cout << &buildLog[0] << "\n"; #endif throw CLWException(status, std::string(&buildLog[0])); } status = clBuildProgram(program, context.GetDeviceCount(), &deviceIds[0], buildopts, nullptr, nullptr); if(status != CL_SUCCESS) { std::vector<char> buildLog; size_t logSize; clGetProgramBuildInfo(program, deviceIds[0], CL_PROGRAM_BUILD_LOG, 0, nullptr, &logSize); buildLog.resize(logSize); clGetProgramBuildInfo(program, deviceIds[0], CL_PROGRAM_BUILD_LOG, logSize, &buildLog[0], nullptr); #ifdef _DEBUG std::cout << &buildLog[0] << "\n"; #endif throw CLWException(status, std::string(&buildLog[0])); } CLWProgram prg(program); clReleaseProgram(program); return prg; }
void buildOpenCLKernels_calc_dt_kernel_print(int xdim0, int xdim1, int xdim2, int xdim3, int xdim4, int xdim5) { //int ocl_fma = OCL_FMA; if(!isbuilt_calc_dt_kernel_print) { buildOpenCLKernels(); //clSafeCall( clUnloadCompiler() ); cl_int ret; char* source_filename[1] = {"./OpenCL/calc_dt_kernel_print.cl"}; // Load the kernel source code into the array source_str FILE *fid; char *source_str[1]; size_t source_size[1]; for(int i=0; i<1; i++) { fid = fopen(source_filename[i], "r"); if (!fid) { fprintf(stderr, "Can't open the kernel source file!\n"); exit(1); } source_str[i] = (char*)malloc(4*0x1000000); source_size[i] = fread(source_str[i], 1, 4*0x1000000, fid); if(source_size[i] != 4*0x1000000) { if (ferror(fid)) { printf ("Error while reading kernel source file %s\n", source_filename[i]); exit(-1); } if (feof(fid)) printf ("Kernel source file %s succesfuly read.\n", source_filename[i]); //printf("%s\n",source_str[i]); } fclose(fid); } printf("Compiling calc_dt_kernel_print %d source -- start \n",OCL_FMA); // Create a program from the source OPS_opencl_core.program = clCreateProgramWithSource(OPS_opencl_core.context, 1, (const char **) &source_str, (const size_t *) &source_size, &ret); clSafeCall( ret ); // Build the program char buildOpts[255*7]; char* pPath = NULL; pPath = getenv ("OPS_INSTALL_PATH"); if (pPath!=NULL) if(OCL_FMA) sprintf(buildOpts,"-cl-mad-enable -DOCL_FMA -I%s/include -DOPS_WARPSIZE=%d -Dxdim0_calc_dt_kernel_print=%d -Dxdim1_calc_dt_kernel_print=%d -Dxdim2_calc_dt_kernel_print=%d -Dxdim3_calc_dt_kernel_print=%d -Dxdim4_calc_dt_kernel_print=%d -Dxdim5_calc_dt_kernel_print=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); else sprintf(buildOpts,"-cl-mad-enable -I%s/include -DOPS_WARPSIZE=%d -Dxdim0_calc_dt_kernel_print=%d -Dxdim1_calc_dt_kernel_print=%d -Dxdim2_calc_dt_kernel_print=%d -Dxdim3_calc_dt_kernel_print=%d -Dxdim4_calc_dt_kernel_print=%d -Dxdim5_calc_dt_kernel_print=%d ", pPath, 32,xdim0,xdim1,xdim2,xdim3,xdim4,xdim5); else { sprintf("Incorrect OPS_INSTALL_PATH %s\n",pPath); exit(EXIT_FAILURE); } ret = clBuildProgram(OPS_opencl_core.program, 1, &OPS_opencl_core.device_id, buildOpts, NULL, NULL); if(ret != CL_SUCCESS) { char* build_log; size_t log_size; clSafeCall( clGetProgramBuildInfo(OPS_opencl_core.program, OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size) ); build_log = (char*) malloc(log_size+1); clSafeCall( clGetProgramBuildInfo(OPS_opencl_core.program, OPS_opencl_core.device_id, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL) ); build_log[log_size] = '\0'; fprintf(stderr, "=============== OpenCL Program Build Info ================\n\n%s", build_log); fprintf(stderr, "\n========================================================= \n"); free(build_log); exit(EXIT_FAILURE); } printf("compiling calc_dt_kernel_print -- done\n"); // Create the OpenCL kernel OPS_opencl_core.kernel[30] = clCreateKernel(OPS_opencl_core.program, "ops_calc_dt_kernel_print", &ret); clSafeCall( ret ); isbuilt_calc_dt_kernel_print = true; } }
/** * The implementation of the particle filter using OpenMP for many frames * @see http://openmp.org/wp/ * @note This function is designed to work with a video of several frames. In addition, it references a provided MATLAB function which takes the video, the objxy matrix and the x and y arrays as arguments and returns the likelihoods * @param I The video to be run * @param IszX The x dimension of the video * @param IszY The y dimension of the video * @param Nfr The number of frames * @param seed The seed array used for random number generation * @param Nparticles The number of particles to be used */ int particleFilter(unsigned char * I, int IszX, int IszY, int Nfr, int * seed, int Nparticles) { int max_size = IszX * IszY*Nfr; //original particle centroid double xe = roundDouble(IszY / 2.0); double ye = roundDouble(IszX / 2.0); //expected object locations, compared to center int radius = 5; int diameter = radius * 2 - 1; int * disk = (int*) calloc(diameter * diameter, sizeof (int)); strelDisk(disk, radius); int countOnes = 0; int x, y; for (x = 0; x < diameter; x++) { for (y = 0; y < diameter; y++) { if (disk[x * diameter + y] == 1) countOnes++; } } int * objxy = (int *) calloc(countOnes * 2, sizeof(int)); getneighbors(disk, countOnes, objxy, radius); //initial weights are all equal (1/Nparticles) double * weights = (double *) calloc(Nparticles, sizeof(double)); for (x = 0; x < Nparticles; x++) { weights[x] = 1 / ((double) (Nparticles)); } /**************************************************************** ************** B E G I N A L L O C A T E ******************* ****************************************************************/ /***** kernel variables ******/ cl_kernel kernel_likelihood; cl_kernel kernel_sum; cl_kernel kernel_normalize_weights; cl_kernel kernel_find_index; int sourcesize = 2048 * 2048; char * source = (char *) calloc(sourcesize, sizeof (char)); if (!source) { printf("ERROR: calloc(%d) failed\n", sourcesize); return -1; } // read the kernel core source char * tempchar = "./particle_double.cl"; FILE * fp = fopen(tempchar, "rb"); if (!fp) { printf("ERROR: unable to open '%s'\n", tempchar); return -1; } fread(source + strlen(source), sourcesize, 1, fp); fclose(fp); // OpenCL initialization int use_gpu = 1; if (initialize(use_gpu)) return -1; // compile kernel cl_int err = 0; const char * slist[2] = {source, 0}; cl_program prog = clCreateProgramWithSource(context, 1, slist, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateProgramWithSource() => %d\n", err); return -1; } err = clBuildProgram(prog, 1, device_list, "-cl-fast-relaxed-math", NULL, NULL); if (err != CL_SUCCESS) { if (err == CL_INVALID_PROGRAM) printf("CL_INVALID_PROGRAM\n"); else if (err == CL_INVALID_VALUE) printf("CL_INVALID_VALUE\n"); else if (err == CL_INVALID_DEVICE) printf("CL_INVALID_DEVICE\n"); else if (err == CL_INVALID_BINARY) printf("CL_INVALID_BINARY\n"); else if (err == CL_INVALID_BUILD_OPTIONS) printf("CL_INVALID_BUILD_OPTIONS\n"); else if (err == CL_INVALID_OPERATION) printf("CL_INVALID_OPERATION\n"); else if (err == CL_COMPILER_NOT_AVAILABLE) printf("CL_COMPILER_NOT_AVAILABLE\n"); else if (err == CL_BUILD_PROGRAM_FAILURE) printf("CL_BUILD_PROGRAM_FAILURE\n"); else if (err == CL_INVALID_OPERATION) printf("CL_INVALID_OPERATION\n"); else if (err == CL_OUT_OF_RESOURCES) printf("CL_OUT_OF_RESOURCES\n"); else if (err == CL_OUT_OF_HOST_MEMORY) printf("CL_OUT_OF_HOST_MEMORY\n"); printf("ERROR: clBuildProgram() => %d\n", err); static char log[65536]; memset(log, 0, sizeof (log)); err = clGetProgramBuildInfo(prog, device_list[0], CL_PROGRAM_BUILD_LOG, sizeof (log) - 1, log, NULL); if (err != CL_SUCCESS) { printf("ERROR: clGetProgramBuildInfo() => %d\n", err); } if (strstr(log, "warning:") || strstr(log, "error:")) printf("<<<<\n%s\n>>>>\n", log); } // { // show warnings/errors // static char log[65536]; // memset(log, 0, sizeof (log)); // cl_device_id device_id[2] = {0}; // err = clGetContextInfo(context, CL_CONTEXT_DEVICES, sizeof (device_id), device_id, NULL); // if (err != CL_SUCCESS) { // if (err == CL_INVALID_CONTEXT) // printf("ERROR: clGetContextInfo() => CL_INVALID_CONTEXT\n"); // if (err == CL_INVALID_VALUE) // printf("ERROR: clGetContextInfo() => CL_INVALID_VALUE\n"); // } // }//*/ char * s_likelihood_kernel = "likelihood_kernel"; char * s_sum_kernel = "sum_kernel"; char * s_normalize_weights_kernel = "normalize_weights_kernel"; char * s_find_index_kernel = "find_index_kernel"; kernel_likelihood = clCreateKernel(prog, s_likelihood_kernel, &err); if (err != CL_SUCCESS) { if (err == CL_INVALID_PROGRAM) printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID PROGRAM %d\n", err); if (err == CL_INVALID_PROGRAM_EXECUTABLE) printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID PROGRAM EXECUTABLE %d\n", err); if (err == CL_INVALID_KERNEL_NAME) printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID KERNEL NAME %d\n", err); if (err == CL_INVALID_KERNEL_DEFINITION) printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID KERNEL DEFINITION %d\n", err); if (err == CL_INVALID_VALUE) printf("ERROR: clCreateKernel(likelihood_kernel) 0 => INVALID CL_INVALID_VALUE %d\n", err); printf("ERROR: clCreateKernel(likelihood_kernel) failed.\n"); return -1; } kernel_sum = clCreateKernel(prog, s_sum_kernel, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateKernel(sum_kernel) 0 => %d\n", err); return -1; } kernel_normalize_weights = clCreateKernel(prog, s_normalize_weights_kernel, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateKernel(normalize_weights_kernel) 0 => %d\n", err); return -1; } kernel_find_index = clCreateKernel(prog, s_find_index_kernel, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateKernel(find_index_kernel) 0 => %d\n", err); return -1; } //initial likelihood to 0.0 double * likelihood = (double *) calloc(Nparticles + 1, sizeof (double)); double * arrayX = (double *) calloc(Nparticles, sizeof (double)); double * arrayY = (double *) calloc(Nparticles, sizeof (double)); double * xj = (double *) calloc(Nparticles, sizeof (double)); double * yj = (double *) calloc(Nparticles, sizeof (double)); double * CDF = (double *) calloc(Nparticles, sizeof(double)); //GPU copies of arrays cl_mem arrayX_GPU; cl_mem arrayY_GPU; cl_mem xj_GPU; cl_mem yj_GPU; cl_mem CDF_GPU; cl_mem likelihood_GPU; cl_mem I_GPU; cl_mem weights_GPU; cl_mem objxy_GPU; int * ind = (int*) calloc(countOnes, sizeof(int)); cl_mem ind_GPU; double * u = (double *) calloc(Nparticles, sizeof(double)); cl_mem u_GPU; cl_mem seed_GPU; cl_mem partial_sums; //OpenCL memory allocation arrayX_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer arrayX_GPU (size:%d) => %d\n", Nparticles, err); return -1; } arrayY_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer arrayY_GPU (size:%d) => %d\n", Nparticles, err); return -1; } xj_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer xj_GPU (size:%d) => %d\n", Nparticles, err); return -1; } yj_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer yj_GPU (size:%d) => %d\n", Nparticles, err); return -1; } CDF_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) * Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer CDF_GPU (size:%d) => %d\n", Nparticles, err); return -1; } u_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer u_GPU (size:%d) => %d\n", Nparticles, err); return -1; } likelihood_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer likelihood_GPU (size:%d) => %d\n", Nparticles, err); return -1; } weights_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (double) *Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer weights_GPU (size:%d) => %d\n", Nparticles, err); return -1; } I_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (unsigned char) *IszX * IszY * Nfr, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer I_GPU (size:%d) => %d\n", IszX * IszY * Nfr, err); return -1; } objxy_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, 2*sizeof (int) *countOnes, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer objxy_GPU (size:%d) => %d\n", countOnes, err); return -1; } ind_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (int) *countOnes * Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer ind_GPU (size:%d) => %d\n", countOnes * Nparticles, err); return -1; } seed_GPU = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof (int) *Nparticles, NULL, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer seed_GPU (size:%d) => %d\n", Nparticles, err); return -1; } partial_sums = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof (double) * Nparticles + 1, likelihood, &err); if (err != CL_SUCCESS) { printf("ERROR: clCreateBuffer partial_sums (size:%d) => %d\n", Nparticles, err); return -1; } //Donnie - this loop is different because in this kernel, arrayX and arrayY // are set equal to xj before every iteration, so effectively, arrayX and // arrayY will be set to xe and ye before the first iteration. for (x = 0; x < Nparticles; x++) { xj[x] = xe; yj[x] = ye; } int k; //double * Ik = (double *)calloc(IszX*IszY, sizeof(double)); int indX, indY; //start send long long send_start = get_time(); //OpenCL memory copy err = clEnqueueWriteBuffer(cmd_queue, I_GPU, 1, 0, sizeof (unsigned char) *IszX * IszY*Nfr, I, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer I_GPU (size:%d) => %d\n", IszX * IszY*Nfr, err); return -1; } err = clEnqueueWriteBuffer(cmd_queue, objxy_GPU, 1, 0, 2*sizeof (int) *countOnes, objxy, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer objxy_GPU (size:%d) => %d\n", countOnes, err); return -1; } err = clEnqueueWriteBuffer(cmd_queue, weights_GPU, 1, 0, sizeof (double) *Nparticles, weights, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer weights_GPU (size:%d) => %d\n", Nparticles, err); return -1; } err = clEnqueueWriteBuffer(cmd_queue, xj_GPU, 1, 0, sizeof (double) *Nparticles, xj, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer arrayX_GPU (size:%d) => %d\n", Nparticles, err); return -1; } err = clEnqueueWriteBuffer(cmd_queue, yj_GPU, 1, 0, sizeof (double) *Nparticles, yj, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer arrayY_GPU (size:%d) => %d\n", Nparticles, err); return -1; } err = clEnqueueWriteBuffer(cmd_queue, seed_GPU, 1, 0, sizeof (int) *Nparticles, seed, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueWriteBuffer seed_GPU (size:%d) => %d\n", Nparticles, err); return -1; } /********************************************************************** *********** E N D A L L O C A T E ******************************** *********************************************************************/ long long send_end = get_time(); printf("TIME TO SEND TO GPU: %f\n", elapsed_time(send_start, send_end)); int num_blocks = ceil((double) Nparticles / (double) threads_per_block); printf("threads_per_block=%d \n",threads_per_block); size_t local_work[3] = {threads_per_block, 1, 1}; size_t global_work[3] = {num_blocks*threads_per_block, 1, 1}; for (k = 1; k < Nfr; k++) { /****************** L I K E L I H O O D ************************************/ clSetKernelArg(kernel_likelihood, 0, sizeof (void *), (void*) &arrayX_GPU); clSetKernelArg(kernel_likelihood, 1, sizeof (void *), (void*) &arrayY_GPU); clSetKernelArg(kernel_likelihood, 2, sizeof (void *), (void*) &xj_GPU); clSetKernelArg(kernel_likelihood, 3, sizeof (void *), (void*) &yj_GPU); clSetKernelArg(kernel_likelihood, 4, sizeof (void *), (void*) &CDF_GPU); clSetKernelArg(kernel_likelihood, 5, sizeof (void *), (void*) &ind_GPU); clSetKernelArg(kernel_likelihood, 6, sizeof (void *), (void*) &objxy_GPU); clSetKernelArg(kernel_likelihood, 7, sizeof (void *), (void*) &likelihood_GPU); clSetKernelArg(kernel_likelihood, 8, sizeof (void *), (void*) &I_GPU); clSetKernelArg(kernel_likelihood, 9, sizeof (void *), (void*) &u_GPU); clSetKernelArg(kernel_likelihood, 10, sizeof (void *), (void*) &weights_GPU); clSetKernelArg(kernel_likelihood, 11, sizeof (cl_int), (void*) &Nparticles); clSetKernelArg(kernel_likelihood, 12, sizeof (cl_int), (void*) &countOnes); clSetKernelArg(kernel_likelihood, 13, sizeof (cl_int), (void*) &max_size); clSetKernelArg(kernel_likelihood, 14, sizeof (cl_int), (void*) &k); clSetKernelArg(kernel_likelihood, 15, sizeof (cl_int), (void*) &IszY); clSetKernelArg(kernel_likelihood, 16, sizeof (cl_int), (void*) &Nfr); clSetKernelArg(kernel_likelihood, 17, sizeof (void *), (void*) &seed_GPU); clSetKernelArg(kernel_likelihood, 18, sizeof (void *), (void*) &partial_sums); clSetKernelArg(kernel_likelihood, 19, threads_per_block * sizeof (double), NULL); //KERNEL FUNCTION CALL err = clEnqueueNDRangeKernel(cmd_queue, kernel_likelihood, 1, NULL, global_work, local_work, 0, 0, 0); clFinish(cmd_queue); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueNDRangeKernel(kernel_likelihood)=>%d failed\n", err); //check_error(err, __FILE__, __LINE__); return -1; } /****************** E N D L I K E L I H O O D **********************/ /*************************** S U M ************************************/ clSetKernelArg(kernel_sum, 0, sizeof (void *), (void*) &partial_sums); clSetKernelArg(kernel_sum, 1, sizeof (cl_int), (void*) &Nparticles); //KERNEL FUNCTION CALL err = clEnqueueNDRangeKernel(cmd_queue, kernel_sum, 1, NULL, global_work, local_work, 0, 0, 0); clFinish(cmd_queue); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueNDRangeKernel(kernel_sum)=>%d failed\n", err); //check_error(err, __FILE__, __LINE__); return -1; }/*************************** E N D S U M ****************************/ /**************** N O R M A L I Z E W E I G H T S *****************/ clSetKernelArg(kernel_normalize_weights, 0, sizeof (void *), (void*) &weights_GPU); clSetKernelArg(kernel_normalize_weights, 1, sizeof (cl_int), (void*) &Nparticles); clSetKernelArg(kernel_normalize_weights, 2, sizeof (void *), (void*) &partial_sums); //*/ clSetKernelArg(kernel_normalize_weights, 3, sizeof (void *), (void*) &CDF_GPU); clSetKernelArg(kernel_normalize_weights, 4, sizeof (void *), (void*) &u_GPU); clSetKernelArg(kernel_normalize_weights, 5, sizeof (void *), (void*) &seed_GPU); //KERNEL FUNCTION CALL err = clEnqueueNDRangeKernel(cmd_queue, kernel_normalize_weights, 1, NULL, global_work, local_work, 0, 0, 0); clFinish(cmd_queue); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueNDRangeKernel(normalize_weights)=>%d failed\n", err); //check_error(err, __FILE__, __LINE__); return -1; } /************* E N D N O R M A L I Z E W E I G H T S ***********/ // ocl_print_double_array(cmd_queue, partial_sums, 40); // /********* I N T E R M E D I A T E R E S U L T S ***************/ // //OpenCL memory copying back from GPU to CPU memory err = clEnqueueReadBuffer(cmd_queue, arrayX_GPU, 1, 0, sizeof (double) *Nparticles, arrayX, 0, 0, 0); err = clEnqueueReadBuffer(cmd_queue, arrayY_GPU, 1, 0, sizeof (double) *Nparticles, arrayY, 0, 0, 0); err = clEnqueueReadBuffer(cmd_queue, weights_GPU, 1, 0, sizeof (double) *Nparticles, weights, 0, 0, 0); xe = 0; ye = 0; double total=0.0; // estimate the object location by expected values for (x = 0; x < Nparticles; x++) { // if( 0.0000000 < arrayX[x]*weights[x]) printf("arrayX[%d]:%f, arrayY[%d]:%f, weights[%d]:%0.10f\n",x,arrayX[x], x, arrayY[x], x, weights[x]); // printf("arrayX[%d]:%f | arrayY[%d]:%f | weights[%d]:%f\n", // x, arrayX[x], x, arrayY[x], x, weights[x]); xe += arrayX[x] * weights[x]; ye += arrayY[x] * weights[x]; total+= weights[x]; } printf("total weight: %lf\n", total); printf("XE: %lf\n", xe); printf("YE: %lf\n", ye); double distance = sqrt(pow((double) (xe - (int) roundDouble(IszY / 2.0)), 2) + pow((double) (ye - (int) roundDouble(IszX / 2.0)), 2)); printf("%lf\n", distance); // /********* E N D I N T E R M E D I A T E R E S U L T S ***************/ /******************** F I N D I N D E X ****************************/ //Set number of threads clSetKernelArg(kernel_find_index, 0, sizeof (void *), (void*) &arrayX_GPU); clSetKernelArg(kernel_find_index, 1, sizeof (void *), (void*) &arrayY_GPU); clSetKernelArg(kernel_find_index, 2, sizeof (void *), (void*) &CDF_GPU); clSetKernelArg(kernel_find_index, 3, sizeof (void *), (void*) &u_GPU); clSetKernelArg(kernel_find_index, 4, sizeof (void *), (void*) &xj_GPU); clSetKernelArg(kernel_find_index, 5, sizeof (void *), (void*) &yj_GPU); clSetKernelArg(kernel_find_index, 6, sizeof (void *), (void*) &weights_GPU); clSetKernelArg(kernel_find_index, 7, sizeof (cl_int), (void*) &Nparticles); //KERNEL FUNCTION CALL err = clEnqueueNDRangeKernel(cmd_queue, kernel_find_index, 1, NULL, global_work, local_work, 0, 0, 0); clFinish(cmd_queue); if (err != CL_SUCCESS) { printf("ERROR: clEnqueueNDRangeKernel(find_index)=>%d failed\n", err); //check_error(err, __FILE__, __LINE__); return -1; } /******************* E N D F I N D I N D E X ********************/ }//end loop //block till kernels are finished //clFinish(cmd_queue); long long back_time = get_time(); //OpenCL freeing of memory clReleaseProgram(prog); clReleaseMemObject(u_GPU); clReleaseMemObject(CDF_GPU); clReleaseMemObject(yj_GPU); clReleaseMemObject(xj_GPU); clReleaseMemObject(likelihood_GPU); clReleaseMemObject(I_GPU); clReleaseMemObject(objxy_GPU); clReleaseMemObject(ind_GPU); clReleaseMemObject(seed_GPU); clReleaseMemObject(partial_sums); long long free_time = get_time(); //OpenCL memory copying back from GPU to CPU memory err = clEnqueueReadBuffer(cmd_queue, arrayX_GPU, 1, 0, sizeof (double) *Nparticles, arrayX, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: Memcopy Out\n"); return -1; } long long arrayX_time = get_time(); err = clEnqueueReadBuffer(cmd_queue, arrayY_GPU, 1, 0, sizeof (double) *Nparticles, arrayY, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: Memcopy Out\n"); return -1; } long long arrayY_time = get_time(); err = clEnqueueReadBuffer(cmd_queue, weights_GPU, 1, 0, sizeof (double) *Nparticles, weights, 0, 0, 0); if (err != CL_SUCCESS) { printf("ERROR: Memcopy Out\n"); return -1; } long long back_end_time = get_time(); printf("GPU Execution: %lf\n", elapsed_time(send_end, back_time)); printf("FREE TIME: %lf\n", elapsed_time(back_time, free_time)); printf("SEND TO SEND BACK: %lf\n", elapsed_time(back_time, back_end_time)); printf("SEND ARRAY X BACK: %lf\n", elapsed_time(free_time, arrayX_time)); printf("SEND ARRAY Y BACK: %lf\n", elapsed_time(arrayX_time, arrayY_time)); printf("SEND WEIGHTS BACK: %lf\n", elapsed_time(arrayY_time, back_end_time)); xe = 0; ye = 0; // estimate the object location by expected values for (x = 0; x < Nparticles; x++) { xe += arrayX[x] * weights[x]; ye += arrayY[x] * weights[x]; } double distance = sqrt(pow((double) (xe - (int) roundDouble(IszY / 2.0)), 2) + pow((double) (ye - (int) roundDouble(IszX / 2.0)), 2)); //Output results FILE *fid; fid=fopen("output.txt", "w+"); if( fid == NULL ){ printf( "The file was not opened for writing\n" ); return -1; } fprintf(fid, "XE: %lf\n", xe); fprintf(fid, "YE: %lf\n", ye); fprintf(fid, "distance: %lf\n", distance); fclose(fid); //OpenCL freeing of memory clReleaseMemObject(weights_GPU); clReleaseMemObject(arrayY_GPU); clReleaseMemObject(arrayX_GPU); //free regular memory free(likelihood); free(arrayX); free(arrayY); free(xj); free(yj); free(CDF); free(ind); free(u); }
void btParticlesDynamicsWorld::initCLKernels(int argc, char** argv) { cl_int ciErrNum; if (!m_cxMainContext) { cl_device_type deviceType = CL_DEVICE_TYPE_ALL; m_cxMainContext = btOpenCLUtils::createContextFromType(deviceType, &ciErrNum, 0, 0); int numDev = btOpenCLUtils::getNumDevices(m_cxMainContext); if (!numDev) { btAssert(0); exit(0);//this is just a demo, exit now } m_cdDevice = btOpenCLUtils::getDevice(m_cxMainContext,0); oclCHECKERROR(ciErrNum, CL_SUCCESS); btOpenCLDeviceInfo clInfo; btOpenCLUtils::getDeviceInfo(m_cdDevice,clInfo); btOpenCLUtils::printDeviceInfo(m_cdDevice); // create a command-queue m_cqCommandQue = clCreateCommandQueue(m_cxMainContext, m_cdDevice, 0, &ciErrNum); oclCHECKERROR(ciErrNum, CL_SUCCESS); } // Program Setup size_t program_length; #ifdef LOAD_FROM_MEMORY program_length = strlen(source); printf("OpenCL compiles ParticlesOCL.cl ... "); #else const char* fileName = "ParticlesOCL.cl"; FILE * fp = fopen(fileName, "rb"); char newFileName[512]; if (fp == NULL) { sprintf(newFileName,"..//%s",fileName); fp = fopen(newFileName, "rb"); if (fp) fileName = newFileName; } if (fp == NULL) { sprintf(newFileName,"Demos//ParticlesOpenCL//%s",fileName); fp = fopen(newFileName, "rb"); if (fp) fileName = newFileName; } if (fp == NULL) { sprintf(newFileName,"..//..//..//..//..//Demos//ParticlesOpenCL//%s",fileName); fp = fopen(newFileName, "rb"); if (fp) fileName = newFileName; else { printf("cannot find %s\n",newFileName); exit(0); } } // char *source = oclLoadProgSource(".//Demos//SpheresGrid//SpheresGrid.cl", "", &program_length); //char *source = btOclLoadProgSource(".//Demos//SpheresOpenCL//Shared//SpheresGrid.cl", "", &program_length); char *source = btOclLoadProgSource(fileName, "", &program_length); if(source == NULL) { printf("ERROR : OpenCL can't load file %s\n", fileName); } // oclCHECKERROR (source == NULL, oclFALSE); btAssert(source != NULL); // create the program printf("OpenCL compiles %s ...", fileName); #endif //LOAD_FROM_MEMORY //printf("%s\n", source); m_cpProgram = clCreateProgramWithSource(m_cxMainContext, 1, (const char**)&source, &program_length, &ciErrNum); oclCHECKERROR(ciErrNum, CL_SUCCESS); #ifndef LOAD_FROM_MEMORY free(source); #endif //LOAD_FROM_MEMORY //#define LOCAL_SIZE_LIMIT 1024U #define LOCAL_SIZE_MAX 1024U // Build the program with 'mad' Optimization option #ifdef MAC const char* flags = "-I. -DLOCAL_SIZE_MAX=1024U -cl-mad-enable -DMAC -DGUID_ARG"; #else const char* flags = "-I. -DLOCAL_SIZE_MAX=1024U -DGUID_ARG= "; #endif // build the program ciErrNum = clBuildProgram(m_cpProgram, 0, NULL, flags, NULL, NULL); if(ciErrNum != CL_SUCCESS) { // write out standard error // oclLog(LOGBOTH | ERRORMSG, (double)ciErrNum, STDERROR); // write out the build log and ptx, then exit char cBuildLog[10240]; // char* cPtx; // size_t szPtxLength; clGetProgramBuildInfo(m_cpProgram, m_cdDevice, CL_PROGRAM_BUILD_LOG, sizeof(cBuildLog), cBuildLog, NULL ); // oclGetProgBinary(m_cpProgram, oclGetFirstDev(m_cxMainContext), &cPtx, &szPtxLength); // oclLog(LOGBOTH | CLOSELOG, 0.0, "\n\nLog:\n%s\n\n\n\n\nPtx:\n%s\n\n\n", cBuildLog, cPtx); printf("\n\n%s\n\n\n", cBuildLog); printf("Press ENTER key to terminate the program\n"); getchar(); exit(-1); } printf("OK\n"); // create the kernels postInitDeviceData(); initKernel(PARTICLES_KERNEL_COMPUTE_CELL_ID, "kComputeCellId"); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 1, sizeof(cl_mem), (void*) &m_dPos); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 2, sizeof(cl_mem), (void*) &m_dPosHash); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_COMPUTE_CELL_ID].m_kernel, 3, sizeof(cl_mem), (void*) &m_dSimParams); oclCHECKERROR(ciErrNum, CL_SUCCESS); initKernel(PARTICLES_KERNEL_INTEGRATE_MOTION, "kIntegrateMotion"); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 1, sizeof(cl_mem), (void *) &m_dPos); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 2, sizeof(cl_mem), (void *) &m_dVel); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_INTEGRATE_MOTION].m_kernel, 3, sizeof(cl_mem), (void *) &m_dSimParams); oclCHECKERROR(ciErrNum, CL_SUCCESS); initKernel(PARTICLES_KERNEL_CLEAR_CELL_START, "kClearCellStart"); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_CLEAR_CELL_START].m_kernel, 0, sizeof(int), (void *) &m_numGridCells); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_CLEAR_CELL_START].m_kernel, 1, sizeof(cl_mem), (void*) &m_dCellStart); initKernel(PARTICLES_KERNEL_FIND_CELL_START, "kFindCellStart"); // ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 0, sizeof(int), (void*) &m_numParticles); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 1, sizeof(cl_mem), (void*) &m_dPosHash); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 2, sizeof(cl_mem), (void*) &m_dCellStart); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 3, sizeof(cl_mem), (void*) &m_dPos); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 4, sizeof(cl_mem), (void*) &m_dVel); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 5, sizeof(cl_mem), (void*) &m_dSortedPos); ciErrNum |= clSetKernelArg(m_kernels[PARTICLES_KERNEL_FIND_CELL_START].m_kernel, 6, sizeof(cl_mem), (void*) &m_dSortedVel); oclCHECKERROR(ciErrNum, CL_SUCCESS); initKernel(PARTICLES_KERNEL_COLLIDE_PARTICLES, "kCollideParticles"); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 1, sizeof(cl_mem), (void*) &m_dVel); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 2, sizeof(cl_mem), (void*) &m_dSortedPos); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 3, sizeof(cl_mem), (void*) &m_dSortedVel); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 4, sizeof(cl_mem), (void*) &m_dPosHash); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 5, sizeof(cl_mem), (void*) &m_dCellStart); ciErrNum = clSetKernelArg(m_kernels[PARTICLES_KERNEL_COLLIDE_PARTICLES].m_kernel, 6, sizeof(cl_mem), (void*) &m_dSimParams); initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL, "kBitonicSortCellIdLocal"); initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_LOCAL_1, "kBitonicSortCellIdLocal1"); initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_GLOBAL, "kBitonicSortCellIdMergeGlobal"); initKernel(PARTICLES_KERNEL_BITONIC_SORT_CELL_ID_MERGE_LOCAL, "kBitonicSortCellIdMergeLocal"); }
int main(int argc, char** argv) { int err; // error code returned from api calls float data[DATA_SIZE]; // original data set given to device float results[DATA_SIZE]; // results returned from device unsigned int correct; // number of correct results returned size_t global; // global domain size for our calculation size_t local; // local domain size for our calculation cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel cl_mem input; // device memory used for the input array cl_mem output; // device memory used for the output array // Fill our data set with random float values // int i = 0; unsigned int count = DATA_SIZE; for(i = 0; i < count; i++) data[i] = rand() / (float)RAND_MAX; // Connect to a compute device // int gpu = 1; err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command commands // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source buffer // program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "square", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } // Create the input and output arrays in device memory for our calculation // input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL); output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL); if (!input || !output) { printf("Error: Failed to allocate device memory!\n"); exit(1); } // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array!\n"); exit(1); } // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } // Get the maximum work group size for executing the kernel on the device // err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // global = count; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } // Wait for the command commands to get serviced before reading back results // clFinish(commands); // Read back the results from the device to verify the output // err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } // Validate our results // correct = 0; for(i = 0; i < count; i++) { if(results[i] == data[i] * data[i]) correct++; } // Print a brief summary detailing the results // printf("Computed '%d/%d' correct values!\n", correct, count); // Shutdown and cleanup // clReleaseMemObject(input); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return 0; }
int main(int argc, char** argv) { int rank, size; // MPI rank & size int err; // error code returned from OpenCL calls float h_a[LENGTH]; // a vector float h_b[LENGTH]; // b vector float h_c[LENGTH]; // c vector (a+b) returned from the compute device (local per task) float _h_c[LENGTH]; // c vector (a+b) returned from the compute device (global for master) unsigned int correct; // number of correct results size_t global; // global domain size size_t local; // local domain size cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel ko_vadd; // compute kernel cl_mem d_a; // device memory used for the input a vector cl_mem d_b; // device memory used for the input b vector cl_mem d_c; // device memory used for the output c vector int mycount, i; err = MPI_Init (&argc, &argv); if (err != MPI_SUCCESS) { printf ("MPI_Init failed!\n"); exit (-1); } err = MPI_Comm_rank (MPI_COMM_WORLD, &rank); if (err != MPI_SUCCESS) { printf ("MPI_Comm_rank failed!\n"); exit (-1); } err = MPI_Comm_size (MPI_COMM_WORLD, &size); if (err != MPI_SUCCESS) { printf ("MPI_Comm_size failed\n"); exit (-1); } if (LENGTH % size != 0) { printf ("Number of MPI processes must divide LENGTH (%d)\n", LENGTH); exit (-1); } mycount = LENGTH / size; if (rank == 0) { for (i = 0; i < LENGTH; i++) { h_a[i] = rand() / (float)RAND_MAX; h_b[i] = rand() / (float)RAND_MAX; h_a[i] = i; h_b[i] = i*2; } err = MPI_Bcast (h_a, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD); if (err != MPI_SUCCESS) { printf ("MPI_Bcast failed transferring h_a\n"); exit (-1); } err = MPI_Bcast (h_b, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD); if (err != MPI_SUCCESS) { printf ("MPI_Bcast failed transferring h_b\n"); exit (-1); } } else { err = MPI_Bcast (h_a, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD); if (err != MPI_SUCCESS) { printf ("MPI_Bcast failed receiving h_a\n"); exit (-1); } err = MPI_Bcast (h_b, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD); if (err != MPI_SUCCESS) { printf ("MPI_Bcast failed receiving h_b\n"); exit (-1); } } // Set up platform cl_uint numPlatforms; // Find number of platforms err = clGetPlatformIDs(0, NULL, &numPlatforms); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to find a platform!\n"); return EXIT_FAILURE; } // Get all platforms cl_platform_id Platform[numPlatforms]; err = clGetPlatformIDs(numPlatforms, Platform, NULL); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to get the platform!\n"); return EXIT_FAILURE; } // Secure a GPU for (i = 0; i < numPlatforms; i++) { err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL); if (err == CL_SUCCESS) break; } if (device_id == NULL) { printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } else { if (output_device_info (rank, device_id) != CL_SUCCESS) return EXIT_FAILURE; } // Create a compute context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command queue commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // Create the compute kernel from the program ko_vadd = clCreateKernel(program, "vadd", &err); if (!ko_vadd || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } // Create the input (a, b) and output (c) arrays in device memory d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * mycount, NULL, NULL); d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * mycount, NULL, NULL); d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * mycount, NULL, NULL); if (!d_a || !d_b || !d_c) { printf("Error: Failed to allocate device memory!\n"); exit(1); } // Write a and b vectors into compute device memory err = clEnqueueWriteBuffer(commands, d_a, CL_TRUE, 0, sizeof(float) * mycount, &h_a[rank*mycount], 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write h_a to source array!\n"); exit(1); } err = clEnqueueWriteBuffer(commands, d_b, CL_TRUE, 0, sizeof(float) * mycount, &h_b[rank*mycount], 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write h_b to source array!\n"); exit(1); } // Set the arguments to our compute kernel err = clSetKernelArg(ko_vadd, 0, sizeof(cl_mem), &d_a); err |= clSetKernelArg(ko_vadd, 1, sizeof(cl_mem), &d_b); err |= clSetKernelArg(ko_vadd, 2, sizeof(cl_mem), &d_c); err |= clSetKernelArg(ko_vadd, 3, sizeof(unsigned int), &mycount); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(ko_vadd, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device global = LENGTH; err = clEnqueueNDRangeKernel(commands, ko_vadd, 1, NULL, &global, &local, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } // Wait for the commands to complete before reading back results clFinish(commands); // Read back the results from the compute device err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * mycount, &h_c, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } err = MPI_Gather (h_c, mycount, MPI_FLOAT, _h_c, mycount, MPI_FLOAT, 0, MPI_COMM_WORLD); if (err != MPI_SUCCESS) { printf ("MPI_Gather failed receiving h_c\n"); exit (-1); } if (rank == 0) { // Test the results correct = 0; float tmp; for(i = 0; i < LENGTH; i++) { tmp = h_a[i] + h_b[i]; // assign element i of a+b to tmp tmp -= _h_c[i]; // compute deviation of expected and output result if(tmp*tmp < TOL*TOL) // correct if square deviation is less than tolerance squared correct++; else printf(" tmp %f h_a %f h_b %f h_c %f \n",tmp, h_a[i], h_b[i], _h_c[i]); } // summarize results printf("C = A+B: %d out of %d results were correct.\n", correct, LENGTH); } // cleanup then shutdown clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); clReleaseProgram(program); clReleaseKernel(ko_vadd); clReleaseCommandQueue(commands); clReleaseContext(context); err = MPI_Finalize (); if (err != MPI_SUCCESS) { printf ("MPI_Finalize failed!\n"); exit (-1); } return 0; }
void initopencl(void) { int i; // Get Platform and Device Info CL_CHECK(clGetPlatformIDs(1, &platform_id, &num_platforms)); // Currently this program only runs on a SINGLE GPU. CL_CHECK(clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &num_devices)); printf("=== %d OpenCL platform(s) found: ===\n", num_platforms); printf("=== %d OpenCL device(s) found on platform:\n", num_devices); char buffer[10240]; cl_uint buf_uint; cl_ulong buf_ulong; printf(" -- %d --\n", i); CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(buffer), buffer, NULL)); printf(" DEVICE_NAME = %s\n", buffer); CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VENDOR = %s\n", buffer); CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(device_id, CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL)); printf(" DRIVER_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(device_id, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL)); printf(" DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong); if (num_devices == 0) { fprintf(stderr, "No Devices found that can run OpenCL."); exit(0); } // Create OpenCL context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating context: Function returned %d \n\n", ret); exit(1); } // Create Command Queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating command Queue: Function returned %d \n\n", ret); exit(1); } // Load the kernel source code into the array source_str FILE *fp; char *source_str; size_t source_size; fp = fopen("integrate.cl", "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } source_str = (char*)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp); fclose( fp ); // Create a program from the kernel source program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a program for integration3D. %d \n\n", (int)ret); exit(1); } // Build the program ret = clBuildProgram(program, 1, &device_id, "-DUSE_DOUBLE=1", NULL, NULL); if (ret != CL_SUCCESS) { size_t length; char buffer[10240]; clGetProgramBuildInfo(program, 1, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &length); fprintf(stderr, "Error returned %d. \n\n", (int)ret); printf("Error Log: \n\n %s \n\n", buffer); exit(0); } /* // Create the OpenCL kernel (compute_points_Unstructure3D_1) kernel1 = clCreateKernel(program, "compute_points_Unstructure3D_1", &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a kernel for compute_points_Unstructure3D_1. \n\n"); exit(1); } */ // Create the OpenCL kernel (check_int) kernel2 = clCreateKernel(program, "check_int", &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a kernel for check_int. %d \n\n", (int)ret); exit(1); } // Create the OpenCL kernel (compute_points_Unstructure3D_1) kernel1 = clCreateKernel(program, "compute_points_Unstructure3D_1", &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a kernel for compute_points_Unstructure3D_1. \n\n"); exit(1); } // Create the OpenCL kernel (initialize_timestep3D) kernel3 = clCreateKernel(program, "initialize_timestep3D", &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a kernel for initialize_timestep3D. \n\n"); exit(1); } // Create the OpenCL kernel (initialize_timestep3D) kernel4 = clCreateKernel(program, "LocalSearch3D", &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a kernel for LocalSearch3D. \n\n"); exit(1); } // Create the OpenCL kernel (initialize_timestep3D) kernel5 = clCreateKernel(program, "compute_points_Unstructure3D_2", &ret); if (ret != CL_SUCCESS) { fprintf(stderr, "Error creating a kernel for LocalSearch3D. \n\n"); exit(1); } printf("\n\n"); }
void OpenCLWaveSimulation::initOCL() { const unsigned int vboSize = m_gridWidth * m_gridHeight * VERTEX_SIZE * sizeof(float); // first get number of available platt forms cl_uint numPlattforms = 0; clGetPlatformIDs(0, NULL, &numPlattforms); // then allocate enough memory for all IDs cl_platform_id* plattforms = new cl_platform_id[numPlattforms]; clGetPlatformIDs(numPlattforms, plattforms, NULL); cl_device_type deviceType = (m_argc <= 1) ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU; // now try to find the right plattform for given device type for (unsigned int i = 0; i < numPlattforms; ++i) { clGetDeviceIDs(plattforms[i], deviceType, 1, &m_device, NULL); if (m_device) { m_platform = plattforms[i]; break; } } delete plattforms; // ocl context must be tied to the ogl context # ifdef _WIN32 HGLRC glCtx = wglGetCurrentContext(); # else GLXContext glCtx = glXGetCurrentContext(); # endif cl_context_properties props[] = {CL_CONTEXT_PLATFORM, (cl_context_properties)m_platform, # ifdef _WIN32 CL_WGL_HDC_KHR, (intptr_t) wglGetCurrentDC(), # else CL_GLX_DISPLAY_KHR, (intptr_t) glXGetCurrentDisplay(); # endif CL_GL_CONTEXT_KHR, (intptr_t) glCtx, 0 }; // create context and queue m_context = clCreateContext(props, 1, &m_device, NULL, NULL, NULL); m_queue = clCreateCommandQueue(m_context, m_device, 0, NULL); // create buffers int errCode; m_clPositionInteropBuffer = clCreateFromGLBuffer(m_context, CL_MEM_WRITE_ONLY, m_positionVBO, &errCode); if(errCode != CL_SUCCESS) { std::cerr << "Failed creating cl_mem position buffer from gl buffer\n"; } m_clNormalInteropBuffer = clCreateFromGLBuffer(m_context, CL_MEM_WRITE_ONLY, m_normalVBO, &errCode); if(errCode != CL_SUCCESS) { std::cerr << "Failed creating cl_mem normal buffer from gl buffer\n"; } m_clTangentInteropBuffer = clCreateFromGLBuffer(m_context, CL_MEM_WRITE_ONLY, m_tangentVBO, &errCode); if(errCode != CL_SUCCESS) { std::cerr << "Failed creating cl_mem tangent buffer from gl buffer\n"; } m_clPing = clCreateBuffer(m_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vboSize, reinterpret_cast<float*>(m_waves.getVertices()), &errCode); if(errCode != CL_SUCCESS) { std::cerr << "Failed creating cl_mem read write buffer\n"; } m_clPong = clCreateBuffer(m_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vboSize, reinterpret_cast<float*>(m_waves.getVertices()), &errCode); if(errCode != CL_SUCCESS) { std::cerr << "Failed creating cl_mem read write buffer\n"; } // load source file std::ifstream file("WaveSimulation.cl"); std::string prog(std::istreambuf_iterator<char>(file), (std::istreambuf_iterator<char>())); file.close(); const char* source = prog.c_str(); const size_t kernelsize = prog.length() + 1; m_program = clCreateProgramWithSource(m_context, 1, (const char**)&source, &kernelsize, NULL); // build program int err = clBuildProgram(m_program, 0, NULL, NULL, NULL, NULL); if(err != CL_SUCCESS) { size_t len; char buffer[2048]; std::cerr << "Error: Failed to build program executable!" << std::endl; clGetProgramBuildInfo(m_program, m_device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); std::cerr << buffer << std::endl; exit(1); } // create the compute kernels in the program m_vertexDisplacementKernel = clCreateKernel(m_program, "compute_vertex_displacement", &err); if(!m_vertexDisplacementKernel || err != CL_SUCCESS) { std::cerr << "Error: Failed to create compute kernel: compute_vertex_displacement!" << std::endl; exit(1); } m_finiteDifferenceSchemeKernel = clCreateKernel(m_program, "compute_finite_difference_scheme", &err); if(!m_finiteDifferenceSchemeKernel || err != CL_SUCCESS) { std::cerr << "Error: Failed to create compute kernel: compute_finite_difference_scheme!" << std::endl; exit(1); } m_disturbKernel = clCreateKernel(m_program, "disturb_grid", &err); if(!m_disturbKernel || err != CL_SUCCESS) { std::cerr << "Error: Failed to create compute kernel: disturb_grid!" << std::endl; exit(1); } m_glGridInitKernel = clCreateKernel(m_program, "initialize_gl_grid", &err); if(!m_glGridInitKernel || err != CL_SUCCESS) { std::cerr << "Error: Failed to create compute kernel: initialize_gl_grid!" << std::endl; exit(1); } initGLBuffer(); }
void BurstSort::parallelSort(std::ofstream& file){ char* buffer = NULL; char* tmp; int* posArray = NULL; int entryLength = KEY_LENGTH + sizeof(char*); buffer = (char*) malloc(sizeof(char) * size * entryLength); posArray = (int*) malloc(sizeof(int) * (NODE_SIZE + 1)); int pos = 0; posArray[0] = 0; for(int i = 0; i < NODE_SIZE; i++){ for(int j = 0; j < nodes[i].used; j++){ memcpy(buffer + pos * entryLength, nodes[i].entries[j], KEY_LENGTH * sizeof(char)); memcpy(buffer + pos * entryLength + KEY_LENGTH, &nodes[i].entries[j], sizeof(char*)); pos += sizeof(char); } posArray[i+1] = pos; } // OpenCL // Use this to check the output of each API call cl_int status; cl_int numDevices = 1; // Connect to first platform cl_platform_id platform; status = clGetPlatformIDs(1, &platform, NULL); if (status != CL_SUCCESS) { printf("Error: Failed to find an OpenCL platform!\n"); return -1; } char cBuffer[1024]; clGetPlatformInfo(platform, CL_PLATFORM_VENDOR, sizeof(cBuffer), cBuffer, NULL); printf("CL_PLATFORM_VENDOR %s\n", cBuffer); clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(cBuffer), cBuffer, NULL); printf("CL_PLATFORM_NAME %s\n", cBuffer); cl_device_id device; status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ACCELERATOR, 1, &device, NULL); if (status != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); return -1; } cl_long maxBufferSize = 0; status = clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(cl_long), &maxBufferSize, NULL); printf("max buffer size: %lld\n", maxBufferSize); // Create a context and associate it with the devices cl_context context; context = clCreateContext(NULL, numDevices, &device, NULL, NULL, &status); if (status != CL_SUCCESS) { printf("Error in creating context, code %d\n", status); return -1; } // Create a command queue and associate it with the device cl_command_queue cmdQueue; cmdQueue = clCreateCommandQueue(context, device, 0, &status); if (status != CL_SUCCESS) { printf("Error in creating command queue for a device, code %d\n", status); return -1; } // Load binary from disk unsigned char *kernelbinary; char *xclbin = "sort_xiaohui.xclbin"; printf("loading %s\n", xclbin); int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary); if (n_i < 0) { printf("ERROR: failed to load kernel from xclbin: %s\n", xclbin); return -1; } size_t n_bit = n_i; // Create the compute program from offline cl_program program = clCreateProgramWithBinary(context, 1, &device, &n_bit, (const unsigned char **) &kernelbinary, NULL, &status); if ((!program) || (status != CL_SUCCESS)) { printf("Error: Failed to create compute program from binary %d!\n", status); return -1; } // Build the program executable status = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (status != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return -1; } // Create the vector addition kernel cl_kernel kernel; kernel = clCreateKernel(program, "sort", &status); cl_mem clPosArray; cl_mem clBuffer; clBuffer = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(char) * size * entryLength, NULL, &status); clPosArray = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int) * (NODE_SIZE + 1), NULL, &status); status = clEnqueueWriteBuffer(cmdQueue, clPosArray, CL_FALSE, 0, sizeof(int) * (NODE_SIZE + 1),posArray, 0, NULL, NULL); status = clEnqueueWriteBuffer(cmdQueue, clBuffer, CL_FALSE, 0, sizeof(char) * size * entryLength, buffer, 0, NULL, NULL); // Associate the input and output buffers with the kernel status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &clBuffer); status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &clPosArray); int nodeSize = NODE_SIZE; status = clSetKernelArg(kernel, 2, sizeof(int), (void *)&nodeSize); status = clSetKernelArg(kernel, 3, sizeof(int), (void *)&entryLength); size_t globalWorkSize[1]; globalWorkSize[0] = NODE_SIZE; gettimeofday(&t1, NULL); // Execute the kernel for execution status = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); if (status != CL_SUCCESS) { printf("Error in clEnqueue, code %d\n", status); return -1; } // Read the device output buffer to the host output array clEnqueueReadBuffer(cmdQueue, clBuffer, CL_TRUE, 0, sizeof(char) * size * entryLength, buffer, 0, NULL, NULL); // Free OpenCL resources clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmdQueue); clReleaseMemObject(clBuffer); clReleaseMemObject(clPosArray); clReleaseContext(context); //print result for(int i = 0; i < size; i+= sizeof(char)){ memcpy(&tmp,buffer + i * entryLength + KEY_LENGTH,sizeof(char*)); file << tmp; } // Free host resources free(buffer); free(posArray); free(platforms); free(devices); }
int deflate259_opencl(unsigned char* input, unsigned in_len, unsigned char* tree, unsigned tree_len, unsigned char* output, unsigned* out_len) { #define SDACCEL_WRAPPER #ifdef SDACCEL_WRAPPER int err; // error code returned from api calls cl_platform_id platform_id; // platform id cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel char cl_platform_vendor[1001]; char cl_platform_name[1001]; err = clGetPlatformIDs(1,&platform_id,NULL); if (err != CL_SUCCESS) { printf("Error: Failed to find an OpenCL platform!\n"); printf("Test failed\n"); return EXIT_FAILURE; } err = clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n"); printf("Test failed\n"); return EXIT_FAILURE; } printf("CL_PLATFORM_VENDOR %s\n",cl_platform_vendor); err = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_NAME) failed!\n"); printf("Test failed\n"); return EXIT_FAILURE; } printf("CL_PLATFORM_NAME %s\n",cl_platform_name); // Connect to a compute device // int fpga = 0; #if defined (FPGA_DEVICE) fpga = 1; #endif err = clGetDeviceIDs(platform_id, fpga ? CL_DEVICE_TYPE_ACCELERATOR : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Create a command commands // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); printf("Error: code %i\n",err); printf("Test failed\n"); return EXIT_FAILURE; } int status; // Create Program Objects // // Load binary from disk unsigned char *kernelbinary; char xclbin[]="deflate1.xclbin"; printf("loading %s\n", xclbin); int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary); if (n_i < 0) { printf("failed to load kernel from xclbin: %s\n", xclbin); printf("Test failed\n"); return EXIT_FAILURE; } size_t n = n_i; // Create the compute program from offline program = clCreateProgramWithBinary(context, 1, &device_id, &n, (const unsigned char **) &kernelbinary, &status, &err); if ((!program) || (err!=CL_SUCCESS)) { printf("Error: Failed to create compute program from binary %d!\n", err); printf("Test failed\n"); return EXIT_FAILURE; } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); printf("Test failed\n"); return EXIT_FAILURE; } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "deflate259", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel! %d\n", err); printf("Test failed\n"); return EXIT_FAILURE; } // Create the input and output arrays in device memory for our calculation // void deflate259_opencl(unsigned char* input, unsigned in_len, unsigned char* tree, // unsigned tree_len, unsigned char* output, unsigned* out_len) cl_mem input_arg, in_len_arg, tree_arg, tree_len_arg, output_arg, out_len_arg; input_arg = clCreateBuffer(context, CL_MEM_READ_ONLY, CHUNK, NULL, NULL); in_len_arg = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned), NULL, NULL); tree_arg = clCreateBuffer(context, CL_MEM_READ_ONLY, 512, NULL, NULL); tree_len_arg = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(unsigned), NULL, NULL); output_arg = clCreateBuffer(context, CL_MEM_WRITE_ONLY, CHUNK*2, NULL, NULL); out_len_arg = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(unsigned), NULL, NULL); if (!input_arg || !in_len_arg || !tree_arg || !tree_len_arg || !output_arg || !out_len_arg) { printf("Error: Failed to allocate device memory!\n"); printf("Test failed\n"); return EXIT_FAILURE; } err = clEnqueueWriteBuffer(commands, input_arg, CL_TRUE, 0, in_len, input, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array input!\n"); printf("Test failed\n"); return EXIT_FAILURE; } err = clEnqueueWriteBuffer(commands, in_len_arg, CL_TRUE, 0, sizeof(unsigned), &in_len, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array &in_len!\n"); printf("Test failed\n"); return EXIT_FAILURE; } err = clEnqueueWriteBuffer(commands, tree_arg, CL_TRUE, 0, 512, tree, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array tree!\n"); printf("Test failed\n"); return EXIT_FAILURE; } err = clEnqueueWriteBuffer(commands, tree_len_arg, CL_TRUE, 0, sizeof(unsigned), &tree_len, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array &tree_len!\n"); printf("Test failed\n"); return EXIT_FAILURE; } // Set the arguments to our compute kernel //void deflate259_opencl(unsigned char* input, unsigned in_len, unsigned char* tree, // unsigned tree_len, unsigned char* output, unsigned* out_len) err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_arg); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &in_len_arg); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &tree_arg); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &tree_len_arg); err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &output_arg); err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &out_len_arg); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); printf("Test failed\n"); return EXIT_FAILURE; } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // #ifdef C_KERNEL err = clEnqueueTask(commands, kernel, 0, NULL, NULL); #else size_t global[1]; size_t local[1]; global[0] = 1; local[0] = 1; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, (size_t*)&global, (size_t*)&local, 0, NULL, NULL); #endif if (err) { printf("Error: Failed to execute kernel! %d\n", err); printf("Test failed\n"); return EXIT_FAILURE; } // Read back the results from the device to verify the output // cl_event readevent; unsigned out_len_b; err = clEnqueueReadBuffer( commands, out_len_arg, CL_TRUE, 0, sizeof(unsigned), &out_len_b, 0, NULL, &readevent ); if (err != CL_SUCCESS) { printf("Error: Failed to read output length! %d\n", err); printf("Test failed\n"); return EXIT_FAILURE; } clWaitForEvents(1, &readevent); *out_len = out_len_b; printf("Read final output length: %d\n", out_len_b); err = clEnqueueReadBuffer( commands, output_arg, CL_TRUE, 0, out_len_b, output, 0, NULL, &readevent ); if (err != CL_SUCCESS) { printf("Error: Failed to read output data! %d\n", err); printf("Test failed\n"); return EXIT_FAILURE; } clWaitForEvents(1, &readevent); #endif }
_clState *initCl(unsigned int gpu, char *name, size_t nameSize) { _clState *clState = calloc(1, sizeof(_clState)); bool patchbfi = false, prog_built = false; struct cgpu_info *cgpu = &gpus[gpu]; cl_platform_id platform = NULL; char pbuff[256], vbuff[255]; cl_platform_id* platforms; cl_uint preferred_vwidth; cl_device_id *devices; cl_uint numPlatforms; cl_uint numDevices; cl_int status; status = clGetPlatformIDs(0, NULL, &numPlatforms); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Platforms. (clGetPlatformsIDs)", status); return NULL; } platforms = (cl_platform_id *)alloca(numPlatforms*sizeof(cl_platform_id)); status = clGetPlatformIDs(numPlatforms, platforms, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Platform Ids. (clGetPlatformsIDs)", status); return NULL; } if (opt_platform_id >= (int)numPlatforms) { applog(LOG_ERR, "Specified platform that does not exist"); return NULL; } status = clGetPlatformInfo(platforms[opt_platform_id], CL_PLATFORM_VENDOR, sizeof(pbuff), pbuff, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Platform Info. (clGetPlatformInfo)", status); return NULL; } platform = platforms[opt_platform_id]; if (platform == NULL) { perror("NULL platform found!\n"); return NULL; } applog(LOG_INFO, "CL Platform vendor: %s", pbuff); status = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(pbuff), pbuff, NULL); if (status == CL_SUCCESS) applog(LOG_INFO, "CL Platform name: %s", pbuff); status = clGetPlatformInfo(platform, CL_PLATFORM_VERSION, sizeof(vbuff), vbuff, NULL); if (status == CL_SUCCESS) applog(LOG_INFO, "CL Platform version: %s", vbuff); status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, NULL, &numDevices); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Device IDs (num)", status); return NULL; } if (numDevices > 0 ) { devices = (cl_device_id *)malloc(numDevices*sizeof(cl_device_id)); /* Now, get the device list data */ status = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numDevices, devices, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Device IDs (list)", status); return NULL; } applog(LOG_INFO, "List of devices:"); unsigned int i; for (i = 0; i < numDevices; i++) { status = clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Device Info", status); return NULL; } applog(LOG_INFO, "\t%i\t%s", i, pbuff); } if (gpu < numDevices) { status = clGetDeviceInfo(devices[gpu], CL_DEVICE_NAME, sizeof(pbuff), pbuff, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Getting Device Info", status); return NULL; } applog(LOG_INFO, "Selected %i: %s", gpu, pbuff); strncpy(name, pbuff, nameSize); } else { applog(LOG_ERR, "Invalid GPU %i", gpu); return NULL; } } else return NULL; cl_context_properties cps[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platform, 0 }; clState->context = clCreateContextFromType(cps, CL_DEVICE_TYPE_GPU, NULL, NULL, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Creating Context. (clCreateContextFromType)", status); return NULL; } ///////////////////////////////////////////////////////////////// // Create an OpenCL command queue ///////////////////////////////////////////////////////////////// clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE, &status); if (status != CL_SUCCESS) /* Try again without OOE enable */ clState->commandQueue = clCreateCommandQueue(clState->context, devices[gpu], 0 , &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Creating Command Queue. (clCreateCommandQueue)", status); return NULL; } /* Check for BFI INT support. Hopefully people don't mix devices with * and without it! */ char * extensions = malloc(1024); const char * camo = "cl_amd_media_ops"; char *find; status = clGetDeviceInfo(devices[gpu], CL_DEVICE_EXTENSIONS, 1024, (void *)extensions, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_EXTENSIONS", status); return NULL; } find = strstr(extensions, camo); if (find) clState->hasBitAlign = true; /* Check for OpenCL >= 1.0 support, needed for global offset parameter usage. */ char * devoclver = malloc(1024); const char * ocl10 = "OpenCL 1.0"; const char * ocl11 = "OpenCL 1.1"; status = clGetDeviceInfo(devices[gpu], CL_DEVICE_VERSION, 1024, (void *)devoclver, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_VERSION", status); return NULL; } find = strstr(devoclver, ocl10); if (!find) { clState->hasOpenCL11plus = true; find = strstr(devoclver, ocl11); if (!find) clState->hasOpenCL12plus = true; } status = clGetDeviceInfo(devices[gpu], CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT, sizeof(cl_uint), (void *)&preferred_vwidth, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_PREFERRED_VECTOR_WIDTH_INT", status); return NULL; } applog(LOG_DEBUG, "Preferred vector width reported %d", preferred_vwidth); status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), (void *)&clState->max_work_size, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_WORK_GROUP_SIZE", status); return NULL; } applog(LOG_DEBUG, "Max work group size reported %d", (int)(clState->max_work_size)); status = clGetDeviceInfo(devices[gpu], CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(cl_ulong), (void *)&cgpu->max_alloc, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Failed to clGetDeviceInfo when trying to get CL_DEVICE_MAX_MEM_ALLOC_SIZE", status); return NULL; } applog(LOG_DEBUG, "Max mem alloc size is %lu", (long unsigned int)(cgpu->max_alloc)); /* Create binary filename based on parameters passed to opencl * compiler to ensure we only load a binary that matches what would * have otherwise created. The filename is: * name + kernelname +/- g(offset) + v + vectors + w + work_size + l + sizeof(long) + .bin * For scrypt the filename is: * name + kernelname + g + lg + lookup_gap + tc + thread_concurrency + w + work_size + l + sizeof(long) + .bin */ char binaryfilename[255]; char filename[255]; char numbuf[16]; if (cgpu->kernel == KL_NONE) { if (opt_scrypt) { applog(LOG_INFO, "Selecting scrypt kernel"); clState->chosen_kernel = KL_SCRYPT; } else if (opt_blake256) { applog(LOG_INFO, "Selecting blake256 kernel"); clState->chosen_kernel = KL_BLAKE256; } else if (!strstr(name, "Tahiti") && /* Detect all 2.6 SDKs not with Tahiti and use diablo kernel */ (strstr(vbuff, "844.4") || // Linux 64 bit ATI 2.6 SDK strstr(vbuff, "851.4") || // Windows 64 bit "" strstr(vbuff, "831.4") || strstr(vbuff, "898.1") || // 12.2 driver SDK strstr(vbuff, "923.1") || // 12.4 strstr(vbuff, "938.2") || // SDK 2.7 strstr(vbuff, "1113.2"))) {// SDK 2.8 applog(LOG_INFO, "Selecting diablo kernel"); clState->chosen_kernel = KL_DIABLO; /* Detect all 7970s, older ATI and NVIDIA and use poclbm */ } else if (strstr(name, "Tahiti") || !clState->hasBitAlign) { applog(LOG_INFO, "Selecting poclbm kernel"); clState->chosen_kernel = KL_POCLBM; /* Use phatk for the rest R5xxx R6xxx */ } else { applog(LOG_INFO, "Selecting phatk kernel"); clState->chosen_kernel = KL_PHATK; } cgpu->kernel = clState->chosen_kernel; } else { clState->chosen_kernel = cgpu->kernel; if (clState->chosen_kernel == KL_PHATK && (strstr(vbuff, "844.4") || strstr(vbuff, "851.4") || strstr(vbuff, "831.4") || strstr(vbuff, "898.1") || strstr(vbuff, "923.1") || strstr(vbuff, "938.2") || strstr(vbuff, "1113.2"))) { applog(LOG_WARNING, "WARNING: You have selected the phatk kernel."); applog(LOG_WARNING, "You are running SDK 2.6+ which performs poorly with this kernel."); applog(LOG_WARNING, "Downgrade your SDK and delete any .bin files before starting again."); applog(LOG_WARNING, "Or allow cgminer to automatically choose a more suitable kernel."); } } /* For some reason 2 vectors is still better even if the card says * otherwise, and many cards lie about their max so use 256 as max * unless explicitly set on the command line. Tahiti prefers 1 */ if (strstr(name, "Tahiti")) preferred_vwidth = 1; else if (preferred_vwidth > 2) preferred_vwidth = 2; switch (clState->chosen_kernel) { case KL_POCLBM: strcpy(filename, POCLBM_KERNNAME".cl"); strcpy(binaryfilename, POCLBM_KERNNAME); break; case KL_PHATK: strcpy(filename, PHATK_KERNNAME".cl"); strcpy(binaryfilename, PHATK_KERNNAME); break; case KL_DIAKGCN: strcpy(filename, DIAKGCN_KERNNAME".cl"); strcpy(binaryfilename, DIAKGCN_KERNNAME); break; case KL_SCRYPT: strcpy(filename, SCRYPT_KERNNAME".cl"); strcpy(binaryfilename, SCRYPT_KERNNAME); /* Scrypt only supports vector 1 */ cgpu->vwidth = 1; break; case KL_BLAKE256: strcpy(filename, BLAKE256_KERNNAME".cl"); strcpy(binaryfilename, BLAKE256_KERNNAME); break; case KL_NONE: /* Shouldn't happen */ case KL_DIABLO: strcpy(filename, DIABLO_KERNNAME".cl"); strcpy(binaryfilename, DIABLO_KERNNAME); break; } if (cgpu->vwidth) clState->vwidth = cgpu->vwidth; else { clState->vwidth = preferred_vwidth; cgpu->vwidth = preferred_vwidth; } if (((clState->chosen_kernel == KL_POCLBM || clState->chosen_kernel == KL_DIABLO || clState->chosen_kernel == KL_DIAKGCN) && clState->vwidth == 1 && clState->hasOpenCL11plus) || opt_scrypt || opt_blake256) clState->goffset = true; if (cgpu->work_size && cgpu->work_size <= clState->max_work_size) clState->wsize = cgpu->work_size; else if (opt_scrypt) clState->wsize = 256; else if (strstr(name, "Tahiti")) clState->wsize = 64; else clState->wsize = (clState->max_work_size <= 256 ? clState->max_work_size : 256) / clState->vwidth; cgpu->work_size = clState->wsize; #ifdef USE_SCRYPT if (opt_scrypt) { if (!cgpu->opt_lg) { applog(LOG_DEBUG, "GPU %d: selecting lookup gap of 2", gpu); cgpu->lookup_gap = 2; } else cgpu->lookup_gap = cgpu->opt_lg; if (!cgpu->opt_tc) { unsigned int sixtyfours; sixtyfours = cgpu->max_alloc / 131072 / 64 - 1; cgpu->thread_concurrency = sixtyfours * 64; if (cgpu->shaders && cgpu->thread_concurrency > cgpu->shaders) { cgpu->thread_concurrency -= cgpu->thread_concurrency % cgpu->shaders; if (cgpu->thread_concurrency > cgpu->shaders * 5) cgpu->thread_concurrency = cgpu->shaders * 5; } applog(LOG_DEBUG, "GPU %d: selecting thread concurrency of %d", gpu, (int)(cgpu->thread_concurrency)); } else cgpu->thread_concurrency = cgpu->opt_tc; } #endif FILE *binaryfile; size_t *binary_sizes; char **binaries; int pl; char *source = file_contents(filename, &pl); size_t sourceSize[] = {(size_t)pl}; cl_uint slot, cpnd; slot = cpnd = 0; if (!source) return NULL; binary_sizes = calloc(sizeof(size_t) * MAX_GPUDEVICES * 4, 1); if (unlikely(!binary_sizes)) { applog(LOG_ERR, "Unable to calloc binary_sizes"); return NULL; } binaries = calloc(sizeof(char *) * MAX_GPUDEVICES * 4, 1); if (unlikely(!binaries)) { applog(LOG_ERR, "Unable to calloc binaries"); return NULL; } strcat(binaryfilename, name); if (clState->goffset) strcat(binaryfilename, "g"); if (opt_scrypt) { #ifdef USE_SCRYPT sprintf(numbuf, "lg%utc%u", cgpu->lookup_gap, (unsigned int)cgpu->thread_concurrency); strcat(binaryfilename, numbuf); #endif } else { sprintf(numbuf, "v%d", clState->vwidth); strcat(binaryfilename, numbuf); } sprintf(numbuf, "w%d", (int)clState->wsize); strcat(binaryfilename, numbuf); sprintf(numbuf, "l%d", (int)sizeof(long)); strcat(binaryfilename, numbuf); strcat(binaryfilename, ".bin"); binaryfile = fopen(binaryfilename, "rb"); if (!binaryfile) { applog(LOG_DEBUG, "No binary found, generating from source"); } else { struct stat binary_stat; if (unlikely(stat(binaryfilename, &binary_stat))) { applog(LOG_DEBUG, "Unable to stat binary, generating from source"); fclose(binaryfile); goto build; } if (!binary_stat.st_size) goto build; binary_sizes[slot] = binary_stat.st_size; binaries[slot] = (char *)calloc(binary_sizes[slot], 1); if (unlikely(!binaries[slot])) { applog(LOG_ERR, "Unable to calloc binaries"); fclose(binaryfile); return NULL; } if (fread(binaries[slot], 1, binary_sizes[slot], binaryfile) != binary_sizes[slot]) { applog(LOG_ERR, "Unable to fread binaries"); fclose(binaryfile); free(binaries[slot]); goto build; } clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[slot], (const unsigned char **)binaries, &status, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithBinary)", status); fclose(binaryfile); free(binaries[slot]); goto build; } fclose(binaryfile); applog(LOG_DEBUG, "Loaded binary image %s", binaryfilename); goto built; } ///////////////////////////////////////////////////////////////// // Load CL file, build CL program object, create CL kernel object ///////////////////////////////////////////////////////////////// build: clState->program = clCreateProgramWithSource(clState->context, 1, (const char **)&source, sourceSize, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithSource)", status); return NULL; } /* create a cl program executable for all the devices specified */ char *CompilerOptions = calloc(1, 256); #ifdef USE_SCRYPT if (opt_scrypt) sprintf(CompilerOptions, "-D LOOKUP_GAP=%d -D CONCURRENT_THREADS=%d -D WORKSIZE=%d", cgpu->lookup_gap, (unsigned int)cgpu->thread_concurrency, (int)clState->wsize); else #endif { sprintf(CompilerOptions, "-D WORKSIZE=%d -D VECTORS%d -D WORKVEC=%d", (int)clState->wsize, clState->vwidth, (int)clState->wsize * clState->vwidth); } applog(LOG_DEBUG, "Setting worksize to %d", (int)(clState->wsize)); if (clState->vwidth > 1) applog(LOG_DEBUG, "Patched source to suit %d vectors", clState->vwidth); if (clState->hasBitAlign) { strcat(CompilerOptions, " -D BITALIGN"); applog(LOG_DEBUG, "cl_amd_media_ops found, setting BITALIGN"); if (!clState->hasOpenCL12plus && (strstr(name, "Cedar") || strstr(name, "Redwood") || strstr(name, "Juniper") || strstr(name, "Cypress" ) || strstr(name, "Hemlock" ) || strstr(name, "Caicos" ) || strstr(name, "Turks" ) || strstr(name, "Barts" ) || strstr(name, "Cayman" ) || strstr(name, "Antilles" ) || strstr(name, "Wrestler" ) || strstr(name, "Zacate" ) || strstr(name, "WinterPark" ))) patchbfi = true; } else applog(LOG_DEBUG, "cl_amd_media_ops not found, will not set BITALIGN"); if (patchbfi) { strcat(CompilerOptions, " -D BFI_INT"); applog(LOG_DEBUG, "BFI_INT patch requiring device found, patched source with BFI_INT"); } else applog(LOG_DEBUG, "BFI_INT patch requiring device not found, will not BFI_INT patch"); if (clState->goffset) strcat(CompilerOptions, " -D GOFFSET"); if (!clState->hasOpenCL11plus) strcat(CompilerOptions, " -D OCL1"); applog(LOG_DEBUG, "CompilerOptions: %s", CompilerOptions); status = clBuildProgram(clState->program, 1, &devices[gpu], CompilerOptions , NULL, NULL); free(CompilerOptions); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Building Program (clBuildProgram)", status); size_t logSize; status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); char *log = malloc(logSize); status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL); applog(LOG_ERR, "%s", log); return NULL; } prog_built = true; #ifdef __APPLE__ /* OSX OpenCL breaks reading off binaries with >1 GPU so always build * from source. */ goto built; #endif status = clGetProgramInfo(clState->program, CL_PROGRAM_NUM_DEVICES, sizeof(cl_uint), &cpnd, NULL); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error %d: Getting program info CL_PROGRAM_NUM_DEVICES. (clGetProgramInfo)", status); return NULL; } status = clGetProgramInfo(clState->program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t)*cpnd, binary_sizes, NULL); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error %d: Getting program info CL_PROGRAM_BINARY_SIZES. (clGetProgramInfo)", status); return NULL; } /* The actual compiled binary ends up in a RANDOM slot! Grr, so we have * to iterate over all the binary slots and find where the real program * is. What the heck is this!? */ for (slot = 0; slot < cpnd; slot++) if (binary_sizes[slot]) break; /* copy over all of the generated binaries. */ applog(LOG_DEBUG, "Binary size for gpu %d found in binary slot %d: %d", gpu, slot, (int)(binary_sizes[slot])); if (!binary_sizes[slot]) { applog(LOG_ERR, "OpenCL compiler generated a zero sized binary, FAIL!"); return NULL; } binaries[slot] = calloc(sizeof(char) * binary_sizes[slot], 1); status = clGetProgramInfo(clState->program, CL_PROGRAM_BINARIES, sizeof(char *) * cpnd, binaries, NULL ); if (unlikely(status != CL_SUCCESS)) { applog(LOG_ERR, "Error %d: Getting program info. CL_PROGRAM_BINARIES (clGetProgramInfo)", status); return NULL; } /* Patch the kernel if the hardware supports BFI_INT but it needs to * be hacked in */ if (patchbfi) { unsigned remaining = binary_sizes[slot]; char *w = binaries[slot]; unsigned int start, length; /* Find 2nd incidence of .text, and copy the program's * position and length at a fixed offset from that. Then go * back and find the 2nd incidence of \x7ELF (rewind by one * from ELF) and then patch the opcocdes */ if (!advance(&w, &remaining, ".text")) goto build; w++; remaining--; if (!advance(&w, &remaining, ".text")) { /* 32 bit builds only one ELF */ w--; remaining++; } memcpy(&start, w + 285, 4); memcpy(&length, w + 289, 4); w = binaries[slot]; remaining = binary_sizes[slot]; if (!advance(&w, &remaining, "ELF")) goto build; w++; remaining--; if (!advance(&w, &remaining, "ELF")) { /* 32 bit builds only one ELF */ w--; remaining++; } w--; remaining++; w += start; remaining -= start; applog(LOG_DEBUG, "At %p (%u rem. bytes), to begin patching", w, remaining); patch_opcodes(w, length); status = clReleaseProgram(clState->program); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Releasing program. (clReleaseProgram)", status); return NULL; } clState->program = clCreateProgramWithBinary(clState->context, 1, &devices[gpu], &binary_sizes[slot], (const unsigned char **)&binaries[slot], &status, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Loading Binary into cl_program (clCreateProgramWithBinary)", status); return NULL; } /* Program needs to be rebuilt */ prog_built = false; } free(source); /* Save the binary to be loaded next time */ binaryfile = fopen(binaryfilename, "wb"); if (!binaryfile) { /* Not a fatal problem, just means we build it again next time */ applog(LOG_DEBUG, "Unable to create file %s", binaryfilename); } else { if (unlikely(fwrite(binaries[slot], 1, binary_sizes[slot], binaryfile) != binary_sizes[slot])) { applog(LOG_ERR, "Unable to fwrite to binaryfile"); return NULL; } fclose(binaryfile); } built: if (binaries[slot]) free(binaries[slot]); free(binaries); free(binary_sizes); applog(LOG_INFO, "Initialising kernel %s with%s bitalign, %d vectors and worksize %d", filename, clState->hasBitAlign ? "" : "out", clState->vwidth, (int)(clState->wsize)); if (!prog_built) { /* create a cl program executable for all the devices specified */ status = clBuildProgram(clState->program, 1, &devices[gpu], NULL, NULL, NULL); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Building Program (clBuildProgram)", status); size_t logSize; status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); char *log = malloc(logSize); status = clGetProgramBuildInfo(clState->program, devices[gpu], CL_PROGRAM_BUILD_LOG, logSize, log, NULL); applog(LOG_ERR, "%s", log); return NULL; } } /* get a kernel object handle for a kernel with the given name */ clState->kernel = clCreateKernel(clState->program, "search", &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: Creating Kernel from program. (clCreateKernel)", status); return NULL; } #ifdef USE_SCRYPT if (opt_scrypt) { size_t ipt = (1024 / cgpu->lookup_gap + (1024 % cgpu->lookup_gap > 0)); size_t bufsize = 128 * ipt * cgpu->thread_concurrency; /* Use the max alloc value which has been rounded to a power of * 2 greater >= required amount earlier */ if (bufsize > cgpu->max_alloc) { applog(LOG_WARNING, "Maximum buffer memory device %d supports says %lu", gpu, (long unsigned int)(cgpu->max_alloc)); applog(LOG_WARNING, "Your scrypt settings come to %d", (int)bufsize); } applog(LOG_DEBUG, "Creating scrypt buffer sized %d", (int)bufsize); clState->padbufsize = bufsize; /* This buffer is weird and might work to some degree even if * the create buffer call has apparently failed, so check if we * get anything back before we call it a failure. */ clState->padbuffer8 = NULL; clState->padbuffer8 = clCreateBuffer(clState->context, CL_MEM_READ_WRITE, bufsize, NULL, &status); if (status != CL_SUCCESS && !clState->padbuffer8) { applog(LOG_ERR, "Error %d: clCreateBuffer (padbuffer8), decrease TC or increase LG", status); return NULL; } clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, 128, NULL, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: clCreateBuffer (CLbuffer0)", status); return NULL; } clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, SCRYPT_BUFFERSIZE, NULL, &status); } else #endif if (opt_blake256) { clState->CLbuffer0 = clCreateBuffer(clState->context, CL_MEM_READ_ONLY, 128, NULL, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: clCreateBuffer (CLbuffer0)", status); return NULL; } clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, SCRYPT_BUFFERSIZE, NULL, &status); } else clState->outputBuffer = clCreateBuffer(clState->context, CL_MEM_WRITE_ONLY, BUFFERSIZE, NULL, &status); if (status != CL_SUCCESS) { applog(LOG_ERR, "Error %d: clCreateBuffer (outputBuffer)", status); return NULL; } return clState; }
static int init_cladsyn(CSOUND *csound, CLADSYN *p){ int asize, ipsize, fpsize, err; cl_device_id device_ids[32], device_id; cl_context context; cl_command_queue commands; cl_program program; cl_kernel kernel1, kernel2; cl_uint num = 0, nump = 0; cl_platform_id platforms[16]; uint i; if(p->fsig->overlap > 1024) return csound->InitError(csound, "overlap is too large\n"); err = clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 32, device_ids, &num); if (err != CL_SUCCESS){ clGetPlatformIDs(16, platforms, &nump); int devs = 0; for(i=0; i < nump && devs < 32; i++){ char name[128]; clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 128, name, NULL); csound->Message(csound, "available platform[%d] %s\n",i, name); err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 32-devs, &device_ids[devs], &num); if (err != CL_SUCCESS) csound->InitError(csound, "failed to find an OpenCL device! %s \n", cl_error_string(err)); } devs += num; } for(i=0; i < num; i++){ char name[128]; cl_device_type type; clGetDeviceInfo(device_ids[i], CL_DEVICE_NAME, 128, name, NULL); clGetDeviceInfo(device_ids[i], CL_DEVICE_TYPE, sizeof(cl_device_type), &type, NULL); if(type & CL_DEVICE_TYPE_CPU) csound->Message(csound, "available CPU[device %d] %s\n",i, name); else if(type & CL_DEVICE_TYPE_GPU) csound->Message(csound, "available GPU[device %d] %s\n",i, name); else if(type & CL_DEVICE_TYPE_ACCELERATOR) csound->Message(csound, "available ACCELLERATOR[device %d] %s\n",i, name); else csound->Message(csound, "available generic [device %d] %s\n",i, name);; } // SELECT THE GPU HERE if(*p->idev < num) device_id = device_ids[(int)*p->idev]; else device_id = device_ids[num-1]; context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) return csound->InitError(csound, "Failed to create a compute context! %s\n", cl_error_string(err)); // Create a command commands // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) return csound->InitError(csound, "Failed to create a command commands! %s\n", cl_error_string(err)); // Create the compute program from the source buffer // program = clCreateProgramWithSource(context, 1, (const char **) &code, NULL, &err); if (!program) return csound->InitError(csound, "Failed to create compute program! %s\n", cl_error_string(err)); err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; csound->Message(csound, "Failed to build program executable! %s\n", cl_error_string(err)); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); return csound->InitError(csound, "%s\n", buffer); } kernel1 = clCreateKernel(program, "sample", &err); if (!kernel1 || err != CL_SUCCESS) return csound->InitError(csound, "Failed to create sample compute kernel! %s\n", cl_error_string(err)); kernel2 = clCreateKernel(program, "update", &err); if (!kernel2 || err != CL_SUCCESS) return csound->InitError(csound,"Failed to create update compute kernel! %s\n", cl_error_string(err)); char name[128]; clGetDeviceInfo(device_id, CL_DEVICE_NAME, 128, name, NULL); csound->Message(csound, "using device: %s\n",name); p->bins = (p->fsig->N)/2; if(*p->inum > 0 && *p->inum < p->bins) p->bins = *p->inum; p->vsamps = p->fsig->overlap; p->threads = p->bins*p->vsamps; p->mthreads = (p->bins > p->vsamps ? p->bins : p->vsamps); asize = p->vsamps*sizeof(cl_float); ipsize = (p->bins > p->vsamps ? p->bins : p->vsamps)*sizeof(cl_long); fpsize = p->fsig->N*sizeof(cl_float); p->out = clCreateBuffer(context,0, asize, NULL, NULL); p->frame = clCreateBuffer(context, CL_MEM_READ_ONLY, fpsize, NULL, NULL); p->ph = clCreateBuffer(context,0, ipsize, NULL, NULL); p->amps = clCreateBuffer(context,0,(p->bins > p->vsamps ? p->bins : p->vsamps)*sizeof(cl_float), NULL, NULL); // memset needed? asize = p->vsamps*sizeof(float); if(p->out_.auxp == NULL || p->out_.size < (unsigned long) asize) csound->AuxAlloc(csound, asize , &p->out_); csound->RegisterDeinitCallback(csound, p, destroy_cladsyn); p->count = 0; p->context = context; p->program = program; p->commands = commands; p->kernel1 = kernel1; p->kernel2 = kernel2; clGetKernelWorkGroupInfo(p->kernel1, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(p->wgs1), &p->wgs1, NULL); clGetKernelWorkGroupInfo(p->kernel2, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(p->wgs1), &p->wgs2, NULL); p->sr = csound->GetSr(csound); clSetKernelArg(p->kernel1, 0, sizeof(cl_mem), &p->out); clSetKernelArg(p->kernel1, 1, sizeof(cl_mem), &p->frame); clSetKernelArg(p->kernel1, 2, sizeof(cl_mem), &p->ph); clSetKernelArg(p->kernel1, 3, sizeof(cl_mem), &p->amps); clSetKernelArg(p->kernel1, 5, sizeof(cl_int), &p->bins); clSetKernelArg(p->kernel1, 6, sizeof(cl_int), &p->vsamps); clSetKernelArg(p->kernel1, 7, sizeof(cl_float), &p->sr); clSetKernelArg(p->kernel2, 0, sizeof(cl_mem), &p->out); clSetKernelArg(p->kernel2, 1, sizeof(cl_mem), &p->frame); clSetKernelArg(p->kernel2, 2, sizeof(cl_mem), &p->ph); clSetKernelArg(p->kernel2, 3, sizeof(cl_mem), &p->amps); clSetKernelArg(p->kernel2, 5, sizeof(cl_int), &p->bins); clSetKernelArg(p->kernel2, 6, sizeof(cl_int), &p->vsamps); clSetKernelArg(p->kernel2, 7, sizeof(cl_float), &p->sr); return OK; }
GPUBase::GPUBase(char* source, char* KernelName) { kernelFuncName = KernelName; size_t szKernelLength; size_t szKernelLengthFilter; size_t szKernelLengthSum; char* SourceOpenCLShared; char* SourceOpenCL; iBlockDimX = 16; iBlockDimY = 16; GPUError = oclGetPlatformID(&cpPlatform); CheckError(GPUError); cl_uint uiNumAllDevs = 0; // Get the number of GPU devices available to the platform GPUError = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 0, NULL, &uiNumAllDevs); CheckError(GPUError); uiDevCount = uiNumAllDevs; // Create the device list cdDevices = new cl_device_id [uiDevCount]; GPUError = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, uiDevCount, cdDevices, NULL); CheckError(GPUError); // Create the OpenCL context on a GPU device GPUContext = clCreateContext(0, uiNumAllDevs, cdDevices, NULL, NULL, &GPUError); CheckError(GPUError); //The command-queue can be used to queue a set of operations (referred to as commands) in order. GPUCommandQueue = clCreateCommandQueue(GPUContext, cdDevices[0], 0, &GPUError); CheckError(GPUError); oclPrintDevName(LOGBOTH, cdDevices[0]); // Load OpenCL kernel SourceOpenCLShared = oclLoadProgSource("C:\\Dropbox\\MGR\\GPUFeatureExtraction\\GPU\\OpenCL\\GPUCode.cl", "// My comment\n", &szKernelLength); SourceOpenCL = oclLoadProgSource(source, "// My comment\n", &szKernelLengthFilter); szKernelLengthSum = szKernelLength + szKernelLengthFilter; char* sourceCL = new char[szKernelLengthSum]; strcpy(sourceCL,SourceOpenCLShared); strcat (sourceCL, SourceOpenCL); GPUProgram = clCreateProgramWithSource( GPUContext , 1, (const char **)&sourceCL, &szKernelLengthSum, &GPUError); CheckError(GPUError); // Build the program with 'mad' Optimization option char *flags = "-cl-unsafe-math-optimizations -cl-fast-relaxed-math -cl-mad-enable"; GPUError = clBuildProgram(GPUProgram, 0, NULL, flags, NULL, NULL); //error checking code if(!GPUError) { //print kernel compilation error char programLog[1024]; clGetProgramBuildInfo(GPUProgram, cdDevices[0], CL_PROGRAM_BUILD_LOG, 1024, programLog, 0); cout<<programLog<<endl; } cout << kernelFuncName << endl; GPUKernel = clCreateKernel(GPUProgram, kernelFuncName, &GPUError); CheckError(GPUError); }
int main(void) { //############################################### // // Declare variables for OpenCL // //############################################### int err; // error code returned from OpenCL calls size_t global; // global domain size cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel ko_calculate_imagerowdots_iterations; // compute kernel cl_kernel ko_calculate_colorrow; // compute kernel cl_mem d_a; // device memory used for the input a vector cl_mem d_b; // device memory int i; //############################################### // // Set values for mandelbrot // //############################################### //plane section values float x_ebene_min = -1; float y_ebene_min = -1; float x_ebene_max = 2; float y_ebene_max = 1; //monitor resolution values const long x_mon = 640; const long y_mon = 480; //Iterations long itr = 100; //abort condition float abort_value = 2; //Number of images per second long fps = 24; //video duration in seconds long video_duration = 3; //zoom speed in percentage float reduction = 5; //zoom dot my_complex_t zoom_dot; //############################################### // // Set up platform and GPU device // //############################################### cl_uint numPlatforms; // Find number of platforms err = clGetPlatformIDs(0, NULL, &numPlatforms); checkError(err, "Finding platforms"); if (numPlatforms == 0) { printf("Found 0 platforms!\n"); return EXIT_FAILURE; } // Get all platforms cl_platform_id Platform[numPlatforms]; err = clGetPlatformIDs(numPlatforms, Platform, NULL); checkError(err, "Getting platforms"); // Secure a GPU for (i = 0; i < numPlatforms; i++) { err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL); if (err == CL_SUCCESS) { break; } } if (device_id == NULL) checkError(err, "Finding a device"); err = output_device_info(device_id); checkError(err, "Printing device output"); //############################################### // // Create context, command queue and kernel // //############################################### // Create a compute context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); checkError(err, "Creating context"); // Create a command queue commands = clCreateCommandQueue(context, device_id, 0, &err); checkError(err, "Creating command queue"); //Read Kernel source FILE *fp; char *source_str; size_t source_size, program_size; fp = fopen("./kernel/calculate_iterations.cl", "r"); if (!fp) { printf("Failed to load kernel\n"); return 1; } fseek(fp, 0, SEEK_END); program_size = ftell(fp); rewind(fp); source_str = (char*) malloc(program_size + 1); source_str[program_size] = '\0'; fread(source_str, sizeof(char), program_size, fp); fclose(fp); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) &source_str, NULL, &err); checkError(err, "Creating program"); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); // Determine the size of the log size_t log_size; clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); // Allocate memory for the log char *log = (char *) malloc(log_size); // Get the log clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); // Print the log printf("%s\n", log); return EXIT_FAILURE; } // Create the compute kernel from the program ko_calculate_imagerowdots_iterations = clCreateKernel(program, "calculate_imagerowdots_iterations", &err); checkError(err, "Creating kernel"); // Create the compute kernel from the program ko_calculate_colorrow = clCreateKernel(program, "calculate_colorrow", &err); checkError(err, "Creating kernel"); int number_images = 0; do { //Get memory for image long* h_image = (long*) calloc(x_mon * y_mon, sizeof(long)); unsigned char* h_image_pixel = (unsigned char*) calloc( x_mon * y_mon * 3, sizeof(unsigned char)); //############################################### //############################################### // // Loop to calculate image dot iterations // //############################################### //############################################### float y_value = y_ebene_max; float delta_y = delta(y_ebene_min, y_ebene_max, y_mon); for (int row = 0; row < y_mon; ++row) { //############################################### // // Create and write buffer // //############################################### //Get memory for row long* h_image_row = (long*) calloc(x_mon, sizeof(long)); // a vector d_a = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long) * x_mon, NULL, &err); checkError(err, "Creating buffer d_a"); // Write a vector into compute device memory err = clEnqueueWriteBuffer(commands, d_a, CL_TRUE, 0, sizeof(long) * x_mon, h_image_row, 0, NULL, NULL); checkError(err, "Copying h_a to device at d_a"); //############################################### // // Set the arguments to our compute kernel // //############################################### err = clSetKernelArg(ko_calculate_imagerowdots_iterations, 0, sizeof(float), &x_ebene_min); err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 1, sizeof(float), &x_ebene_max); err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 2, sizeof(float), &y_value); err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 3, sizeof(long), &x_mon); err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 4, sizeof(float), &abort_value); err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 5, sizeof(long), &itr); err |= clSetKernelArg(ko_calculate_imagerowdots_iterations, 6, sizeof(cl_mem), &d_a); checkError(err, "Setting kernel arguments"); /*__kernel void calculate_imagerowdots_iterations(const float x_min, const float x_max, const float y_value, const long x_mon, const float abort_value, const long itr, __global long * imagerow)*/ // Execute the kernel over the entire range of our 1d input data set // letting the OpenCL runtime choose the work-group size global = x_mon; err = clEnqueueNDRangeKernel(commands, ko_calculate_imagerowdots_iterations, 1, NULL, &global, NULL, 0, NULL, NULL); checkError(err, "Enqueueing kernel"); // Wait for the commands to complete err = clFinish(commands); checkError(err, "Waiting for kernel to finish"); // Read back the results from the compute device err = clEnqueueReadBuffer(commands, d_a, CL_TRUE, 0, sizeof(long) * x_mon, h_image_row, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to read output array!\n%s\n", err_code(err)); exit(1); } //reduce y y_value -= delta_y; //cope row to image memcpy(h_image + row * x_mon, h_image_row, sizeof(long) * x_mon); free(h_image_row); } // for (i = 0; i < x_mon * y_mon; ++i) { // printf("%ld ", h_image[i]); // } // fflush(stdout); //############################################### //############################################### // // End of loop to calculate image dot iterations // //############################################### //############################################### //############################################### //############################################### // // Beginn color calculation // //############################################### //############################################### for (int row = 0; row < y_mon; ++row) { //Get memory for row long* h_image_row = (long*) calloc(x_mon, sizeof(long)); // a vector memcpy(h_image_row, h_image + row * x_mon, sizeof(long) * x_mon); d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(long) * x_mon, NULL, &err); checkError(err, "Creating buffer d_a"); // Write a vector into compute device memory err = clEnqueueWriteBuffer(commands, d_a, CL_TRUE, 0, sizeof(long) * x_mon, h_image_row, 0, NULL, NULL); checkError(err, "Copying h_image_row to device at d_a"); unsigned char* h_imagepixel_row = (unsigned char*) calloc(x_mon * 3, sizeof(unsigned char)); // a vector d_b = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(unsigned char) * x_mon * 3, NULL, &err); checkError(err, "Creating buffer d_b"); // Write a vector into compute device memory err = clEnqueueWriteBuffer(commands, d_b, CL_TRUE, 0, sizeof(unsigned char) * x_mon * 3, h_imagepixel_row, 0, NULL, NULL); checkError(err, "Copying h_imagepixel_row to device at d_b"); //############################################### // // Set the arguments to our compute kernel // //############################################### err = clSetKernelArg(ko_calculate_colorrow, 0, sizeof(long), &x_mon); err |= clSetKernelArg(ko_calculate_colorrow, 1, sizeof(long), &itr); err |= clSetKernelArg(ko_calculate_colorrow, 2, sizeof(cl_mem), &d_a); err |= clSetKernelArg(ko_calculate_colorrow, 3, sizeof(cl_mem), &d_b); checkError(err, "Setting kernel arguments"); /*__kernel void calculate_colorrow(const long width, long itr, long * imagerowvalues, unsigned char * imagerow)*/ // Execute the kernel over the entire range of our 1d input data set // letting the OpenCL runtime choose the work-group size global = x_mon; err = clEnqueueNDRangeKernel(commands, ko_calculate_colorrow, 1, NULL, &global, NULL, 0, NULL, NULL); checkError(err, "Enqueueing kernel"); // Wait for the commands to complete err = clFinish(commands); checkError(err, "Waiting for kernel to finish"); // Read back the results from the compute device err = clEnqueueReadBuffer(commands, d_b, CL_TRUE, 0, sizeof(unsigned char) * x_mon * 3, h_imagepixel_row, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to read output array!\n%s\n", err_code(err)); exit(1); } memcpy(h_image_pixel + row * x_mon * 3, h_imagepixel_row, sizeof(unsigned char) * x_mon * 3); free(h_image_row); free(h_imagepixel_row); } if (number_images == 0) { zoom_dot = find_dot_to_zoom(x_ebene_min, x_ebene_max, y_ebene_min, y_ebene_max, h_image, y_mon, x_mon, itr); } reduce_plane_section_focus_dot(&x_ebene_min, &x_ebene_max, &y_ebene_min, &y_ebene_max, reduction, zoom_dot); // save the image char filename[50]; sprintf(filename, "img-%d.bmp", number_images); safe_image_to_bmp(x_mon, y_mon, h_image_pixel, filename); free(h_image); free(h_image_pixel); number_images++; itr = (long) (itr + itr * reduction / 100); printf("%d\n", number_images); fflush(stdout); } while (number_images < (fps * video_duration)); //############################################### // // cleanup then shutdown // //############################################### clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseProgram(program); clReleaseKernel(ko_calculate_imagerowdots_iterations); clReleaseCommandQueue(commands); clReleaseContext(context); return 0; }
int main() { /* Host/device data structures */ cl_platform_id platform; cl_device_id device; cl_context context; cl_command_queue queue; cl_int err; /* Program/kernel data structures */ cl_program program; FILE *program_handle; char *program_buffer, *program_log; size_t program_size, log_size; cl_kernel kernel; size_t offset = 0; size_t global_size, local_size; /* Data and buffers */ char pattern[16] = "thatwithhavefrom"; FILE *text_handle; char *text; size_t text_size; int chars_per_item; int result[4] = {0, 0, 0, 0}; cl_mem text_buffer, result_buffer; /* Identify a platform */ err = clGetPlatformIDs(1, &platform, NULL); if(err < 0) { perror("Couldn't identify a platform"); exit(1); } /* Access a device */ err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if(err < 0) { perror("Couldn't access any devices"); exit(1); } /* Determine global size and local size */ clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(global_size), &global_size, NULL); clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_size), &local_size, NULL); global_size *= local_size; /* Create a context */ context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if(err < 0) { perror("Couldn't create a context"); exit(1); } /* Read program file and place content into buffer */ program_handle = fopen(PROGRAM_FILE, "r"); if(program_handle == NULL) { perror("Couldn't find the program file"); exit(1); } fseek(program_handle, 0, SEEK_END); program_size = ftell(program_handle); rewind(program_handle); program_buffer = (char*)calloc(program_size+1, sizeof(char)); fread(program_buffer, sizeof(char), program_size, program_handle); fclose(program_handle); /* Read text file and place content into buffer */ text_handle = fopen(TEXT_FILE, "r"); if(text_handle == NULL) { perror("Couldn't find the text file"); exit(1); } fseek(text_handle, 0, SEEK_END); text_size = ftell(text_handle)-1; rewind(text_handle); text = (char*)calloc(text_size, sizeof(char)); fread(text, sizeof(char), text_size, text_handle); fclose(text_handle); chars_per_item = text_size / global_size + 1; /* Create program from file */ program = clCreateProgramWithSource(context, 1, (const char**)&program_buffer, &program_size, &err); if(err < 0) { perror("Couldn't create the program"); exit(1); } free(program_buffer); /* Build program */ err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if(err < 0) { /* Find size of log and print to std output */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); program_log = (char*) calloc(log_size+1, sizeof(char)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL); printf("%s\n", program_log); free(program_log); exit(1); } /* Create a kernel */ kernel = clCreateKernel(program, KERNEL_FUNC, &err); if(err < 0) { perror("Couldn't create a kernel"); exit(1); }; /* Create buffers to hold the text characters and count */ text_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, text_size, text, &err); if(err < 0) { perror("Couldn't create a buffer"); exit(1); }; result_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(result), result, NULL); /* Create kernel argument */ err = clSetKernelArg(kernel, 0, sizeof(pattern), pattern); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &text_buffer); err |= clSetKernelArg(kernel, 2, sizeof(chars_per_item), &chars_per_item); err |= clSetKernelArg(kernel, 3, 4 * sizeof(int), NULL); err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &result_buffer); if(err < 0) { printf("Couldn't set a kernel argument"); exit(1); }; /* Create a command queue */ queue = clCreateCommandQueue(context, device, 0, &err); if(err < 0) { perror("Couldn't create a command queue"); exit(1); }; /* Enqueue kernel */ err = clEnqueueNDRangeKernel(queue, kernel, 1, &offset, &global_size, &local_size, 0, NULL, NULL); if(err < 0) { perror("Couldn't enqueue the kernel"); printf("Error code: %d\n", err); exit(1); } /* Read and print the result */ err = clEnqueueReadBuffer(queue, result_buffer, CL_TRUE, 0, sizeof(result), &result, 0, NULL, NULL); if(err < 0) { perror("Couldn't read the buffer"); exit(1); } printf("\nResults: \n"); printf("Number of occurrences of 'that': %d\n", result[0]); printf("Number of occurrences of 'with': %d\n", result[1]); printf("Number of occurrences of 'have': %d\n", result[2]); printf("Number of occurrences of 'from': %d\n", result[3]); /* Deallocate resources */ clReleaseMemObject(result_buffer); clReleaseMemObject(text_buffer); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseProgram(program); clReleaseContext(context); return 0; }
int main() { /* Host/device data structures */ cl_platform_id platform; cl_device_id device; cl_context context; cl_command_queue queue; cl_int err, i; /* Program/kernel data structures */ cl_program program; FILE *program_handle; char *program_buffer, *program_log; size_t program_size, log_size; cl_kernel kernel; size_t global_size, local_size; /* Data and buffers */ int num_rows, num_cols, num_values; int *rows, *cols; float *values, *b_vec; float result[2]; double value_double; cl_mem rows_buffer, cols_buffer, values_buffer, b_buffer, result_buffer; /* Read sparse file */ FILE *mm_handle; MM_typecode code; /* Read matrix file */ if ((mm_handle = fopen(MM_FILE, "r")) == NULL) { perror("Couldn't open the MatrixMarket file"); exit(1); } mm_read_banner(mm_handle, &code); mm_read_mtx_crd_size(mm_handle, &num_rows, &num_cols, &num_values); /* Check for symmetry and allocate memory */ if(mm_is_symmetric(code) || mm_is_skew(code) || mm_is_hermitian(code)) { num_values += num_values - num_rows; } rows = (int*) malloc(num_values * sizeof(int)); cols = (int*) malloc(num_values * sizeof(int)); values = (float*) malloc(num_values * sizeof(float)); b_vec = (float*) malloc(num_rows * sizeof(float)); /* Read matrix data and close file */ i=0; while(i<num_values) { fscanf(mm_handle, "%d %d %lg\n", &rows[i], &cols[i], &value_double); values[i] = (float)value_double; cols[i]--; rows[i]--; if((rows[i] != cols[i]) && (mm_is_symmetric(code) || mm_is_skew(code) || mm_is_hermitian(code))) { i++; rows[i] = cols[i-1]; cols[i] = rows[i-1]; values[i] = values[i-1]; } i++; } sort(num_values, rows, cols, values); fclose(mm_handle); /* Initialize the b vector */ srand(time(0)); for(i=0; i<num_rows; i++) { b_vec[i] = (float)(rand() - RAND_MAX/2); } /* Identify a platform */ err = clGetPlatformIDs(1, &platform, NULL); if(err < 0) { perror("Couldn't identify a platform"); exit(1); } /* Access a device */ err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if(err < 0) { perror("Couldn't access any devices"); exit(1); } /* Create a context */ context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if(err < 0) { perror("Couldn't create a context"); exit(1); } /* Read program file and place content into buffer */ program_handle = fopen(PROGRAM_FILE, "r"); if(program_handle == NULL) { perror("Couldn't find the program file"); exit(1); } fseek(program_handle, 0, SEEK_END); program_size = ftell(program_handle); rewind(program_handle); program_buffer = (char*)calloc(program_size+1, sizeof(char)); fread(program_buffer, sizeof(char), program_size, program_handle); fclose(program_handle); /* Create program from file */ program = clCreateProgramWithSource(context, 1, (const char**)&program_buffer, &program_size, &err); if(err < 0) { perror("Couldn't create the program"); exit(1); } free(program_buffer); /* Build program */ err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if(err < 0) { /* Find size of log and print to std output */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); program_log = (char*) calloc(log_size+1, sizeof(char)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL); printf("%s\n", program_log); free(program_log); exit(1); } /* Create a kernel */ kernel = clCreateKernel(program, KERNEL_FUNC, &err); if(err < 0) { printf("Couldn't create a kernel: %d", err); exit(1); }; /* Create buffers to hold the text characters and count */ rows_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, num_values*sizeof(int), rows, &err); if(err < 0) { perror("Couldn't create a buffer"); exit(1); }; cols_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, num_values*sizeof(int), cols, NULL); values_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, num_values*sizeof(float), values, NULL); b_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, num_values*sizeof(float), b_vec, NULL); result_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 2*sizeof(float), NULL, NULL); /* Create kernel argument */ err = clSetKernelArg(kernel, 0, sizeof(num_rows), &num_rows); err |= clSetKernelArg(kernel, 1, sizeof(num_values), &num_values); err |= clSetKernelArg(kernel, 2, num_rows * sizeof(float), NULL); err |= clSetKernelArg(kernel, 3, num_rows * sizeof(float), NULL); err |= clSetKernelArg(kernel, 4, num_rows * sizeof(float), NULL); err |= clSetKernelArg(kernel, 5, num_rows * sizeof(float), NULL); err |= clSetKernelArg(kernel, 6, sizeof(cl_mem), &rows_buffer); err |= clSetKernelArg(kernel, 7, sizeof(cl_mem), &cols_buffer); err |= clSetKernelArg(kernel, 8, sizeof(cl_mem), &values_buffer); err |= clSetKernelArg(kernel, 9, sizeof(cl_mem), &b_buffer); err |= clSetKernelArg(kernel, 10, sizeof(cl_mem), &result_buffer); if(err < 0) { printf("Couldn't set a kernel argument"); exit(1); }; /* Create a command queue */ queue = clCreateCommandQueue(context, device, 0, &err); if(err < 0) { perror("Couldn't create a command queue"); exit(1); }; /* Enqueue kernel */ global_size = num_rows; local_size = num_rows; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL); if(err < 0) { perror("Couldn't enqueue the kernel"); printf("Error: %d\n", err); exit(1); } /* Read the results */ err = clEnqueueReadBuffer(queue, result_buffer, CL_TRUE, 0, 2*sizeof(float), result, 0, NULL, NULL); if(err < 0) { perror("Couldn't read the buffer"); exit(1); } /* Print the result */ printf("After %d iterations, the residual length is %f.\n", (int)result[0], result[1]); /* Deallocate resources */ free(b_vec); free(rows); free(cols); free(values); clReleaseMemObject(b_buffer); clReleaseMemObject(rows_buffer); clReleaseMemObject(cols_buffer); clReleaseMemObject(values_buffer); clReleaseMemObject(result_buffer); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseProgram(program); clReleaseContext(context); return 0; }
///////////////////////////////////////////////////////////////// // Create OpenCL memory buffers ///////////////////////////////////////////////////////////////// bool initializeCLBuffers (JobToPUM *job) { cl_int status = CL_SUCCESS; int devID = job->runOn; cl_context *context = PUInfoStruct.PUsContexts[devID]; cl_device_id device;// = calloc(1, sizeof(cl_device_id)); size_t deviceListSize; status = clGetContextInfo(*context, CL_CONTEXT_DEVICES, 0, NULL, &deviceListSize); if (status != CL_SUCCESS) { fprintf(stderr, "clGetContextInfo failed (%i).\n", status); return false; } //Get the corresponding device (TODO: currently only one) status = clGetContextInfo(*context, CL_CONTEXT_DEVICES, sizeof(cl_device_id), &device, NULL); if (status != CL_SUCCESS) { fprintf(stderr, "clGetContextInfo failed (%i).\n", status); return false; } PUInfoStruct.argBuffers[devID] = calloc(job->nTotalArgs, sizeof(cl_mem)); //TODO: support other memory locations for (cl_uint i = 0; i < job->nTotalArgs; i++) { /* Create buffers */ PUInfoStruct.argBuffers[devID][i] = clCreateBuffer(*context, CL_MEM_READ_WRITE, job->argSizes[i], 0, &status); if (status != CL_SUCCESS) { fprintf(stderr,"clCreateBuffer failed. (inputBuffers[%i]))\n",i); return false; } if ((job->argTypes[i] == INPUT) || (job->argTypes[i] == INPUT_OUTPUT)) { if (job->probID == myid && job->jobID == 2){ for (int j = 0; j < 30; j++) { status = clEnqueueWriteBuffer(*(PUInfoStruct.PUsCmdQs[devID]), PUInfoStruct.argBuffers[devID][i], CL_TRUE, 0, job->argSizes[i], job->arguments[i], 0, 0, 0); if (status != CL_SUCCESS) { fprintf(stderr,"clEnqueueWriteBuffer failed. (inputBuffers[%i])\n",i); return false; } } } else { status = clEnqueueWriteBuffer(*(PUInfoStruct.PUsCmdQs[devID]), PUInfoStruct.argBuffers[devID][i], CL_TRUE, 0, job->argSizes[i], job->arguments[i], 0, 0, 0); if (status != CL_SUCCESS) { fprintf(stderr,"clEnqueueWriteBuffer failed. (inputBuffers[%i])\n",i); return false; } } } } ///////////////////////////////////////////////////////////////// // Load CL source, build CL program object, create CL kernel object ///////////////////////////////////////////////////////////////// /* create a CL program using the kernel source */ cl_program program;// = malloc(sizeof(cl_program)); // PUInfoStruct.currKernels[devID] = malloc(sizeof(cl_kernel)); const char *sourceStr = job->taskSource; if (debug_PUM) fprintf(stderr, "TRYING (%i)\n", job->taskSourceSize); if (sourceStr == NULL) { fprintf(stderr, "initializeCLBuffers srcStr is NULL\n"); return false; } size_t sourceSize[] = { strlen(sourceStr) }; assert(sourceSize[0] == job->taskSourceSize-1); program = clCreateProgramWithSource(*context, 1, &sourceStr, sourceSize, &status); if (status != CL_SUCCESS) { fprintf(stderr, "clCreateProgramWithSource failed.\n"); return false; } /* create a cl program executable for all the devices specified */ //currently only one device supported status = clBuildProgram(program, 1, &device, NULL, NULL, NULL); size_t ret_val_size; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); char *build_log = calloc(ret_val_size+1, sizeof(char)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); build_log[ret_val_size] = '\0'; if (!SILENT){ if (strlen(build_log) > 0) { fprintf(stderr, "PUM (%i): buildlog:\n>>>\n%s\n<<<\n__END OF BUILDLOG__\n",myid, build_log); } else { fprintf(stderr, "PUM (%i): NO BUILD LOG\n", myid); } } free(build_log); if (status != CL_SUCCESS) { fprintf(stderr, "clBuildProgram failed (%i).\n", status); return false; } /* get a kernel object handle for a kernel with the given name */ PUInfoStruct.currKernels[devID] = clCreateKernel(program, job->startingKernel, &status); if (status != CL_SUCCESS) { fprintf(stderr, "clCreateKernel failed.\n"); return false; } // free (device); // free (program); //TODO: check if this should be persistent return true; }
cl_program getOrBuildProgram(const Context* ctx, const cv::ocl::ProgramEntry* source, const String& options) { cl_int status = 0; cl_program program = NULL; std::vector<char> binary; if (!enable_disk_cache || !readConfigurationFromFile(options, binary)) { program = clCreateProgramWithSource(getClContext(ctx), 1, (const char**)&source->programStr, NULL, &status); openCLVerifyCall(status); cl_device_id device = getClDeviceID(ctx); status = clBuildProgram(program, 1, &device, options.c_str(), NULL, NULL); if(status == CL_SUCCESS) { if (enable_disk_cache) { size_t binarySize; openCLSafeCall(clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binarySize, NULL)); std::vector<char> binary(binarySize); char* ptr = &binary[0]; openCLSafeCall(clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(char*), &ptr, NULL)); if (!writeConfigurationToFile(options, binary)) { std::cerr << "Can't write data to file: " << fileName_ << std::endl; } } } } else { cl_device_id device = getClDeviceID(ctx); size_t size = binary.size(); const char* ptr = &binary[0]; program = clCreateProgramWithBinary(getClContext(ctx), 1, &device, (const size_t *)&size, (const unsigned char **)&ptr, NULL, &status); openCLVerifyCall(status); status = clBuildProgram(program, 1, &device, options.c_str(), NULL, NULL); } if(status != CL_SUCCESS) { if(status == CL_BUILD_PROGRAM_FAILURE) { size_t buildLogSize = 0; openCLSafeCall(clGetProgramBuildInfo(program, getClDeviceID(ctx), CL_PROGRAM_BUILD_LOG, 0, NULL, &buildLogSize)); std::vector<char> buildLog; buildLog.resize(buildLogSize); memset(&buildLog[0], 0, buildLogSize); openCLSafeCall(clGetProgramBuildInfo(program, getClDeviceID(ctx), CL_PROGRAM_BUILD_LOG, buildLogSize, &buildLog[0], NULL)); std::cout << std::endl << "BUILD LOG: " << (source->name ? source->name : "dynamic program") << ": " << options << "\n"; std::cout << &buildLog[0] << std::endl; } openCLVerifyCall(status); } return program; }
int main(int argc, char** argv) { /* OpenCL 1.1 data structures */ cl_platform_id* platforms; cl_program program; cl_context context; /* OpenCL 1.1 scalar data types */ cl_uint numOfPlatforms; cl_int error; /* Prepare an array of __cl_float4 via dynamic memory allocation This will map to the native vector type which is SSE / SSE2 / AVX on Intel-compatible processors. */ cl_float8* ud_in = (cl_float8*) malloc( sizeof(cl_float8) * DATA_SIZE); // input to device cl_float8* ud_out = (cl_float8*) malloc( sizeof(cl_float8) * DATA_SIZE); // output from device for( int i = 0; i < DATA_SIZE; ++i) { ud_in[i] = (cl_float8){(float)i,(float)i,(float)i,(float)i,(float)i,(float)i,(float)i,(float)i}; ud_out[i] = (cl_float8){(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f}; } /* Get the number of platforms Remember that for each vendor's SDK installed on the computer, the number of available platform also increased. */ error = clGetPlatformIDs(0, NULL, &numOfPlatforms); if(error != CL_SUCCESS ) { perror("Unable to find any OpenCL platforms"); exit(1); } platforms = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numOfPlatforms); printf("Number of OpenCL platforms found: %d\n", numOfPlatforms); error = clGetPlatformIDs(numOfPlatforms, platforms, NULL); if(error != CL_SUCCESS ) { perror("Unable to find any OpenCL platforms"); exit(1); } // Search for a CPU/GPU device through the installed platforms // Build a OpenCL program and do not run it. for(cl_uint i = 0; i < numOfPlatforms; i++ ) { cl_uint numOfDevices = 0; /* Determine how many devices are connected to your platform */ error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &numOfDevices); if (error != CL_SUCCESS ) { perror("Unable to obtain any OpenCL compliant device info"); exit(1); } cl_device_id* devices = (cl_device_id*) alloca(sizeof(cl_device_id) * numOfDevices); /* Load the information about your devices into the variable 'devices' */ error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numOfDevices, devices, NULL); if (error != CL_SUCCESS ) { perror("Unable to obtain any OpenCL compliant device info"); exit(1); } printf("Number of detected OpenCL devices: %d\n", numOfDevices); /* Create a context */ cl_context_properties ctx[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i], 0 }; context = clCreateContext(ctx, numOfDevices, devices, NULL, NULL, &error); if(error != CL_SUCCESS) { perror("Can't create a valid OpenCL context"); exit(1); } /* For each device, create a buffer and partition that data among the devices for compute! */ cl_mem inobj = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * DATA_SIZE, ud_in, &error); if(error != CL_SUCCESS) { perror("Can't create a buffer"); exit(1); } int offset = 0; for(int i = 0; i < numOfDevices; ++i, ++offset ) { /* Load the two source files into temporary datastores */ const char *file_names[] = {"vectorization.cl"}; const int NUMBER_OF_FILES = 1; char* buffer[NUMBER_OF_FILES]; size_t sizes[NUMBER_OF_FILES]; loadProgramSource(file_names, NUMBER_OF_FILES, buffer, sizes); /* Create the OpenCL program object */ program = clCreateProgramWithSource(context, NUMBER_OF_FILES, (const char**)buffer, sizes, &error); if(error != CL_SUCCESS) { perror("Can't create the OpenCL program object"); exit(1); } /* Build OpenCL program object and dump the error message, if any */ char *program_log; size_t log_size; char* build_options = "-fbin-llvmir -fbin-amdil -fbin-exe"; error = clBuildProgram(program, 1, &devices[i], build_options, NULL, NULL); if(error != CL_SUCCESS) { // If there's an error whilst building the program, dump the log clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); program_log = (char*) malloc(log_size+1); program_log[log_size] = '\0'; clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL); printf("\n=== ERROR ===\n\n%s\n=============\n", program_log); free(program_log); exit(1); } /* Query the program as to how many kernels were detected */ cl_uint numOfKernels; error = clCreateKernelsInProgram(program, 0, NULL, &numOfKernels); if (error != CL_SUCCESS) { perror("Unable to retrieve kernel count from program"); exit(1); } cl_kernel* kernels = (cl_kernel*) alloca(sizeof(cl_kernel) * numOfKernels); error = clCreateKernelsInProgram(program, numOfKernels, kernels, NULL); /* Loop thru each kernel and execute on device */ for(cl_uint j = 0; j < numOfKernels; j++) { char kernelName[32]; cl_uint argCnt; clGetKernelInfo(kernels[j], CL_KERNEL_FUNCTION_NAME, sizeof(kernelName), kernelName, NULL); clGetKernelInfo(kernels[j], CL_KERNEL_NUM_ARGS, sizeof(argCnt), &argCnt, NULL); printf("Kernel name: %s with arity: %d\n", kernelName, argCnt); printf("About to create command queue and enqueue this kernel...\n"); /* Create a command queue */ cl_command_queue cQ = clCreateCommandQueue(context, devices[i], 0, &error); if (error != CL_SUCCESS) { perror("Unable to create command-queue"); exit(1); } /* Create a buffer and copy the data from the main buffer */ cl_mem outobj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float8) * DATA_SIZE, 0, &error); if (error != CL_SUCCESS) { perror("Unable to create sub-buffer object"); exit(1); } /* Let OpenCL know that the kernel is suppose to receive an argument */ error = clSetKernelArg(kernels[j], 0, sizeof(cl_mem), &inobj); error = clSetKernelArg(kernels[j], 1, sizeof(cl_mem), &outobj); if (error != CL_SUCCESS) { perror("Unable to set buffer object in kernel"); exit(1); } /* Enqueue the kernel to the command queue */ error = clEnqueueTask(cQ, kernels[j], 0, NULL, NULL); if (error != CL_SUCCESS) { perror("Unable to enqueue task to command-queue"); exit(1); } printf("Task has been enqueued successfully!\n"); /* Enqueue the read-back from device to host */ error = clEnqueueReadBuffer(cQ, outobj, CL_TRUE, // blocking read 0, // read from the start sizeof(cl_float8)*DATA_SIZE, // how much to copy ud_out, 0, NULL, NULL); /* Check the returned data */ if ( valuesOK(ud_in, ud_out, DATA_SIZE) ) { printf("Check passed!\n"); } else printf("Check failed!\n"); /* Release the command queue */ clReleaseCommandQueue(cQ); clReleaseMemObject(outobj); } /* Clean up */ for(cl_uint i = 0; i < numOfKernels; i++) { clReleaseKernel(kernels[i]); } for(int i=0; i< NUMBER_OF_FILES; i++) { free(buffer[i]); } clReleaseProgram(program); }// end of device loop and execution clReleaseMemObject(inobj); clReleaseContext(context); }// end of platform loop free(ud_in); free(ud_out); }
int main(int argc, char **argv){ printf("Check OpenCL environtment\n"); cl_platform_id platid; cl_device_id devid; cl_int res; size_t param; /* Query OpenCL, get some information about the returned device */ clGetPlatformIDs(1u, &platid, NULL); clGetDeviceIDs(platid, CL_DEVICE_TYPE_ALL, 1, &devid, NULL); cl_char vendor_name[1024] = {0}; cl_char device_name[1024] = {0}; clGetDeviceInfo(devid, CL_DEVICE_VENDOR, sizeof(vendor_name), vendor_name, NULL); clGetDeviceInfo(devid, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); printf("Connecting to OpenCL device:\t%s %s\n", vendor_name, device_name); clGetDeviceInfo(devid, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), ¶m, NULL); printf("CL_DEVICE_MAX_COMPUTE_UNITS\t%d\n", param); clGetDeviceInfo(devid, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), ¶m, NULL); printf("CL_DEVICE_MAX_WORK_GROUP_SIZE\t%u\n", param); clGetDeviceInfo(devid, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(cl_ulong), ¶m, NULL); printf("CL_DEVICE_LOCAL_MEM_SIZE\t%ub\n", param); /* Check if kernel source exists, we compile argv[1] passed kernel */ if(argv[1] == NULL) { printf("\nUsage: %s kernel_source.cl kernel_function\n", argv[0]); exit(1); } char *kernel_source; if(load_program_source(argv[1], &kernel_source)) return 1; printf("Building from OpenCL source: \t%s\n", argv[1]); printf("Compile/query OpenCL_program:\t%s\n", argv[2]); /* Create context and kernel program */ cl_context context = clCreateContext(0, 1, &devid, NULL, NULL, NULL); cl_program pro = clCreateProgramWithSource(context, 1, (const char **)&kernel_source, NULL, NULL); res = clBuildProgram(pro, 1, &devid, "-cl-fast-relaxed-math", NULL, NULL); if(res != CL_SUCCESS){ printf("clBuildProgram failed: %d\n", res); char buf[0x10000]; clGetProgramBuildInfo(pro, devid, CL_PROGRAM_BUILD_LOG, 0x10000, buf, NULL); printf("\n%s\n", buf); return(-1); } cl_kernel kernelobj = clCreateKernel(pro, argv[2], &res); check_return(res); /* Get the maximum work-group size for executing the kernel on the device */ size_t global, local; res = clGetKernelWorkGroupInfo(kernelobj, devid, CL_KERNEL_WORK_GROUP_SIZE, sizeof(int), &local, NULL); check_return(res); printf("CL_KERNEL_WORK_GROUP_SIZE\t%u\n", local); res = clGetKernelWorkGroupInfo(kernelobj, devid, CL_KERNEL_LOCAL_MEM_SIZE, sizeof(cl_ulong), ¶m, NULL); check_return(res); printf("CL_KERNEL_LOCAL_MEM_SIZE\t%ub\n", param); cl_command_queue cmd_queue = clCreateCommandQueue(context, devid, CL_QUEUE_PROFILING_ENABLE, NULL); if(cmd_queue == NULL) { printf("Compute device setup failed\n"); return(-1); } local = 4; int n = 2 * local; //num_group * local workgroup size global = n; int num_groups= global / local, allocated_local= sizeof(data) * local + sizeof(debug) * local; data *DP __attribute__ ((aligned(16))); DP = calloc(n, sizeof(data) *1); debug *dbg __attribute__ ((aligned(16))); dbg = calloc(n, sizeof(debug)); printf("global:%d, local:%d, (should be):%d groups\n", global, local, num_groups); printf("structs size: %db, %db, %db\n", sizeof(data), sizeof(Elliptic_Curve), sizeof(inv256)); printf("sets:%d, total of %db needed, allocated _local: %db\n", n, n * sizeof(cl_uint4) *5 *4, allocated_local); cl_mem cl_DP, cl_EC, cl_INV, DEBUG; cl_DP = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR, n * sizeof(data), NULL, &res); check_return(res); cl_EC = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY, 1 * sizeof(Elliptic_Curve), NULL, &res); check_return(res); //_constant address space cl_INV= clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY, 1 * sizeof(u8) * 0x80, NULL, &res); check_return(res); DEBUG = clCreateBuffer(context, CL_MEM_ALLOC_HOST_PTR | CL_MEM_WRITE_ONLY, n * sizeof(debug), NULL, &res); check_return(res); Elliptic_Curve EC; /* Curve domain parameters, (test vectors) ------------------------------------------------------------------------------------- p: c1c627e1638fdc8e24299bb041e4e23af4bb5427 is prime a: c1c627e1638fdc8e24299bb041e4e23af4bb5424 divisor g = 62980 b: 877a6d84155a1de374b72d9f9d93b36bb563b2ab divisor g = 227169643 Gx: 010aff82b3ac72569ae645af3b527be133442131 divisor g = 32209245 Gy: 46b8ec1e6d71e5ecb549614887d57a287df573cc divisor g = 972 precomputed_per_curve_constants: U: c1c627e1638fdc8e24299bb041e4e23af4bb5425 V: 3e39d81e9c702371dbd6644fbe1b1dc50b44abd9 already prepared mod p to test: a: 07189f858e3f723890a66ec1079388ebd2ed509c b: 6043379beb0dade6eed1e9d6de64f4a0c50639d4 gx: 5ef84aacf4f0ea6752f572d0741f40049f354dca gy: 418c695435af6b3d4d7cbb72967395016ef67239 resulting point: P.x: 01718f862ebe9423bd661a65355aa1c86ba330f8 program MUST got this point !! P.y: 557e8ed53ffbfe2c990a121967b340f62e0e4fe2 taken mod p: P.x: 41da1a8f74ff8d3f1ce20ef3e9d8865c96014fe3 P.y: 73ca143c9badedf2d9d3c7573307115ccfe04f13 */ u8 *t; t = _x_to_u8_buffer("c1c627e1638fdc8e24299bb041e4e23af4bb5427"); memcpy(EC.p, t, 20); t = _x_to_u8_buffer("07189f858e3f723890a66ec1079388ebd2ed509c"); memcpy(EC.a, t, 20); t = _x_to_u8_buffer("6043379beb0dade6eed1e9d6de64f4a0c50639d4"); memcpy(EC.b, t, 20); t = _x_to_u8_buffer("5ef84aacf4f0ea6752f572d0741f40049f354dca"); memcpy(EC.Gx, t, 20); t = _x_to_u8_buffer("418c695435af6b3d4d7cbb72967395016ef67239"); memcpy(EC.Gy, t, 20); t = _x_to_u8_buffer("c1c627e1638fdc8e24299bb041e4e23af4bb5425"); memcpy(EC.U, t, 20); t = _x_to_u8_buffer("3e39d81e9c702371dbd6644fbe1b1dc50b44abd9"); memcpy(EC.V, t, 20); /* we need to map buffer now to load some k into data */ DP = clEnqueueMapBuffer(cmd_queue, cl_DP, CL_TRUE, CL_MAP_WRITE, 0, n * sizeof(data), 0, NULL, NULL, &res); check_return(res); t = _x_to_u8_buffer("00542d46e7b3daac8aeb81e533873aabd6d74bb710"); for(u8 i = 0; i < n; i++) memcpy(DP[i].k, t, 21); free(t); //d for(u8 i = 0; i < n; i++) bn_print("", DP[i].k, 21, 1); /* we can alter just a byte into a chosen k to verify that we'll get a different point! */ //DP[2].k[2] = 0x09; //no res = clEnqueueWriteBuffer(cmd_queue, cl_DP, CL_TRUE, 0, n * sizeof(data), &DP, 0, NULL, NULL); check_return(res); res = clEnqueueWriteBuffer(cmd_queue, cl_EC, CL_TRUE, 0, 1 * sizeof(Elliptic_Curve), &EC, 0, NULL, NULL); check_return(res); res = clEnqueueWriteBuffer(cmd_queue, cl_INV, CL_TRUE, 0, 1 * sizeof(u8) * 0x80, &inv256, 0, NULL, NULL); check_return(res); res = clSetKernelArg(kernelobj, 0, sizeof(cl_mem), &cl_DP); /* i/o buffer */ res|= clSetKernelArg(kernelobj, 1, sizeof(data) * local *1, NULL); //allocate space for __local in kernel (just this!) one * localsize res|= clSetKernelArg(kernelobj, 2, sizeof(cl_mem), &cl_EC); res|= clSetKernelArg(kernelobj, 3, sizeof(cl_mem), &cl_INV); res|= clSetKernelArg(kernelobj, 4, sizeof(debug) * local *1, NULL); //allocate space for __local in kernel (just this!) one * localsize res|= clSetKernelArg(kernelobj, 5, sizeof(cl_mem), &DEBUG); //this used to debug kernel output check_return(res); // printf("n:%d, total of %db needed, allocated _local: %db\n", n, n * sizeof(debug), allocated_local); cl_event NDRangeEvent; cl_ulong start, end; /* Execute NDrange */ res = clEnqueueNDRangeKernel(cmd_queue, kernelobj, 1, NULL, &global, &local, 0, NULL, &NDRangeEvent); check_return(res); // res = clEnqueueNDRangeKernel(cmd_queue, kernelobj, 1, NULL, &global, NULL, 0, NULL, &NDRangeEvent); check_return(res); printf("Read back, Mapping buffer:\t%db\n", n * sizeof(data)); DP = clEnqueueMapBuffer(cmd_queue, cl_DP, CL_TRUE, CL_MAP_READ, 0, n * sizeof(data), 0, NULL, NULL, &res); check_return(res); dbg =clEnqueueMapBuffer(cmd_queue, DEBUG, CL_TRUE, CL_MAP_READ, 0, n * sizeof(debug), 0, NULL, NULL, &res); check_return(res); /* using clEnqueueReadBuffer template */ // res = clEnqueueReadBuffer(cmd_queue, ST, CL_TRUE, 0, sets * sizeof(cl_uint8), dbg, 0, NULL, NULL); check_return(res); clFlush(cmd_queue); clFinish(cmd_queue); /* get NDRange execution time with internal ocl profiler */ res = clGetEventProfilingInfo(NDRangeEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); res|= clGetEventProfilingInfo(NDRangeEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); check_return(res); printf("kernel execution time:\t\t%.2f ms\n", (float) ((end - start) /1000000)); //relative to NDRange call printf("number of computes/sec:\t%.2f\n", (float) global *1000000 /((end - start))); printf("i,\tgid\tlid0\tlsize0\tgid0/lsz0,\tgsz0,\tn_gr0,\tlid5,\toffset\n"); for(int i = 0; i < n; i++) { // if(i %local == 0) { printf("%d \t", i); //printf("%u\t%u\t%u\t%u\t| %2u, %2u, %2u, %u\n", *p, *(p +1), *(p +2), *(p +3), *(p +4), *(p +5), *(p +6), *(p +7)); /* silence this doubled debug info printf("%u\t%u\t%u\t%u\t| %2u, %2u, %2u, %u\n", dbg[i].data[0], dbg[i].data[1], dbg[i].data[2], dbg[i].data[3], dbg[i].data[4], dbg[i].data[5], dbg[i].data[6], dbg[i].data[7]); */ //printf("%d %d\n", P[i].dig, P[i].c); bn_print("", DP[i].k, 21, 1); bn_print("", DP[i].rx, 20, 0); bn_print(" ", DP[i].ry, 20, 1); printf("%u(/%u) = %u*%u(/%u) +%u, offset:%u, stride:%u\n", DP[i].pad[0], DP[i].pad[1], DP[i].pad[2], DP[i].pad[3], DP[i].pad[4], DP[i].pad[5], DP[i].pad[6], DP[i].pad[7]); // } } /* Release OpenCL stuff, free the rest */ clReleaseMemObject(cl_DP); clReleaseMemObject(cl_EC); clReleaseMemObject(cl_INV); clReleaseMemObject(DEBUG); clReleaseKernel(kernelobj); clReleaseProgram(pro); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); free(kernel_source); puts("Done!"); return 0; }
int main() { srand(unsigned(time(nullptr))); int err; // error code returned from api calls cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel // OpenCL device memory for matrices cl_mem d_A; cl_mem d_B; cl_mem d_C; // set seed for rand() srand(2014); //Allocate host memory for matrices A and B unsigned int size_A = WA * HA; unsigned int mem_size_A = sizeof(float) * size_A; float* h_A = (float*)malloc(mem_size_A); unsigned int size_B = WB * HB; unsigned int mem_size_B = sizeof(float) * size_B; float* h_B = (float*)malloc(mem_size_B); //Initialize host memory randomMemInit(h_A, size_A); randomMemInit(h_B, size_B); //Allocate host memory for the result C unsigned int size_C = WC * HC; unsigned int mem_size_C = sizeof(float) * size_C; float* h_C = (float*)malloc(mem_size_C); printf("Initializing OpenCL device...\n"); cl_uint dev_cnt = 0; clGetPlatformIDs(0, 0, &dev_cnt); cl_platform_id platform_ids[100]; clGetPlatformIDs(dev_cnt, platform_ids, NULL); // Connect to a compute device int gpu = 1; err = clGetDeviceIDs(platform_ids[0], gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS){ printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } // Create a compute context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context){ printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command commands commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands){ printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source file char *KernelSource; long lFileSize = LoadOpenCLKernel("matrixmul_kernel.cl", &KernelSource); if (lFileSize < 0L){ perror("File read failed"); return 1; } //const char* KernelSource = loadKernelCPP(".\\matrixmul_kernel.cl"); program = clCreateProgramWithSource(context, 1, (const char **)&KernelSource, NULL, &err); if (!program){ printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program executable err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS){ size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // Create the compute kernel in the program we wish to run kernel = clCreateKernel(program, "matrixMul", &err); if (!kernel || err != CL_SUCCESS){ printf("Error: Failed to create compute kernel!\n"); exit(1); } // Create the input and output arrays in device memory for our calculation d_C = clCreateBuffer(context, CL_MEM_READ_WRITE, mem_size_A, NULL, &err); d_A = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_A, h_A, &err); d_B = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_B, h_B, &err); if (!d_A || !d_B || !d_C){ printf("Error: Failed to allocate device memory!\n"); exit(1); } printf("Running matrix multiplication for matrices A (%dx%d) and B (%dx%d) ...\n", WA, HA, WB, HB); //Launch OpenCL kernel size_t localWorkSize[2], globalWorkSize[2]; int wA = WA; int wC = WC; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&d_C); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d_A); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&d_B); err |= clSetKernelArg(kernel, 3, sizeof(int), (void *)&wA); err |= clSetKernelArg(kernel, 4, sizeof(int), (void *)&wC); if (err != CL_SUCCESS){ printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } localWorkSize[0] = 16; localWorkSize[1] = 16; globalWorkSize[0] = 1024; globalWorkSize[1] = 1024; err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); if (err != CL_SUCCESS){ printf("Error: Failed to execute kernel! %d\n", err); exit(1); } //Retrieve result from device err = clEnqueueReadBuffer(commands, d_C, CL_TRUE, 0, mem_size_C, h_C, 0, NULL, NULL); if (err != CL_SUCCESS){ printf("Error: Failed to read output array! %d\n", err); exit(1); } //print table A printf("\nMatrix A\n"); for (int i = 0; i < size_A; i++){ printf("%f\t", h_A[i]); if (((i + 1) % WA) == 0) printf("\n"); } //print table B printf("\nMatrix B\n"); for (int i = 0; i < size_B; i++){ printf("%f\t", h_B[i]); if (((i + 1) % WB) == 0) printf("\n"); } //print out the results printf("\nMatrix C (Results)\n"); for (int i = 0; i < size_C; i++){ printf("%f\t", h_C[i]); if (((i + 1) % WC) == 0) printf("\n"); } printf("\n"); printf("Matrix multiplication completed...\n"); //Shutdown and cleanup free(h_A); free(h_B); free(h_C); clReleaseMemObject(d_A); clReleaseMemObject(d_C); clReleaseMemObject(d_B); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); std::cin.clear(); std::cin.sync(); std::cin.get(); }
int main () { int err, i; cl_platform_id platform; cl_device_id device; cl_context context; cl_context_properties context_props[3]; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_mem buffer; size_t len; const char *program_source = NULL; char *device_extensions = NULL; char kernel_build_opts[256]; size_t size = sizeof (cl_int) * SIZE; const size_t global_work_size[] = {SIZE, 0, 0}; /* size of each dimension */ cl_int *data; /* In order to see which devices the OpenCL implementation on your platform provides you may issue a call to the print_clinfo () fuction. */ /* Initialize the data the OpenCl program operates on. */ data = (cl_int*) calloc (1, size); if (data == NULL) { fprintf (stderr, "calloc failed\n"); exit (EXIT_FAILURE); } /* Pick the first platform. */ CHK (clGetPlatformIDs (1, &platform, NULL)); /* Get the default device and create context. */ CHK (clGetDeviceIDs (platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, NULL)); context_props[0] = CL_CONTEXT_PLATFORM; context_props[1] = (cl_context_properties) platform; context_props[2] = 0; context = clCreateContext (context_props, 1, &device, NULL, NULL, &err); CHK_ERR ("clCreateContext", err); queue = clCreateCommandQueue (context, device, 0, &err); CHK_ERR ("clCreateCommandQueue", err); /* Query OpenCL extensions of that device. */ CHK (clGetDeviceInfo (device, CL_DEVICE_EXTENSIONS, 0, NULL, &len)); device_extensions = (char *) malloc (len); CHK (clGetDeviceInfo (device, CL_DEVICE_EXTENSIONS, len, device_extensions, NULL)); strcpy (kernel_build_opts, "-Werror -cl-opt-disable"); if (strstr (device_extensions, "cl_khr_fp64") != NULL) strcpy (kernel_build_opts + strlen (kernel_build_opts), " -D HAVE_cl_khr_fp64"); if (strstr (device_extensions, "cl_khr_fp16") != NULL) strcpy (kernel_build_opts + strlen (kernel_build_opts), " -D HAVE_cl_khr_fp16"); /* Read the OpenCL kernel source into the main memory. */ program_source = read_file (STRINGIFY (CL_SOURCE), &len); if (program_source == NULL) { fprintf (stderr, "file does not exist: %s\n", STRINGIFY (CL_SOURCE)); exit (EXIT_FAILURE); } /* Build the OpenCL kernel. */ program = clCreateProgramWithSource (context, 1, &program_source, &len, &err); free ((void*) program_source); CHK_ERR ("clCreateProgramWithSource", err); err = clBuildProgram (program, 0, NULL, kernel_build_opts, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char *clbuild_log = NULL; CHK (clGetProgramBuildInfo (program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &len)); clbuild_log = malloc (len); if (clbuild_log) { CHK (clGetProgramBuildInfo (program, device, CL_PROGRAM_BUILD_LOG, len, clbuild_log, NULL)); fprintf (stderr, "clBuildProgram failed with:\n%s\n", clbuild_log); free (clbuild_log); } exit (EXIT_FAILURE); } /* In some cases it might be handy to save the OpenCL program binaries to do further analysis on them. In order to do so you may call the following function: save_program_binaries (program);. */ kernel = clCreateKernel (program, "testkernel", &err); CHK_ERR ("clCreateKernel", err); /* Setup the input data for the kernel. */ buffer = clCreateBuffer (context, CL_MEM_USE_HOST_PTR, size, data, &err); CHK_ERR ("clCreateBuffer", err); /* Execute the kernel (data parallel). */ CHK (clSetKernelArg (kernel, 0, sizeof (buffer), &buffer)); CHK (clEnqueueNDRangeKernel (queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, NULL)); /* Fetch the results (blocking). */ CHK (clEnqueueReadBuffer (queue, buffer, CL_TRUE, 0, size, data, 0, NULL, NULL)); /* Compare the results. */ for (i = 0; i < SIZE; i++) { if (data[i] != 0x1) { fprintf (stderr, "error: data[%d]: %d != 0x1\n", i, data[i]); exit (EXIT_FAILURE); } } /* Cleanup. */ CHK (clReleaseMemObject (buffer)); CHK (clReleaseKernel (kernel)); CHK (clReleaseProgram (program)); CHK (clReleaseCommandQueue (queue)); CHK (clReleaseContext (context)); free (data); return 0; }
/** * @brief Creates an array of objects containing the OpenCL variables of each device * @param trDataBase The training database which will contain the instances and the features * @param selInstances The instances choosen as initial centroids * @param transposedTrDataBase The training database already transposed * @param conf The structure with all configuration parameters * @return A pointer containing the objects */ CLDevice *createDevices(const float *const trDataBase, const int *const selInstances, const float *const transposedTrDataBase, Config *const conf) { /********** Find the OpenCL devices specified in configuration ***********/ // OpenCL variables cl_uint numPlatformsDevices; cl_device_type deviceType; cl_program program; cl_kernel kernel; cl_int status; // Others variables auto allDevices = getAllDevices(); CLDevice *devices = new CLDevice[conf -> nDevices + (conf -> ompThreads > 0)]; for (int dev = 0; dev < conf -> nDevices; ++dev) { bool found = false; for (int allDev = 0; allDev < allDevices.size() && !found; ++allDev) { // Get the specified OpenCL device char dbuff[120]; check(clGetDeviceInfo(allDevices[allDev], CL_DEVICE_NAME, sizeof(dbuff), dbuff, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_NAME); // If the device exists... if (conf -> devices[dev] == dbuff) { devices[dev].device = allDevices[allDev]; devices[dev].deviceName = dbuff; check(clGetDeviceInfo(devices[dev].device, CL_DEVICE_TYPE, sizeof(cl_device_type), &(devices[dev].deviceType), NULL) != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_TYPE); /********** Device local memory usage ***********/ long int usedMemory = conf -> nFeatures * sizeof(cl_uchar); // Chromosome of the individual usedMemory += conf -> trNInstances * sizeof(cl_uchar); // Mapping buffer usedMemory += conf -> K * conf -> nFeatures * sizeof(cl_float); // Centroids buffer usedMemory += conf -> trNInstances * sizeof(cl_float); // DistCentroids buffer usedMemory += conf -> K * sizeof(cl_int); // Samples_in_k buffer // Get the maximum local memory size long int maxMemory; check(clGetDeviceInfo(devices[dev].device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(long int), &maxMemory, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_MAXMEM); // Avoid exceeding the maximum local memory available. 1024 bytes of margin check(usedMemory > maxMemory - 1024, "%s:\n\tMax memory: %ld bytes\n\tAllow memory: %ld bytes\n\tUsed memory: %ld bytes\n", CL_ERROR_DEVICE_LOCALMEM, maxMemory, maxMemory - 1024, usedMemory); /********** Create context ***********/ devices[dev].context = clCreateContext(NULL, 1, &(devices[dev].device), 0, 0, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_CONTEXT); /********** Create Command queue ***********/ devices[dev].commandQueue = clCreateCommandQueue(devices[dev].context, devices[dev].device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_QUEUE); /********** Create kernel ***********/ // Open the file containing the kernels std::fstream kernels(conf -> kernelsFileName.c_str(), std::fstream::in); check(!kernels.is_open(), "%s\n", CL_ERROR_FILE_OPEN); // Obtain the size kernels.seekg(0, kernels.end); size_t fSize = kernels.tellg(); kernels.seekg(0, kernels.beg); char *kernelSource = new char[fSize]; kernels.read(kernelSource, fSize); kernels.close(); // Create program program = clCreateProgramWithSource(devices[dev].context, 1, (const char **) &kernelSource, &fSize, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_PROGRAM_BUILD); // Build program for the device in the context char buildOptions[196]; sprintf(buildOptions, "-I include -D N_INSTANCES=%d -D N_FEATURES=%d -D N_OBJECTIVES=%d -D K=%d -D MAX_ITER_KMEANS=%d", conf -> trNInstances, conf -> nFeatures, conf -> nObjectives, conf -> K, conf -> maxIterKmeans); if (clBuildProgram(program, 1, &(devices[dev].device), buildOptions, 0, 0) != CL_SUCCESS) { char buffer[4096]; fprintf(stderr, "Error: Could not build the program\n"); check(clGetProgramBuildInfo(program, devices[dev].device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_PROGRAM_ERRORS); check(true, "%s\n", buffer); } // Create kernel const char *kernelName = (devices[dev].deviceType == CL_DEVICE_TYPE_GPU) ? "kmeansGPU" : ""; devices[dev].kernel = clCreateKernel(program, kernelName, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_BUILD); /******* Work-items *******/ devices[dev].computeUnits = atoi(conf -> computeUnits[dev].c_str()); devices[dev].wiLocal = atoi(conf -> wiLocal[dev].c_str()); devices[dev].wiGlobal = devices[dev].computeUnits * devices[dev].wiLocal; /******* Create and write the databases and centroids buffers. Create the subpopulations buffer. Set kernel arguments *******/ // Create buffers devices[dev].objSubpopulations = clCreateBuffer(devices[dev].context, CL_MEM_READ_WRITE, conf -> familySize * sizeof(Individual), 0, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_SUBPOPS); devices[dev].objTrDataBase = clCreateBuffer(devices[dev].context, CL_MEM_READ_ONLY, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), 0, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_TRDB); devices[dev].objTransposedTrDataBase = clCreateBuffer(devices[dev].context, CL_MEM_READ_ONLY, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), 0, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_TTRDB); devices[dev].objSelInstances = clCreateBuffer(devices[dev].context, CL_MEM_READ_ONLY, conf -> K * sizeof(cl_int), 0, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_CENTROIDS); // Sets kernel arguments check(clSetKernelArg(devices[dev].kernel, 0, sizeof(cl_mem), (void *)&(devices[dev].objSubpopulations)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT1); check(clSetKernelArg(devices[dev].kernel, 1, sizeof(cl_mem), (void *)&(devices[dev].objSelInstances)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT2); check(clSetKernelArg(devices[dev].kernel, 2, sizeof(cl_mem), (void *)&(devices[dev].objTrDataBase)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT3); check(clSetKernelArg(devices[dev].kernel, 5, sizeof(cl_mem), (void *)&(devices[dev].objTransposedTrDataBase)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT6); // Write buffers check(clEnqueueWriteBuffer(devices[dev].commandQueue, devices[dev].objTrDataBase, CL_FALSE, 0, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), trDataBase, 0, NULL, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_ENQUEUE_TRDB); check(clEnqueueWriteBuffer(devices[dev].commandQueue, devices[dev].objSelInstances, CL_FALSE, 0, conf -> K * sizeof(cl_int), selInstances, 0, NULL, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_ENQUEUE_CENTROIDS); check(clEnqueueWriteBuffer(devices[dev].commandQueue, devices[dev].objTransposedTrDataBase, CL_FALSE, 0, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), transposedTrDataBase, 0, NULL, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_ENQUEUE_TTRDB); // Resources used are released delete[] kernelSource; clReleaseProgram(program); found = true; allDevices.erase(allDevices.begin() + allDev); } } check(!found, "%s\n", CL_ERROR_DEVICE_FOUND); } /********** Add the CPU if has been enabled in configuration ***********/ if (conf -> ompThreads > 0) { devices[conf -> nDevices].deviceType = CL_DEVICE_TYPE_CPU; devices[conf -> nDevices].computeUnits = conf -> ompThreads; ++(conf -> nDevices); } return devices; }
void init(OptionParser& op, bool _do_dp) { cl_int err; do_dp = _do_dp; if (!fftCtx) { // first get the device int device, platform = op.getOptionInt("platform"); if (op.getOptionVecInt("device").size() > 0) { device = op.getOptionVecInt("device")[0]; } else { device = 0; } fftDev = ListDevicesAndGetDevice(platform, device); // now get the context fftCtx = clCreateContext(NULL, 1, &fftDev, NULL, NULL, &err); CL_CHECK_ERROR(err); } if (!fftQueue) { // get a queue fftQueue = clCreateCommandQueue(fftCtx, fftDev, CL_QUEUE_PROFILING_ENABLE, &err); CL_CHECK_ERROR(err); } // create the program... fftProg = clCreateProgramWithSource(fftCtx, 1, &cl_source_fft, NULL, &err); CL_CHECK_ERROR(err); // ...and build it string args = " -cl-mad-enable "; if (op.getOptionBool("use-native")) { args += " -cl-fast-relaxed-math "; } if (!do_dp) { args += " -DSINGLE_PRECISION "; } else if (checkExtension(fftDev, "cl_khr_fp64")) { args += " -DK_DOUBLE_PRECISION "; } else if (checkExtension(fftDev, "cl_amd_fp64")) { args += " -DAMD_DOUBLE_PRECISION "; } err = clBuildProgram(fftProg, 0, NULL, args.c_str(), NULL, NULL); { char* log = NULL; size_t bytesRequired = 0; err = clGetProgramBuildInfo(fftProg, fftDev, CL_PROGRAM_BUILD_LOG, 0, NULL, &bytesRequired ); log = (char*)malloc( bytesRequired + 1 ); err = clGetProgramBuildInfo(fftProg, fftDev, CL_PROGRAM_BUILD_LOG, bytesRequired, log, NULL ); std::cout << log << std::endl; free( log ); } if (err != CL_SUCCESS) { char log[50000]; size_t retsize = 0; err = clGetProgramBuildInfo(fftProg, fftDev, CL_PROGRAM_BUILD_LOG, 50000*sizeof(char), log, &retsize); CL_CHECK_ERROR(err); cout << "Retsize: " << retsize << endl; cout << "Log: " << log << endl; dumpPTXCode(fftCtx, fftProg, "oclFFT"); exit(-1); } else { // dumpPTXCode(fftCtx, fftProg, "oclFFT"); } // Create kernel for forward FFT fftKrnl = clCreateKernel(fftProg, "fft1D_512", &err); CL_CHECK_ERROR(err); // Create kernel for inverse FFT ifftKrnl = clCreateKernel(fftProg, "ifft1D_512", &err); CL_CHECK_ERROR(err); // Create kernel for check chkKrnl = clCreateKernel(fftProg, "chk1D_512", &err); CL_CHECK_ERROR(err); }
int main(int argc, char** argv) { ocd_init(&argc, &argv, NULL); ocd_initCL(); cl_int err; size_t global_size; size_t local_size; cl_program program; cl_kernel kernel_compute_flux; cl_kernel kernel_compute_flux_contributions; cl_kernel kernel_compute_step_factor; cl_kernel kernel_time_step; cl_kernel kernel_initialize_variables; cl_mem ff_variable; cl_mem ff_fc_momentum_x; cl_mem ff_fc_momentum_y; cl_mem ff_fc_momentum_z; cl_mem ff_fc_density_energy; if (argc < 2) { printf("Usage ./cfd <data input file>\n"); return 0; } const char* data_file_name = argv[1]; // set far field conditions and load them into constant memory on the gpu { float h_ff_variable[NVAR]; const float angle_of_attack = (float)(3.1415926535897931 / 180.0) * (float)(deg_angle_of_attack); h_ff_variable[VAR_DENSITY] = (float)(1.4); float ff_pressure = (float)(1.0); float ff_speed_of_sound = sqrt(GAMMA*ff_pressure / h_ff_variable[VAR_DENSITY]); float ff_speed = (float)(ff_mach)*ff_speed_of_sound; float3 ff_velocity; ff_velocity.x = ff_speed*(float)(cos((float)angle_of_attack)); ff_velocity.y = ff_speed*(float)(sin((float)angle_of_attack)); ff_velocity.z = 0.0; h_ff_variable[VAR_MOMENTUM+0] = h_ff_variable[VAR_DENSITY] * ff_velocity.x; h_ff_variable[VAR_MOMENTUM+1] = h_ff_variable[VAR_DENSITY] * ff_velocity.y; h_ff_variable[VAR_MOMENTUM+2] = h_ff_variable[VAR_DENSITY] * ff_velocity.z; h_ff_variable[VAR_DENSITY_ENERGY] = h_ff_variable[VAR_DENSITY]*((float)(0.5)*(ff_speed*ff_speed)) + (ff_pressure / (float)(GAMMA-1.0)); float3 h_ff_momentum; h_ff_momentum.x = *(h_ff_variable+VAR_MOMENTUM+0); h_ff_momentum.y = *(h_ff_variable+VAR_MOMENTUM+1); h_ff_momentum.z = *(h_ff_variable+VAR_MOMENTUM+2); float3 h_ff_fc_momentum_x; float3 h_ff_fc_momentum_y; float3 h_ff_fc_momentum_z; float3 h_ff_fc_density_energy; compute_flux_contribution(&h_ff_variable[VAR_DENSITY], &h_ff_momentum, &h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure, &ff_velocity, &h_ff_fc_momentum_x, &h_ff_fc_momentum_y, &h_ff_fc_momentum_z, &h_ff_fc_density_energy); // copy far field conditions to the gpu ff_variable = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * NVAR, h_ff_variable, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_momentum_x = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_x, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_momentum_y = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_y, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_momentum_z = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_z, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_density_energy = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_density_energy, &err); CHKERR(err, "Unable to allocate ff data"); } int nel; int nelr; // read in domain geometry cl_mem areas; cl_mem elements_surrounding_elements; cl_mem normals; { std::ifstream file(data_file_name); file >> nel; nelr = block_length*((nel / block_length )+ std::min(1, nel % block_length)); float* h_areas = new float[nelr]; int* h_elements_surrounding_elements = new int[nelr*NNB]; float* h_normals = new float[nelr*NDIM*NNB]; // read in data for(int i = 0; i < nel; i++) { file >> h_areas[i]; for(int j = 0; j < NNB; j++) { file >> h_elements_surrounding_elements[i + j*nelr]; if(h_elements_surrounding_elements[i+j*nelr] < 0) h_elements_surrounding_elements[i+j*nelr] = -1; h_elements_surrounding_elements[i + j*nelr]--; //it's coming in with Fortran numbering for(int k = 0; k < NDIM; k++) { file >> h_normals[i + (j + k*NNB)*nelr]; h_normals[i + (j + k*NNB)*nelr] = -h_normals[i + (j + k*NNB)*nelr]; } } } // fill in remaining data int last = nel-1; for(int i = nel; i < nelr; i++) { h_areas[i] = h_areas[last]; for(int j = 0; j < NNB; j++) { // duplicate the last element h_elements_surrounding_elements[i + j*nelr] = h_elements_surrounding_elements[last + j*nelr]; for(int k = 0; k < NDIM; k++) h_normals[last + (j + k*NNB)*nelr] = h_normals[last + (j + k*NNB)*nelr]; } } areas = alloc<float>(context, nelr); upload<float>(commands, areas, h_areas, nelr); elements_surrounding_elements = alloc<int>(context, nelr*NNB); upload<int>(commands, elements_surrounding_elements, h_elements_surrounding_elements, nelr*NNB); normals = alloc<float>(context, nelr*NDIM*NNB); upload<float>(commands, normals, h_normals, nelr*NDIM*NNB); delete[] h_areas; delete[] h_elements_surrounding_elements; delete[] h_normals; } // Get program source. long kernelSize = getKernelSize(); char* kernelSource = new char[kernelSize]; getKernelSource(kernelSource, kernelSize); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) &kernelSource, NULL, &err); CHKERR(err, "Failed to create a compute program!"); // Build the program executable err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err == CL_BUILD_PROGRAM_FAILURE) { char *log; size_t logLen; err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLen); log = (char *) malloc(sizeof(char)*logLen); err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, logLen, (void *) log, NULL); fprintf(stderr, "CL Error %d: Failed to build program! Log:\n%s", err, log); free(log); exit(1); } CHKERR(err, "Failed to build program!"); delete[] kernelSource; // Create the compute kernel in the program we wish to run kernel_compute_flux = clCreateKernel(program, "compute_flux", &err); CHKERR(err, "Failed to create a compute kernel!"); // Create the reduce kernel in the program we wish to run kernel_compute_flux_contributions = clCreateKernel(program, "compute_flux_contributions", &err); CHKERR(err, "Failed to create a compute_flux_contributions kernel!"); // Create the reduce kernel in the program we wish to run kernel_compute_step_factor = clCreateKernel(program, "compute_step_factor", &err); CHKERR(err, "Failed to create a compute_step_factor kernel!"); // Create the reduce kernel in the program we wish to run kernel_time_step = clCreateKernel(program, "time_step", &err); CHKERR(err, "Failed to create a time_step kernel!"); // Create the reduce kernel in the program we wish to run kernel_initialize_variables = clCreateKernel(program, "initialize_variables", &err); CHKERR(err, "Failed to create a initialize_variables kernel!"); // Create arrays and set initial conditions cl_mem variables = alloc<cl_float>(context, nelr*NVAR); err = 0; err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&variables); err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device //err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!"); local_size = 1;//std::min(local_size, (size_t)nelr); global_size = nelr; err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); err = clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 0"); cl_mem old_variables = alloc<float>(context, nelr*NVAR); cl_mem fluxes = alloc<float>(context, nelr*NVAR); cl_mem step_factors = alloc<float>(context, nelr); clFinish(commands); cl_mem fc_momentum_x = alloc<float>(context, nelr*NDIM); cl_mem fc_momentum_y = alloc<float>(context, nelr*NDIM); cl_mem fc_momentum_z = alloc<float>(context, nelr*NDIM); cl_mem fc_density_energy = alloc<float>(context, nelr*NDIM); clFinish(commands); // make sure all memory is floatly allocated before we start timing err = 0; err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&old_variables); err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 1"); err = 0; err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&fluxes); err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 2"); std::cout << "About to memcopy" << std::endl; err = clReleaseMemObject(step_factors); float temp[nelr]; for(int i = 0; i < nelr; i++) temp[i] = 0; step_factors = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * nelr, temp, &err); CHKERR(err, "Unable to memset step_factors"); // make sure CUDA isn't still doing something before we start timing clFinish(commands); // these need to be computed the first time in order to compute time step std::cout << "Starting..." << std::endl; // Begin iterations for(int i = 0; i < iterations; i++) { copy<float>(commands, old_variables, variables, nelr*NVAR); // for the first iteration we compute the time step err = 0; err = clSetKernelArg(kernel_compute_step_factor, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_compute_step_factor, 1, sizeof(cl_mem),&variables); err |= clSetKernelArg(kernel_compute_step_factor, 2, sizeof(cl_mem), &areas); err |= clSetKernelArg(kernel_compute_step_factor, 3, sizeof(cl_mem), &step_factors); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_compute_step_factor, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Step Factor Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel[kernel_compute_step_factor]!"); for(int j = 0; j < RK; j++) { err = 0; err = clSetKernelArg(kernel_compute_flux_contributions, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_compute_flux_contributions, 1, sizeof(cl_mem),&variables); err |= clSetKernelArg(kernel_compute_flux_contributions, 2, sizeof(cl_mem), &fc_momentum_x); err |= clSetKernelArg(kernel_compute_flux_contributions, 3, sizeof(cl_mem), &fc_momentum_y); err |= clSetKernelArg(kernel_compute_flux_contributions, 4, sizeof(cl_mem), &fc_momentum_z); err |= clSetKernelArg(kernel_compute_flux_contributions, 5, sizeof(cl_mem), &fc_density_energy); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_flux_contributions, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_flux_contributions work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_compute_flux_contributions, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Contribution Kernel", ocdTempTimer) //compute_flux_contributions(nelr, variables, fc_momentum_x, fc_momentum_y, fc_momentum_z, fc_density_energy); END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_compute_flux_contributions]!"); err = 0; err = clSetKernelArg(kernel_compute_flux, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_compute_flux, 1, sizeof(cl_mem), &elements_surrounding_elements); err |= clSetKernelArg(kernel_compute_flux, 2, sizeof(cl_mem), &normals); err |= clSetKernelArg(kernel_compute_flux, 3, sizeof(cl_mem), &variables); err |= clSetKernelArg(kernel_compute_flux, 4, sizeof(cl_mem), &fc_momentum_x); err |= clSetKernelArg(kernel_compute_flux, 5, sizeof(cl_mem), &fc_momentum_y); err |= clSetKernelArg(kernel_compute_flux, 6, sizeof(cl_mem), &fc_momentum_z); err |= clSetKernelArg(kernel_compute_flux, 7, sizeof(cl_mem), &fc_density_energy); err |= clSetKernelArg(kernel_compute_flux, 8, sizeof(cl_mem), &fluxes); err |= clSetKernelArg(kernel_compute_flux, 9, sizeof(cl_mem), &ff_variable); err |= clSetKernelArg(kernel_compute_flux, 10, sizeof(cl_mem), &ff_fc_momentum_x); err |= clSetKernelArg(kernel_compute_flux, 11, sizeof(cl_mem), &ff_fc_momentum_y); err |= clSetKernelArg(kernel_compute_flux, 12, sizeof(cl_mem), &ff_fc_momentum_z); err |= clSetKernelArg(kernel_compute_flux, 13, sizeof(cl_mem), &ff_fc_density_energy); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_flux, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_flux work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_compute_flux, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_compute_flux]!"); err = 0; err = clSetKernelArg(kernel_time_step, 0, sizeof(int), &j); err |= clSetKernelArg(kernel_time_step, 1, sizeof(int), &nelr); err |= clSetKernelArg(kernel_time_step, 2, sizeof(cl_mem), &old_variables); err |= clSetKernelArg(kernel_time_step, 3, sizeof(cl_mem), &variables); err |= clSetKernelArg(kernel_time_step, 4, sizeof(cl_mem), &step_factors); err |= clSetKernelArg(kernel_time_step, 5, sizeof(cl_mem), &fluxes); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_time_step, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_time_step work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_time_step, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Time Step Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_time_step]!"); } } clFinish(commands); std::cout << "Finished" << std::endl; std::cout << "Saving solution..." << std::endl; dump(commands, variables, nel, nelr); std::cout << "Saved solution..." << std::endl; std::cout << "Cleaning up..." << std::endl; clReleaseProgram(program); clReleaseKernel(kernel_compute_flux); clReleaseKernel(kernel_compute_flux_contributions); clReleaseKernel(kernel_compute_step_factor); clReleaseKernel(kernel_time_step); clReleaseKernel(kernel_initialize_variables); clReleaseCommandQueue(commands); clReleaseContext(context); dealloc<float>(areas); dealloc<int>(elements_surrounding_elements); dealloc<float>(normals); dealloc<float>(variables); dealloc<float>(old_variables); dealloc<float>(fluxes); dealloc<float>(step_factors); dealloc<float>(fc_momentum_x); dealloc<float>(fc_momentum_y); dealloc<float>(fc_momentum_z); dealloc<float>(fc_density_energy); std::cout << "Done..." << std::endl; ocd_finalize(); return 0; }
int main(int argc, char** argv) { int err; // error code returned from api calls float *cpu_xyz; //calculate the results on the CPU float *gpu_xyz; //calculate the results on the GPU unsigned int correct; // number of correct results returned char* kernel_source; //kernel source code cl_platform_id platform_id; // compute platform id cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel code //stuff were going to query cl_int preferred_workgroup_size; cl_int max_workgroup_size; cl_mem cl_output; // device memory used for the output array int i, j, k; if(!cl_load("main.cl", &kernel_source)) { printf("Your file didn't load."); return 1; } int theta, phi, r; for(i=0; i<V_THETA_MAX; i++) { theta = i*V_THETA_INC; for(j=0; j<V_PHI_MAX; j++) { phi = j*V_PHI_INC; for(k=0; k<V_R_MAX; k++) { r = k*V_R_INC; } } } //get the platform information. This corresponds to vendor implementations of opencl cl_uint platforms; err = clGetPlatformIDs(1, &platform_id, &platforms); if(err != CL_SUCCESS) { printf("Error: Failed to query platform ids!\n"); return EXIT_FAILURE; } // Connect to a compute device. A GPU in this case. // cl_uint num_devices; err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, &num_devices); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } // Create a compute context. A handle to the combination of platform and device. // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command queue. We will use this to send work to the CPU. // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // create a program, given the loaded source code. // program = clCreateProgramWithSource(context, 1, (const char **) &kernel_source, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Compile the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // create (select) a kernel function from the compiled program // kernel = clCreateKernel(program, "square", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } // Get the preferred workgroup size multiple // err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(preferred_workgroup_size), &preferred_workgroup_size, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } // Get the preferred workgroup size // err = clGetKernelWorkGroupInfo(kernel, device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(max_workgroup_size), &max_workgroup_size, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } // Create the output array in device memory for our calculation // output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL); if (!input || !output) { printf("Error: Failed to allocate device memory!\n"); exit(1); } // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array!\n"); exit(1); } // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // // global = count; global = local; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } // Wait for the command commands to get serviced before reading back results // clFinish(commands); // Read back the results from the device to verify the output // err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } // Validate our results // correct = 0; for(i = 0; i < count; i++) { if((results[i]) == data[i] * data[i]) correct++; } // Print a brief summary detailing the results // printf("Computed '%d/%d' correct values!\n", correct, count); // Shutdown and cleanup // clReleaseMemObject(input); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return 0; }
void WorkScheduler::initialize(bool use_opencl, int num_cpu_threads) { /* initialize highlighting */ if (!g_highlightInitialized) { if (g_highlightedNodesRead) MEM_freeN(g_highlightedNodesRead); if (g_highlightedNodes) MEM_freeN(g_highlightedNodes); g_highlightedNodesRead = NULL; g_highlightedNodes = NULL; COM_startReadHighlights(); g_highlightInitialized = true; } #if COM_CURRENT_THREADING_MODEL == COM_TM_QUEUE /* deinitialize if number of threads doesn't match */ if (g_cpudevices.size() != num_cpu_threads) { Device *device; while (g_cpudevices.size() > 0) { device = g_cpudevices.back(); g_cpudevices.pop_back(); device->deinitialize(); delete device; } g_cpuInitialized = false; } /* initialize CPU threads */ if (!g_cpuInitialized) { for (int index = 0; index < num_cpu_threads; index++) { CPUDevice *device = new CPUDevice(); device->initialize(); g_cpudevices.push_back(device); } g_cpuInitialized = true; } #ifdef COM_OPENCL_ENABLED /* deinitialize OpenCL GPU's */ if (use_opencl && !g_openclInitialized) { g_context = NULL; g_program = NULL; if (clewInit() != CLEW_SUCCESS) /* this will check for errors and skip if already initialized */ return; if (clCreateContextFromType) { cl_uint numberOfPlatforms = 0; cl_int error; error = clGetPlatformIDs(0, 0, &numberOfPlatforms); if (error == -1001) { } /* GPU not supported */ else if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); } if (G.f & G_DEBUG) printf("%u number of platforms\n", numberOfPlatforms); cl_platform_id *platforms = (cl_platform_id *)MEM_mallocN(sizeof(cl_platform_id) * numberOfPlatforms, __func__); error = clGetPlatformIDs(numberOfPlatforms, platforms, 0); unsigned int indexPlatform; for (indexPlatform = 0; indexPlatform < numberOfPlatforms; indexPlatform++) { cl_platform_id platform = platforms[indexPlatform]; cl_uint numberOfDevices = 0; clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 0, 0, &numberOfDevices); if (numberOfDevices <= 0) continue; cl_device_id *cldevices = (cl_device_id *)MEM_mallocN(sizeof(cl_device_id) * numberOfDevices, __func__); clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, numberOfDevices, cldevices, 0); g_context = clCreateContext(NULL, numberOfDevices, cldevices, clContextError, NULL, &error); if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); } const char *cl_str[2] = {datatoc_COM_OpenCLKernels_cl, NULL}; g_program = clCreateProgramWithSource(g_context, 1, cl_str, 0, &error); error = clBuildProgram(g_program, numberOfDevices, cldevices, 0, 0, 0); if (error != CL_SUCCESS) { cl_int error2; size_t ret_val_size = 0; printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); error2 = clGetProgramBuildInfo(g_program, cldevices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size); if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); } char *build_log = (char *)MEM_mallocN(sizeof(char) * ret_val_size + 1, __func__); error2 = clGetProgramBuildInfo(g_program, cldevices[0], CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL); if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); } build_log[ret_val_size] = '\0'; printf("%s", build_log); MEM_freeN(build_log); } else { unsigned int indexDevices; for (indexDevices = 0; indexDevices < numberOfDevices; indexDevices++) { cl_device_id device = cldevices[indexDevices]; cl_int vendorID = 0; cl_int error2 = clGetDeviceInfo(device, CL_DEVICE_VENDOR_ID, sizeof(cl_int), &vendorID, NULL); if (error2 != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error2, clewErrorString(error2)); } OpenCLDevice *clDevice = new OpenCLDevice(g_context, device, g_program, vendorID); clDevice->initialize(); g_gpudevices.push_back(clDevice); } } MEM_freeN(cldevices); } MEM_freeN(platforms); } g_openclInitialized = true; } #endif #endif }
int main(int argc, char *argv[]){ cl_uint numPlatforms; cl_platform_id* clSelectedPlatformID = NULL; int err; // error code returned from api calls int data[DATA_SIZE]; // original data set given to device int results[DATA_SIZE]; // results returned from device unsigned int correct; // number of correct results returned size_t global; // global domain size for our calculation size_t local; // local domain size for our calculation cl_device_id device_id; cl_context context; cl_command_queue commands; cl_program program; cl_kernel kernel; cl_mem input; // device memory used for the input array cl_mem output; // device memory used for the output array if(parseArgs(argc, argv)){ return 0; } // Fill our data set with random int values unsigned int count = DATA_SIZE; //////////////////////////////////////////////////////////////////////////////// // Simple compute kernel which computes the collatz of an input array // const char *KernelSource = fileToString("gpuFunctions.c"); //get Platform clGetPlatformIDs(0, NULL, &numPlatforms); clSelectedPlatformID = (cl_platform_id*)malloc(sizeof(cl_platform_id)*numPlatforms); err = clGetPlatformIDs(numPlatforms, clSelectedPlatformID, NULL); //get Device err = clGetDeviceIDs(clSelectedPlatformID[0], CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } //create context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command commands // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source buffer // program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "allToOne", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } // Create the input and output arrays in device memory for our calculation // input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL); output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL); if (!input || !output) { printf("Error: Failed to allocate device memory!\n"); exit(1); } timer t = createTimer(); for(int i =0;i<rep;i++){ initData(data); // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array!\n"); exit(1); } // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } // Get the maximum work group size for executing the kernel on the device // err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // global = count; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } // Wait for the command commands to get serviced before reading back results // clFinish(commands); // Read back the results from the device to verify the output // err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } } double timeEnd = getTime(t); // Validate our results // correct = 0; for(int i = 0; i < arraySize; i++) { if(results[i] >= 0){ correct++; if(i==0){ printf("%d",results[i]); }else{ printf(",%d",results[i]); } } } printf("\n"); // Print a brief summary detailing the results printf("Computed '%d/%d' values to 1!\n", correct, arraySize); printf("TIME- %f\n",timeEnd); // Shutdown and cleanup clReleaseMemObject(input); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return 0; }
int main(int argc, char const *argv[]) { /* Get platform */ cl_platform_id platform; cl_uint num_platforms; cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformIDs' failed\n"); exit(1); } printf("Number of platforms: %d\n", num_platforms); printf("platform=%p\n", platform); /* Get platform name */ char platform_name[100]; ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformInfo' failed\n"); exit(1); } printf("platform.name='%s'\n\n", platform_name); /* Get device */ cl_device_id device; cl_uint num_devices; ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceIDs' failed\n"); exit(1); } printf("Number of devices: %d\n", num_devices); printf("device=%p\n", device); /* Get device name */ char device_name[100]; ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceInfo' failed\n"); exit(1); } printf("device.name='%s'\n", device_name); printf("\n"); /* Create a Context Object */ cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateContext' failed\n"); exit(1); } printf("context=%p\n", context); /* Create a Command Queue Object*/ cl_command_queue command_queue; command_queue = clCreateCommandQueue(context, device, 0, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateCommandQueue' failed\n"); exit(1); } printf("command_queue=%p\n", command_queue); printf("\n"); /* Program source */ unsigned char *source_code; size_t source_length; /* Read program from 'signbit_float16.cl' */ source_code = read_buffer("signbit_float16.cl", &source_length); /* Create a program */ cl_program program; program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_length, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateProgramWithSource' failed\n"); exit(1); } printf("program=%p\n", program); /* Build program */ ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (ret != CL_SUCCESS ) { size_t size; char *log; /* Get log size */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size); /* Allocate log and print */ log = malloc(size); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL); printf("error: call to 'clBuildProgram' failed:\n%s\n", log); /* Free log and exit */ free(log); exit(1); } printf("program built\n"); printf("\n"); /* Create a Kernel Object */ cl_kernel kernel; kernel = clCreateKernel(program, "signbit_float16", &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateKernel' failed\n"); exit(1); } /* Create and allocate host buffers */ size_t num_elem = 10; /* Create and init host side src buffer 0 */ cl_float16 *src_0_host_buffer; src_0_host_buffer = malloc(num_elem * sizeof(cl_float16)); for (int i = 0; i < num_elem; i++) src_0_host_buffer[i] = (cl_float16){{2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0}}; /* Create and init device side src buffer 0 */ cl_mem src_0_device_buffer; src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_float16), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_float16), src_0_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create host dst buffer */ cl_int16 *dst_host_buffer; dst_host_buffer = malloc(num_elem * sizeof(cl_int16)); memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_int16)); /* Create device dst buffer */ cl_mem dst_device_buffer; dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_int16), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create dst buffer\n"); exit(1); } /* Set kernel arguments */ ret = CL_SUCCESS; ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clSetKernelArg' failed\n"); exit(1); } /* Launch the kernel */ size_t global_work_size = num_elem; size_t local_work_size = num_elem; ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueNDRangeKernel' failed\n"); exit(1); } /* Wait for it to finish */ clFinish(command_queue); /* Read results from GPU */ ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_int16), dst_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueReadBuffer' failed\n"); exit(1); } /* Dump dst buffer to file */ char dump_file[100]; sprintf((char *)&dump_file, "%s.result", argv[0]); write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_int16)); printf("Result dumped to %s\n", dump_file); /* Free host dst buffer */ free(dst_host_buffer); /* Free device dst buffer */ ret = clReleaseMemObject(dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 0 */ free(src_0_host_buffer); /* Free device side src buffer 0 */ ret = clReleaseMemObject(src_0_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Release kernel */ ret = clReleaseKernel(kernel); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseKernel' failed\n"); exit(1); } /* Release program */ ret = clReleaseProgram(program); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseProgram' failed\n"); exit(1); } /* Release command queue */ ret = clReleaseCommandQueue(command_queue); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseCommandQueue' failed\n"); exit(1); } /* Release context */ ret = clReleaseContext(context); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseContext' failed\n"); exit(1); } return 0; }
void opencl_init(void) { // get the platform cl_uint num_platforms; clError = clGetPlatformIDs(0, NULL, &num_platforms); checkErr(clError, "clGetPlatformIDs( 0, NULL, &num_platforms );"); if (num_platforms <= 0) { std::cout << "No platform..." << std::endl; exit(1); } cl_platform_id* platforms = new cl_platform_id[num_platforms]; clError = clGetPlatformIDs(num_platforms, platforms, NULL); checkErr(clError, "clGetPlatformIDs( num_platforms, &platforms, NULL );"); if (num_platforms > 1) { char platformName[256]; clError = clGetPlatformInfo(platforms[0], CL_PLATFORM_VENDOR, sizeof(platformName), platformName, NULL); std::cerr << "Multiple platforms found defaulting to: " << platformName << std::endl; } platform_id = platforms[0]; if (getenv("OPENCL_PLATEFORM")) platform_id = platforms[1]; delete platforms; // Connect to a compute device // cl_uint device_count = 0; clError = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, 0, NULL, &device_count); checkErr(clError, "Failed to create a device group"); cl_device_id* deviceIds = (cl_device_id*) malloc( sizeof(cl_device_id) * device_count); clError = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_ALL, device_count, deviceIds, NULL); if (device_count > 1) { char device_name[256]; int compute_units; clError = clGetDeviceInfo(deviceIds[0], CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); checkErr(clError, "clGetDeviceInfo failed"); clError = clGetDeviceInfo(deviceIds[0], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &compute_units, NULL); checkErr(clError, "clGetDeviceInfo failed"); std::cerr << "Multiple devices found defaulting to: " << device_name; std::cerr << " with " << compute_units << " compute units" << std::endl; } device_id = deviceIds[0]; delete deviceIds; // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &clError); checkErr(clError, "Failed to create a compute context!"); // Create a command commands // commandQueue = clCreateCommandQueue(context, device_id, 0, &clError); checkErr(clError, "Failed to create a command commands!"); // READ KERNEL FILENAME std::string filename = "NOTDEFINED.cl"; char const* tmp_name = getenv("OPENCL_KERNEL"); if (tmp_name) { filename = std::string(tmp_name); } else { filename = std::string(__FILE__); filename = filename.substr(0, filename.length() - 17); filename += "/kernels.cl"; } // READ OPENCL_PARAMETERS std::string compile_parameters = ""; char const* tmp_params = getenv("OPENCL_PARAMETERS"); if (tmp_params) { compile_parameters = std::string(tmp_params); } std::ifstream kernelFile(filename.c_str(), std::ios::in); if (!kernelFile.is_open()) { std::cout << "Unable to open " << filename << ". " << __FILE__ << ":" << __LINE__ << "Please set OPENCL_KERNEL" << std::endl; exit(1); } /* * Read the kernel file into an output stream. * Convert this into a char array for passing to OpenCL. */ std::ostringstream outputStringStream; outputStringStream << kernelFile.rdbuf(); std::string srcStdStr = outputStringStream.str(); const char* charSource = srcStdStr.c_str(); kernelFile.close(); // Create the compute program from the source buffer // program = clCreateProgramWithSource(context, 1, (const char **) &charSource, NULL, &clError); if (!program) { printf("Error: Failed to create compute program!\n"); exit(1); } // Build the program executable // clError = clBuildProgram(program, 0, NULL, compile_parameters.c_str(), NULL, NULL); /* Get the size of the build log. */ size_t logSize = 0; clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &logSize); if (clError != CL_SUCCESS) { if (logSize > 1) { char* log = new char[logSize]; clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, logSize, log, NULL); std::string stringChars(log, logSize); std::cerr << "Build log:\n " << stringChars << std::endl; delete[] log; } printf("Error: Failed to build program executable!\n"); exit(1); } return; }