int main() { /* Host/device data structures */ cl_platform_id platform; cl_device_id device; cl_context context; cl_command_queue queue; cl_int err; /* Program/kernel data structures */ cl_program program; FILE *program_handle; char *program_buffer, *program_log; size_t program_size, log_size; cl_kernel kernel; size_t offset = 0; size_t global_size, local_size; /* Data and buffers */ char pattern[16] = "thatwithhavefrom"; FILE *text_handle; char *text; size_t text_size; int chars_per_item; int result[4] = {0, 0, 0, 0}; cl_mem text_buffer, result_buffer; /* Identify a platform */ err = clGetPlatformIDs(1, &platform, NULL); if(err < 0) { perror("Couldn't identify a platform"); exit(1); } /* Access a device */ err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if(err < 0) { perror("Couldn't access any devices"); exit(1); } /* Determine global size and local size */ clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(global_size), &global_size, NULL); clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_size), &local_size, NULL); global_size *= local_size; /* Create a context */ context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if(err < 0) { perror("Couldn't create a context"); exit(1); } /* Read program file and place content into buffer */ program_handle = fopen(PROGRAM_FILE, "r"); if(program_handle == NULL) { perror("Couldn't find the program file"); exit(1); } fseek(program_handle, 0, SEEK_END); program_size = ftell(program_handle); rewind(program_handle); program_buffer = (char*)calloc(program_size+1, sizeof(char)); fread(program_buffer, sizeof(char), program_size, program_handle); fclose(program_handle); /* Read text file and place content into buffer */ text_handle = fopen(TEXT_FILE, "r"); if(text_handle == NULL) { perror("Couldn't find the text file"); exit(1); } fseek(text_handle, 0, SEEK_END); text_size = ftell(text_handle)-1; rewind(text_handle); text = (char*)calloc(text_size, sizeof(char)); fread(text, sizeof(char), text_size, text_handle); fclose(text_handle); chars_per_item = text_size / global_size + 1; /* Create program from file */ program = clCreateProgramWithSource(context, 1, (const char**)&program_buffer, &program_size, &err); if(err < 0) { perror("Couldn't create the program"); exit(1); } free(program_buffer); /* Build program */ err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if(err < 0) { /* Find size of log and print to std output */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); program_log = (char*) calloc(log_size+1, sizeof(char)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL); printf("%s\n", program_log); free(program_log); exit(1); } /* Create a kernel */ kernel = clCreateKernel(program, KERNEL_FUNC, &err); if(err < 0) { perror("Couldn't create a kernel"); exit(1); }; /* Create buffers to hold the text characters and count */ text_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, text_size, text, &err); if(err < 0) { perror("Couldn't create a buffer"); exit(1); }; result_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(result), result, NULL); /* Create kernel argument */ err = clSetKernelArg(kernel, 0, sizeof(pattern), pattern); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &text_buffer); err |= clSetKernelArg(kernel, 2, sizeof(chars_per_item), &chars_per_item); err |= clSetKernelArg(kernel, 3, 4 * sizeof(int), NULL); err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &result_buffer); if(err < 0) { printf("Couldn't set a kernel argument"); exit(1); }; /* Create a command queue */ queue = clCreateCommandQueue(context, device, 0, &err); if(err < 0) { perror("Couldn't create a command queue"); exit(1); }; /* Enqueue kernel */ err = clEnqueueNDRangeKernel(queue, kernel, 1, &offset, &global_size, &local_size, 0, NULL, NULL); if(err < 0) { perror("Couldn't enqueue the kernel"); printf("Error code: %d\n", err); exit(1); } /* Read and print the result */ err = clEnqueueReadBuffer(queue, result_buffer, CL_TRUE, 0, sizeof(result), &result, 0, NULL, NULL); if(err < 0) { perror("Couldn't read the buffer"); exit(1); } printf("\nResults: \n"); printf("Number of occurrences of 'that': %d\n", result[0]); printf("Number of occurrences of 'with': %d\n", result[1]); printf("Number of occurrences of 'have': %d\n", result[2]); printf("Number of occurrences of 'from': %d\n", result[3]); /* Deallocate resources */ clReleaseMemObject(result_buffer); clReleaseMemObject(text_buffer); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseProgram(program); clReleaseContext(context); return 0; }
int main() { /* OpenCL data structures */ cl_device_id device; cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_int err; /* Data and events */ char *kernel_msg; float data[4096]; cl_mem data_buffer; cl_event kernel_event, read_event; /* Create a device and context */ device = create_device(); context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if(err < 0) { perror("Couldn't create a context"); exit(1); } /* Build the program and create a kernel */ program = build_program(context, device, PROGRAM_FILE); kernel = clCreateKernel(program, KERNEL_FUNC, &err); if(err < 0) { perror("Couldn't create a kernel"); exit(1); }; /* Create a write-only buffer to hold the output data */ data_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(data), NULL, &err); if(err < 0) { perror("Couldn't create a buffer"); exit(1); }; /* Create kernel argument */ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_buffer); if(err < 0) { perror("Couldn't set a kernel argument"); exit(1); }; /* Create a command queue */ queue = clCreateCommandQueue(context, device, 0, &err); if(err < 0) { perror("Couldn't create a command queue"); exit(1); }; /* Enqueue kernel */ err = clEnqueueTask(queue, kernel, 0, NULL, &kernel_event); if(err < 0) { perror("Couldn't enqueue the kernel"); exit(1); } /* Read the buffer */ err = clEnqueueReadBuffer(queue, data_buffer, CL_FALSE, 0, sizeof(data), &data, 0, NULL, &read_event); if(err < 0) { perror("Couldn't read the buffer"); exit(1); } /* Set event handling routines */ kernel_msg = "The kernel finished successfully.\n\0"; err = clSetEventCallback(kernel_event, CL_COMPLETE, &kernel_complete, kernel_msg); if(err < 0) { perror("Couldn't set callback for event"); exit(1); } clSetEventCallback(read_event, CL_COMPLETE, &read_complete, data); /* Deallocate resources */ clReleaseMemObject(data_buffer); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseProgram(program); clReleaseContext(context); return 0; }
int main(int argc, char **argv) { if (find_option(argc, argv, "-h") >= 0) { printf("Options:\n"); printf("-h to see this help\n"); printf("-n <int> to set the number of particles\n"); printf("-o <filename> to specify the output file name\n"); printf("-s <filename> to specify the summary output file name\n"); return 0; } int n = read_int(argc, argv, "-n", 1000); char *savename = read_string(argc, argv, "-o", NULL); char *sumname = read_string(argc, argv, "-s", NULL); // For return values. cl_int ret; // OpenCL stuff. // Loading kernel files. FILE *kernelFile; char *kernelSource; size_t kernelSize; kernelFile = fopen("simulationKernel.cl", "r"); if (!kernelFile) { fprintf(stderr, "No file named simulationKernel.cl was found\n"); exit(-1); } kernelSource = (char*)malloc(MAX_SOURCE_SIZE); kernelSize = fread(kernelSource, 1, MAX_SOURCE_SIZE, kernelFile); fclose(kernelFile); // Getting platform and device information cl_platform_id platformId = NULL; cl_device_id deviceID = NULL; cl_uint retNumDevices; cl_uint retNumPlatforms; ret = clGetPlatformIDs(1, &platformId, &retNumPlatforms); // Different types of devices to pick from. At the moment picks the default opencl device. //CL_DEVICE_TYPE_GPU //CL_DEVICE_TYPE_ACCELERATOR //CL_DEVICE_TYPE_DEFAULT //CL_DEVICE_TYPE_CPU ret = clGetDeviceIDs(platformId, CL_DEVICE_TYPE_ACCELERATOR, 1, &deviceID, &retNumDevices); // Max workgroup size size_t max_available_local_wg_size; ret = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_available_local_wg_size, NULL); // Creating context. cl_context context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &ret); // Creating command queue cl_command_queue commandQueue = clCreateCommandQueueWithProperties (context, deviceID, 0, &ret); // Build program cl_program program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, (const size_t *)&kernelSize, &ret); // printf("program = ret %i \n", ret); ret = clBuildProgram(program, 1, &deviceID, NULL, NULL, NULL); // printf("clBuildProgram: ret %i \n", ret); // Create kernels cl_kernel forceKernel = clCreateKernel(program, "compute_forces_gpu", &ret); cl_kernel moveKernel = clCreateKernel(program, "move_gpu", &ret); cl_kernel binInitKernel = clCreateKernel(program, "bin_init_gpu", &ret); cl_kernel binKernel = clCreateKernel(program, "bin_gpu", &ret); FILE *fsave = savename ? fopen(savename, "w") : NULL; FILE *fsum = sumname ? fopen(sumname, "a") : NULL; particle_t *particles = (particle_t*)malloc(n * sizeof(particle_t)); // GPU particle data structure cl_mem d_particles = clCreateBuffer(context, CL_MEM_READ_WRITE, n * sizeof(particle_t), NULL, &ret); // Set size set_size(n); init_particles(n, particles); double copy_time = read_timer(); // Copy particles to device. ret = clEnqueueWriteBuffer(commandQueue, d_particles, CL_TRUE, 0, n * sizeof(particle_t), particles, 0, NULL, NULL); copy_time = read_timer() - copy_time; // Calculating thread and thread block counts. // sizes size_t globalItemSize; size_t localItemSize; // Global item size if (n <= NUM_THREADS) { globalItemSize = NUM_THREADS; localItemSize = 16; } else if (n % NUM_THREADS != 0) { globalItemSize = (n / NUM_THREADS + 1) * NUM_THREADS; } else { globalItemSize = n; } // Local item size localItemSize = globalItemSize / NUM_THREADS; // Bins and bin sizes. // Because of uniform distribution we will know that bins size is amortized. Therefore I picked the value of 10. // There will never be 10 particles in one bin. int maxParticles = 10; // Calculating the number of bins. int numberOfBins = (int)ceil(size/(2*cutoff)) + 2; // Bins will only exist on the device. particle_t* bins; // How many particles are there in each bin - also only exists on the device. volatile int* binSizes; // Number of bins to be initialized. size_t clearAmt = numberOfBins*numberOfBins; // Allocate memory for bins on the device. cl_mem d_binSizes = clCreateBuffer(context, CL_MEM_READ_WRITE, numberOfBins * numberOfBins * sizeof(volatile int), NULL, &ret); cl_mem d_bins = clCreateBuffer(context, CL_MEM_READ_WRITE, numberOfBins * numberOfBins * maxParticles * sizeof(particle_t), NULL, &ret); // SETTING ARGUMENTS FOR THE KERNELS // Set arguments for the init / clear kernel ret = clSetKernelArg(binInitKernel, 0, sizeof(cl_mem), (void *)&d_binSizes); ret = clSetKernelArg(binInitKernel, 1, sizeof(int), &numberOfBins); // Set arguments for the binning kernel ret = clSetKernelArg(binKernel, 0, sizeof(cl_mem), (void *)&d_particles); ret = clSetKernelArg(binKernel, 1, sizeof(int), &n); ret = clSetKernelArg(binKernel, 2, sizeof(cl_mem), (void *)&d_bins); ret = clSetKernelArg(binKernel, 3, sizeof(cl_mem), (void *)&d_binSizes); ret = clSetKernelArg(binKernel, 4, sizeof(int), &numberOfBins); // Set arguments for force kernel. ret = clSetKernelArg(forceKernel, 0, sizeof(cl_mem), (void *)&d_particles); ret = clSetKernelArg(forceKernel, 1, sizeof(int), &n); ret = clSetKernelArg(forceKernel, 2, sizeof(cl_mem), (void *)&d_bins); ret = clSetKernelArg(forceKernel, 3, sizeof(cl_mem), (void *)&d_binSizes); ret = clSetKernelArg(forceKernel, 4, sizeof(int), &numberOfBins); // Set arguments for move kernel ret = clSetKernelArg(moveKernel, 0, sizeof(cl_mem), (void *)&d_particles); ret = clSetKernelArg(moveKernel, 1, sizeof(int), &n); ret = clSetKernelArg(moveKernel, 2, sizeof(double), &size); // Variable to check if kernel execution is done. cl_event kernelDone; double simulation_time = read_timer(); int step = 0; for (step = 0; step < NSTEPS; step++) { // Execute bin initialization (clearing after first iteration) ret = clEnqueueNDRangeKernel(commandQueue, binInitKernel, 1, NULL, &clearAmt, NULL, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); // Execute binning kernel ret = clEnqueueNDRangeKernel(commandQueue, binKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone); // ret = clEnqueueNDRangeKernel(commandQueue, binKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); // Execute force kernel ret = clEnqueueNDRangeKernel(commandQueue, forceKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); // Execute move kernel ret = clEnqueueNDRangeKernel(commandQueue, moveKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); if (fsave && (step%SAVEFREQ) == 0) { // Copy the particles back to the CPU ret = clEnqueueReadBuffer(commandQueue, d_particles, CL_TRUE, 0, n * sizeof(particle_t), particles, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); save(fsave, n, particles); } } simulation_time = read_timer() - simulation_time; printf("CPU-GPU copy time = %g seconds\n", copy_time); printf("n = %d, simulation time = %g seconds\n", n, simulation_time); if (fsum) fprintf(fsum, "%d %lf \n", n, simulation_time); if (fsum) fclose(fsum); free(particles); if (fsave) fclose(fsave); ret = clFlush(commandQueue); ret = clFinish(commandQueue); ret = clReleaseCommandQueue(commandQueue); ret = clReleaseKernel(forceKernel); ret = clReleaseKernel(moveKernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(d_particles); ret = clReleaseContext(context); return 0; }
void clean_all(void) { printf("Cleaning Variables ... \n\n"); // Opencl environment variables clReleaseCommandQueue(command_queue); clReleaseContext(context); // Release all memory allocated if (Data_MeshType == UNSTRUCTURED) { // Mesh Variables free(MeshElementArray.Node1); free(MeshElementArray.Node2); free(MeshElementArray.Node3); free(MeshElementArray.Node4); free(MeshNodeArray_double.x); free(MeshNodeArray_double.y); free(MeshNodeArray_double.z); free(MeshElementArray.Neighborindex1); free(MeshElementArray.Neighborindex2); free(MeshElementArray.Neighborindex3); free(MeshElementArray.Neighborindex4); clReleaseMemObject(Mesh_Node_x); clReleaseMemObject(Mesh_Node_y); clReleaseMemObject(Mesh_Node_z); clReleaseMemObject(Mesh_Element_Node1); clReleaseMemObject(Mesh_Element_Node2); clReleaseMemObject(Mesh_Element_Node3); clReleaseMemObject(Mesh_Element_Node4); clReleaseMemObject(Mesh_Element_Neighborindex1); clReleaseMemObject(Mesh_Element_Neighborindex2); clReleaseMemObject(Mesh_Element_Neighborindex3); clReleaseMemObject(Mesh_Element_Neighborindex4); clReleaseMemObject(r); clReleaseMemObject(s); clReleaseMemObject(t); clReleaseMemObject(eid); } // Cleaning Velocity variables free(velocity.u0); free(velocity.v0); free(velocity.w0); free(velocity.u1); free(velocity.v1); free(velocity.w1); free(velocity.time0); free(velocity.time1); free(Tracer.x); Tracer.x = NULL; free(Tracer.y); Tracer.y = NULL; free(Tracer.z); Tracer.z = NULL; free(Tracer.ElementIndex); Tracer.ElementIndex = NULL; free(Tracer.Start_time); Tracer.Start_time = NULL; free(Tracer.Stop_time); Tracer.Stop_time = NULL; free(Tracer.LeftDomain); Tracer.LeftDomain = NULL; if (Trace_ReleaseStrategy == 1) { free(Tracer1.x); Tracer1.x = NULL; free(Tracer1.y); Tracer1.y = NULL; free(Tracer1.z); Tracer1.z = NULL; free(Tracer1.ElementIndex); Tracer1.ElementIndex = NULL; free(Tracer1.Start_time); Tracer1.Start_time = NULL; free(Tracer1.Stop_time); Tracer1.Stop_time = NULL; free(Tracer1.LeftDomain); Tracer1.LeftDomain = NULL; free(index1); index1 = NULL; free(Tracer.Status); Tracer.Status = NULL; } free(DataTime1); free(Output_time); free(Launch_time); clReleaseMemObject(Vel_U0); clReleaseMemObject(Vel_U1); clReleaseMemObject(Vel_V0); clReleaseMemObject(Vel_V1); clReleaseMemObject(Vel_W0); clReleaseMemObject(Vel_W1); clReleaseMemObject(x_dev); clReleaseMemObject(y_dev); clReleaseMemObject(posx); clReleaseMemObject(posy); clReleaseMemObject(xn0); clReleaseMemObject(xn1); clReleaseMemObject(integrate); if (Dimensions == 3) { clReleaseMemObject(z_dev); clReleaseMemObject(posz); clReleaseMemObject(xn2); } clReleaseMemObject(Start_time_dev); clReleaseMemObject(Stop_time_dev); clReleaseMemObject(ElementIndex_dev); clReleaseMemObject(LeftDomain_dev); // Remove Temp file containing tracer release information if (!Keep_Tempfile) { char BinFile[LONGSTRING]; sprintf(BinFile, "%s%s.bin", Path_Output, Temp_OutFilePrefix); if(remove(BinFile)) fprintf(stderr, "Warning: Could not delete file %s\n", BinFile); } CL_CHECK(clReleaseKernel(kernel1)); CL_CHECK(clReleaseKernel(kernel2)); CL_CHECK(clReleaseKernel(kernel3)); CL_CHECK(clReleaseKernel(kernel4)); CL_CHECK(clReleaseKernel(kernel5)); CL_CHECK(clReleaseProgram(program)); printf("Cleaning Successfull \n\n"); }
void OpenCLDevice::deinitialize() { if (this->m_queue) { clReleaseCommandQueue(this->m_queue); } }
int main(void) { //time meassuring struct timeval tvs; //variables int Nx=1024; int Ny=1024; int plotnum=0; int Tmax=2; int plottime=0; int plotgap=1; double Lx=1.0; double Ly=1.0; double dt=0.0; double A=0.0; double B=0.0; double Du=0.0; double Dv=0.0; //splitting coefficients double a=0.5; double b=0.5; double c=1.0; //loop counters int i=0; int j=0; int n=0; double*umax=NULL; double*vmax=NULL; parainit(&Nx,&Ny,&Tmax,&plotgap,&Lx,&Ly,&dt,&Du,&Dv,&A,&B); plottime=plotgap; vmax=(double*)malloc((Tmax/plotgap+1)*sizeof(double)); umax=(double*)malloc((Tmax/plotgap+1)*sizeof(double)); //openCL variables cl_platform_id *platform_id = NULL; cl_kernel frequencies = NULL, initialdata = NULL, linearpart=NULL; cl_kernel nonlinearpart_a=NULL, nonlinearpart_b=NULL; cl_int ret; cl_uint num_platforms; // Detect how many platforms there are. ret = clGetPlatformIDs(0, NULL, &num_platforms); // Allocate enough space for the number of platforms. platform_id = (cl_platform_id*) malloc(num_platforms*sizeof(cl_platform_id)); // Store the platforms ret = clGetPlatformIDs(num_platforms, platform_id, NULL); printf("Found %d platform(s)!\n",num_platforms); cl_uint *num_devices; num_devices=(cl_uint*) malloc(num_platforms*sizeof(cl_uint)); cl_device_id **device_id = NULL; device_id =(cl_device_id**) malloc(num_platforms*sizeof(cl_device_id*)); // Detect number of devices in the platforms for(i=0;i<num_platforms;i++){ char buf[65536]; size_t size; ret = clGetPlatformInfo(platform_id[i],CL_PLATFORM_VERSION,sizeof(buf),buf,&size); printf("%s\n",buf); ret = clGetDeviceIDs(platform_id[i],CL_DEVICE_TYPE_ALL,0,NULL,num_devices); printf("Found %d device(s) on platform %d!\n", num_devices[i],i); ret = clGetPlatformInfo(platform_id[i],CL_PLATFORM_NAME,sizeof(buf),buf,&size); printf("%s ",buf); // Store numDevices from platform device_id[i]=(cl_device_id*) malloc(num_devices[i]*sizeof(device_id)); ret = clGetDeviceIDs(platform_id[i],CL_DEVICE_TYPE_ALL,num_devices[i],device_id[i],NULL); for(j=0;j<num_devices[i];j++){ ret = clGetDeviceInfo(device_id[i][j],CL_DEVICE_NAME,sizeof(buf),buf,&size); printf("%s (%d,%d)\n",buf,i,j); } } //create context and command_queue cl_context context = NULL; cl_command_queue command_queue = NULL; //Which platform and device do i choose? int chooseplatform=0; int choosedevice=0; printf("Choose platform %d and device %d!\n",chooseplatform,choosedevice); context = clCreateContext( NULL, num_devices[chooseplatform], device_id[chooseplatform], NULL, NULL, &ret); if(ret!=CL_SUCCESS){printf("createContext ret:%d\n",ret); exit(1); } command_queue = clCreateCommandQueue(context, device_id[chooseplatform][choosedevice], 0, &ret); if(ret!=CL_SUCCESS){printf("createCommandQueue ret:%d\n",ret); exit(1); } //OpenCL arrays cl_mem cl_u = NULL,cl_v = NULL; cl_mem cl_uhat = NULL, cl_vhat = NULL; cl_mem cl_kx = NULL, cl_ky = NULL; //FFT clfftPlanHandle planHandle; cl_mem tmpBuffer = NULL; fftinit(&planHandle,&context, &command_queue, &tmpBuffer, Nx, Ny); //allocate gpu memory/ cl_u=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx* Ny* sizeof(double), NULL, &ret); cl_v=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx* Ny* sizeof(double), NULL, &ret); cl_uhat=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx * Ny* sizeof(double), NULL, &ret); cl_vhat=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx * Ny* sizeof(double), NULL, &ret); cl_kx = clCreateBuffer(context, CL_MEM_READ_WRITE, Nx * sizeof(double), NULL, &ret); cl_ky = clCreateBuffer(context, CL_MEM_READ_WRITE, Ny * sizeof(double), NULL, &ret); printf("allocated space\n"); //load the kernels loadKernel(&frequencies,&context,&device_id[chooseplatform][choosedevice],"frequencies"); loadKernel(&initialdata,&context,&device_id[chooseplatform][choosedevice],"initialdata"); loadKernel(&linearpart,&context,&device_id[chooseplatform][choosedevice],"linearpart"); loadKernel(&nonlinearpart_a,&context,&device_id[chooseplatform][choosedevice],"nonlinearpart_a"); loadKernel(&nonlinearpart_b,&context,&device_id[chooseplatform][choosedevice],"nonlinearpart_b"); size_t global_work_size[1] = {Nx*Ny}; size_t global_work_size_X[1] = {Nx}; size_t global_work_size_Y[1] = {Ny}; //frequencies ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem),(void *)&cl_kx); ret = clSetKernelArg(frequencies, 1, sizeof(double),(void* )&Lx); ret = clSetKernelArg(frequencies, 2, sizeof(int),(void* )&Nx); ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_X, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem),(void *)&cl_ky); ret = clSetKernelArg(frequencies, 1, sizeof(double),(void* )&Ly); ret = clSetKernelArg(frequencies, 2, sizeof(int),(void* )&Ny); ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_Y, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //printCL(&cl_kx,&command_queue,Nx,1); //printCL(&cl_ky,&command_queue,1,Ny); //inintial data ret = clSetKernelArg(initialdata, 0, sizeof(cl_mem),(void *)&cl_u); ret = clSetKernelArg(initialdata, 1, sizeof(cl_mem),(void* )&cl_v); ret = clSetKernelArg(initialdata, 2, sizeof(int),(void* )&Nx); ret = clSetKernelArg(initialdata, 3, sizeof(int),(void* )&Ny); ret = clSetKernelArg(initialdata, 4, sizeof(double),(void* )&Lx); ret = clSetKernelArg(initialdata, 5, sizeof(double),(void* )&Ly); ret = clEnqueueNDRangeKernel(command_queue, initialdata, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //make output writedata_C(&cl_u, &command_queue,Nx,Ny,plotnum,"u"); writedata_C(&cl_v, &command_queue,Nx,Ny,plotnum,"v"); umax[plotnum]=writeimage(&cl_u, &command_queue,Nx,Ny,plotnum,"u"); vmax[plotnum]=writeimage(&cl_v, &command_queue,Nx,Ny,plotnum,"v"); printf("Got initial data, starting timestepping\n"); mtime_s(&tvs); for(n=0;n<=Tmax;n++){ //nonlinearpart_a ret = clSetKernelArg(nonlinearpart_a, 0, sizeof(cl_mem),(void *)&cl_u); ret = clSetKernelArg(nonlinearpart_a, 1, sizeof(cl_mem),(void* )&cl_v); ret = clSetKernelArg(nonlinearpart_a, 2, sizeof(double),(void* )&A); ret = clSetKernelArg(nonlinearpart_a, 3, sizeof(double),(void* )&dt); ret = clSetKernelArg(nonlinearpart_a, 4, sizeof(double),(void* )&a); ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_a, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //nonlinearpart_b ret = clSetKernelArg(nonlinearpart_b, 0, sizeof(cl_mem),(void *)&cl_u); ret = clSetKernelArg(nonlinearpart_b, 1, sizeof(cl_mem),(void* )&cl_v); ret = clSetKernelArg(nonlinearpart_b, 2, sizeof(double),(void* )&A); ret = clSetKernelArg(nonlinearpart_b, 3, sizeof(double),(void* )&dt); ret = clSetKernelArg(nonlinearpart_b, 4, sizeof(double),(void* )&b); ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_b, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //linear fft2dfor(&cl_u, &cl_uhat,&planHandle,&command_queue,&tmpBuffer); fft2dfor(&cl_v, &cl_vhat,&planHandle,&command_queue,&tmpBuffer); //printf("A%f,B%f\n",A,B); ret = clSetKernelArg(linearpart, 0, sizeof(cl_mem),(void *)&cl_uhat); ret = clSetKernelArg(linearpart, 1, sizeof(cl_mem),(void *)&cl_vhat); ret = clSetKernelArg(linearpart, 2, sizeof(cl_mem),(void* )&cl_kx); ret = clSetKernelArg(linearpart, 3, sizeof(cl_mem),(void* )&cl_ky); ret = clSetKernelArg(linearpart, 4, sizeof(double),(void* )&Du); ret = clSetKernelArg(linearpart, 5, sizeof(double),(void* )&Dv); ret = clSetKernelArg(linearpart, 6, sizeof(double),(void* )&A); ret = clSetKernelArg(linearpart, 7, sizeof(double),(void* )&B); ret = clSetKernelArg(linearpart, 8, sizeof(double),(void* )&dt); ret = clSetKernelArg(linearpart, 9, sizeof(double),(void* )&c); ret = clSetKernelArg(linearpart, 10, sizeof(int),(void* )&Nx); ret = clSetKernelArg(linearpart, 11, sizeof(int),(void* )&Ny); ret = clEnqueueNDRangeKernel(command_queue, linearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); fft2dback(&cl_u, &cl_uhat,&planHandle,&command_queue,&tmpBuffer); fft2dback(&cl_v, &cl_vhat,&planHandle,&command_queue,&tmpBuffer); //nonlinearpart_b ret = clSetKernelArg(nonlinearpart_b, 0, sizeof(cl_mem),(void *)&cl_u); ret = clSetKernelArg(nonlinearpart_b, 1, sizeof(cl_mem),(void* )&cl_v); ret = clSetKernelArg(nonlinearpart_b, 2, sizeof(double),(void* )&A); ret = clSetKernelArg(nonlinearpart_b, 3, sizeof(double),(void* )&dt); ret = clSetKernelArg(nonlinearpart_b, 4, sizeof(double),(void* )&b); ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_b, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //nonlinearpart_a ret = clSetKernelArg(nonlinearpart_a, 0, sizeof(cl_mem),(void *)&cl_u); ret = clSetKernelArg(nonlinearpart_a, 1, sizeof(cl_mem),(void* )&cl_v); ret = clSetKernelArg(nonlinearpart_a, 2, sizeof(double),(void* )&A); ret = clSetKernelArg(nonlinearpart_a, 3, sizeof(double),(void* )&dt); ret = clSetKernelArg(nonlinearpart_a, 4, sizeof(double),(void* )&a); ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_a, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); // done if(n==plottime){ printf("time:%f, step:%d,%d,umax:%f,vmax:%f\n",n*dt,n,plotnum,umax[plotnum],vmax[plotnum]); plottime=plottime+plotgap; plotnum=plotnum+1; writedata_C(&cl_u, &command_queue,Nx,Ny,plotnum,"u"); writedata_C(&cl_v, &command_queue,Nx,Ny,plotnum,"v"); umax[plotnum]=writeimage(&cl_u, &command_queue,Nx,Ny,plotnum,"u"); vmax[plotnum]=writeimage(&cl_v, &command_queue,Nx,Ny,plotnum,"v"); } }//end timestepping printf("Finished time stepping\n"); mtime_e(&tvs,"Programm took:"); writearray(umax,(Tmax/plotgap)+1,"u"); writearray(vmax,(Tmax/plotgap)+1,"v"); free(umax); free(vmax); clReleaseMemObject(cl_u); clReleaseMemObject(cl_v); clReleaseMemObject(cl_uhat); clReleaseMemObject(cl_vhat); clReleaseMemObject(cl_kx); clReleaseMemObject(cl_ky); ret = clReleaseKernel(initialdata); ret = clReleaseKernel(frequencies); ret = clReleaseKernel(linearpart); ret = clReleaseKernel(nonlinearpart_a); ret = clReleaseKernel(nonlinearpart_b); fftdestroy(&planHandle, &tmpBuffer); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); for(i=0;i<num_platforms;i++){free(device_id[i]);} free(device_id); free(platform_id); free(num_devices); printf("Program execution complete\n"); return 0; }
void xcl_release_world(xcl_world world) { clReleaseCommandQueue(world.command_queue); clReleaseContext(world.context); }
ofxClScheduler::~ofxClScheduler() { if(globalQ) clReleaseCommandQueue(globalQ); if(context) clReleaseContext(context); }
int main(int argc, char** argv) { /* OpenCL 1.1 data structures */ cl_platform_id* platforms; cl_program program; cl_context context; /* OpenCL 1.1 scalar data types */ cl_uint numOfPlatforms; cl_int error; /* Prepare an array of __cl_float4 via dynamic memory allocation This will map to the native vector type which is SSE / SSE2 / AVX on Intel-compatible processors. */ cl_float8* ud_in = (cl_float8*) malloc( sizeof(cl_float8) * DATA_SIZE); // input to device cl_float8* ud_out = (cl_float8*) malloc( sizeof(cl_float8) * DATA_SIZE); // output from device for( int i = 0; i < DATA_SIZE; ++i) { ud_in[i] = (cl_float8){(float)i,(float)i,(float)i,(float)i,(float)i,(float)i,(float)i,(float)i}; ud_out[i] = (cl_float8){(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f,(float)0.f}; } /* Get the number of platforms Remember that for each vendor's SDK installed on the computer, the number of available platform also increased. */ error = clGetPlatformIDs(0, NULL, &numOfPlatforms); if(error != CL_SUCCESS ) { perror("Unable to find any OpenCL platforms"); exit(1); } platforms = (cl_platform_id*) alloca(sizeof(cl_platform_id) * numOfPlatforms); printf("Number of OpenCL platforms found: %d\n", numOfPlatforms); error = clGetPlatformIDs(numOfPlatforms, platforms, NULL); if(error != CL_SUCCESS ) { perror("Unable to find any OpenCL platforms"); exit(1); } // Search for a CPU/GPU device through the installed platforms // Build a OpenCL program and do not run it. for(cl_uint i = 0; i < numOfPlatforms; i++ ) { cl_uint numOfDevices = 0; /* Determine how many devices are connected to your platform */ error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &numOfDevices); if (error != CL_SUCCESS ) { perror("Unable to obtain any OpenCL compliant device info"); exit(1); } cl_device_id* devices = (cl_device_id*) alloca(sizeof(cl_device_id) * numOfDevices); /* Load the information about your devices into the variable 'devices' */ error = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numOfDevices, devices, NULL); if (error != CL_SUCCESS ) { perror("Unable to obtain any OpenCL compliant device info"); exit(1); } printf("Number of detected OpenCL devices: %d\n", numOfDevices); /* Create a context */ cl_context_properties ctx[3] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[i], 0 }; context = clCreateContext(ctx, numOfDevices, devices, NULL, NULL, &error); if(error != CL_SUCCESS) { perror("Can't create a valid OpenCL context"); exit(1); } /* For each device, create a buffer and partition that data among the devices for compute! */ cl_mem inobj = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float8) * DATA_SIZE, ud_in, &error); if(error != CL_SUCCESS) { perror("Can't create a buffer"); exit(1); } int offset = 0; for(int i = 0; i < numOfDevices; ++i, ++offset ) { /* Load the two source files into temporary datastores */ const char *file_names[] = {"vectorization.cl"}; const int NUMBER_OF_FILES = 1; char* buffer[NUMBER_OF_FILES]; size_t sizes[NUMBER_OF_FILES]; loadProgramSource(file_names, NUMBER_OF_FILES, buffer, sizes); /* Create the OpenCL program object */ program = clCreateProgramWithSource(context, NUMBER_OF_FILES, (const char**)buffer, sizes, &error); if(error != CL_SUCCESS) { perror("Can't create the OpenCL program object"); exit(1); } /* Build OpenCL program object and dump the error message, if any */ char *program_log; size_t log_size; char* build_options = "-fbin-llvmir -fbin-amdil -fbin-exe"; error = clBuildProgram(program, 1, &devices[i], build_options, NULL, NULL); if(error != CL_SUCCESS) { // If there's an error whilst building the program, dump the log clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); program_log = (char*) malloc(log_size+1); program_log[log_size] = '\0'; clGetProgramBuildInfo(program, devices[i], CL_PROGRAM_BUILD_LOG, log_size+1, program_log, NULL); printf("\n=== ERROR ===\n\n%s\n=============\n", program_log); free(program_log); exit(1); } /* Query the program as to how many kernels were detected */ cl_uint numOfKernels; error = clCreateKernelsInProgram(program, 0, NULL, &numOfKernels); if (error != CL_SUCCESS) { perror("Unable to retrieve kernel count from program"); exit(1); } cl_kernel* kernels = (cl_kernel*) alloca(sizeof(cl_kernel) * numOfKernels); error = clCreateKernelsInProgram(program, numOfKernels, kernels, NULL); /* Loop thru each kernel and execute on device */ for(cl_uint j = 0; j < numOfKernels; j++) { char kernelName[32]; cl_uint argCnt; clGetKernelInfo(kernels[j], CL_KERNEL_FUNCTION_NAME, sizeof(kernelName), kernelName, NULL); clGetKernelInfo(kernels[j], CL_KERNEL_NUM_ARGS, sizeof(argCnt), &argCnt, NULL); printf("Kernel name: %s with arity: %d\n", kernelName, argCnt); printf("About to create command queue and enqueue this kernel...\n"); /* Create a command queue */ cl_command_queue cQ = clCreateCommandQueue(context, devices[i], 0, &error); if (error != CL_SUCCESS) { perror("Unable to create command-queue"); exit(1); } /* Create a buffer and copy the data from the main buffer */ cl_mem outobj = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float8) * DATA_SIZE, 0, &error); if (error != CL_SUCCESS) { perror("Unable to create sub-buffer object"); exit(1); } /* Let OpenCL know that the kernel is suppose to receive an argument */ error = clSetKernelArg(kernels[j], 0, sizeof(cl_mem), &inobj); error = clSetKernelArg(kernels[j], 1, sizeof(cl_mem), &outobj); if (error != CL_SUCCESS) { perror("Unable to set buffer object in kernel"); exit(1); } /* Enqueue the kernel to the command queue */ error = clEnqueueTask(cQ, kernels[j], 0, NULL, NULL); if (error != CL_SUCCESS) { perror("Unable to enqueue task to command-queue"); exit(1); } printf("Task has been enqueued successfully!\n"); /* Enqueue the read-back from device to host */ error = clEnqueueReadBuffer(cQ, outobj, CL_TRUE, // blocking read 0, // read from the start sizeof(cl_float8)*DATA_SIZE, // how much to copy ud_out, 0, NULL, NULL); /* Check the returned data */ if ( valuesOK(ud_in, ud_out, DATA_SIZE) ) { printf("Check passed!\n"); } else printf("Check failed!\n"); /* Release the command queue */ clReleaseCommandQueue(cQ); clReleaseMemObject(outobj); } /* Clean up */ for(cl_uint i = 0; i < numOfKernels; i++) { clReleaseKernel(kernels[i]); } for(int i=0; i< NUMBER_OF_FILES; i++) { free(buffer[i]); } clReleaseProgram(program); }// end of device loop and execution clReleaseMemObject(inobj); clReleaseContext(context); }// end of platform loop free(ud_in); free(ud_out); }
int main(int argc, char** argv) { int rank, size; // MPI rank & size int err; // error code returned from OpenCL calls float h_a[LENGTH]; // a vector float h_b[LENGTH]; // b vector float h_c[LENGTH]; // c vector (a+b) returned from the compute device (local per task) float _h_c[LENGTH]; // c vector (a+b) returned from the compute device (global for master) unsigned int correct; // number of correct results size_t global; // global domain size size_t local; // local domain size cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel ko_vadd; // compute kernel cl_mem d_a; // device memory used for the input a vector cl_mem d_b; // device memory used for the input b vector cl_mem d_c; // device memory used for the output c vector int mycount, i; err = MPI_Init (&argc, &argv); if (err != MPI_SUCCESS) { printf ("MPI_Init failed!\n"); exit (-1); } err = MPI_Comm_rank (MPI_COMM_WORLD, &rank); if (err != MPI_SUCCESS) { printf ("MPI_Comm_rank failed!\n"); exit (-1); } err = MPI_Comm_size (MPI_COMM_WORLD, &size); if (err != MPI_SUCCESS) { printf ("MPI_Comm_size failed\n"); exit (-1); } if (LENGTH % size != 0) { printf ("Number of MPI processes must divide LENGTH (%d)\n", LENGTH); exit (-1); } mycount = LENGTH / size; if (rank == 0) { for (i = 0; i < LENGTH; i++) { h_a[i] = rand() / (float)RAND_MAX; h_b[i] = rand() / (float)RAND_MAX; h_a[i] = i; h_b[i] = i*2; } err = MPI_Bcast (h_a, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD); if (err != MPI_SUCCESS) { printf ("MPI_Bcast failed transferring h_a\n"); exit (-1); } err = MPI_Bcast (h_b, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD); if (err != MPI_SUCCESS) { printf ("MPI_Bcast failed transferring h_b\n"); exit (-1); } } else { err = MPI_Bcast (h_a, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD); if (err != MPI_SUCCESS) { printf ("MPI_Bcast failed receiving h_a\n"); exit (-1); } err = MPI_Bcast (h_b, LENGTH, MPI_FLOAT, 0, MPI_COMM_WORLD); if (err != MPI_SUCCESS) { printf ("MPI_Bcast failed receiving h_b\n"); exit (-1); } } // Set up platform cl_uint numPlatforms; // Find number of platforms err = clGetPlatformIDs(0, NULL, &numPlatforms); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to find a platform!\n"); return EXIT_FAILURE; } // Get all platforms cl_platform_id Platform[numPlatforms]; err = clGetPlatformIDs(numPlatforms, Platform, NULL); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to get the platform!\n"); return EXIT_FAILURE; } // Secure a GPU for (i = 0; i < numPlatforms; i++) { err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL); if (err == CL_SUCCESS) break; } if (device_id == NULL) { printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } else { if (output_device_info (rank, device_id) != CL_SUCCESS) return EXIT_FAILURE; } // Create a compute context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command queue commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // Create the compute kernel from the program ko_vadd = clCreateKernel(program, "vadd", &err); if (!ko_vadd || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } // Create the input (a, b) and output (c) arrays in device memory d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * mycount, NULL, NULL); d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * mycount, NULL, NULL); d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * mycount, NULL, NULL); if (!d_a || !d_b || !d_c) { printf("Error: Failed to allocate device memory!\n"); exit(1); } // Write a and b vectors into compute device memory err = clEnqueueWriteBuffer(commands, d_a, CL_TRUE, 0, sizeof(float) * mycount, &h_a[rank*mycount], 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write h_a to source array!\n"); exit(1); } err = clEnqueueWriteBuffer(commands, d_b, CL_TRUE, 0, sizeof(float) * mycount, &h_b[rank*mycount], 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write h_b to source array!\n"); exit(1); } // Set the arguments to our compute kernel err = clSetKernelArg(ko_vadd, 0, sizeof(cl_mem), &d_a); err |= clSetKernelArg(ko_vadd, 1, sizeof(cl_mem), &d_b); err |= clSetKernelArg(ko_vadd, 2, sizeof(cl_mem), &d_c); err |= clSetKernelArg(ko_vadd, 3, sizeof(unsigned int), &mycount); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(ko_vadd, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device global = LENGTH; err = clEnqueueNDRangeKernel(commands, ko_vadd, 1, NULL, &global, &local, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } // Wait for the commands to complete before reading back results clFinish(commands); // Read back the results from the compute device err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * mycount, &h_c, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } err = MPI_Gather (h_c, mycount, MPI_FLOAT, _h_c, mycount, MPI_FLOAT, 0, MPI_COMM_WORLD); if (err != MPI_SUCCESS) { printf ("MPI_Gather failed receiving h_c\n"); exit (-1); } if (rank == 0) { // Test the results correct = 0; float tmp; for(i = 0; i < LENGTH; i++) { tmp = h_a[i] + h_b[i]; // assign element i of a+b to tmp tmp -= _h_c[i]; // compute deviation of expected and output result if(tmp*tmp < TOL*TOL) // correct if square deviation is less than tolerance squared correct++; else printf(" tmp %f h_a %f h_b %f h_c %f \n",tmp, h_a[i], h_b[i], _h_c[i]); } // summarize results printf("C = A+B: %d out of %d results were correct.\n", correct, LENGTH); } // cleanup then shutdown clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); clReleaseProgram(program); clReleaseKernel(ko_vadd); clReleaseCommandQueue(commands); clReleaseContext(context); err = MPI_Finalize (); if (err != MPI_SUCCESS) { printf ("MPI_Finalize failed!\n"); exit (-1); } return 0; }
int main() { struct ecl_context ctx; cl_program program; cl_kernel kernel; cl_int err; cl_mem in, out; size_t globWorkSize; int n = 100000; cl_event event; cl_ulong start, end; err = eclGetSomeContext(&ctx); assert(err == CL_SUCCESS); err = eclGetProgramFromSource(ctx.context, ctx.device, src, &program); assert(err == CL_SUCCESS); kernel = clCreateKernel(program, "stream", &err); assert(err == CL_SUCCESS); in = clCreateBuffer(ctx.context, CL_MEM_READ_ONLY, n * sizeof(float), 0, &err); assert(err == CL_SUCCESS); out = clCreateBuffer(ctx.context, CL_MEM_READ_ONLY, n * sizeof(float), 0, &err); assert(err == CL_SUCCESS); err = clSetKernelArg(kernel, 0, sizeof(in), &in); assert(err == CL_SUCCESS); err = clSetKernelArg(kernel, 1, sizeof(out), &out); assert(err == CL_SUCCESS); err = clSetKernelArg(kernel, 2, sizeof(n), &n); assert(err == CL_SUCCESS); globWorkSize = n; err = clEnqueueNDRangeKernel(ctx.queue, kernel, 1, 0, &globWorkSize, 0, 0, 0, &event); assert(err == CL_SUCCESS); clWaitForEvents(1, &event); err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(start), &start, 0); if (err == CL_PROFILING_INFO_NOT_AVAILABLE) { printf("Profling info not available.\n"); return err; } else if (err) { printf("An error occurred getting profiling info.\n"); return err; } err = clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(end), &end, 0); if (err == CL_PROFILING_INFO_NOT_AVAILABLE) { printf("Profling info not available.\n"); return err; } else if (err) { printf("An error occurred getting profiling info.\n"); return err; } printf("T/ms: %lf\n", (double)(end - start) / 1.0e6); printf("MB: %lf\n", (double)n * sizeof(float) / 1.0e6); printf("GB/s: %lf\n", (double)n * sizeof(float) / (end - start)); clReleaseCommandQueue(ctx.queue); clReleaseContext(ctx.context); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseMemObject(in); clReleaseMemObject(out); clReleaseEvent(event); return err; }
void OpenCLFree() { clReleaseCommandQueue(opencl_command_queue); clReleaseContext(opencl_context); }
void call_kernel(float *data,unsigned int count,char * cl_name,float *results) { FILE* programHandle; size_t programSize, KernelSourceSize; char *programBuffer, *KernelSource; size_t global; // global domain size for our calculation size_t local; // local domain size for our calculation cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel cl_mem input; // device memory used for the input array cl_mem output; // device memory used for the output array int err; int gpu = 1; err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); commands = clCreateCommandQueue(context, device_id, 0, &err); //---------------------------------------------------------------------------- // get size of kernel source programHandle = fopen(cl_name, "r"); fseek(programHandle, 0, SEEK_END); programSize = ftell(programHandle); rewind(programHandle); programBuffer = (char*) malloc(programSize + 1); programBuffer[programSize] = '\0'; fread(programBuffer, sizeof(char), programSize, programHandle); fclose(programHandle); // create program from buffer program = clCreateProgramWithSource(context,1,(const char**) &programBuffer,&programSize, NULL); free(programBuffer); // read kernel source back in from program to check clGetProgramInfo(program, CL_PROGRAM_SOURCE, 0, NULL, &KernelSourceSize); KernelSource = (char*) malloc(KernelSourceSize); clGetProgramInfo(program, CL_PROGRAM_SOURCE, KernelSourceSize, KernelSource, NULL); program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); kernel = clCreateKernel(program, "square", &err); //---------------------------------------------------------------------------- input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL); output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL); err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL); clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); clSetKernelArg(kernel, 2, sizeof(unsigned int), &count); err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); global = count; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); clFinish(commands); err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL ); clReleaseMemObject(input); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); printf("nKernel source:\n\n %s \n", KernelSource); free(KernelSource); }
int main() { cl_device_id device = new_device(); cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_int i, j, err; float matrix_1[80], matrix_2[80], matrix_3[80]; const size_t buffer_origin[3] = { 5 * sizeof(float), 3, 0 }; const size_t host_origin[3] = { 1 * sizeof(float), 1, 0 }; const size_t region[3] = { 4 * sizeof(float), 4, 1 }; cl_mem matrix_buffer_1, matrix_buffer_2, matrix_buffer_3; for (i = 0; i < 80; i++) { matrix_1[i] = i * 1.0f; matrix_2[i] = 3.0; matrix_3[i] = 0; } context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if (err < 0) { perror("Couldn't create a context\n"); exit(1); } program = build_program(context, device, FILE_NAME); kernel = clCreateKernel(program, "add", &err); if (err < 0) { perror("Couldn't create a kernel\n"); exit(1); } matrix_buffer_1 = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(matrix_1), matrix_1, &err); if (err < 0) { perror("Couldn't create a buffer\n"); exit(1); } matrix_buffer_2 = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(matrix_2), matrix_2, &err); if (err < 0) { perror("Couldn't create a buffer\n"); exit(1); } matrix_buffer_3 = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(matrix_3), matrix_3, &err); if (err < 0) { perror("Couldn't create a buffer\n"); exit(1); } int row = 8; int col = 10; err = clSetKernelArg(kernel, 0, sizeof(int), &row); err = clSetKernelArg(kernel, 1, sizeof(int), &col); err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &matrix_buffer_1); err = clSetKernelArg(kernel, 3, sizeof(cl_mem), &matrix_buffer_2); err = clSetKernelArg(kernel, 4, sizeof(cl_mem), &matrix_buffer_3); queue = clCreateCommandQueue(context, device, 0, &err); if (err < 0) { perror("Couldn't create a command queue\n"); exit(1); } err = clEnqueueTask(queue, kernel, 0, NULL, NULL); if (err < 0) { perror("Couldn't enque task\n"); exit(1); } err = clEnqueueReadBuffer(queue, matrix_buffer_3, CL_TRUE, 0, sizeof(matrix_3), &matrix_3, 0, NULL, NULL); for (i = 0; i < 8; i++) { for (j = 0; j < 10; j++) { printf("%6.1f ", matrix_3[j + i * 10]); } printf("\n"); } clReleaseMemObject(matrix_buffer_1); clReleaseMemObject(matrix_buffer_2); clReleaseMemObject(matrix_buffer_3); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseContext(context); return 0; }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX, bufY; cl_event event = NULL; int ret = 0; int lenX = 1 + (N-1)*abs(incx); int lenY = 1 + (N-1)*abs(incy); /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenX*sizeof(cl_float)), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); printResult(); /* Call clblas function. */ err = clblasSrot(N, bufX, 0, incx, bufY, 0, incy, C, S, 1, &queue, 0, NULL, &event); // printf("here\n"); if (err != CL_SUCCESS) { printf("clblasSrot() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); /* At this point you will get the result of SROT placed in vector Y. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
int vadd(void) { // Create the two input vectors int i; const int LIST_SIZE = 1024; int *A = (int*)malloc(sizeof(int)*LIST_SIZE); int *B = (int*)malloc(sizeof(int)*LIST_SIZE); for(i = 0; i < LIST_SIZE; i++) { A[i] = i; B[i] = LIST_SIZE - i; } // Load the kernel source code into the array source_str FILE *fp; char *source_str; size_t source_size; const char *fname = "/home/ckit/program/workspace_java/OpenCLHookSample/jni/vector_add_kernel.cl"; // const char *fname = "vadd.ir"; fp = fopen(fname, "rb"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } source_str = (char*)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp); fclose( fp ); // Get platform and device information cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); printf("clGetPlatformIDs err=%d,num_platforms=%d, platform_id=%x\n", ret, ret_num_platforms, (unsigned int)platform_id ); //#define XXX CL_DEVICE_TYPE_DEFAULT // #define XXX CL_DEVICE_TYPE_ALL // #define XXX CL_DEVICE_TYPE_GPU #define XXX CL_DEVICE_TYPE_CPU cl_uint num_platforms = 2; cl_platform_id* platforms = (cl_platform_id*)malloc(sizeof(cl_platform_id)* num_platforms); if(NULL == platforms){ printf("malloc err!\n"); } ret = clGetPlatformIDs(2, platforms, &ret_num_platforms); printf("clGetPlatformIDs err=%d,num_platforms=%d, platform_id=%x\n", ret, ret_num_platforms, (unsigned int)platforms[1] ); ret = clGetDeviceIDs( platforms[0], XXX, 1, &device_id, &ret_num_devices); printf("clGetDeviceIDs err=%d,num_platforms=%d, device_id=%x\n", ret, ret_num_platforms, (unsigned int)device_id ); char name[64]; ret = clGetDeviceInfo(device_id, CL_DEVICE_NAME, sizeof(char)*64, name, NULL); printf("device_name : %s\n", name); // Create an OpenCL context // cl_context context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); cl_context_properties cps[3] = { (cl_context_properties)CL_CONTEXT_PLATFORM, (cl_context_properties)platforms[0], (cl_context_properties)0 }; cl_context context = clCreateContextFromType( cps, XXX, NULL, NULL, &ret); printf("clCreateContextFromType err=%d,device_type=%x\n", ret, (unsigned int)XXX); // Create a command queue cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret); // Create memory buffers on the device for each vector cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int), NULL, &ret); cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, LIST_SIZE * sizeof(int), NULL, &ret); cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, LIST_SIZE * sizeof(int), NULL, &ret); // Copy the lists A and B to their respective memory buffers ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(int), A, 0, NULL, NULL); ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(int), B, 0, NULL, NULL); // Create a program from the kernel source cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); // cl_int status; // cl_int err; // cl_program program = clCreateProgramWithBinary( // context, 1, &device_id, &source_size, (const unsigned char **)&source_str, &status, &err); // Build the program ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); // Create the OpenCL kernel cl_kernel kernel = clCreateKernel(program, "vector_add", &ret); // Set the arguments of the kernel ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj); ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj); ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj); // Execute the OpenCL kernel on the list size_t global_item_size = LIST_SIZE; // Process the entire lists size_t local_item_size = 64; // Process in groups of 64 ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); // Read the memory buffer C on the device to the local variable C int *C = (int*)malloc(sizeof(int)*LIST_SIZE); ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, LIST_SIZE * sizeof(int), C, 0, NULL, NULL); // Display the result to the screen for(i = 0; i < /*LIST_SIZE*/10; i++) printf("%d + %d = %d\n", A[i], B[i], C[i]); // Clean up ret = clFlush(command_queue); ret = clFinish(command_queue); ret = clReleaseKernel(kernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(a_mem_obj); ret = clReleaseMemObject(b_mem_obj); ret = clReleaseMemObject(c_mem_obj); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); free(A); free(B); free(C); return 0; }
void runProgram(int N, char *fileName) { printf("GPU Symmetrize()..." "\nSquareMatrix[%d][%d]\n", N, N); int i,j; // initialize input array float *A; A = (float*)malloc(sizeof(float)*N*N); for( i = 0; i < N ; ++i ) { for( j = 0; j < N ; ++j ) { A[i*N + j] = j; } } // result float *Aout; Aout = (float*)malloc(sizeof(float)*N*N); #ifdef DEBUG puts("A"); check_2d_f(A,N,N); #endif int NumK = 1; int NumE = 2; double gpuTime; cl_ulong gstart, gend; //------------------------------------------------ // OpenCL //------------------------------------------------ cl_int err; cl_platform_id platform; // OpenCL platform cl_device_id device_id; // device ID cl_context context; // context cl_command_queue queue; // command queue cl_program program; // program cl_kernel *kernel = (cl_kernel*)malloc(sizeof(cl_kernel)*NumK); cl_event *event = (cl_event*)malloc(sizeof(cl_event)*NumE); // read kernel file //char *fileName = "transpose_kernel.cl"; char *kernelSource; size_t size; FILE *fh = fopen(fileName, "rb"); if(!fh) { printf("Error: Failed to open kernel file!\n"); exit(1); } fseek(fh,0,SEEK_END); size=ftell(fh); fseek(fh,0,SEEK_SET); kernelSource = malloc(size+1); size_t result; result = fread(kernelSource,1,size,fh); if(result != size){ fputs("Reading error", stderr);exit(1);} kernelSource[size] = '\0'; // Bind to platform err = clGetPlatformIDs(1, &platform, NULL); OCL_CHECK(err); // Get ID for the device err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); OCL_CHECK(err); // Create a context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); OCL_CHECK(err); // Create a command queue queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err); OCL_CHECK(err); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, NULL, &err); OCL_CHECK(err); // turn on optimization for kernel char *options="-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-unsafe-math-optimizations -cl-finite-math-only"; err = clBuildProgram(program, 1, &device_id, options, NULL, NULL); if(err != CL_SUCCESS) printCompilerOutput(program, device_id); OCL_CHECK(err); #ifdef SAVEBIN // Calculate size of binaries size_t binary_size; err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_size, NULL); OCL_CHECK(err); //printf("binary size = %ld\n", binary_size); unsigned char* bin; bin = (unsigned char*)malloc(sizeof(unsigned char)*binary_size); err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*) , &bin, NULL); OCL_CHECK(err); //puts("save binaries"); // Print the binary out to the output file fh = fopen("kernel.bin", "wb"); fwrite(bin, 1, binary_size, fh); fclose(fh); puts("done save binaries"); #endif kernel[0] = clCreateKernel(program, "kernel_a", &err); OCL_CHECK(err); // memory on device cl_mem A_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N*N, NULL, NULL); cl_mem Aout_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N*N, NULL, NULL); // copy data to device err = clEnqueueWriteBuffer(queue, A_d, CL_TRUE, 0, sizeof(float)*N*N, A, 0, NULL , &event[0]); OCL_CHECK(err); size_t localsize[2]; size_t globalsize[2]; localsize[0] = 16; localsize[1] = 16; globalsize[0] = N; globalsize[1] = N; err = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &A_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &Aout_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clEnqueueNDRangeKernel(queue, kernel[0], 2, NULL, globalsize, localsize, 0, NULL, NULL); OCL_CHECK(err); clFinish(queue); // read device data back to host clEnqueueReadBuffer(queue, Aout_d, CL_TRUE, 0, sizeof(float)*N*N, Aout, 0, NULL , &event[1]); err = clWaitForEvents(1,&event[1]); OCL_CHECK(err); err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &gstart, NULL); OCL_CHECK(err); err = clGetEventProfilingInfo (event[1], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &gend, NULL); OCL_CHECK(err); gpuTime = (double)(gend -gstart)/1000000000.0; //check_1d_f(sum, blks+1); #ifdef DEBUG puts("Output"); check_2d_f(Aout,N,N); #endif printf("oclTime = %lf (s)\n", gpuTime ); // free clReleaseMemObject(A_d); clReleaseMemObject(Aout_d); // // check // int flag = 1; // for(i=0;i<N;++i){ // for(j=0;j<N;++j){ // if(A[i*N+j] != At[j*N+i]) // { // flag = 0; // break; // } // } // } // if( flag == 0 ) // { // puts("Bugs! Check program."); // }else{ // puts("Succeed!"); // } clReleaseProgram(program); clReleaseContext(context); clReleaseCommandQueue(queue); for(i=0;i<NumK;++i){ clReleaseKernel(kernel[i]); } for(i=0;i<NumE;++i){ clReleaseEvent(event[i]); } free(kernelSource); #ifdef SAVEBIN free(bin); #endif free(A); free(Aout); return; }
int main(int argc, char *argv[]) { int error, xsize, ysize, rgb_max; int *r, *b, *g; float *gray, *congray, *congray2, *congray_cl; // identity kernel // float filter[] = { // 0,0,0,0,0,0,0, // 0,0,0,0,0,0,0, // 0,0,0,0,0,0,0, // 0,0,0,1,0,0,0, // 0,0,0,0,0,0,0, // 0,0,0,0,0,0,0, // 0,0,0,0,0,0,0, // }; // 45 degree motion blur float filter[] = {0, 0, 0, 0, 0, 0.0145, 0, 0, 0, 0, 0, 0.0376, 0.1283, 0.0145, 0, 0, 0, 0.0376, 0.1283, 0.0376, 0, 0, 0, 0.0376, 0.1283, 0.0376, 0, 0, 0, 0.0376, 0.1283, 0.0376, 0, 0, 0, 0.0145, 0.1283, 0.0376, 0, 0, 0, 0, 0, 0.0145, 0, 0, 0, 0, 0}; // mexican hat kernel // float filter[] = { // 0, 0,-1,-1,-1, 0, 0, // 0,-1,-3,-3,-3,-1, 0, // -1,-3, 0, 7, 0,-3,-1, // -1,-3, 7,24, 7,-3,-1, // -1,-3, 0, 7, 0,-3,-1, // 0,-1,-3,-3,-3,-1, 0, // 0, 0,-1,-1,-1, 0, 0 // }; if(argc != 3) { fprintf(stderr, "Usage: %s image.ppm num_loops\n", argv[0]); abort(); } const char* filename = argv[1]; const int num_loops = atoi(argv[2]); // -------------------------------------------------------------------------- // load image // -------------------------------------------------------------------------- printf("Reading ``%s''\n", filename); ppma_read(filename, &xsize, &ysize, &rgb_max, &r, &g, &b); printf("Done reading ``%s'' of size %dx%d\n", filename, xsize, ysize); // -------------------------------------------------------------------------- // allocate CPU buffers // -------------------------------------------------------------------------- posix_memalign((void**)&gray, 32, 4*xsize*ysize*sizeof(float)); if(!gray) { fprintf(stderr, "alloc gray"); abort(); } posix_memalign((void**)&congray, 32, 4*xsize*ysize*sizeof(float)); if(!congray) { fprintf(stderr, "alloc congray"); abort(); } posix_memalign((void**)&congray2, 32, 4*xsize*ysize*sizeof(float)); if(!congray2) { fprintf(stderr, "alloc congray2"); abort(); } posix_memalign((void**)&congray_cl, 32, 4*xsize*ysize*sizeof(float)); if(!congray_cl) { fprintf(stderr, "alloc congray_cl"); abort(); } // -------------------------------------------------------------------------- // convert image to grayscale // -------------------------------------------------------------------------- for(int n = 0; n < xsize*ysize; ++n) { gray[4*n] = r[n]; gray[4*n+1] = g[n]; gray[4*n+2] = b[n]; gray[4*n+3] = (0.21f*r[n])/rgb_max + (0.72f*g[n])/rgb_max + (0.07f*b[n])/rgb_max; } for(int n = 0; n < 4*xsize*ysize; ++n) { congray[n]=gray[n]; } // -------------------------------------------------------------------------- // execute filter on cpu // -------------------------------------------------------------------------- for(int s=0;s<num_loops;s++){ for(int i = HALF_FILTER_WIDTH; i < ysize - HALF_FILTER_WIDTH; ++i) { for(int j = HALF_FILTER_WIDTH; j < xsize - HALF_FILTER_WIDTH; ++j) { float sumR = 0; float sumG = 0; float sumB = 0; float sum = 0; for(int k = -HALF_FILTER_WIDTH; k <= HALF_FILTER_WIDTH; ++k) { for(int l = -HALF_FILTER_WIDTH; l <= HALF_FILTER_WIDTH; ++l){ sumR += congray[4*((i+k)*xsize + (j+l))] * filter[(k+HALF_FILTER_WIDTH)*FILTER_WIDTH + (l+HALF_FILTER_WIDTH)]; sumG += congray[4*((i+k)*xsize + (j+l))+1] * filter[(k+HALF_FILTER_WIDTH)*FILTER_WIDTH + (l+HALF_FILTER_WIDTH)]; sumB += congray[4*((i+k)*xsize + (j+l))+2] * filter[(k+HALF_FILTER_WIDTH)*FILTER_WIDTH + (l+HALF_FILTER_WIDTH)]; //sum += congray[4*((i+k)*xsize + (j+l))+3] * filter[(k+HALF_FILTER_WIDTH)*FILTER_WIDTH + (l+HALF_FILTER_WIDTH)]; } } congray2[4*(i*xsize + j)] = sumR; congray2[4*(i*xsize + j)+1] = sumG; congray2[4*(i*xsize + j)+2] = sumB; congray2[4*(i*xsize + j)+3] = sum; } } for(int i = HALF_FILTER_WIDTH; i < ysize - HALF_FILTER_WIDTH; ++i) { for(int j = HALF_FILTER_WIDTH; j < xsize - HALF_FILTER_WIDTH; ++j) { congray[4*(i*xsize + j)] = congray2[4*(i*xsize + j)]; congray[4*(i*xsize + j)+1] = congray2[4*(i*xsize + j)+1]; congray[4*(i*xsize + j)+2] = congray2[4*(i*xsize + j)+2]; congray[4*(i*xsize + j)+3] = congray2[4*(i*xsize + j)+3]; } } } // -------------------------------------------------------------------------- // output cpu filtered image // -------------------------------------------------------------------------- printf("Writing cpu filtered image\n"); for(int n = 0; n < xsize*ysize; ++n) { r[n] = (int)(congray[4*n] ); g[n] = (int)(congray[4*n+1] ); b[n] = (int)(congray[4*n+2]); } error = ppma_write("output_cpu.ppm", xsize, ysize, r, g, b); if(error) { fprintf(stderr, "error writing image"); abort(); } // -------------------------------------------------------------------------- // get an OpenCL context and queue // -------------------------------------------------------------------------- cl_context ctx; cl_command_queue queue; create_context_on(CHOOSE_INTERACTIVELY, CHOOSE_INTERACTIVELY, 0, &ctx, &queue, 0); print_device_info_from_queue(queue); // -------------------------------------------------------------------------- // load kernels // -------------------------------------------------------------------------- char *knl_text = read_file("convolution_color.cl"); cl_kernel knl = kernel_from_string(ctx, knl_text, "convolution_color", NULL); free(knl_text); #ifdef NON_OPTIMIZED int deviceWidth = xsize; #else int deviceWidth = ((xsize + WGX - 1)/WGX)* WGX; #endif int deviceHeight = ysize; size_t deviceDataSize = 4 * deviceHeight*deviceWidth*sizeof(float); // -------------------------------------------------------------------------- // allocate device memory // -------------------------------------------------------------------------- cl_int status; cl_mem buf_gray = clCreateBuffer(ctx, CL_MEM_READ_ONLY, deviceDataSize, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_congray = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, deviceDataSize, 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); cl_mem buf_filter = clCreateBuffer(ctx, CL_MEM_READ_ONLY, FILTER_WIDTH*FILTER_WIDTH*sizeof(float), 0, &status); CHECK_CL_ERROR(status, "clCreateBuffer"); // -------------------------------------------------------------------------- // transfer to device // -------------------------------------------------------------------------- #ifdef NON_OPTIMIZED CALL_CL_SAFE(clEnqueueWriteBuffer( queue, buf_gray, /*blocking*/ CL_TRUE, /*offset*/ 0, deviceDataSize, gray, 0, NULL, NULL)); #else size_t buffer_origin[3] = {0,0,0}; size_t host_origin[3] = {0,0,0}; size_t region[3] = {deviceWidth*sizeof(float), ysize, 1}; clEnqueueWriteBufferRect(queue, buf_gray, CL_TRUE, buffer_origin, host_origin, region, deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0, gray, 0, NULL, NULL); #endif CALL_CL_SAFE(clEnqueueWriteBuffer( queue, buf_filter, /*blocking*/ CL_TRUE, /*offset*/ 0, FILTER_WIDTH*FILTER_WIDTH*sizeof(float), filter, 0, NULL, NULL)); // -------------------------------------------------------------------------- // run code on device // -------------------------------------------------------------------------- cl_int rows = ysize; cl_int cols = xsize; cl_int filterWidth = FILTER_WIDTH; cl_int paddingPixels = 2*HALF_FILTER_WIDTH; size_t local_size[] = { WGX, WGY }; size_t global_size[] = { ((xsize-paddingPixels + local_size[0] - 1)/local_size[0])* local_size[0], ((ysize-paddingPixels + local_size[1] - 1)/local_size[1])* local_size[1], }; cl_int localWidth = local_size[0] + paddingPixels; cl_int localHeight = local_size[1] + paddingPixels; size_t localMemSize = 4 * localWidth * localHeight * sizeof(float); CALL_CL_SAFE(clSetKernelArg(knl, 0, sizeof(buf_gray), &buf_gray)); CALL_CL_SAFE(clSetKernelArg(knl, 1, sizeof(buf_congray), &buf_congray)); CALL_CL_SAFE(clSetKernelArg(knl, 2, sizeof(buf_filter), &buf_filter)); CALL_CL_SAFE(clSetKernelArg(knl, 3, sizeof(rows), &rows)); CALL_CL_SAFE(clSetKernelArg(knl, 4, sizeof(cols), &cols)); CALL_CL_SAFE(clSetKernelArg(knl, 5, sizeof(filterWidth), &filterWidth)); CALL_CL_SAFE(clSetKernelArg(knl, 6, localMemSize, NULL)); CALL_CL_SAFE(clSetKernelArg(knl, 7, sizeof(localHeight), &localHeight)); CALL_CL_SAFE(clSetKernelArg(knl, 8, sizeof(localWidth), &localWidth)); // -------------------------------------------------------------------------- // print kernel info // -------------------------------------------------------------------------- print_kernel_info(queue, knl); CALL_CL_SAFE(clFinish(queue)); timestamp_type tic, toc; get_timestamp(&tic); for(int loop = 0; loop < num_loops; ++loop) { if (loop%2==0){ CALL_CL_SAFE(clSetKernelArg(knl, 0, sizeof(buf_gray), &buf_gray)); CALL_CL_SAFE(clSetKernelArg(knl, 1, sizeof(buf_congray), &buf_congray)); } else{ CALL_CL_SAFE(clSetKernelArg(knl, 0, sizeof(buf_congray), &buf_congray)); CALL_CL_SAFE(clSetKernelArg(knl, 1, sizeof(buf_gray), &buf_gray)); } CALL_CL_SAFE(clEnqueueNDRangeKernel(queue, knl, 2, NULL, global_size, local_size, 0, NULL, NULL)); } CALL_CL_SAFE(clFinish(queue)); get_timestamp(&toc); double elapsed = timestamp_diff_in_seconds(tic,toc)/num_loops; printf("%f s\n", elapsed); printf("%f MPixels/s\n", xsize*ysize/1e6/elapsed); printf("%f GBit/s\n", 4*2*xsize*ysize*sizeof(float)/1e9/elapsed); printf("%f GFlop/s\n",4*(xsize-HALF_FILTER_WIDTH)*(ysize-HALF_FILTER_WIDTH) *FILTER_WIDTH*FILTER_WIDTH/1e9/elapsed); // -------------------------------------------------------------------------- // transfer back & check // -------------------------------------------------------------------------- #ifdef NON_OPTIMIZED if (num_loops%2==0) CALL_CL_SAFE(clEnqueueReadBuffer(queue, buf_gray, /*blocking*/ CL_TRUE, /*offset*/ 0,deviceDataSize,congray_cl,0, NULL, NULL)); else CALL_CL_SAFE(clEnqueueReadBuffer(queue, buf_congray, /*blocking*/ CL_TRUE, /*offset*/ 0,deviceDataSize, congray_cl, 0, NULL, NULL)); #else buffer_origin[0] = 3*sizeof(float); buffer_origin[1] = 3; buffer_origin[2] = 0; host_origin[0] = 3*sizeof(float); host_origin[1] = 3; host_origin[2] = 0; region[0] = (xsize-paddingPixels)*sizeof(float); region[1] = (ysize-paddingPixels); region[2] = 1; if (num_loops%2==0) clEnqueueReadBufferRect(queue, buf_gray, CL_TRUE,buffer_origin, host_origin, region,deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0,congray_cl, 0, NULL, NULL); else clEnqueueReadBufferRect(queue, buf_congray, CL_TRUE,buffer_origin, host_origin, region,deviceWidth*sizeof(float), 0, xsize*sizeof(float), 0,congray_cl, 0, NULL, NULL); #endif // -------------------------------------------------------------------------- // output OpenCL filtered image // -------------------------------------------------------------------------- printf("Writing OpenCL filtered image\n"); for(int n = 0; n < xsize*ysize; ++n) { //r[n] = (int)(congray_cl[4*n] * rgb_max); //g[n] = (int)(congray_cl[4*n+1] * rgb_max); //b[n] = (int)(congray_cl[4*n+2] * rgb_max); r[n] = (int)(congray_cl[4*n]); g[n] = (int)(congray_cl[4*n+1]); b[n] = (int)(congray_cl[4*n+2]); } error = ppma_write("output_cl.ppm", xsize, ysize, r, g, b); if(error) { fprintf(stderr, "error writing image"); abort(); } // -------------------------------------------------------------------------- // clean up // -------------------------------------------------------------------------- CALL_CL_SAFE(clReleaseMemObject(buf_congray)); CALL_CL_SAFE(clReleaseMemObject(buf_gray)); CALL_CL_SAFE(clReleaseMemObject(buf_filter)); CALL_CL_SAFE(clReleaseKernel(knl)); CALL_CL_SAFE(clReleaseCommandQueue(queue)); CALL_CL_SAFE(clReleaseContext(ctx)); free(gray); free(congray); free(congray_cl); free(r); free(b); free(g); }
int exec_trig_kernel(const char *program_source, int n, void *srcA, void *dst) { cl_context context; cl_command_queue cmd_queue; cl_device_id *devices; cl_program program; cl_kernel kernel; cl_mem memobjs[2]; size_t global_work_size[1]; size_t local_work_size[1]; size_t cb; cl_int err; float c = 7.3f; // a scalar number to test non-pointer args // create the OpenCL context on a GPU device context = poclu_create_any_context(); if (context == (cl_context)0) return -1; // get the list of GPU devices associated with context clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb); devices = (cl_device_id *) malloc(cb); clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, devices, NULL); // create a command-queue cmd_queue = clCreateCommandQueue(context, devices[0], 0, NULL); if (cmd_queue == (cl_command_queue)0) { clReleaseContext(context); free(devices); return -1; } free(devices); // allocate the buffer memory objects memobjs[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float4) * n, srcA, NULL); if (memobjs[0] == (cl_mem)0) { clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } memobjs[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float4) * n, NULL, NULL); if (memobjs[1] == (cl_mem)0) { delete_memobjs(memobjs, 1); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // create the program program = clCreateProgramWithSource(context, 1, (const char**)&program_source, NULL, NULL); if (program == (cl_program)0) { delete_memobjs(memobjs, 2); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // create the kernel kernel = clCreateKernel(program, "trig", NULL); if (kernel == (cl_kernel)0) { delete_memobjs(memobjs, 2); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // set the args values err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &memobjs[0]); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &memobjs[1]); err |= clSetKernelArg(kernel, 2, sizeof(float), (void *) &c); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // set work-item dimensions global_work_size[0] = n; local_work_size[0]= 2; // execute kernel err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // read output image err = clEnqueueReadBuffer(cmd_queue, memobjs[1], CL_TRUE, 0, n * sizeof(cl_float4), dst, 0, NULL, NULL); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // release kernel, program, and memory objects delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return 0; // success... }
int main() { srand(unsigned(time(nullptr))); int err; // error code returned from api calls cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel // OpenCL device memory for matrices cl_mem d_A; cl_mem d_B; cl_mem d_C; // set seed for rand() srand(2014); //Allocate host memory for matrices A and B unsigned int size_A = WA * HA; unsigned int mem_size_A = sizeof(float) * size_A; float* h_A = (float*)malloc(mem_size_A); unsigned int size_B = WB * HB; unsigned int mem_size_B = sizeof(float) * size_B; float* h_B = (float*)malloc(mem_size_B); //Initialize host memory randomMemInit(h_A, size_A); randomMemInit(h_B, size_B); //Allocate host memory for the result C unsigned int size_C = WC * HC; unsigned int mem_size_C = sizeof(float) * size_C; float* h_C = (float*)malloc(mem_size_C); printf("Initializing OpenCL device...\n"); cl_uint dev_cnt = 0; clGetPlatformIDs(0, 0, &dev_cnt); cl_platform_id platform_ids[100]; clGetPlatformIDs(dev_cnt, platform_ids, NULL); // Connect to a compute device int gpu = 1; err = clGetDeviceIDs(platform_ids[0], gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS){ printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } // Create a compute context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context){ printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command commands commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands){ printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source file char *KernelSource; long lFileSize = LoadOpenCLKernel("matrixmul_kernel.cl", &KernelSource); if (lFileSize < 0L){ perror("File read failed"); return 1; } //const char* KernelSource = loadKernelCPP(".\\matrixmul_kernel.cl"); program = clCreateProgramWithSource(context, 1, (const char **)&KernelSource, NULL, &err); if (!program){ printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program executable err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS){ size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // Create the compute kernel in the program we wish to run kernel = clCreateKernel(program, "matrixMul", &err); if (!kernel || err != CL_SUCCESS){ printf("Error: Failed to create compute kernel!\n"); exit(1); } // Create the input and output arrays in device memory for our calculation d_C = clCreateBuffer(context, CL_MEM_READ_WRITE, mem_size_A, NULL, &err); d_A = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_A, h_A, &err); d_B = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_B, h_B, &err); if (!d_A || !d_B || !d_C){ printf("Error: Failed to allocate device memory!\n"); exit(1); } printf("Running matrix multiplication for matrices A (%dx%d) and B (%dx%d) ...\n", WA, HA, WB, HB); //Launch OpenCL kernel size_t localWorkSize[2], globalWorkSize[2]; int wA = WA; int wC = WC; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&d_C); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&d_A); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&d_B); err |= clSetKernelArg(kernel, 3, sizeof(int), (void *)&wA); err |= clSetKernelArg(kernel, 4, sizeof(int), (void *)&wC); if (err != CL_SUCCESS){ printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } localWorkSize[0] = 16; localWorkSize[1] = 16; globalWorkSize[0] = 1024; globalWorkSize[1] = 1024; err = clEnqueueNDRangeKernel(commands, kernel, 2, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL); if (err != CL_SUCCESS){ printf("Error: Failed to execute kernel! %d\n", err); exit(1); } //Retrieve result from device err = clEnqueueReadBuffer(commands, d_C, CL_TRUE, 0, mem_size_C, h_C, 0, NULL, NULL); if (err != CL_SUCCESS){ printf("Error: Failed to read output array! %d\n", err); exit(1); } //print table A printf("\nMatrix A\n"); for (int i = 0; i < size_A; i++){ printf("%f\t", h_A[i]); if (((i + 1) % WA) == 0) printf("\n"); } //print table B printf("\nMatrix B\n"); for (int i = 0; i < size_B; i++){ printf("%f\t", h_B[i]); if (((i + 1) % WB) == 0) printf("\n"); } //print out the results printf("\nMatrix C (Results)\n"); for (int i = 0; i < size_C; i++){ printf("%f\t", h_C[i]); if (((i + 1) % WC) == 0) printf("\n"); } printf("\n"); printf("Matrix multiplication completed...\n"); //Shutdown and cleanup free(h_A); free(h_B); free(h_C); clReleaseMemObject(d_A); clReleaseMemObject(d_C); clReleaseMemObject(d_B); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); std::cin.clear(); std::cin.sync(); std::cin.get(); }
int main( int argc, char* argv[] ) { // Length of vectors unsigned int n = 100000; // Host input vectors double *h_a; double *h_b; // Host output vector double *h_c; // Device input buffers cl_mem d_a; cl_mem d_b; // Device output buffer cl_mem d_c; cl_platform_id cpPlatform; // OpenCL platform cl_device_id device_id; // device ID cl_context context; // context cl_command_queue queue; // command queue cl_program program; // program cl_kernel kernel; // kernel // Size, in bytes, of each vector size_t bytes = n*sizeof(double); // Allocate memory for each vector on host h_a = (double*)malloc(bytes); h_b = (double*)malloc(bytes); h_c = (double*)malloc(bytes); // Initialize vectors on host int i; for( i = 0; i < n; i++ ) { h_a[i] = sinf(i)*sinf(i); h_b[i] = cosf(i)*cosf(i); } size_t globalSize, localSize; cl_int err; // Number of work items in each local work group localSize = 64; // Number of total work items - localSize must be devisor globalSize = ceil(n/(float)localSize)*localSize; // Bind to platform err = clGetPlatformIDs(1, &cpPlatform, NULL); // Get ID for the device err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); // Create a context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); // Create a command queue queue = clCreateCommandQueue(context, device_id, 0, &err); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelSource, NULL, &err); // Build the program executable clBuildProgram(program, 0, NULL, NULL, NULL, NULL); // Create the compute kernel in the program we wish to run kernel = clCreateKernel(program, "vecAdd", &err); // Create the input and output arrays in device memory for our calculation d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL); // Write our data set into the input array in device memory err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0, bytes, h_a, 0, NULL, NULL); err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0, bytes, h_b, 0, NULL, NULL); // Set the arguments to our compute kernel err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c); err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n); // Execute the kernel over the entire range of the data set err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL); // Wait for the command queue to get serviced before reading back results clFinish(queue); // Read the results from the device clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0, bytes, h_c, 0, NULL, NULL ); //Sum up vector c and print result divided by n, this should equal 1 within error double sum = 0; for(i=0; i<n; i++) sum += h_c[i]; printf("final result: %f\n", sum/n); // release OpenCL resources clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseContext(context); //release host memory free(h_a); free(h_b); free(h_c); return 0; }
void release_cl(t_cl *cl) { clReleaseCommandQueue(cl->cmd_queue); clReleaseContext(cl->context); }
ErrorStatus gemm_clblas(cl_device_id device, const void *inMatrixA, int nrowA, int ncolA, bool transposeA, const void *inMatrixB, int nrowB, int ncolB, bool transposeB, double alpha, double beta, void *outMatrix, bool use_float) { std::stringstream result; float *input_matrixA_f = (float *)inMatrixA; float *input_matrixB_f = (float *)inMatrixB; float *output_matrix_f = (float *)outMatrix; double *input_matrixA_d = (double *)inMatrixA; double *input_matrixB_d = (double *)inMatrixB; double *output_matrix_d = (double *)outMatrix; if (debug) { result << "gemm_clblas( " << (use_float ? "FLOAT" : "DOUBLE") << ")" << std::endl << std::endl; } cl_int err = CL_SUCCESS; clblasStatus status = clblasSetup(); if (status != CL_SUCCESS) { if (debug) { result << "clblasSetup: " << clblasErrorToString(status) << std::endl; } err = CL_INVALID_OPERATION; } // get first platform cl_platform_id platform = NULL; if (err == CL_SUCCESS) { err = clGetPlatformIDs(1, &platform, NULL); } if (debug && err == CL_SUCCESS) { result << "Platform: " << getPlatformInfoString(platform, CL_PLATFORM_NAME) << std::endl; result << "Device: " << getDeviceInfoString(device, CL_DEVICE_NAME) << std::endl; } // context cl_context context = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateContext:" << std::endl; } context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); } // queue cl_command_queue queue = NULL; if (err == CL_SUCCESS) { #ifdef CL_VERSION_2_0 if (debug) { result << "clCreateCommandQueueWithProperties:" << std::endl; } queue = clCreateCommandQueueWithProperties(context, device, NULL, &err); #else if (debug) { result << "clCreateCommandQueue:" << std::endl; } queue = clCreateCommandQueue(context, device, 0, &err); #endif } // buffers cl_mem cl_input_matrixA = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_input_matrixA:" << std::endl; } if (use_float) { cl_input_matrixA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowA * ncolA * sizeof(float), input_matrixA_f, &err); } else { cl_input_matrixA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowA * ncolA * sizeof(double), input_matrixA_d, &err); } } cl_mem cl_input_matrixB = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_input_matrixB:" << std::endl; } if (use_float) { cl_input_matrixB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowB * ncolB * sizeof(float), input_matrixB_f, &err); } else { cl_input_matrixB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowB * ncolB * sizeof(double), input_matrixB_d, &err); } } int nrowC = transposeA ? ncolA : nrowA; int ncolC = transposeB ? nrowB : ncolB; cl_mem cl_output_matrix = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_output_vector:" << std::endl; } if (use_float) { cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, nrowC * ncolC * sizeof(float), output_matrix_f, &err); } else { cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, nrowC * ncolC * sizeof(double), output_matrix_d, &err); } } // ++++++++++++ const int lda = nrowA; // first dimension of A (rows), before any transpose const int ldb = nrowB; // first dimension of B (rows), before any transpose const int ldc = nrowC; // first dimension of C (rows) const int M = transposeA ? ncolA : nrowA; // rows in A (after transpose, if any) and C const int N = transposeB ? nrowB : ncolB; // cols in B (after transpose, if any) and C const int K = transposeA ? nrowA : ncolA; // cols in A and rows in B (after transposes, if any) const clblasOrder order = clblasColumnMajor; const clblasTranspose transA = transposeA ? clblasTrans : clblasNoTrans; const clblasTranspose transB = transposeB ? clblasTrans : clblasNoTrans; cl_event event = NULL; if (err == CL_SUCCESS) { if (use_float) { if (debug) { result << "clblasSgemm:" << std::endl; } status = clblasSgemm(order, transA, transB, M, N, K, alpha, cl_input_matrixA, 0, lda, cl_input_matrixB, 0, ldb, beta, cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS && debug) { result << "clblasSgemm error:" << clblasErrorToString(status) << std::endl; } } else { if (debug) { result << "clblasDgemm:" << std::endl; } status = clblasDgemm(order, transA, transB, M, N, K, alpha, cl_input_matrixA, 0, lda, cl_input_matrixB, 0, ldb, beta, cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { if (debug) { result << "clblasDgemm error:" << clblasErrorToString(status) << std::endl; } err = status; } } } if (err == CL_SUCCESS) { /* Wait for calculations to be finished. */ if (debug) { result << "clWaitForEvents:" << std::endl; } err = clWaitForEvents(1, &event); } // retrieve result if (err == CL_SUCCESS) { if (debug) { result << "Retrieve result:" << std::endl; } if (use_float) { clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, nrowC * ncolC * sizeof(float), output_matrix_f, 0, NULL, NULL); } else { clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, nrowC * ncolC * sizeof(double), output_matrix_d, 0, NULL, NULL); } } std::string err_str = clErrorToString(err); result << std::endl << err_str << std::endl; // cleanup clReleaseMemObject(cl_output_matrix); cl_output_matrix = NULL; clReleaseMemObject(cl_input_matrixA); cl_input_matrixA = NULL; clReleaseMemObject(cl_input_matrixB); cl_input_matrixB = NULL; clReleaseCommandQueue(queue); queue = NULL; clReleaseContext(context); context = NULL; if (debug) { CERR << result.str(); } ErrorStatus errorStatus = { err, status }; return errorStatus; }
int main(int argc, char** argv) { int err; // error code returned from api calls float data[DATA_SIZE]; // original data set given to device float results[DATA_SIZE]; // results returned from device unsigned int correct; // number of correct results returned size_t global; // global domain size for our calculation size_t local; // local domain size for our calculation cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel cl_mem input; // device memory used for the input array cl_mem output; // device memory used for the output array // Fill our data set with random float values // int i = 0; unsigned int count = DATA_SIZE; for(i = 0; i < count; i++) data[i] = rand() / (float)RAND_MAX; // Connect to a compute device // int gpu = 1; err = clGetDeviceIDs(NULL, gpu ? CL_DEVICE_TYPE_GPU : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); return EXIT_FAILURE; } // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); return EXIT_FAILURE; } // Create a command commands // commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); return EXIT_FAILURE; } // Create the compute program from the source buffer // program = clCreateProgramWithSource(context, 1, (const char **) & KernelSource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n"); return EXIT_FAILURE; } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); exit(1); } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "square", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); exit(1); } // Create the input and output arrays in device memory for our calculation // input = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * count, NULL, NULL); output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * count, NULL, NULL); if (!input || !output) { printf("Error: Failed to allocate device memory!\n"); exit(1); } // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, input, CL_TRUE, 0, sizeof(float) * count, data, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array!\n"); exit(1); } // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output); err |= clSetKernelArg(kernel, 2, sizeof(unsigned int), &count); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); exit(1); } // Get the maximum work group size for executing the kernel on the device // err = clGetKernelWorkGroupInfo(kernel, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to retrieve kernel work group info! %d\n", err); exit(1); } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // global = count; err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel!\n"); return EXIT_FAILURE; } // Wait for the command commands to get serviced before reading back results // clFinish(commands); // Read back the results from the device to verify the output // err = clEnqueueReadBuffer( commands, output, CL_TRUE, 0, sizeof(float) * count, results, 0, NULL, NULL ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); exit(1); } // Validate our results // correct = 0; for(i = 0; i < count; i++) { if(results[i] == data[i] * data[i]) correct++; } // Print a brief summary detailing the results // printf("Computed '%d/%d' correct values!\n", correct, count); // Shutdown and cleanup // clReleaseMemObject(input); clReleaseMemObject(output); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return 0; }
int main(void) { const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(float); // Generate the input array on the host. float h_a[ARRAY_SIZE]; float h_b[ARRAY_SIZE]; for (int i = 0; i < ARRAY_SIZE; i++) { h_a[i] = (float)i; h_b[i] = (float)(2 * i); } float h_c[ARRAY_SIZE]; FILE *fp; char *source_str; size_t source_size; fp = fopen("vectors_cl.cl", "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); // Get platform and device information cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); // Create an OpenCL context cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); // Create a command queue cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret); // Create memory buffers on the device for each vector cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, ARRAY_BYTES, NULL, &ret); cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, ARRAY_BYTES, NULL, &ret); cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, ARRAY_BYTES, NULL, &ret); // Copy h_a and h_b to memory buffer ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, ARRAY_BYTES, h_a, 0, NULL, NULL); ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, ARRAY_BYTES, h_b, 0, NULL, NULL); // Create a program from the kernel source cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); if (ret != 0) { printf("clCreateProgramWithSource returned non-zero status %d\n\n", ret); exit(1); } // Build the program ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); if (ret != 0) { printf("clBuildProgram returned non-zero status %d: ", ret); if (ret == CL_INVALID_PROGRAM) { printf("invalid program\n"); } else if (ret == CL_INVALID_VALUE) { printf("invalid value\n"); } else if (ret == CL_INVALID_DEVICE) { printf("invalid device\n"); } else if (ret == CL_INVALID_BINARY) { printf("invalid binary\n"); } else if (ret == CL_INVALID_BUILD_OPTIONS) { printf("invalid build options\n"); } else if (ret == CL_INVALID_OPERATION) { printf("invalid operation\n"); } else if (ret == CL_COMPILER_NOT_AVAILABLE) { printf("compiler not available\n"); } else if (ret == CL_BUILD_PROGRAM_FAILURE) { printf("build program failure\n"); // Determine the size of the log size_t log_size; clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); // Allocate memory for the log char *log = (char *) malloc(log_size); // Get the log clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); // Print the log printf("%s\n", log); } else if (ret == CL_OUT_OF_HOST_MEMORY) { printf("out of host memory\n"); } exit(1); } // Create the OpenCL kernel cl_kernel kernel = clCreateKernel(program, "add", &ret); // Set the arguments of the kernel ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj); ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj); ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj); size_t array_size = ARRAY_SIZE; ret = clSetKernelArg(kernel, 3, sizeof(const size_t), (void *)&array_size); // Execute the OpenCL kernel on the list size_t global_item_size = ARRAY_SIZE; // Process the entire lists size_t local_item_size = 1; // Divide work items into groups of 64 ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); // Read the memory buffer C on the device to the local variable C ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, ARRAY_BYTES, h_c, 0, NULL, NULL); // Print out the resulting array. for (int i = 0; i < 8; i++) { printf("%d + %d = %d", (int)h_a[i], (int)h_b[i], (int)h_c[i]); printf(((i % 4) != 3) ? "\t" : "\n"); } printf("...\n"); for (int i = ARRAY_SIZE - 8; i < ARRAY_SIZE; i++) { printf("%d + %d = %d", (int)h_a[i], (int)h_b[i], (int)h_c[i]); printf(((i % 4) != 3) ? "\t" : "\n"); } // Clean up ret = clFlush(command_queue); ret = clFinish(command_queue); ret = clReleaseKernel(kernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(a_mem_obj); ret = clReleaseMemObject(b_mem_obj); ret = clReleaseMemObject(c_mem_obj); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); return 0; }
int main(int argc, char** argv) { ocd_init(&argc, &argv, NULL); ocd_initCL(); cl_int err; size_t global_size; size_t local_size; cl_program program; cl_kernel kernel_compute_flux; cl_kernel kernel_compute_flux_contributions; cl_kernel kernel_compute_step_factor; cl_kernel kernel_time_step; cl_kernel kernel_initialize_variables; cl_mem ff_variable; cl_mem ff_fc_momentum_x; cl_mem ff_fc_momentum_y; cl_mem ff_fc_momentum_z; cl_mem ff_fc_density_energy; if (argc < 2) { printf("Usage ./cfd <data input file>\n"); return 0; } const char* data_file_name = argv[1]; // set far field conditions and load them into constant memory on the gpu { float h_ff_variable[NVAR]; const float angle_of_attack = (float)(3.1415926535897931 / 180.0) * (float)(deg_angle_of_attack); h_ff_variable[VAR_DENSITY] = (float)(1.4); float ff_pressure = (float)(1.0); float ff_speed_of_sound = sqrt(GAMMA*ff_pressure / h_ff_variable[VAR_DENSITY]); float ff_speed = (float)(ff_mach)*ff_speed_of_sound; float3 ff_velocity; ff_velocity.x = ff_speed*(float)(cos((float)angle_of_attack)); ff_velocity.y = ff_speed*(float)(sin((float)angle_of_attack)); ff_velocity.z = 0.0; h_ff_variable[VAR_MOMENTUM+0] = h_ff_variable[VAR_DENSITY] * ff_velocity.x; h_ff_variable[VAR_MOMENTUM+1] = h_ff_variable[VAR_DENSITY] * ff_velocity.y; h_ff_variable[VAR_MOMENTUM+2] = h_ff_variable[VAR_DENSITY] * ff_velocity.z; h_ff_variable[VAR_DENSITY_ENERGY] = h_ff_variable[VAR_DENSITY]*((float)(0.5)*(ff_speed*ff_speed)) + (ff_pressure / (float)(GAMMA-1.0)); float3 h_ff_momentum; h_ff_momentum.x = *(h_ff_variable+VAR_MOMENTUM+0); h_ff_momentum.y = *(h_ff_variable+VAR_MOMENTUM+1); h_ff_momentum.z = *(h_ff_variable+VAR_MOMENTUM+2); float3 h_ff_fc_momentum_x; float3 h_ff_fc_momentum_y; float3 h_ff_fc_momentum_z; float3 h_ff_fc_density_energy; compute_flux_contribution(&h_ff_variable[VAR_DENSITY], &h_ff_momentum, &h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure, &ff_velocity, &h_ff_fc_momentum_x, &h_ff_fc_momentum_y, &h_ff_fc_momentum_z, &h_ff_fc_density_energy); // copy far field conditions to the gpu ff_variable = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * NVAR, h_ff_variable, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_momentum_x = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_x, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_momentum_y = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_y, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_momentum_z = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_z, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_density_energy = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_density_energy, &err); CHKERR(err, "Unable to allocate ff data"); } int nel; int nelr; // read in domain geometry cl_mem areas; cl_mem elements_surrounding_elements; cl_mem normals; { std::ifstream file(data_file_name); file >> nel; nelr = block_length*((nel / block_length )+ std::min(1, nel % block_length)); float* h_areas = new float[nelr]; int* h_elements_surrounding_elements = new int[nelr*NNB]; float* h_normals = new float[nelr*NDIM*NNB]; // read in data for(int i = 0; i < nel; i++) { file >> h_areas[i]; for(int j = 0; j < NNB; j++) { file >> h_elements_surrounding_elements[i + j*nelr]; if(h_elements_surrounding_elements[i+j*nelr] < 0) h_elements_surrounding_elements[i+j*nelr] = -1; h_elements_surrounding_elements[i + j*nelr]--; //it's coming in with Fortran numbering for(int k = 0; k < NDIM; k++) { file >> h_normals[i + (j + k*NNB)*nelr]; h_normals[i + (j + k*NNB)*nelr] = -h_normals[i + (j + k*NNB)*nelr]; } } } // fill in remaining data int last = nel-1; for(int i = nel; i < nelr; i++) { h_areas[i] = h_areas[last]; for(int j = 0; j < NNB; j++) { // duplicate the last element h_elements_surrounding_elements[i + j*nelr] = h_elements_surrounding_elements[last + j*nelr]; for(int k = 0; k < NDIM; k++) h_normals[last + (j + k*NNB)*nelr] = h_normals[last + (j + k*NNB)*nelr]; } } areas = alloc<float>(context, nelr); upload<float>(commands, areas, h_areas, nelr); elements_surrounding_elements = alloc<int>(context, nelr*NNB); upload<int>(commands, elements_surrounding_elements, h_elements_surrounding_elements, nelr*NNB); normals = alloc<float>(context, nelr*NDIM*NNB); upload<float>(commands, normals, h_normals, nelr*NDIM*NNB); delete[] h_areas; delete[] h_elements_surrounding_elements; delete[] h_normals; } // Get program source. long kernelSize = getKernelSize(); char* kernelSource = new char[kernelSize]; getKernelSource(kernelSource, kernelSize); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) &kernelSource, NULL, &err); CHKERR(err, "Failed to create a compute program!"); // Build the program executable err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err == CL_BUILD_PROGRAM_FAILURE) { char *log; size_t logLen; err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLen); log = (char *) malloc(sizeof(char)*logLen); err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, logLen, (void *) log, NULL); fprintf(stderr, "CL Error %d: Failed to build program! Log:\n%s", err, log); free(log); exit(1); } CHKERR(err, "Failed to build program!"); delete[] kernelSource; // Create the compute kernel in the program we wish to run kernel_compute_flux = clCreateKernel(program, "compute_flux", &err); CHKERR(err, "Failed to create a compute kernel!"); // Create the reduce kernel in the program we wish to run kernel_compute_flux_contributions = clCreateKernel(program, "compute_flux_contributions", &err); CHKERR(err, "Failed to create a compute_flux_contributions kernel!"); // Create the reduce kernel in the program we wish to run kernel_compute_step_factor = clCreateKernel(program, "compute_step_factor", &err); CHKERR(err, "Failed to create a compute_step_factor kernel!"); // Create the reduce kernel in the program we wish to run kernel_time_step = clCreateKernel(program, "time_step", &err); CHKERR(err, "Failed to create a time_step kernel!"); // Create the reduce kernel in the program we wish to run kernel_initialize_variables = clCreateKernel(program, "initialize_variables", &err); CHKERR(err, "Failed to create a initialize_variables kernel!"); // Create arrays and set initial conditions cl_mem variables = alloc<cl_float>(context, nelr*NVAR); err = 0; err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&variables); err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device //err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!"); local_size = 1;//std::min(local_size, (size_t)nelr); global_size = nelr; err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); err = clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 0"); cl_mem old_variables = alloc<float>(context, nelr*NVAR); cl_mem fluxes = alloc<float>(context, nelr*NVAR); cl_mem step_factors = alloc<float>(context, nelr); clFinish(commands); cl_mem fc_momentum_x = alloc<float>(context, nelr*NDIM); cl_mem fc_momentum_y = alloc<float>(context, nelr*NDIM); cl_mem fc_momentum_z = alloc<float>(context, nelr*NDIM); cl_mem fc_density_energy = alloc<float>(context, nelr*NDIM); clFinish(commands); // make sure all memory is floatly allocated before we start timing err = 0; err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&old_variables); err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 1"); err = 0; err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&fluxes); err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 2"); std::cout << "About to memcopy" << std::endl; err = clReleaseMemObject(step_factors); float temp[nelr]; for(int i = 0; i < nelr; i++) temp[i] = 0; step_factors = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * nelr, temp, &err); CHKERR(err, "Unable to memset step_factors"); // make sure CUDA isn't still doing something before we start timing clFinish(commands); // these need to be computed the first time in order to compute time step std::cout << "Starting..." << std::endl; // Begin iterations for(int i = 0; i < iterations; i++) { copy<float>(commands, old_variables, variables, nelr*NVAR); // for the first iteration we compute the time step err = 0; err = clSetKernelArg(kernel_compute_step_factor, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_compute_step_factor, 1, sizeof(cl_mem),&variables); err |= clSetKernelArg(kernel_compute_step_factor, 2, sizeof(cl_mem), &areas); err |= clSetKernelArg(kernel_compute_step_factor, 3, sizeof(cl_mem), &step_factors); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_compute_step_factor, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Step Factor Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel[kernel_compute_step_factor]!"); for(int j = 0; j < RK; j++) { err = 0; err = clSetKernelArg(kernel_compute_flux_contributions, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_compute_flux_contributions, 1, sizeof(cl_mem),&variables); err |= clSetKernelArg(kernel_compute_flux_contributions, 2, sizeof(cl_mem), &fc_momentum_x); err |= clSetKernelArg(kernel_compute_flux_contributions, 3, sizeof(cl_mem), &fc_momentum_y); err |= clSetKernelArg(kernel_compute_flux_contributions, 4, sizeof(cl_mem), &fc_momentum_z); err |= clSetKernelArg(kernel_compute_flux_contributions, 5, sizeof(cl_mem), &fc_density_energy); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_flux_contributions, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_flux_contributions work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_compute_flux_contributions, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Contribution Kernel", ocdTempTimer) //compute_flux_contributions(nelr, variables, fc_momentum_x, fc_momentum_y, fc_momentum_z, fc_density_energy); END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_compute_flux_contributions]!"); err = 0; err = clSetKernelArg(kernel_compute_flux, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_compute_flux, 1, sizeof(cl_mem), &elements_surrounding_elements); err |= clSetKernelArg(kernel_compute_flux, 2, sizeof(cl_mem), &normals); err |= clSetKernelArg(kernel_compute_flux, 3, sizeof(cl_mem), &variables); err |= clSetKernelArg(kernel_compute_flux, 4, sizeof(cl_mem), &fc_momentum_x); err |= clSetKernelArg(kernel_compute_flux, 5, sizeof(cl_mem), &fc_momentum_y); err |= clSetKernelArg(kernel_compute_flux, 6, sizeof(cl_mem), &fc_momentum_z); err |= clSetKernelArg(kernel_compute_flux, 7, sizeof(cl_mem), &fc_density_energy); err |= clSetKernelArg(kernel_compute_flux, 8, sizeof(cl_mem), &fluxes); err |= clSetKernelArg(kernel_compute_flux, 9, sizeof(cl_mem), &ff_variable); err |= clSetKernelArg(kernel_compute_flux, 10, sizeof(cl_mem), &ff_fc_momentum_x); err |= clSetKernelArg(kernel_compute_flux, 11, sizeof(cl_mem), &ff_fc_momentum_y); err |= clSetKernelArg(kernel_compute_flux, 12, sizeof(cl_mem), &ff_fc_momentum_z); err |= clSetKernelArg(kernel_compute_flux, 13, sizeof(cl_mem), &ff_fc_density_energy); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_flux, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_flux work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_compute_flux, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_compute_flux]!"); err = 0; err = clSetKernelArg(kernel_time_step, 0, sizeof(int), &j); err |= clSetKernelArg(kernel_time_step, 1, sizeof(int), &nelr); err |= clSetKernelArg(kernel_time_step, 2, sizeof(cl_mem), &old_variables); err |= clSetKernelArg(kernel_time_step, 3, sizeof(cl_mem), &variables); err |= clSetKernelArg(kernel_time_step, 4, sizeof(cl_mem), &step_factors); err |= clSetKernelArg(kernel_time_step, 5, sizeof(cl_mem), &fluxes); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_time_step, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_time_step work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_time_step, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Time Step Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_time_step]!"); } } clFinish(commands); std::cout << "Finished" << std::endl; std::cout << "Saving solution..." << std::endl; dump(commands, variables, nel, nelr); std::cout << "Saved solution..." << std::endl; std::cout << "Cleaning up..." << std::endl; clReleaseProgram(program); clReleaseKernel(kernel_compute_flux); clReleaseKernel(kernel_compute_flux_contributions); clReleaseKernel(kernel_compute_step_factor); clReleaseKernel(kernel_time_step); clReleaseKernel(kernel_initialize_variables); clReleaseCommandQueue(commands); clReleaseContext(context); dealloc<float>(areas); dealloc<int>(elements_surrounding_elements); dealloc<float>(normals); dealloc<float>(variables); dealloc<float>(old_variables); dealloc<float>(fluxes); dealloc<float>(step_factors); dealloc<float>(fc_momentum_x); dealloc<float>(fc_momentum_y); dealloc<float>(fc_momentum_z); dealloc<float>(fc_density_energy); std::cout << "Done..." << std::endl; ocd_finalize(); return 0; }
int main() { cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_mem memobj = NULL; cl_program program = NULL; cl_kernel kernel = NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret; float mem[MEM_SIZE]; FILE *fp; char fileName[] = "./kernel.clbin"; size_t binary_size; char *binary_buf; cl_int binary_status; cl_int i; /* カーネルを含むオブジェクトファイルをロード */ fp = fopen(fileName, "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } binary_buf = (char *)malloc(MAX_BINARY_SIZE); binary_size = fread( binary_buf, 1, MAX_BINARY_SIZE, fp ); fclose( fp ); /* データを初期化 */ for( i = 0; i < MEM_SIZE; i++ ) { mem[i] = i; } /* プラットフォーム・デバイスの情報の取得 */ ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); /* OpenCLコンテキストの作成 */ context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); /* コマンドキューの作成 */ command_queue = clCreateCommandQueue(context, device_id, 0, &ret); /* メモリバッファの作成 */ memobj = clCreateBuffer(context, CL_MEM_READ_WRITE, MEM_SIZE * sizeof(float), NULL, &ret); /* メモリバッファにデータを転送 */ ret = clEnqueueWriteBuffer(command_queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(float), mem, 0, NULL, NULL); /* 読み込んだバイナリからカーネルプログラムを作成 */ program = clCreateProgramWithBinary(context, 1, &device_id, (const size_t *)&binary_size, (const unsigned char **)&binary_buf, &binary_status, &ret); /* OpenCLカーネルの作成 */ kernel = clCreateKernel(program, "vecAdd", &ret); printf("err:%d\n", ret); /* OpenCLカーネル引数の設定 */ ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj); size_t global_work_size[3] = {MEM_SIZE, 0, 0}; size_t local_work_size[3] = {MEM_SIZE, 0, 0}; /* OpenCLカーネルを実行 */ ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); /* メモリバッファから結果を取得 */ ret = clEnqueueReadBuffer(command_queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(float), mem, 0, NULL, NULL); /* 結果の表示 */ for(i=0; i<MEM_SIZE; i++) { printf("mem[%d] : %f\n", i, mem[i]); } /* 終了処理 */ ret = clFlush(command_queue); ret = clFinish(command_queue); ret = clReleaseKernel(kernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(memobj); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); free(binary_buf); return 0; }
void DeleteCL() { clReleaseContext(g_cxMainContext); clReleaseCommandQueue(g_cqCommandQue); }
int main(int argc, char const *argv[]) { /* Get platform */ cl_platform_id platform; cl_uint num_platforms; cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformIDs' failed\n"); exit(1); } printf("Number of platforms: %d\n", num_platforms); printf("platform=%p\n", platform); /* Get platform name */ char platform_name[100]; ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformInfo' failed\n"); exit(1); } printf("platform.name='%s'\n\n", platform_name); /* Get device */ cl_device_id device; cl_uint num_devices; ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceIDs' failed\n"); exit(1); } printf("Number of devices: %d\n", num_devices); printf("device=%p\n", device); /* Get device name */ char device_name[100]; ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceInfo' failed\n"); exit(1); } printf("device.name='%s'\n", device_name); printf("\n"); /* Create a Context Object */ cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateContext' failed\n"); exit(1); } printf("context=%p\n", context); /* Create a Command Queue Object*/ cl_command_queue command_queue; command_queue = clCreateCommandQueue(context, device, 0, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateCommandQueue' failed\n"); exit(1); } printf("command_queue=%p\n", command_queue); printf("\n"); /* Program source */ unsigned char *source_code; size_t source_length; /* Read program from 'post_increment_short4.cl' */ source_code = read_buffer("post_increment_short4.cl", &source_length); /* Create a program */ cl_program program; program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_length, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateProgramWithSource' failed\n"); exit(1); } printf("program=%p\n", program); /* Build program */ ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (ret != CL_SUCCESS ) { size_t size; char *log; /* Get log size */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size); /* Allocate log and print */ log = malloc(size); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL); printf("error: call to 'clBuildProgram' failed:\n%s\n", log); /* Free log and exit */ free(log); exit(1); } printf("program built\n"); printf("\n"); /* Create a Kernel Object */ cl_kernel kernel; kernel = clCreateKernel(program, "post_increment_short4", &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateKernel' failed\n"); exit(1); } /* Create and allocate host buffers */ size_t num_elem = 10; /* Create and init host side src buffer 0 */ cl_short4 *src_0_host_buffer; src_0_host_buffer = malloc(num_elem * sizeof(cl_short4)); for (int i = 0; i < num_elem; i++) src_0_host_buffer[i] = (cl_short4){{2, 2, 2, 2}}; /* Create and init device side src buffer 0 */ cl_mem src_0_device_buffer; src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_short4), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_short4), src_0_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create host dst buffer */ cl_short4 *dst_host_buffer; dst_host_buffer = malloc(num_elem * sizeof(cl_short4)); memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_short4)); /* Create device dst buffer */ cl_mem dst_device_buffer; dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_short4), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create dst buffer\n"); exit(1); } /* Set kernel arguments */ ret = CL_SUCCESS; ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clSetKernelArg' failed\n"); exit(1); } /* Launch the kernel */ size_t global_work_size = num_elem; size_t local_work_size = num_elem; ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueNDRangeKernel' failed\n"); exit(1); } /* Wait for it to finish */ clFinish(command_queue); /* Read results from GPU */ ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_short4), dst_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueReadBuffer' failed\n"); exit(1); } /* Dump dst buffer to file */ char dump_file[100]; sprintf((char *)&dump_file, "%s.result", argv[0]); write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_short4)); printf("Result dumped to %s\n", dump_file); /* Free host dst buffer */ free(dst_host_buffer); /* Free device dst buffer */ ret = clReleaseMemObject(dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 0 */ free(src_0_host_buffer); /* Free device side src buffer 0 */ ret = clReleaseMemObject(src_0_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Release kernel */ ret = clReleaseKernel(kernel); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseKernel' failed\n"); exit(1); } /* Release program */ ret = clReleaseProgram(program); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseProgram' failed\n"); exit(1); } /* Release command queue */ ret = clReleaseCommandQueue(command_queue); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseCommandQueue' failed\n"); exit(1); } /* Release context */ ret = clReleaseContext(context); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseContext' failed\n"); exit(1); } return 0; }
//Do the proper test using different sizes. static cl_ulong gws_test(size_t num, struct fmt_main * self) { cl_event myEvent; cl_int ret_code; cl_uint *tmpbuffer; cl_ulong startTime, endTime, runtime; int i, loops; //Prepare buffers. create_clobj(num, self); tmpbuffer = mem_alloc(sizeof(sha512_hash) * num); if (tmpbuffer == NULL) { fprintf(stderr, "Malloc failure in find_best_gws\n"); exit(EXIT_FAILURE); } queue_prof = clCreateCommandQueue(context[ocl_gpu_id], devices[ocl_gpu_id], CL_QUEUE_PROFILING_ENABLE, &ret_code); HANDLE_CLERROR(ret_code, "Failed in clCreateCommandQueue"); // Set salt. set_salt(get_salt("$6$saltstring$")); salt->initial = salt->rounds - get_multiple(salt->rounds, HASH_LOOPS); // Set keys for (i = 0; i < num; i++) { set_key("aaabaabaaa", i); } //** Get execution time **// HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, salt_buffer, CL_FALSE, 0, sizeof(sha512_salt), salt, 0, NULL, &myEvent), "Failed in clEnqueueWriteBuffer"); HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish"); HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL), "Failed in clGetEventProfilingInfo I"); HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL), "Failed in clGetEventProfilingInfo II"); HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent"); runtime = endTime - startTime; //** Get execution time **// HANDLE_CLERROR(clEnqueueWriteBuffer(queue_prof, pass_buffer, CL_FALSE, 0, sizeof(sha512_password) * num, plaintext, 0, NULL, &myEvent), "Failed in clEnqueueWriteBuffer"); HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish"); HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL), "Failed in clGetEventProfilingInfo I"); HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL), "Failed in clGetEventProfilingInfo II"); HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent"); runtime += endTime - startTime; //** Get execution time **// if (gpu(source_in_use) || use_local(source_in_use)) { ret_code = clEnqueueNDRangeKernel(queue_prof, prepare_kernel, 1, NULL, &num, &local_work_size, 0, NULL, &myEvent); HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish"); HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL), "Failed in clGetEventProfilingInfo I"); HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL), "Failed in clGetEventProfilingInfo II"); HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent"); runtime += endTime - startTime; } loops = gpu(source_in_use) || use_local(source_in_use) ? (salt->rounds / HASH_LOOPS) : 1; //** Get execution time **// for (i = 0; i < loops; i++) { ret_code = clEnqueueNDRangeKernel(queue_prof, crypt_kernel, 1, NULL, &num, &local_work_size, 0, NULL, &myEvent); HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish"); HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL), "Failed in clGetEventProfilingInfo I"); HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL), "Failed in clGetEventProfilingInfo II"); HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent"); runtime += endTime - startTime; } //** Get execution time **// HANDLE_CLERROR(clEnqueueReadBuffer(queue_prof, hash_buffer, CL_FALSE, 0, sizeof(sha512_hash) * num, tmpbuffer, 0, NULL, &myEvent), "Failed in clEnqueueReadBuffer"); HANDLE_CLERROR(clFinish(queue_prof), "Failed in clFinish"); HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_SUBMIT, sizeof(cl_ulong), &startTime, NULL), "Failed in clGetEventProfilingInfo I"); HANDLE_CLERROR(clGetEventProfilingInfo(myEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, NULL), "Failed in clGetEventProfilingInfo II"); HANDLE_CLERROR(clReleaseEvent(myEvent), "Failed in clReleaseEvent"); runtime += endTime - startTime; MEM_FREE(tmpbuffer); HANDLE_CLERROR(clReleaseCommandQueue(queue_prof), "Failed in clReleaseCommandQueue"); release_clobj(); if (ret_code != CL_SUCCESS) { if (ret_code != CL_INVALID_WORK_GROUP_SIZE) fprintf(stderr, "Error %d\n", ret_code); return 0; } return runtime; }