int main(int argc, char** argv) { int err; // error code returned from api calls cl_platform_id platform_id; // platform id cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel size_t global[2]; // global domain size for our calculation size_t local[2]; // local domain size for our calculation char cl_platform_vendor[1001]; char cl_platform_name[1001]; cl_mem in_array; // device memory used for the input array //cl_mem synaptic_weights; // device memory used for the input array cl_mem out_array; // device memory used for the output array if (argc != 2){ printf("%s <inputfile>\n", argv[0]); return -1; } //float in_array[NO_NODES]; //float out_array[NO_NODES]; //float synaptic_weights[NO_NODES*NO_NODES]; float in_array_tb[NO_NODES]; float out_array_tb[NO_NODES]; //float synaptic_weights_tb[NO_NODES*NO_NODES]; float temp =0; int i = 0; int j = 0; int index = 0; FILE* ifp; char* mode = "r"; // // Connect to first platform // err = clGetPlatformIDs(1,&platform_id,NULL); if (err != CL_SUCCESS) { printf("Error: Failed to find an OpenCL platform!\n"); printf("Test failed\n"); return -1; } err = clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n"); printf("Test failed\n"); return -1; } printf("CL_PLATFORM_VENDOR %s\n",cl_platform_vendor); err = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_NAME) failed!\n"); printf("Test failed\n"); return -1; } printf("CL_PLATFORM_NAME %s\n",cl_platform_name); // Connect to a compute device // int fpga = 0; #if defined (FPGA_DEVICE) fpga = 1; #endif err = clGetDeviceIDs(platform_id, fpga ? CL_DEVICE_TYPE_ACCELERATOR : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); printf("Test failed\n"); return -1; } // // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); printf("Test failed\n"); return -1; } //relu_1(in_array,synaptic_weights,out_array); // Fill our data sets with pattern // //int i = 0; //for(i = 0; i < DATA_SIZE; i++) { // a[i] = (int)i; // b[i] = (int)i; // results[i] = 0; //} // // Create a command commands commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); printf("Error: code %i\n",err); printf("Test failed\n"); return -1; } int status; // Create Program Objects // // Load binary from disk unsigned char *kernelbinary; char *xclbin=argv[1]; printf("loading %s\n", xclbin); int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary); if (n_i < 0) { printf("failed to load kernel from xclbin: %s\n", xclbin); printf("Test failed\n"); return -1; } size_t n = n_i; // Create the compute program from offline program = clCreateProgramWithBinary(context, 1, &device_id, &n, (const unsigned char **) &kernelbinary, &status, &err); if ((!program) || (err!=CL_SUCCESS)) { printf("Error: Failed to create compute program from binary %d!\n", err); printf("Test failed\n"); printf("err : %d %s\n",err,err); } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); printf("Test failed\n"); return -1; } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "relu_1", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); printf("Test failed\n"); return -1; } // Create the input and output arrays in device memory for our calculation // in_array = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * NO_NODES, NULL, NULL); //synaptic_weights = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * NO_NODES * NO_NODES, NULL, NULL); out_array = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * NO_NODES, NULL, NULL); if (!in_array || /*!synaptic_weights ||*/ !out_array) { printf("Error: Failed to allocate device memory!\n"); printf("Test failed\n"); return -1; } ifp = fopen("/home/agandhi92/sdaccel/relu_1/input.txt",mode); if(ifp == NULL) { printf("Input file not found \n"); return -1; } while (fscanf(ifp, "%f", &temp) != EOF && index < NO_NODES) { in_array_tb[index++] = temp; } index = 0; temp = 0; //ifp = fopen("/home/agandhi92/sdaccel/relu_1/weight.txt",mode); //if(ifp == NULL) //{ // printf("Weight file not found \n"); // return -1; //} //while (fscanf(ifp, "%f", &temp) != EOF && index < (NO_NODES*NO_NODES)) { // synaptic_weights_tb[index++] = temp; //} // // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, in_array, CL_TRUE, 0, sizeof(float) * NO_NODES, in_array_tb, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array a!\n"); printf("Test failed\n"); return -1; } // Write our data set into the input array in device memory // //err = clEnqueueWriteBuffer(commands, synaptic_weights, CL_TRUE, 0, sizeof(float) * NO_NODES * NO_NODES, synaptic_weights_tb, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array b!\n"); printf("Test failed\n"); return -1; } // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &in_array); //err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &synaptic_weights); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_array); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); printf("Test failed\n"); return -1; } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // err = clEnqueueTask(commands, kernel, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel! %d\n", err); printf("Test failed\n"); return -1; } // Read back the results from the device to verify the output // cl_event readevent; err = clEnqueueReadBuffer( commands, out_array, CL_TRUE, 0, sizeof(float) * NO_NODES, out_array_tb, 0, NULL, &readevent ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); printf("Test failed\n"); return -1; } clWaitForEvents(1, &readevent); //printf("A\n"); //for (i=0;i<DATA_SIZE;i++) { // printf("%x ",a[i]); // if (((i+1) % 16) == 0) // printf("\n"); //} //printf("B\n"); //for (i=0;i<DATA_SIZE;i++) { // printf("%x ",b[i]); // if (((i+1) % 16) == 0) // printf("\n"); //} //printf("res\n"); //for (i=0;i<DATA_SIZE;i++) { // printf("%x ",results[i]); // if (((i+1) % 16) == 0) // printf("\n"); //} // Validate our results // //correct = 0; //for(i = 0; i < DATA_SIZE; i++) //{ // int row = i/MATRIX_RANK; // int col = i%MATRIX_RANK; // int running = 0; // int index; // for (index=0;index<MATRIX_RANK;index++) { // int aIndex = row*MATRIX_RANK + index; // int bIndex = col + index*MATRIX_RANK; // running += a[aIndex] * b[bIndex]; // } // sw_results[i] = running; //} // //for (i = 0;i < DATA_SIZE; i++) // if(results[i] == sw_results[i]) // correct++; //printf("Software\n"); //for (i=0;i<DATA_SIZE;i++) { // //printf("%0.2f ",sw_results[i]); // printf("%d ",sw_results[i]); // if (((i+1) % 16) == 0) // printf("\n"); //} // // //// Print a brief summary detailing the results //// //printf("Computed '%d/%d' correct values!\n", correct, DATA_SIZE); // // Shutdown and cleanup int temp_ = 0; for (j = 0; j < NO_NODES; j++) { if (out_array_tb[j] >= 0) // || out_array_tb[j]== 0) { //printf("out_array[%d] = %f \n", j, out_array[j]); temp_++; } } clReleaseMemObject(in_array); //clReleaseMemObject(synaptic_weights); clReleaseMemObject(out_array); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); if (temp_ == NO_NODES) { printf("*********************************************************** \n"); printf("TEST PASSED !!!!!! The output matches the desired output. \n"); printf("*********************************************************** \n"); return EXIT_SUCCESS; } else { printf("**************************************************************** \n"); printf("TEST Failed !!!!!! The output does not match the desired output. \n"); printf("**************************************************************** \n"); return -1; } //if(correct == DATA_SIZE){ // printf("Test passed!\n"); // return EXIT_SUCCESS; //} //else{ // printf("Test failed\n"); // return -1; //} }
int main(int argc, char** argv) { ocd_init(&argc, &argv, NULL); ocd_initCL(); cl_int err; size_t global_size; size_t local_size; cl_program program; cl_kernel kernel_compute_flux; cl_kernel kernel_compute_flux_contributions; cl_kernel kernel_compute_step_factor; cl_kernel kernel_time_step; cl_kernel kernel_initialize_variables; cl_mem ff_variable; cl_mem ff_fc_momentum_x; cl_mem ff_fc_momentum_y; cl_mem ff_fc_momentum_z; cl_mem ff_fc_density_energy; if (argc < 2) { printf("Usage ./cfd <data input file>\n"); return 0; } const char* data_file_name = argv[1]; // set far field conditions and load them into constant memory on the gpu { float h_ff_variable[NVAR]; const float angle_of_attack = (float)(3.1415926535897931 / 180.0) * (float)(deg_angle_of_attack); h_ff_variable[VAR_DENSITY] = (float)(1.4); float ff_pressure = (float)(1.0); float ff_speed_of_sound = sqrt(GAMMA*ff_pressure / h_ff_variable[VAR_DENSITY]); float ff_speed = (float)(ff_mach)*ff_speed_of_sound; float3 ff_velocity; ff_velocity.x = ff_speed*(float)(cos((float)angle_of_attack)); ff_velocity.y = ff_speed*(float)(sin((float)angle_of_attack)); ff_velocity.z = 0.0; h_ff_variable[VAR_MOMENTUM+0] = h_ff_variable[VAR_DENSITY] * ff_velocity.x; h_ff_variable[VAR_MOMENTUM+1] = h_ff_variable[VAR_DENSITY] * ff_velocity.y; h_ff_variable[VAR_MOMENTUM+2] = h_ff_variable[VAR_DENSITY] * ff_velocity.z; h_ff_variable[VAR_DENSITY_ENERGY] = h_ff_variable[VAR_DENSITY]*((float)(0.5)*(ff_speed*ff_speed)) + (ff_pressure / (float)(GAMMA-1.0)); float3 h_ff_momentum; h_ff_momentum.x = *(h_ff_variable+VAR_MOMENTUM+0); h_ff_momentum.y = *(h_ff_variable+VAR_MOMENTUM+1); h_ff_momentum.z = *(h_ff_variable+VAR_MOMENTUM+2); float3 h_ff_fc_momentum_x; float3 h_ff_fc_momentum_y; float3 h_ff_fc_momentum_z; float3 h_ff_fc_density_energy; compute_flux_contribution(&h_ff_variable[VAR_DENSITY], &h_ff_momentum, &h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure, &ff_velocity, &h_ff_fc_momentum_x, &h_ff_fc_momentum_y, &h_ff_fc_momentum_z, &h_ff_fc_density_energy); // copy far field conditions to the gpu ff_variable = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * NVAR, h_ff_variable, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_momentum_x = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_x, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_momentum_y = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_y, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_momentum_z = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_z, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_density_energy = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_density_energy, &err); CHKERR(err, "Unable to allocate ff data"); } int nel; int nelr; // read in domain geometry cl_mem areas; cl_mem elements_surrounding_elements; cl_mem normals; { std::ifstream file(data_file_name); file >> nel; nelr = block_length*((nel / block_length )+ std::min(1, nel % block_length)); float* h_areas = new float[nelr]; int* h_elements_surrounding_elements = new int[nelr*NNB]; float* h_normals = new float[nelr*NDIM*NNB]; // read in data for(int i = 0; i < nel; i++) { file >> h_areas[i]; for(int j = 0; j < NNB; j++) { file >> h_elements_surrounding_elements[i + j*nelr]; if(h_elements_surrounding_elements[i+j*nelr] < 0) h_elements_surrounding_elements[i+j*nelr] = -1; h_elements_surrounding_elements[i + j*nelr]--; //it's coming in with Fortran numbering for(int k = 0; k < NDIM; k++) { file >> h_normals[i + (j + k*NNB)*nelr]; h_normals[i + (j + k*NNB)*nelr] = -h_normals[i + (j + k*NNB)*nelr]; } } } // fill in remaining data int last = nel-1; for(int i = nel; i < nelr; i++) { h_areas[i] = h_areas[last]; for(int j = 0; j < NNB; j++) { // duplicate the last element h_elements_surrounding_elements[i + j*nelr] = h_elements_surrounding_elements[last + j*nelr]; for(int k = 0; k < NDIM; k++) h_normals[last + (j + k*NNB)*nelr] = h_normals[last + (j + k*NNB)*nelr]; } } areas = alloc<float>(context, nelr); upload<float>(commands, areas, h_areas, nelr); elements_surrounding_elements = alloc<int>(context, nelr*NNB); upload<int>(commands, elements_surrounding_elements, h_elements_surrounding_elements, nelr*NNB); normals = alloc<float>(context, nelr*NDIM*NNB); upload<float>(commands, normals, h_normals, nelr*NDIM*NNB); delete[] h_areas; delete[] h_elements_surrounding_elements; delete[] h_normals; } // Get program source. long kernelSize = getKernelSize(); char* kernelSource = new char[kernelSize]; getKernelSource(kernelSource, kernelSize); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) &kernelSource, NULL, &err); CHKERR(err, "Failed to create a compute program!"); // Build the program executable err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err == CL_BUILD_PROGRAM_FAILURE) { char *log; size_t logLen; err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLen); log = (char *) malloc(sizeof(char)*logLen); err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, logLen, (void *) log, NULL); fprintf(stderr, "CL Error %d: Failed to build program! Log:\n%s", err, log); free(log); exit(1); } CHKERR(err, "Failed to build program!"); delete[] kernelSource; // Create the compute kernel in the program we wish to run kernel_compute_flux = clCreateKernel(program, "compute_flux", &err); CHKERR(err, "Failed to create a compute kernel!"); // Create the reduce kernel in the program we wish to run kernel_compute_flux_contributions = clCreateKernel(program, "compute_flux_contributions", &err); CHKERR(err, "Failed to create a compute_flux_contributions kernel!"); // Create the reduce kernel in the program we wish to run kernel_compute_step_factor = clCreateKernel(program, "compute_step_factor", &err); CHKERR(err, "Failed to create a compute_step_factor kernel!"); // Create the reduce kernel in the program we wish to run kernel_time_step = clCreateKernel(program, "time_step", &err); CHKERR(err, "Failed to create a time_step kernel!"); // Create the reduce kernel in the program we wish to run kernel_initialize_variables = clCreateKernel(program, "initialize_variables", &err); CHKERR(err, "Failed to create a initialize_variables kernel!"); // Create arrays and set initial conditions cl_mem variables = alloc<cl_float>(context, nelr*NVAR); err = 0; err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&variables); err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device //err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!"); local_size = 1;//std::min(local_size, (size_t)nelr); global_size = nelr; err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); err = clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 0"); cl_mem old_variables = alloc<float>(context, nelr*NVAR); cl_mem fluxes = alloc<float>(context, nelr*NVAR); cl_mem step_factors = alloc<float>(context, nelr); clFinish(commands); cl_mem fc_momentum_x = alloc<float>(context, nelr*NDIM); cl_mem fc_momentum_y = alloc<float>(context, nelr*NDIM); cl_mem fc_momentum_z = alloc<float>(context, nelr*NDIM); cl_mem fc_density_energy = alloc<float>(context, nelr*NDIM); clFinish(commands); // make sure all memory is floatly allocated before we start timing err = 0; err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&old_variables); err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 1"); err = 0; err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&fluxes); err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 2"); std::cout << "About to memcopy" << std::endl; err = clReleaseMemObject(step_factors); float temp[nelr]; for(int i = 0; i < nelr; i++) temp[i] = 0; step_factors = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * nelr, temp, &err); CHKERR(err, "Unable to memset step_factors"); // make sure CUDA isn't still doing something before we start timing clFinish(commands); // these need to be computed the first time in order to compute time step std::cout << "Starting..." << std::endl; // Begin iterations for(int i = 0; i < iterations; i++) { copy<float>(commands, old_variables, variables, nelr*NVAR); // for the first iteration we compute the time step err = 0; err = clSetKernelArg(kernel_compute_step_factor, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_compute_step_factor, 1, sizeof(cl_mem),&variables); err |= clSetKernelArg(kernel_compute_step_factor, 2, sizeof(cl_mem), &areas); err |= clSetKernelArg(kernel_compute_step_factor, 3, sizeof(cl_mem), &step_factors); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_compute_step_factor, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Step Factor Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel[kernel_compute_step_factor]!"); for(int j = 0; j < RK; j++) { err = 0; err = clSetKernelArg(kernel_compute_flux_contributions, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_compute_flux_contributions, 1, sizeof(cl_mem),&variables); err |= clSetKernelArg(kernel_compute_flux_contributions, 2, sizeof(cl_mem), &fc_momentum_x); err |= clSetKernelArg(kernel_compute_flux_contributions, 3, sizeof(cl_mem), &fc_momentum_y); err |= clSetKernelArg(kernel_compute_flux_contributions, 4, sizeof(cl_mem), &fc_momentum_z); err |= clSetKernelArg(kernel_compute_flux_contributions, 5, sizeof(cl_mem), &fc_density_energy); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_flux_contributions, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_flux_contributions work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_compute_flux_contributions, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Contribution Kernel", ocdTempTimer) //compute_flux_contributions(nelr, variables, fc_momentum_x, fc_momentum_y, fc_momentum_z, fc_density_energy); END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_compute_flux_contributions]!"); err = 0; err = clSetKernelArg(kernel_compute_flux, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_compute_flux, 1, sizeof(cl_mem), &elements_surrounding_elements); err |= clSetKernelArg(kernel_compute_flux, 2, sizeof(cl_mem), &normals); err |= clSetKernelArg(kernel_compute_flux, 3, sizeof(cl_mem), &variables); err |= clSetKernelArg(kernel_compute_flux, 4, sizeof(cl_mem), &fc_momentum_x); err |= clSetKernelArg(kernel_compute_flux, 5, sizeof(cl_mem), &fc_momentum_y); err |= clSetKernelArg(kernel_compute_flux, 6, sizeof(cl_mem), &fc_momentum_z); err |= clSetKernelArg(kernel_compute_flux, 7, sizeof(cl_mem), &fc_density_energy); err |= clSetKernelArg(kernel_compute_flux, 8, sizeof(cl_mem), &fluxes); err |= clSetKernelArg(kernel_compute_flux, 9, sizeof(cl_mem), &ff_variable); err |= clSetKernelArg(kernel_compute_flux, 10, sizeof(cl_mem), &ff_fc_momentum_x); err |= clSetKernelArg(kernel_compute_flux, 11, sizeof(cl_mem), &ff_fc_momentum_y); err |= clSetKernelArg(kernel_compute_flux, 12, sizeof(cl_mem), &ff_fc_momentum_z); err |= clSetKernelArg(kernel_compute_flux, 13, sizeof(cl_mem), &ff_fc_density_energy); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_flux, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_flux work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_compute_flux, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_compute_flux]!"); err = 0; err = clSetKernelArg(kernel_time_step, 0, sizeof(int), &j); err |= clSetKernelArg(kernel_time_step, 1, sizeof(int), &nelr); err |= clSetKernelArg(kernel_time_step, 2, sizeof(cl_mem), &old_variables); err |= clSetKernelArg(kernel_time_step, 3, sizeof(cl_mem), &variables); err |= clSetKernelArg(kernel_time_step, 4, sizeof(cl_mem), &step_factors); err |= clSetKernelArg(kernel_time_step, 5, sizeof(cl_mem), &fluxes); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_time_step, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_time_step work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_time_step, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Time Step Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_time_step]!"); } } clFinish(commands); std::cout << "Finished" << std::endl; std::cout << "Saving solution..." << std::endl; dump(commands, variables, nel, nelr); std::cout << "Saved solution..." << std::endl; std::cout << "Cleaning up..." << std::endl; clReleaseProgram(program); clReleaseKernel(kernel_compute_flux); clReleaseKernel(kernel_compute_flux_contributions); clReleaseKernel(kernel_compute_step_factor); clReleaseKernel(kernel_time_step); clReleaseKernel(kernel_initialize_variables); clReleaseCommandQueue(commands); clReleaseContext(context); dealloc<float>(areas); dealloc<int>(elements_surrounding_elements); dealloc<float>(normals); dealloc<float>(variables); dealloc<float>(old_variables); dealloc<float>(fluxes); dealloc<float>(step_factors); dealloc<float>(fc_momentum_x); dealloc<float>(fc_momentum_y); dealloc<float>(fc_momentum_z); dealloc<float>(fc_density_energy); std::cout << "Done..." << std::endl; ocd_finalize(); return 0; }
int main(int argc, char const *argv[]) { /* Get platform */ cl_platform_id platform; cl_uint num_platforms; cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformIDs' failed\n"); exit(1); } printf("Number of platforms: %d\n", num_platforms); printf("platform=%p\n", platform); /* Get platform name */ char platform_name[100]; ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformInfo' failed\n"); exit(1); } printf("platform.name='%s'\n\n", platform_name); /* Get device */ cl_device_id device; cl_uint num_devices; ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceIDs' failed\n"); exit(1); } printf("Number of devices: %d\n", num_devices); printf("device=%p\n", device); /* Get device name */ char device_name[100]; ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceInfo' failed\n"); exit(1); } printf("device.name='%s'\n", device_name); printf("\n"); /* Create a Context Object */ cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateContext' failed\n"); exit(1); } printf("context=%p\n", context); /* Create a Command Queue Object*/ cl_command_queue command_queue; command_queue = clCreateCommandQueue(context, device, 0, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateCommandQueue' failed\n"); exit(1); } printf("command_queue=%p\n", command_queue); printf("\n"); /* Program binary */ unsigned char *bin; size_t bin_len; cl_int bin_ret; /* Read program binary */ if (argc == 2) bin = read_buffer((char *)argv[1], &bin_len); else { printf("error: No binary specified\n"); exit(1); } /* Create a program */ cl_program program; program = clCreateProgramWithBinary(context, 1, &device, &bin_len, (const unsigned char **)&bin, &bin_ret, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateProgramWithBinary' failed\n"); exit(1); } if (bin_ret != CL_SUCCESS) { printf("error: Invalid binary for device\n"); exit(1); } printf("program=%p\n", program); /* Free binary */ free(bin); printf("program binary loaded\n"); printf("\n"); ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (ret != CL_SUCCESS ) { size_t size; char *log; /* Get log size */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size); /* Allocate log and print */ log = malloc(size); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL); printf("error: call to 'clBuildProgram' failed:\n%s\n", log); /* Free log and exit */ free(log); exit(1); } printf("program built\n"); printf("\n"); /* Create a Kernel Object*/ cl_kernel kernel; kernel = clCreateKernel(program, "sign_float16", &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateKernel' failed\n"); exit(1); } /* Create and allocate host buffers */ size_t num_elem = 10; /* Create and init host side src buffer 0 */ cl_float16 *src_0_host_buffer; src_0_host_buffer = malloc(num_elem * sizeof(cl_float16)); for (int i = 0; i < num_elem; i++) src_0_host_buffer[i] = (cl_float16){{2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0}}; /* Create and init device side src buffer 0 */ cl_mem src_0_device_buffer; src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_float16), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_float16), src_0_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create host dst buffer */ cl_float16 *dst_host_buffer; dst_host_buffer = malloc(num_elem * sizeof(cl_float16)); memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_float16)); /* Create device dst buffer */ cl_mem dst_device_buffer; dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_float16), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create dst buffer\n"); exit(1); } /* Set kernel arguments */ ret = CL_SUCCESS; ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clSetKernelArg' failed\n"); exit(1); } /* Launch the kernel */ size_t global_work_size = num_elem; size_t local_work_size = num_elem; ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueNDRangeKernel' failed\n"); exit(1); } /* Wait for it to finish */ clFinish(command_queue); /* Read results from GPU */ ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_float16), dst_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueReadBuffer' failed\n"); exit(1); } /* Dump dst buffer to file */ char dump_file[100]; sprintf((char *)&dump_file, "%s.result", argv[0]); write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_float16)); printf("Result dumped to %s\n", dump_file); /* Free host dst buffer */ free(dst_host_buffer); /* Free device dst buffer */ ret = clReleaseMemObject(dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 0 */ free(src_0_host_buffer); /* Free device side src buffer 0 */ ret = clReleaseMemObject(src_0_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Release kernel */ ret = clReleaseKernel(kernel); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseKernel' failed\n"); exit(1); } /* Release program */ ret = clReleaseProgram(program); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseProgram' failed\n"); exit(1); } /* Release command queue */ ret = clReleaseCommandQueue(command_queue); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseCommandQueue' failed\n"); exit(1); } /* Release context */ ret = clReleaseContext(context); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseContext' failed\n"); exit(1); } return 0; }
int main(void) { //time meassuring struct timeval tvs; struct timeval tve; float elapsedTime; int Nx; int Ny; int Nz; int N; int plotnum=0; int Tmax=0; int plottime=0; int plotgap=0; float Lx,Ly,Lz; float dt=0.0; float A=0.0; float B=0.0; float Du=0.0; float Dv=0.0; float a[2]={1.0,0.0}; float b[2]={0.5,0.0}; float* x,*y,*z ; float* u[2],*v[2]; //openCL variables cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_mem cl_u[2] = {NULL,NULL}; cl_mem cl_v[2] = {NULL,NULL}; cl_mem cl_uhat[2] = {NULL,NULL}; cl_mem cl_vhat[2] = {NULL,NULL}; cl_mem cl_x = NULL; cl_mem cl_y = NULL; cl_mem cl_z = NULL; cl_mem cl_kx = NULL; cl_mem cl_ky = NULL; cl_mem cl_kz = NULL; cl_program p_grid = NULL,p_frequencies = NULL,p_initialdata = NULL,p_linearpart=NULL,p_nonlinearpart=NULL; cl_kernel grid = NULL,frequencies = NULL,initialdata = NULL,linearpart=NULL,nonlinearpart=NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret; ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_CPU, 1, &device_id, &ret_num_devices); context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); command_queue = clCreateCommandQueue(context, device_id, 0, &ret); size_t source_size; char *source_str; //end opencl int i,n; int status=0; //int start, finish, count_rate, ind, numthreads char nameconfig[100]=""; //Read infutfile char InputFileName[]="./INPUTFILE"; FILE*fp; fp=fopen(InputFileName,"r"); if(!fp) {fprintf(stderr, "Failed to load IPUTFILE.\n");exit(1);} int ierr=fscanf(fp, "%d %d %d %d %d %f %f %f %f %f %f %f %f", &Nx,&Ny,&Nz,&Tmax,&plotgap,&Lx,&Ly,&Lz,&dt,&Du,&Dv,&A,&B); if(ierr!=13){fprintf(stderr, "INPUTFILE corrupted.\n");exit(1);} fclose(fp); printf("NX %d\n",Nx); printf("NY %d\n",Ny); printf("NZ %d\n",Nz); printf("Tmax %d\n",Tmax); printf("plotgap %d\n",plotgap); printf("Lx %f\n",Lx); printf("Ly %f\n",Ly); printf("Lz %f\n",Lz); printf("dt %f\n",dt); printf("Du %f\n",Du); printf("Dv %f\n",Dv); printf("F %f\n",A); printf("k %f\n",B); printf("Read inputfile\n"); N=Nx*Ny*Nz; plottime=plotgap; B=A+B; //ALLocate the memory u[0]=(float*) malloc(N*sizeof(float)); v[0]=(float*) malloc(N*sizeof(float)); x=(float*) malloc(Nx*sizeof(float)); y=(float*) malloc(Ny*sizeof(float)); z=(float*) malloc(Nz*sizeof(float)); //allocate gpu mem cl_u[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_v[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_u[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_v[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_uhat[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_vhat[0] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_uhat[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); cl_vhat[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, N * sizeof(float), NULL, &ret); printf("allocated space\n"); // FFT library realted declarations. clfftPlanHandle planHandle; clfftDim dim = CLFFT_3D; size_t clLengths[3] = {Nx, Ny, Nz}; // Setup clFFT. clfftSetupData fftSetup; ret = clfftInitSetupData(&fftSetup); ret = clfftSetup(&fftSetup); // Create a default plan for a complex FFT. ret = clfftCreateDefaultPlan(&planHandle, context, dim, clLengths); // Set plan parameters. ret = clfftSetPlanPrecision(planHandle, CLFFT_SINGLE); ret = clfftSetLayout(planHandle, CLFFT_COMPLEX_PLANAR, CLFFT_COMPLEX_PLANAR); ret = clfftSetResultLocation(planHandle, CLFFT_OUTOFPLACE); // Bake the plan. ret = clfftBakePlan(planHandle, 1, &command_queue, NULL, NULL); // Create temporary buffer. cl_mem tmpBufferu = 0; cl_mem tmpBufferv = 0; // Size of temp buffer. size_t tmpBufferSize = 0; status = clfftGetTmpBufSize(planHandle, &tmpBufferSize); if ((status == 0) && (tmpBufferSize > 0)) { tmpBufferu = clCreateBuffer(context, CL_MEM_READ_WRITE, tmpBufferSize, NULL, &ret); tmpBufferv = clCreateBuffer(context, CL_MEM_READ_WRITE, tmpBufferSize, NULL, &ret); if (ret != CL_SUCCESS) printf("Error with tmpBuffer clCreateBuffer\n"); } //kernel grid fp = fopen("./grid.cl", "r"); if (!fp) {fprintf(stderr, "Failed to load grid.\n"); exit(1); } source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp ); fclose( fp ); p_grid = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(p_grid, 1, &device_id, NULL, NULL, NULL); grid = clCreateKernel(p_grid, "grid", &ret); //first x cl_x = clCreateBuffer(context, CL_MEM_READ_WRITE, Nx * sizeof(float), NULL, &ret); ret = clSetKernelArg(grid, 0, sizeof(cl_mem), (void *)&cl_x); ret = clSetKernelArg(grid, 1, sizeof(float),(void*)&Lx); ret = clSetKernelArg(grid, 2, sizeof(int),(void*)&Nx); size_t global_work_size_x[3] = {Nx, 0, 0}; ret = clEnqueueNDRangeKernel(command_queue, grid, 1, NULL, global_work_size_x, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clEnqueueReadBuffer(command_queue, cl_x, CL_TRUE, 0, Nx * sizeof(float), x, 0, NULL, NULL); ret = clFinish(command_queue); //then y cl_y = clCreateBuffer(context, CL_MEM_READ_WRITE, Ny * sizeof(float), NULL, &ret); ret = clSetKernelArg(grid, 0, sizeof(cl_mem), (void *)&cl_y); ret = clSetKernelArg(grid, 1, sizeof(float),(void*)&Ly); ret = clSetKernelArg(grid, 2, sizeof(int),(void*)&Ny); size_t global_work_size_y[3] = {Ny, 0, 0}; ret = clEnqueueNDRangeKernel(command_queue, grid, 1, NULL, global_work_size_y, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clEnqueueReadBuffer(command_queue, cl_y, CL_TRUE, 0, Ny * sizeof(float), y, 0, NULL, NULL); ret = clFinish(command_queue); //last z cl_z = clCreateBuffer(context, CL_MEM_READ_WRITE, Nz * sizeof(float), NULL, &ret); ret = clSetKernelArg(grid, 0, sizeof(cl_mem), (void *)&cl_z); ret = clSetKernelArg(grid, 1, sizeof(float),(void*)&Lz); ret = clSetKernelArg(grid, 2, sizeof(int),(void*)&Nz); size_t global_work_size_z[3] = {Nz, 0, 0}; ret = clEnqueueNDRangeKernel(command_queue, grid, 1, NULL, global_work_size_z, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clEnqueueReadBuffer(command_queue, cl_z, CL_TRUE, 0, Nz * sizeof(float), z, 0, NULL, NULL); ret = clFinish(command_queue); ret = clReleaseKernel(grid); ret = clReleaseProgram(p_grid); //kernel initial data fp = fopen("./initialdata.cl", "r"); if (!fp) {fprintf(stderr, "Failed to load initialdata.\n"); exit(1); } free(source_str); source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp ); fclose( fp ); p_initialdata = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(p_initialdata, 1, &device_id, NULL, NULL, NULL); initialdata = clCreateKernel(p_initialdata, "initialdata", &ret); ret = clSetKernelArg(initialdata, 0, sizeof(cl_mem),(void *)&cl_u[0]); ret = clSetKernelArg(initialdata, 1, sizeof(cl_mem),(void* )&cl_v[0]); ret = clSetKernelArg(initialdata, 2, sizeof(cl_mem),(void *)&cl_u[1]); ret = clSetKernelArg(initialdata, 3, sizeof(cl_mem),(void* )&cl_v[1]); ret = clSetKernelArg(initialdata, 4, sizeof(cl_mem),(void* )&cl_x); ret = clSetKernelArg(initialdata, 5, sizeof(cl_mem),(void* )&cl_y); ret = clSetKernelArg(initialdata, 6, sizeof(cl_mem),(void* )&cl_z); ret = clSetKernelArg(initialdata, 7, sizeof(int),(void* )&Nx); ret = clSetKernelArg(initialdata, 8, sizeof(int),(void* )&Ny); ret = clSetKernelArg(initialdata, 9, sizeof(int),(void* )&Nz); size_t global_work_size[3] = {N, 0, 0}; ret = clEnqueueNDRangeKernel(command_queue, initialdata, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clReleaseKernel(initialdata); ret = clReleaseProgram(p_initialdata); ret = clEnqueueReadBuffer(command_queue, cl_u[0], CL_TRUE, 0, N * sizeof(float), u[0], 0, NULL, NULL); ret = clFinish(command_queue); ret = clEnqueueReadBuffer(command_queue, cl_v[0], CL_TRUE, 0, N * sizeof(float), v[0], 0, NULL, NULL); ret = clFinish(command_queue); ret = clReleaseMemObject(cl_x); ret = clReleaseMemObject(cl_y); ret = clReleaseMemObject(cl_z); //write to disk fp=fopen("./data/xcoord.dat","w"); if (!fp) {fprintf(stderr, "Failed to write xcoord.dat.\n"); exit(1); } for(i=0;i<Nx;i++){fprintf(fp,"%f\n",x[i]);} fclose( fp ); fp=fopen("./data/ycoord.dat","w"); if (!fp) {fprintf(stderr, "Failed to write ycoord.dat.\n"); exit(1); } for(i=0;i<Ny;i++){fprintf(fp,"%f\n",y[i]);} fclose( fp ); fp=fopen("./data/zcoord.dat","w"); if (!fp) {fprintf(stderr, "Failed to write zcoord.dat.\n"); exit(1); } for(i=0;i<Nz;i++){fprintf(fp,"%f\n",z[i]);} fclose( fp ); free(x); free(y); free(z); n=0; plotnum=0; //output of initial data U char tmp_str[10]; strcpy(nameconfig,"./data/u"); sprintf(tmp_str,"%d",10000000+plotnum); strcat(nameconfig,tmp_str); strcat(nameconfig,".datbin"); fp=fopen(nameconfig,"wb"); if (!fp) {fprintf(stderr, "Failed to write initialdata.\n"); exit(1); } for(i=0;i<N;i++){fwrite(&u[0][i], sizeof(float), 1, fp);} fclose( fp ); //V strcpy(nameconfig,"./data/v"); sprintf(tmp_str,"%d",10000000+plotnum); strcat(nameconfig,tmp_str); strcat(nameconfig,".datbin"); fp=fopen(nameconfig,"wb"); if (!fp) {fprintf(stderr, "Failed to write initialdata.\n"); exit(1); } for(i=0;i<N;i++){fwrite(&v[0][i], sizeof(float), 1, fp);} fclose( fp ); //frequencies kernel fp = fopen("./frequencies.cl", "r"); if (!fp) {fprintf(stderr, "Failed to load frequencies.\n"); exit(1); } free(source_str); source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp ); fclose( fp ); p_frequencies = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(p_frequencies, 1, &device_id, NULL, NULL, NULL); frequencies = clCreateKernel(p_frequencies, "frequencies", &ret); //get frequencies first x cl_kx = clCreateBuffer(context, CL_MEM_READ_WRITE, Nx * sizeof(float), NULL, &ret); ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem), (void *)&cl_kx); ret = clSetKernelArg(frequencies, 1, sizeof(float),(void*)&Lx); ret = clSetKernelArg(frequencies, 2, sizeof(int),(void*)&Nx); ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_x, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //then y cl_ky = clCreateBuffer(context, CL_MEM_READ_WRITE, Ny * sizeof(float), NULL, &ret); ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem), (void *)&cl_ky); ret = clSetKernelArg(frequencies, 1, sizeof(float),(void*)&Ly); ret = clSetKernelArg(frequencies, 2, sizeof(int),(void*)&Ny); ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_y, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //last z cl_kz = clCreateBuffer(context, CL_MEM_READ_WRITE, Nz * sizeof(float), NULL, &ret); ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem), (void *)&cl_kz); ret = clSetKernelArg(frequencies, 1, sizeof(float),(void*)&Lz); ret = clSetKernelArg(frequencies, 2, sizeof(int),(void*)&Nz); ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_z, NULL, 0, NULL, NULL); ret = clFinish(command_queue); printf("Setup grid, fourier frequencies and initialcondition\n"); //load the rest of the kernels //linearpart kernel fp = fopen("./linearpart.cl", "r"); if (!fp) {fprintf(stderr, "Failed to load linearpart.\n"); exit(1); } free(source_str); source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp ); fclose( fp ); p_linearpart = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(p_linearpart, 1, &device_id, NULL, NULL, NULL); linearpart = clCreateKernel(p_linearpart, "linearpart", &ret); //kernel nonlinear fp = fopen("./nonlinearpart.cl", "r"); if (!fp) {fprintf(stderr, "Failed to load nonlinearpart.\n"); exit(1); } free(source_str); source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread( source_str, 1, MAX_SOURCE_SIZE, fp ); fclose( fp ); p_nonlinearpart = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); ret = clBuildProgram(p_nonlinearpart, 1, &device_id, NULL, NULL, NULL); nonlinearpart = clCreateKernel(p_nonlinearpart, "nonlinearpart", &ret); printf("Got initial data, starting timestepping\n"); gettimeofday(&tvs, NULL); for(n=0;n<=Tmax;n++){ //linear ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_u, cl_uhat, tmpBufferu); ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_v, cl_vhat, tmpBufferv); ret = clFinish(command_queue); ret = clSetKernelArg(linearpart, 0, sizeof(cl_mem),(void *)&cl_uhat[0]); ret = clSetKernelArg(linearpart, 1, sizeof(cl_mem),(void *)&cl_uhat[1]); ret = clSetKernelArg(linearpart, 2, sizeof(cl_mem),(void *)&cl_vhat[0]); ret = clSetKernelArg(linearpart, 3, sizeof(cl_mem),(void *)&cl_vhat[1]); ret = clSetKernelArg(linearpart, 4, sizeof(cl_mem),(void* )&cl_kx); ret = clSetKernelArg(linearpart, 5, sizeof(cl_mem),(void* )&cl_ky); ret = clSetKernelArg(linearpart, 6, sizeof(cl_mem),(void* )&cl_kz); ret = clSetKernelArg(linearpart, 7, sizeof(float),(void* )&dt); ret = clSetKernelArg(linearpart, 8, sizeof(float),(void* )&Du); ret = clSetKernelArg(linearpart, 9, sizeof(float),(void* )&Dv); ret = clSetKernelArg(linearpart, 10, sizeof(float),(void* )&A); ret = clSetKernelArg(linearpart, 11, sizeof(float),(void* )&B); ret = clSetKernelArg(linearpart, 12, sizeof(float),(void* )&b[0]); ret = clSetKernelArg(linearpart, 13, sizeof(float),(void* )&b[1]); ret = clSetKernelArg(linearpart, 14, sizeof(int),(void* )&Nx); ret = clSetKernelArg(linearpart, 15, sizeof(int),(void* )&Ny); ret = clSetKernelArg(linearpart, 16, sizeof(int),(void* )&Nz); ret = clEnqueueNDRangeKernel(command_queue, linearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_uhat, cl_u, tmpBufferu); ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_vhat, cl_v, tmpBufferv); ret = clFinish(command_queue); //nonlinearpart ret = clSetKernelArg(nonlinearpart, 0, sizeof(cl_mem),(void *)&cl_u[0]); ret = clSetKernelArg(nonlinearpart, 1, sizeof(cl_mem),(void *)&cl_u[1]); ret = clSetKernelArg(nonlinearpart, 2, sizeof(cl_mem),(void* )&cl_v[0]); ret = clSetKernelArg(nonlinearpart, 3, sizeof(cl_mem),(void* )&cl_v[1]); ret = clSetKernelArg(nonlinearpart, 4, sizeof(float),(void* )&dt); ret = clSetKernelArg(nonlinearpart, 5, sizeof(float),(void* )&a[0]); ret = clSetKernelArg(nonlinearpart, 6, sizeof(float),(void* )&a[1]); ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); // linear part ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_u, cl_uhat, tmpBufferu); ret = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &command_queue, 0, NULL, NULL,cl_v, cl_vhat, tmpBufferv); ret = clFinish(command_queue); ret = clSetKernelArg(linearpart, 0, sizeof(cl_mem),(void *)&cl_uhat[0]); ret = clSetKernelArg(linearpart, 1, sizeof(cl_mem),(void *)&cl_uhat[1]); ret = clSetKernelArg(linearpart, 2, sizeof(cl_mem),(void *)&cl_vhat[0]); ret = clSetKernelArg(linearpart, 3, sizeof(cl_mem),(void *)&cl_vhat[1]); ret = clSetKernelArg(linearpart, 4, sizeof(cl_mem),(void* )&cl_kx); ret = clSetKernelArg(linearpart, 5, sizeof(cl_mem),(void* )&cl_ky); ret = clSetKernelArg(linearpart, 6, sizeof(cl_mem),(void* )&cl_kz); ret = clSetKernelArg(linearpart, 7, sizeof(float),(void* )&dt); ret = clSetKernelArg(linearpart, 8, sizeof(float),(void* )&Du); ret = clSetKernelArg(linearpart, 9, sizeof(float),(void* )&Dv); ret = clSetKernelArg(linearpart, 10, sizeof(float),(void* )&A); ret = clSetKernelArg(linearpart, 11, sizeof(float),(void* )&B); ret = clSetKernelArg(linearpart, 12, sizeof(float),(void* )&b[0]); ret = clSetKernelArg(linearpart, 13, sizeof(float),(void* )&b[1]); ret = clSetKernelArg(linearpart, 14, sizeof(int),(void* )&Nx); ret = clSetKernelArg(linearpart, 15, sizeof(int),(void* )&Ny); ret = clSetKernelArg(linearpart, 16, sizeof(int),(void* )&Nz); ret = clEnqueueNDRangeKernel(command_queue, linearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_uhat, cl_u, tmpBufferu); ret = clfftEnqueueTransform(planHandle, CLFFT_BACKWARD, 1, &command_queue, 0, NULL, NULL,cl_vhat, cl_v, tmpBufferv); ret = clFinish(command_queue); // done if(n==plottime){ printf("time:%f, step:%d,%d\n",n*dt,n,plotnum); plottime=plottime+plotgap; plotnum=plotnum+1; ret = clEnqueueReadBuffer(command_queue, cl_u[0], CL_TRUE, 0, N * sizeof(float), u[0], 0, NULL, NULL); ret = clEnqueueReadBuffer(command_queue, cl_v[0], CL_TRUE, 0, N * sizeof(float), v[0], 0, NULL, NULL); ret = clFinish(command_queue); //output of data U char tmp_str[10]; strcpy(nameconfig,"./data/u"); sprintf(tmp_str,"%d",10000000+plotnum); strcat(nameconfig,tmp_str); strcat(nameconfig,".datbin"); fp=fopen(nameconfig,"wb"); if (!fp) {fprintf(stderr, "Failed to write u-data.\n"); exit(1); } for(i=0;i<N;i++){fwrite(&u[0][i], sizeof(float), 1, fp);} fclose( fp ); //V strcpy(nameconfig,"./data/v"); sprintf(tmp_str,"%d",10000000+plotnum); strcat(nameconfig,tmp_str); strcat(nameconfig,".datbin"); fp=fopen(nameconfig,"wb"); if (!fp) {fprintf(stderr, "Failed to write v-data.\n"); exit(1); } for(i=0;i<N;i++){fwrite(&v[0][i], sizeof(float), 1, fp);} fclose( fp ); } } gettimeofday(&tve, NULL); printf("Finished time stepping\n"); elapsedTime = (tve.tv_sec - tvs.tv_sec) * 1000.0; // sec to ms elapsedTime += (tve.tv_usec - tvs.tv_usec) / 1000.0; // us to ms printf("%f,",elapsedTime); clReleaseMemObject(cl_u[0]); clReleaseMemObject(cl_u[1]); clReleaseMemObject(cl_v[0]); clReleaseMemObject(cl_v[1]); clReleaseMemObject(cl_uhat[0]); clReleaseMemObject(cl_uhat[1]); clReleaseMemObject(cl_vhat[0]); clReleaseMemObject(cl_vhat[1]); clReleaseMemObject(cl_kx); clReleaseMemObject(cl_ky); clReleaseMemObject(cl_kz); ret = clReleaseKernel(frequencies); ret = clReleaseProgram(p_frequencies); ret = clReleaseKernel(linearpart); ret = clReleaseProgram(p_linearpart); ret = clReleaseKernel(nonlinearpart); ret = clReleaseProgram(p_nonlinearpart); free(u[0]); free(v[0]); clReleaseMemObject(tmpBufferu); clReleaseMemObject(tmpBufferv); /* Release the plan. */ ret = clfftDestroyPlan(&planHandle); /* Release clFFT library. */ clfftTeardown(); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); printf("Program execution complete\n"); return 0; }
Context::~Context() { clReleaseCommandQueue(queue); clReleaseContext(context); }
int main(int argc, char** argv) { srand(1000); int i; unsigned int size_A = WA * HA; unsigned int mem_size_A = sizeof(float) * size_A; float* h_A = (float*) malloc(mem_size_A); unsigned int size_B = WB * HB; unsigned int mem_size_B = sizeof(float) * size_B; float* h_B = (float*) malloc(mem_size_B); randomInit(h_A, size_A); randomInit(h_B, size_B); unsigned int size_C = WC * HC; unsigned int mem_size_C = sizeof(float) * size_C; float* h_C = (float*) malloc(mem_size_C); cl_context clGPUContext; cl_command_queue clCommandQue; cl_program clProgram; cl_kernel clKernel; cl_event mm; size_t dataBytes; size_t kernelLength; cl_int errcode; cl_mem d_A; cl_mem d_B; cl_mem d_C; clGPUContext = clCreateContextFromType(0, CL_DEVICE_TYPE_GPU, NULL, NULL, &errcode); errcode = clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, 0, NULL, &dataBytes); cl_device_id *clDevices = (cl_device_id *) malloc(dataBytes); errcode |= clGetContextInfo(clGPUContext, CL_CONTEXT_DEVICES, dataBytes, clDevices, NULL); clCommandQue = clCreateCommandQueue(clGPUContext, clDevices[0], CL_QUEUE_PROFILING_ENABLE, &errcode); d_C = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE, mem_size_A, NULL, &errcode); d_A = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_A, h_A, &errcode); d_B = clCreateBuffer(clGPUContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, mem_size_B, h_B, &errcode); FILE* fp = fopen("hw2.cl", "r"); fseek (fp , 0 , SEEK_END); const size_t lSize = ftell(fp); rewind(fp); unsigned char* buffer; buffer = (unsigned char*) malloc (lSize); fread(buffer, 1, lSize, fp); fclose(fp); cl_int status; clProgram = clCreateProgramWithBinary(clGPUContext, 1, (const cl_device_id *)clDevices, &lSize, (const unsigned char**)&buffer, &status, &errcode); errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL); errcode = clBuildProgram(clProgram, 0, NULL, NULL, NULL, NULL); clKernel = clCreateKernel(clProgram, "MM", &errcode); size_t globalWorkSize[2]; int wA = WA; int wC = WC; errcode = clSetKernelArg(clKernel, 0, sizeof(cl_mem), (void *)&d_C); errcode |= clSetKernelArg(clKernel, 1, sizeof(cl_mem), (void *)&d_A); errcode |= clSetKernelArg(clKernel, 2, sizeof(cl_mem), (void *)&d_B); errcode |= clSetKernelArg(clKernel, 3, sizeof(int), (void *)&wA); errcode |= clSetKernelArg(clKernel, 4, sizeof(int), (void *)&wC); globalWorkSize[0] = 16; globalWorkSize[1] = 16; cl_ulong time_start, time_end, total_time = 0; errcode = clEnqueueNDRangeKernel(clCommandQue, clKernel, 2, NULL, globalWorkSize, NULL, 0, NULL, &mm); printf("Average time = %lu\n"); clFinish(clCommandQue); clGetEventProfilingInfo(mm, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL); clGetEventProfilingInfo(mm, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL); total_time += time_end - time_start; printf("Average time = %lu\n", total_time); errcode = clEnqueueReadBuffer(clCommandQue, d_C, CL_TRUE, 0, mem_size_C, h_C, 0, NULL, NULL); free(h_A); free(h_B); free(h_C); clReleaseMemObject(d_A); clReleaseMemObject(d_C); clReleaseMemObject(d_B); free(clDevices); clReleaseContext(clGPUContext); clReleaseKernel(clKernel); clReleaseProgram(clProgram); clReleaseCommandQueue(clCommandQue); }
int main() { int MaxIter = 50; if(argc<2) return -1; char* fname = argv[1]; char fname_out[50] = "contour.bmp"; if (argc>2) strcpy(fname_out, argv[2]); if (argc>3) MaxIter = atoi(argv[3]); int N1; int N2; int i, j; float *img; //reads fname, stores the array of floats in img, N1 = width of image, N2 = height of image int err = imread(&img, &N1, &N2, fname); if (err!=0) return err; // Elements in each array const int elements = N1 * N2; // Compute the size of the data size_t datasize = sizeof(float)*elements; // Allocate space for input/output data //Float *img is u float *contour = (float*)calloc(datasize); float *curv = (float*)calloc(datasize); float *phi = (float*)calloc(datasize); int *dataDimensions = (int*)malloc(3*sizeof(int)); // Init data dataDimensions[0] = N1; dataDimensions[1] = N2; dataDimensions[2] = MaxIter; // -------------------------DONT MODIFY SECTION BELOW------------------------------- // Use this to check the output of each API call cl_int status; // Retrieve the number of platforms cl_uint numPlatforms = 0; status = clGetPlatformIDs(0, NULL, &numPlatforms); // Allocate enough space for each platform cl_platform_id *platforms = NULL; platforms = (cl_platform_id*)malloc( numPlatforms*sizeof(cl_platform_id)); // Fill in the platforms status = clGetPlatformIDs(numPlatforms, platforms, NULL); // Retrieve the number of devices cl_uint numDevices = 0; status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); // Allocate enough space for each device cl_device_id *devices; devices = (cl_device_id*)malloc( numDevices*sizeof(cl_device_id)); // Fill in the devices status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); // Create a context and associate it with the devices cl_context context; context = clCreateContext(NULL, numDevices, devices, NULL, NULL, &status); // Create a command queue and associate it with the device cl_command_queue cmdQueue; cmdQueue = clCreateCommandQueue(context, devices[0], 0, &status); // -----------------------------DONT EDIT SECTION ABOVE THIS------------------------------- // Create a buffer object that will contain the data // from the host array A cl_mem contourBuf; contourBuf = clCreateBuffer(context, CL_MEM_READ_ONLY, datasize, NULL, &status); // Create a buffer object that will contain the data // from the host array B cl_mem imgBuf; imgBuf = clCreateBuffer(context, CL_MEM_READ_ONLY, datasize, NULL, &status); // Create a buffer object that will hold the output data cl_mem dataDimensionsBuf; dataDimensionsBuf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, 3 * sizeof(int), NULL, &status); // Create a buffer object that will hold the output data cl_mem curvBuf; curBuf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, datasize, NULL, &status); // Create a buffer object that will hold the output data cl_mem phiBuf; phiBuf = clCreateBuffer(context, CL_MEM_WRITE_ONLY, datasize, NULL, &status); status = clEnqueueWriteBuffer(cmdQueue, imgBuf, CL_FALSE, 0, datasize, img, 0, NULL, NULL); status = clEnqueueWriteBuffer(cmdQueue, dataDimensionsBuf, CL_FALSE, 0, 3 * sizeof(int), dataDimensions, 0, NULL, NULL); status = clEnqueueWriteBuffer(cmdQueue, curvBuf, CL_FALSE, 0, datasize, curv, 0, NULL, NULL); // Write input array A to the device buffer bufferA status = clEnqueueWriteBuffer(cmdQueue, phiBuf, CL_FALSE, 0, datasize, phi, 0, NULL, NULL); // Create a program with source code cl_program program = clCreateProgramWithSource(context, 1, (const char**)&programSource, NULL, &status); // Build (compile) the program for the device status = clBuildProgram(program, numDevices, devices, NULL, NULL, NULL); // Create the vector addition kernel cl_kernel kernel; kernel = clCreateKernel(program, "segmentation", &status); // Associate the input and output buffers with the kernel status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &contourBuf); status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &imgBuf); status = clSetKernelArg(kernel, 2, sizeof(cl_mem), &dataDimensionsBuf); status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &curvBuf); status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &phiBuf); // Define an index space (global work size) of work // items for execution. A workgroup size (local work size) // is not required, but can be used. size_t globalWorkSize[1]; // There are 'elements' work-items globalWorkSize[0] = elements; // Execute the kernel for execution status = clEnqueueNDRangeKernel(cmdQueue, kernel, 1, NULL, globalWorkSize, NULL, 0, NULL, NULL); // Read the device output buffer to the host output array clEnqueueReadBuffer(cmdQueue, contourBuf, CL_TRUE, 0, datasize, contour, 0, NULL, NULL); imwrite(contour, N1, N2, fname_out); // Free OpenCL resources clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmdQueue); clReleaseMemObject(contourBuf); clReleaseMemObject(imgBuf); clReleaseMemObject(dataDimensionsBuf); clReleaseMemObject(curvBuf); clReleaseMemObject(phiBuf); clReleaseContext(context); // Free host resources free(phi); free(curv); free(img); free(contour); free(dataDimensions); free(platforms); free(devices); return 0; }
int main() { cl_int num_rand = 4096*256; /* The number of random numbers generated using one generator */ int count_all, i, num_generator = sizeof(mts)/sizeof(mts[0]); /* The number of generators */ double pi; cl_platform_id platform_id = NULL; cl_uint ret_num_platforms; cl_device_id device_id = NULL; cl_uint ret_num_devices; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_program program = NULL; cl_kernel kernel_mt = NULL, kernel_pi = NULL; size_t kernel_code_size; char *kernel_src_str; cl_uint *result; cl_int ret; FILE *fp; cl_mem rand, count; size_t global_item_size[3], local_item_size[3]; cl_mem dev_mts; cl_event ev_mt_end, ev_pi_end, ev_copy_end; cl_ulong prof_start, prof_mt_end, prof_pi_end, prof_copy_end; clGetPlatformIDs(1, &platform_id, &ret_num_platforms); clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); result = (cl_uint*)malloc(sizeof(cl_uint)*num_generator); command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret); fp = fopen("mt.cl", "r"); kernel_src_str = (char*)malloc(MAX_SOURCE_SIZE); kernel_code_size = fread(kernel_src_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); /* Create output buffer */ rand = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint)*num_rand*num_generator, NULL, &ret); count = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint)*num_generator, NULL, &ret); /* Build Program*/ program = clCreateProgramWithSource(context, 1, (const char **)&kernel_src_str, (const size_t *)&kernel_code_size, &ret); clBuildProgram(program, 1, &device_id, "", NULL, NULL); kernel_mt = clCreateKernel(program, "genrand", &ret); kernel_pi = clCreateKernel(program, "calc_pi", &ret); /* Create input parameter */ dev_mts = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(mts), NULL, &ret); clEnqueueWriteBuffer(command_queue, dev_mts, CL_TRUE, 0, sizeof(mts), mts, 0, NULL, NULL); /* Set Kernel Arguments */ clSetKernelArg(kernel_mt, 0, sizeof(cl_mem), (void*)&rand); /* Random numbers (output of genrand) */ clSetKernelArg(kernel_mt, 1, sizeof(cl_mem), (void*)&dev_mts); /* MT parameter (input to genrand) */ clSetKernelArg(kernel_mt, 2, sizeof(num_rand), &num_rand); /* Number of random numbers to generate */ clSetKernelArg(kernel_pi, 0, sizeof(cl_mem), (void*)&count); /* Counter for points within circle (output of calc_pi) */ clSetKernelArg(kernel_pi, 1, sizeof(cl_mem), (void*)&rand); /* Random numbers (input to calc_pi) */ clSetKernelArg(kernel_pi, 2, sizeof(num_rand), &num_rand); /* Number of random numbers used */ global_item_size[0] = num_generator; global_item_size[1] = 1; global_item_size[2] = 1; local_item_size[0] = num_generator; local_item_size[1] = 1; local_item_size[2] = 1; /* Create a random number array */ clEnqueueNDRangeKernel(command_queue, kernel_mt, 1, NULL, global_item_size, local_item_size, 0, NULL, &ev_mt_end); /* Compute PI */ clEnqueueNDRangeKernel(command_queue, kernel_pi, 1, NULL, global_item_size, local_item_size, 0, NULL, &ev_pi_end); /* Get result */ clEnqueueReadBuffer(command_queue, count, CL_TRUE, 0, sizeof(cl_uint)*num_generator, result, 0, NULL, &ev_copy_end); /* Average the values of PI */ count_all = 0; for (i=0; i < num_generator; i++) { count_all += result[i]; } pi = ((double)count_all)/(num_rand * num_generator) * 4; printf("pi = %f\n", pi); /* Get execution time info */ clGetEventProfilingInfo(ev_mt_end, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &prof_start, NULL); clGetEventProfilingInfo(ev_mt_end, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &prof_mt_end, NULL); clGetEventProfilingInfo(ev_pi_end, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &prof_pi_end, NULL); clGetEventProfilingInfo(ev_copy_end, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &prof_copy_end, NULL); printf(" mt: %f[ms]\n" " pi: %f[ms]\n" " copy: %f[ms]\n", (prof_mt_end - prof_start)/(1000000.0), (prof_pi_end - prof_mt_end)/(1000000.0), (prof_copy_end - prof_pi_end)/(1000000.0)); clReleaseEvent(ev_mt_end); clReleaseEvent(ev_pi_end); clReleaseEvent(ev_copy_end); clReleaseMemObject(rand); clReleaseMemObject(count); clReleaseKernel(kernel_mt); clReleaseKernel(kernel_pi); clReleaseProgram(program); clReleaseCommandQueue(command_queue); clReleaseContext(context); free(kernel_src_str); free(result); return 0; }
int main( int argc, char* argv[] ) { // Length of vectors unsigned int n = 100000; // Host input vectors double *h_a; double *h_b; // Host output vector double *h_c; // Device input buffers cl_mem d_a; cl_mem d_b; // Device output buffer cl_mem d_c; cl_platform_id cpPlatform; // OpenCL platform cl_device_id device_id; // device ID cl_context context; // context cl_command_queue queue; // command queue cl_program program; // program cl_kernel kernel; // kernel // Size, in bytes, of each vector size_t bytes = n*sizeof(double); // Allocate memory for each vector on host h_a = (double*)malloc(bytes); h_b = (double*)malloc(bytes); h_c = (double*)malloc(bytes); // Initialize vectors on host int i; for( i = 0; i < n; i++ ) { h_a[i] = sinf(i)*sinf(i); h_b[i] = cosf(i)*cosf(i); } size_t globalSize, localSize; cl_int err; // Number of work items in each local work group localSize = 64; // Number of total work items - localSize must be devisor globalSize = ceil(n/(float)localSize)*localSize; // Bind to platform err = clGetPlatformIDs(1, &cpPlatform, NULL); // Get ID for the device err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); // Create a context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); // Create a command queue queue = clCreateCommandQueue(context, device_id, 0, &err); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelSource, NULL, &err); // Build the program executable clBuildProgram(program, 0, NULL, NULL, NULL, NULL); // Create the compute kernel in the program we wish to run kernel = clCreateKernel(program, "vecAdd", &err); // Create the input and output arrays in device memory for our calculation d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL); // Write our data set into the input array in device memory err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0, bytes, h_a, 0, NULL, NULL); err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0, bytes, h_b, 0, NULL, NULL); // Set the arguments to our compute kernel err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c); err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n); // Execute the kernel over the entire range of the data set err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL); // Wait for the command queue to get serviced before reading back results clFinish(queue); // Read the results from the device clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0, bytes, h_c, 0, NULL, NULL ); //Sum up vector c and print result divided by n, this should equal 1 within error double sum = 0; for(i=0; i<n; i++) sum += h_c[i]; printf("final result: %f\n", sum/n); // release OpenCL resources clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseContext(context); //release host memory free(h_a); free(h_b); free(h_c); return 0; }
void xcl_release_world(xcl_world world) { clReleaseCommandQueue(world.command_queue); clReleaseContext(world.context); }
int exec_trig_kernel(const char *program_source, int n, void *srcA, void *dst) { cl_context context; cl_command_queue cmd_queue; cl_device_id *devices; cl_program program; cl_kernel kernel; cl_mem memobjs[2]; size_t global_work_size[1]; size_t local_work_size[1]; size_t cb; cl_int err; float c = 7.3f; // a scalar number to test non-pointer args // create the OpenCL context on a GPU device context = poclu_create_any_context(); if (context == (cl_context)0) return -1; // get the list of GPU devices associated with context clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb); devices = (cl_device_id *) malloc(cb); clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, devices, NULL); // create a command-queue cmd_queue = clCreateCommandQueue(context, devices[0], 0, NULL); if (cmd_queue == (cl_command_queue)0) { clReleaseContext(context); free(devices); return -1; } free(devices); // allocate the buffer memory objects memobjs[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float4) * n, srcA, NULL); if (memobjs[0] == (cl_mem)0) { clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } memobjs[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float4) * n, NULL, NULL); if (memobjs[1] == (cl_mem)0) { delete_memobjs(memobjs, 1); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // create the program program = clCreateProgramWithSource(context, 1, (const char**)&program_source, NULL, NULL); if (program == (cl_program)0) { delete_memobjs(memobjs, 2); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // create the kernel kernel = clCreateKernel(program, "trig", NULL); if (kernel == (cl_kernel)0) { delete_memobjs(memobjs, 2); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // set the args values err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &memobjs[0]); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &memobjs[1]); err |= clSetKernelArg(kernel, 2, sizeof(float), (void *) &c); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // set work-item dimensions global_work_size[0] = n; local_work_size[0]= 2; // execute kernel err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // read output image err = clEnqueueReadBuffer(cmd_queue, memobjs[1], CL_TRUE, 0, n * sizeof(cl_float4), dst, 0, NULL, NULL); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // release kernel, program, and memory objects delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return 0; // success... }
int main(void) { //time meassuring struct timeval tvs; //variables int Nx=1024; int Ny=1024; int plotnum=0; int Tmax=2; int plottime=0; int plotgap=1; double Lx=1.0; double Ly=1.0; double dt=0.0; double A=0.0; double B=0.0; double Du=0.0; double Dv=0.0; //splitting coefficients double a=0.5; double b=0.5; double c=1.0; //loop counters int i=0; int j=0; int n=0; double*umax=NULL; double*vmax=NULL; parainit(&Nx,&Ny,&Tmax,&plotgap,&Lx,&Ly,&dt,&Du,&Dv,&A,&B); plottime=plotgap; vmax=(double*)malloc((Tmax/plotgap+1)*sizeof(double)); umax=(double*)malloc((Tmax/plotgap+1)*sizeof(double)); //openCL variables cl_platform_id *platform_id = NULL; cl_kernel frequencies = NULL, initialdata = NULL, linearpart=NULL; cl_kernel nonlinearpart_a=NULL, nonlinearpart_b=NULL; cl_int ret; cl_uint num_platforms; // Detect how many platforms there are. ret = clGetPlatformIDs(0, NULL, &num_platforms); // Allocate enough space for the number of platforms. platform_id = (cl_platform_id*) malloc(num_platforms*sizeof(cl_platform_id)); // Store the platforms ret = clGetPlatformIDs(num_platforms, platform_id, NULL); printf("Found %d platform(s)!\n",num_platforms); cl_uint *num_devices; num_devices=(cl_uint*) malloc(num_platforms*sizeof(cl_uint)); cl_device_id **device_id = NULL; device_id =(cl_device_id**) malloc(num_platforms*sizeof(cl_device_id*)); // Detect number of devices in the platforms for(i=0;i<num_platforms;i++){ char buf[65536]; size_t size; ret = clGetPlatformInfo(platform_id[i],CL_PLATFORM_VERSION,sizeof(buf),buf,&size); printf("%s\n",buf); ret = clGetDeviceIDs(platform_id[i],CL_DEVICE_TYPE_ALL,0,NULL,num_devices); printf("Found %d device(s) on platform %d!\n", num_devices[i],i); ret = clGetPlatformInfo(platform_id[i],CL_PLATFORM_NAME,sizeof(buf),buf,&size); printf("%s ",buf); // Store numDevices from platform device_id[i]=(cl_device_id*) malloc(num_devices[i]*sizeof(device_id)); ret = clGetDeviceIDs(platform_id[i],CL_DEVICE_TYPE_ALL,num_devices[i],device_id[i],NULL); for(j=0;j<num_devices[i];j++){ ret = clGetDeviceInfo(device_id[i][j],CL_DEVICE_NAME,sizeof(buf),buf,&size); printf("%s (%d,%d)\n",buf,i,j); } } //create context and command_queue cl_context context = NULL; cl_command_queue command_queue = NULL; //Which platform and device do i choose? int chooseplatform=0; int choosedevice=0; printf("Choose platform %d and device %d!\n",chooseplatform,choosedevice); context = clCreateContext( NULL, num_devices[chooseplatform], device_id[chooseplatform], NULL, NULL, &ret); if(ret!=CL_SUCCESS){printf("createContext ret:%d\n",ret); exit(1); } command_queue = clCreateCommandQueue(context, device_id[chooseplatform][choosedevice], 0, &ret); if(ret!=CL_SUCCESS){printf("createCommandQueue ret:%d\n",ret); exit(1); } //OpenCL arrays cl_mem cl_u = NULL,cl_v = NULL; cl_mem cl_uhat = NULL, cl_vhat = NULL; cl_mem cl_kx = NULL, cl_ky = NULL; //FFT clfftPlanHandle planHandle; cl_mem tmpBuffer = NULL; fftinit(&planHandle,&context, &command_queue, &tmpBuffer, Nx, Ny); //allocate gpu memory/ cl_u=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx* Ny* sizeof(double), NULL, &ret); cl_v=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx* Ny* sizeof(double), NULL, &ret); cl_uhat=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx * Ny* sizeof(double), NULL, &ret); cl_vhat=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx * Ny* sizeof(double), NULL, &ret); cl_kx = clCreateBuffer(context, CL_MEM_READ_WRITE, Nx * sizeof(double), NULL, &ret); cl_ky = clCreateBuffer(context, CL_MEM_READ_WRITE, Ny * sizeof(double), NULL, &ret); printf("allocated space\n"); //load the kernels loadKernel(&frequencies,&context,&device_id[chooseplatform][choosedevice],"frequencies"); loadKernel(&initialdata,&context,&device_id[chooseplatform][choosedevice],"initialdata"); loadKernel(&linearpart,&context,&device_id[chooseplatform][choosedevice],"linearpart"); loadKernel(&nonlinearpart_a,&context,&device_id[chooseplatform][choosedevice],"nonlinearpart_a"); loadKernel(&nonlinearpart_b,&context,&device_id[chooseplatform][choosedevice],"nonlinearpart_b"); size_t global_work_size[1] = {Nx*Ny}; size_t global_work_size_X[1] = {Nx}; size_t global_work_size_Y[1] = {Ny}; //frequencies ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem),(void *)&cl_kx); ret = clSetKernelArg(frequencies, 1, sizeof(double),(void* )&Lx); ret = clSetKernelArg(frequencies, 2, sizeof(int),(void* )&Nx); ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_X, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem),(void *)&cl_ky); ret = clSetKernelArg(frequencies, 1, sizeof(double),(void* )&Ly); ret = clSetKernelArg(frequencies, 2, sizeof(int),(void* )&Ny); ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_Y, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //printCL(&cl_kx,&command_queue,Nx,1); //printCL(&cl_ky,&command_queue,1,Ny); //inintial data ret = clSetKernelArg(initialdata, 0, sizeof(cl_mem),(void *)&cl_u); ret = clSetKernelArg(initialdata, 1, sizeof(cl_mem),(void* )&cl_v); ret = clSetKernelArg(initialdata, 2, sizeof(int),(void* )&Nx); ret = clSetKernelArg(initialdata, 3, sizeof(int),(void* )&Ny); ret = clSetKernelArg(initialdata, 4, sizeof(double),(void* )&Lx); ret = clSetKernelArg(initialdata, 5, sizeof(double),(void* )&Ly); ret = clEnqueueNDRangeKernel(command_queue, initialdata, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //make output writedata_C(&cl_u, &command_queue,Nx,Ny,plotnum,"u"); writedata_C(&cl_v, &command_queue,Nx,Ny,plotnum,"v"); umax[plotnum]=writeimage(&cl_u, &command_queue,Nx,Ny,plotnum,"u"); vmax[plotnum]=writeimage(&cl_v, &command_queue,Nx,Ny,plotnum,"v"); printf("Got initial data, starting timestepping\n"); mtime_s(&tvs); for(n=0;n<=Tmax;n++){ //nonlinearpart_a ret = clSetKernelArg(nonlinearpart_a, 0, sizeof(cl_mem),(void *)&cl_u); ret = clSetKernelArg(nonlinearpart_a, 1, sizeof(cl_mem),(void* )&cl_v); ret = clSetKernelArg(nonlinearpart_a, 2, sizeof(double),(void* )&A); ret = clSetKernelArg(nonlinearpart_a, 3, sizeof(double),(void* )&dt); ret = clSetKernelArg(nonlinearpart_a, 4, sizeof(double),(void* )&a); ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_a, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //nonlinearpart_b ret = clSetKernelArg(nonlinearpart_b, 0, sizeof(cl_mem),(void *)&cl_u); ret = clSetKernelArg(nonlinearpart_b, 1, sizeof(cl_mem),(void* )&cl_v); ret = clSetKernelArg(nonlinearpart_b, 2, sizeof(double),(void* )&A); ret = clSetKernelArg(nonlinearpart_b, 3, sizeof(double),(void* )&dt); ret = clSetKernelArg(nonlinearpart_b, 4, sizeof(double),(void* )&b); ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_b, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //linear fft2dfor(&cl_u, &cl_uhat,&planHandle,&command_queue,&tmpBuffer); fft2dfor(&cl_v, &cl_vhat,&planHandle,&command_queue,&tmpBuffer); //printf("A%f,B%f\n",A,B); ret = clSetKernelArg(linearpart, 0, sizeof(cl_mem),(void *)&cl_uhat); ret = clSetKernelArg(linearpart, 1, sizeof(cl_mem),(void *)&cl_vhat); ret = clSetKernelArg(linearpart, 2, sizeof(cl_mem),(void* )&cl_kx); ret = clSetKernelArg(linearpart, 3, sizeof(cl_mem),(void* )&cl_ky); ret = clSetKernelArg(linearpart, 4, sizeof(double),(void* )&Du); ret = clSetKernelArg(linearpart, 5, sizeof(double),(void* )&Dv); ret = clSetKernelArg(linearpart, 6, sizeof(double),(void* )&A); ret = clSetKernelArg(linearpart, 7, sizeof(double),(void* )&B); ret = clSetKernelArg(linearpart, 8, sizeof(double),(void* )&dt); ret = clSetKernelArg(linearpart, 9, sizeof(double),(void* )&c); ret = clSetKernelArg(linearpart, 10, sizeof(int),(void* )&Nx); ret = clSetKernelArg(linearpart, 11, sizeof(int),(void* )&Ny); ret = clEnqueueNDRangeKernel(command_queue, linearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); fft2dback(&cl_u, &cl_uhat,&planHandle,&command_queue,&tmpBuffer); fft2dback(&cl_v, &cl_vhat,&planHandle,&command_queue,&tmpBuffer); //nonlinearpart_b ret = clSetKernelArg(nonlinearpart_b, 0, sizeof(cl_mem),(void *)&cl_u); ret = clSetKernelArg(nonlinearpart_b, 1, sizeof(cl_mem),(void* )&cl_v); ret = clSetKernelArg(nonlinearpart_b, 2, sizeof(double),(void* )&A); ret = clSetKernelArg(nonlinearpart_b, 3, sizeof(double),(void* )&dt); ret = clSetKernelArg(nonlinearpart_b, 4, sizeof(double),(void* )&b); ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_b, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //nonlinearpart_a ret = clSetKernelArg(nonlinearpart_a, 0, sizeof(cl_mem),(void *)&cl_u); ret = clSetKernelArg(nonlinearpart_a, 1, sizeof(cl_mem),(void* )&cl_v); ret = clSetKernelArg(nonlinearpart_a, 2, sizeof(double),(void* )&A); ret = clSetKernelArg(nonlinearpart_a, 3, sizeof(double),(void* )&dt); ret = clSetKernelArg(nonlinearpart_a, 4, sizeof(double),(void* )&a); ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_a, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); // done if(n==plottime){ printf("time:%f, step:%d,%d,umax:%f,vmax:%f\n",n*dt,n,plotnum,umax[plotnum],vmax[plotnum]); plottime=plottime+plotgap; plotnum=plotnum+1; writedata_C(&cl_u, &command_queue,Nx,Ny,plotnum,"u"); writedata_C(&cl_v, &command_queue,Nx,Ny,plotnum,"v"); umax[plotnum]=writeimage(&cl_u, &command_queue,Nx,Ny,plotnum,"u"); vmax[plotnum]=writeimage(&cl_v, &command_queue,Nx,Ny,plotnum,"v"); } }//end timestepping printf("Finished time stepping\n"); mtime_e(&tvs,"Programm took:"); writearray(umax,(Tmax/plotgap)+1,"u"); writearray(vmax,(Tmax/plotgap)+1,"v"); free(umax); free(vmax); clReleaseMemObject(cl_u); clReleaseMemObject(cl_v); clReleaseMemObject(cl_uhat); clReleaseMemObject(cl_vhat); clReleaseMemObject(cl_kx); clReleaseMemObject(cl_ky); ret = clReleaseKernel(initialdata); ret = clReleaseKernel(frequencies); ret = clReleaseKernel(linearpart); ret = clReleaseKernel(nonlinearpart_a); ret = clReleaseKernel(nonlinearpart_b); fftdestroy(&planHandle, &tmpBuffer); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); for(i=0;i<num_platforms;i++){free(device_id[i]);} free(device_id); free(platform_id); free(num_devices); printf("Program execution complete\n"); return 0; }
void runProgram(int N, char *fileName) { printf("GPU Symmetrize()..." "\nSquareMatrix[%d][%d]\n", N, N); int i,j; // initialize input array float *A; A = (float*)malloc(sizeof(float)*N*N); for( i = 0; i < N ; ++i ) { for( j = 0; j < N ; ++j ) { A[i*N + j] = j; } } // result float *Aout; Aout = (float*)malloc(sizeof(float)*N*N); #ifdef DEBUG puts("A"); check_2d_f(A,N,N); #endif int NumK = 1; int NumE = 2; double gpuTime; cl_ulong gstart, gend; //------------------------------------------------ // OpenCL //------------------------------------------------ cl_int err; cl_platform_id platform; // OpenCL platform cl_device_id device_id; // device ID cl_context context; // context cl_command_queue queue; // command queue cl_program program; // program cl_kernel *kernel = (cl_kernel*)malloc(sizeof(cl_kernel)*NumK); cl_event *event = (cl_event*)malloc(sizeof(cl_event)*NumE); // read kernel file //char *fileName = "transpose_kernel.cl"; char *kernelSource; size_t size; FILE *fh = fopen(fileName, "rb"); if(!fh) { printf("Error: Failed to open kernel file!\n"); exit(1); } fseek(fh,0,SEEK_END); size=ftell(fh); fseek(fh,0,SEEK_SET); kernelSource = malloc(size+1); size_t result; result = fread(kernelSource,1,size,fh); if(result != size){ fputs("Reading error", stderr);exit(1);} kernelSource[size] = '\0'; // Bind to platform err = clGetPlatformIDs(1, &platform, NULL); OCL_CHECK(err); // Get ID for the device err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); OCL_CHECK(err); // Create a context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); OCL_CHECK(err); // Create a command queue queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err); OCL_CHECK(err); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, NULL, &err); OCL_CHECK(err); // turn on optimization for kernel char *options="-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-unsafe-math-optimizations -cl-finite-math-only"; err = clBuildProgram(program, 1, &device_id, options, NULL, NULL); if(err != CL_SUCCESS) printCompilerOutput(program, device_id); OCL_CHECK(err); #ifdef SAVEBIN // Calculate size of binaries size_t binary_size; err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_size, NULL); OCL_CHECK(err); //printf("binary size = %ld\n", binary_size); unsigned char* bin; bin = (unsigned char*)malloc(sizeof(unsigned char)*binary_size); err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*) , &bin, NULL); OCL_CHECK(err); //puts("save binaries"); // Print the binary out to the output file fh = fopen("kernel.bin", "wb"); fwrite(bin, 1, binary_size, fh); fclose(fh); puts("done save binaries"); #endif kernel[0] = clCreateKernel(program, "kernel_a", &err); OCL_CHECK(err); // memory on device cl_mem A_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N*N, NULL, NULL); cl_mem Aout_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N*N, NULL, NULL); // copy data to device err = clEnqueueWriteBuffer(queue, A_d, CL_TRUE, 0, sizeof(float)*N*N, A, 0, NULL , &event[0]); OCL_CHECK(err); size_t localsize[2]; size_t globalsize[2]; localsize[0] = 16; localsize[1] = 16; globalsize[0] = N; globalsize[1] = N; err = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &A_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &Aout_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clEnqueueNDRangeKernel(queue, kernel[0], 2, NULL, globalsize, localsize, 0, NULL, NULL); OCL_CHECK(err); clFinish(queue); // read device data back to host clEnqueueReadBuffer(queue, Aout_d, CL_TRUE, 0, sizeof(float)*N*N, Aout, 0, NULL , &event[1]); err = clWaitForEvents(1,&event[1]); OCL_CHECK(err); err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &gstart, NULL); OCL_CHECK(err); err = clGetEventProfilingInfo (event[1], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &gend, NULL); OCL_CHECK(err); gpuTime = (double)(gend -gstart)/1000000000.0; //check_1d_f(sum, blks+1); #ifdef DEBUG puts("Output"); check_2d_f(Aout,N,N); #endif printf("oclTime = %lf (s)\n", gpuTime ); // free clReleaseMemObject(A_d); clReleaseMemObject(Aout_d); // // check // int flag = 1; // for(i=0;i<N;++i){ // for(j=0;j<N;++j){ // if(A[i*N+j] != At[j*N+i]) // { // flag = 0; // break; // } // } // } // if( flag == 0 ) // { // puts("Bugs! Check program."); // }else{ // puts("Succeed!"); // } clReleaseProgram(program); clReleaseContext(context); clReleaseCommandQueue(queue); for(i=0;i<NumK;++i){ clReleaseKernel(kernel[i]); } for(i=0;i<NumE;++i){ clReleaseEvent(event[i]); } free(kernelSource); #ifdef SAVEBIN free(bin); #endif free(A); free(Aout); return; }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX, bufY; cl_event event = NULL; int ret = 0; int lenX = 1 + (N-1)*abs(incx); int lenY = 1 + (N-1)*abs(incy); /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufX = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenX*sizeof(cl_float)), NULL, &err); bufY = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (lenY*sizeof(cl_float)), NULL, &err); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); printResult(); /* Call clblas function. */ err = clblasSrot(N, bufX, 0, incx, bufY, 0, incy, C, S, 1, &queue, 0, NULL, &event); // printf("here\n"); if (err != CL_SUCCESS) { printf("clblasSrot() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufY, CL_TRUE, 0, (lenY*sizeof(cl_float)), Y, 0, NULL, NULL); err = clEnqueueReadBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); /* At this point you will get the result of SROT placed in vector Y. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufY); clReleaseMemObject(bufX); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
void DeleteCL() { clReleaseContext(g_cxMainContext); clReleaseCommandQueue(g_cqCommandQue); }
int simpleExample() { /* Create device and determine local size */ device = create_device(); err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_size), &local_size, NULL); if(err < 0) { perror("Couldn't obtain device information"); exit(1); } /* Create a context */ context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if(err < 0) { perror("Couldn't create a context"); exit(1); } /* Build program */ program = build_program(context, device, PROGRAM_FILE); /* Create data buffer */ data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, ARRAY_SIZE * sizeof(float), data, &err); sum_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float), NULL, &err); if(err < 0) { perror("Couldn't create a buffer"); exit(1); }; /* Create a command queue */ queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err); if(err < 0) { perror("Couldn't create a command queue"); exit(1); }; /* Create kernels */ vector_kernel = clCreateKernel(program, KERNEL_1, &err); complete_kernel = clCreateKernel(program, KERNEL_2, &err); if(err < 0) { perror("Couldn't create a kernel"); exit(1); }; /* Set arguments for vector kernel */ err = clSetKernelArg(vector_kernel, 0, sizeof(cl_mem), &data_buffer); err |= clSetKernelArg(vector_kernel, 1, local_size * 4 * sizeof(float), NULL); /* Set arguments for complete kernel */ err = clSetKernelArg(complete_kernel, 0, sizeof(cl_mem), &data_buffer); err |= clSetKernelArg(complete_kernel, 1, local_size * 4 * sizeof(float), NULL); err |= clSetKernelArg(complete_kernel, 2, sizeof(cl_mem), &sum_buffer); if(err < 0) { perror("Couldn't create a kernel argument"); exit(1); } /* Enqueue kernels */ global_size = ARRAY_SIZE/4; err = clEnqueueNDRangeKernel(queue, vector_kernel, 1, NULL, &global_size, &local_size, 0, NULL, &start_event); if(err < 0) { perror("Couldn't enqueue the kernel"); exit(1); } printf("Global size = %lu\n", global_size); /* Perform successive stages of the reduction */ while(global_size/local_size > local_size) { global_size = global_size/local_size; err = clEnqueueNDRangeKernel(queue, vector_kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL); printf("Global size = %lu\n", global_size); if(err < 0) { perror("Couldn't enqueue the kernel"); exit(1); } } global_size = global_size/local_size; err = clEnqueueNDRangeKernel(queue, complete_kernel, 1, NULL, &global_size, NULL, 0, NULL, &end_event); printf("Global size = %lu\n", global_size); /* Finish processing the queue and get profiling information */ clFinish(queue); clGetEventProfilingInfo(start_event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL); clGetEventProfilingInfo(end_event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL); total_time = time_end - time_start; /* Read the result */ err = clEnqueueReadBuffer(queue, sum_buffer, CL_TRUE, 0, sizeof(float), &sum, 0, NULL, NULL); if(err < 0) { perror("Couldn't read the buffer"); exit(1); } /* Check result */ actual_sum = 1.0f * (ARRAY_SIZE/2)*(ARRAY_SIZE-1); if(fabs(sum - actual_sum) > 0.01*fabs(sum)) printf("Check failed.\n"); else printf("Check passed.\n"); printf("Total time = %lu\n", total_time); /* Deallocate resources */ clReleaseEvent(start_event); clReleaseEvent(end_event); clReleaseMemObject(sum_buffer); clReleaseMemObject(data_buffer); clReleaseKernel(vector_kernel); clReleaseKernel(complete_kernel); clReleaseCommandQueue(queue); clReleaseProgram(program); clReleaseContext(context); return 0; }
void clean_all(void) { printf("Cleaning Variables ... \n\n"); // Opencl environment variables clReleaseCommandQueue(command_queue); clReleaseContext(context); // Release all memory allocated if (Data_MeshType == UNSTRUCTURED) { // Mesh Variables free(MeshElementArray.Node1); free(MeshElementArray.Node2); free(MeshElementArray.Node3); free(MeshElementArray.Node4); free(MeshNodeArray_double.x); free(MeshNodeArray_double.y); free(MeshNodeArray_double.z); free(MeshElementArray.Neighborindex1); free(MeshElementArray.Neighborindex2); free(MeshElementArray.Neighborindex3); free(MeshElementArray.Neighborindex4); clReleaseMemObject(Mesh_Node_x); clReleaseMemObject(Mesh_Node_y); clReleaseMemObject(Mesh_Node_z); clReleaseMemObject(Mesh_Element_Node1); clReleaseMemObject(Mesh_Element_Node2); clReleaseMemObject(Mesh_Element_Node3); clReleaseMemObject(Mesh_Element_Node4); clReleaseMemObject(Mesh_Element_Neighborindex1); clReleaseMemObject(Mesh_Element_Neighborindex2); clReleaseMemObject(Mesh_Element_Neighborindex3); clReleaseMemObject(Mesh_Element_Neighborindex4); clReleaseMemObject(r); clReleaseMemObject(s); clReleaseMemObject(t); clReleaseMemObject(eid); } // Cleaning Velocity variables free(velocity.u0); free(velocity.v0); free(velocity.w0); free(velocity.u1); free(velocity.v1); free(velocity.w1); free(velocity.time0); free(velocity.time1); free(Tracer.x); Tracer.x = NULL; free(Tracer.y); Tracer.y = NULL; free(Tracer.z); Tracer.z = NULL; free(Tracer.ElementIndex); Tracer.ElementIndex = NULL; free(Tracer.Start_time); Tracer.Start_time = NULL; free(Tracer.Stop_time); Tracer.Stop_time = NULL; free(Tracer.LeftDomain); Tracer.LeftDomain = NULL; if (Trace_ReleaseStrategy == 1) { free(Tracer1.x); Tracer1.x = NULL; free(Tracer1.y); Tracer1.y = NULL; free(Tracer1.z); Tracer1.z = NULL; free(Tracer1.ElementIndex); Tracer1.ElementIndex = NULL; free(Tracer1.Start_time); Tracer1.Start_time = NULL; free(Tracer1.Stop_time); Tracer1.Stop_time = NULL; free(Tracer1.LeftDomain); Tracer1.LeftDomain = NULL; free(index1); index1 = NULL; free(Tracer.Status); Tracer.Status = NULL; } free(DataTime1); free(Output_time); free(Launch_time); clReleaseMemObject(Vel_U0); clReleaseMemObject(Vel_U1); clReleaseMemObject(Vel_V0); clReleaseMemObject(Vel_V1); clReleaseMemObject(Vel_W0); clReleaseMemObject(Vel_W1); clReleaseMemObject(x_dev); clReleaseMemObject(y_dev); clReleaseMemObject(posx); clReleaseMemObject(posy); clReleaseMemObject(xn0); clReleaseMemObject(xn1); clReleaseMemObject(integrate); if (Dimensions == 3) { clReleaseMemObject(z_dev); clReleaseMemObject(posz); clReleaseMemObject(xn2); } clReleaseMemObject(Start_time_dev); clReleaseMemObject(Stop_time_dev); clReleaseMemObject(ElementIndex_dev); clReleaseMemObject(LeftDomain_dev); // Remove Temp file containing tracer release information if (!Keep_Tempfile) { char BinFile[LONGSTRING]; sprintf(BinFile, "%s%s.bin", Path_Output, Temp_OutFilePrefix); if(remove(BinFile)) fprintf(stderr, "Warning: Could not delete file %s\n", BinFile); } CL_CHECK(clReleaseKernel(kernel1)); CL_CHECK(clReleaseKernel(kernel2)); CL_CHECK(clReleaseKernel(kernel3)); CL_CHECK(clReleaseKernel(kernel4)); CL_CHECK(clReleaseKernel(kernel5)); CL_CHECK(clReleaseProgram(program)); printf("Cleaning Successfull \n\n"); }
ErrorStatus gemm_clblas(cl_device_id device, const void *inMatrixA, int nrowA, int ncolA, bool transposeA, const void *inMatrixB, int nrowB, int ncolB, bool transposeB, double alpha, double beta, void *outMatrix, bool use_float) { std::stringstream result; float *input_matrixA_f = (float *)inMatrixA; float *input_matrixB_f = (float *)inMatrixB; float *output_matrix_f = (float *)outMatrix; double *input_matrixA_d = (double *)inMatrixA; double *input_matrixB_d = (double *)inMatrixB; double *output_matrix_d = (double *)outMatrix; if (debug) { result << "gemm_clblas( " << (use_float ? "FLOAT" : "DOUBLE") << ")" << std::endl << std::endl; } cl_int err = CL_SUCCESS; clblasStatus status = clblasSetup(); if (status != CL_SUCCESS) { if (debug) { result << "clblasSetup: " << clblasErrorToString(status) << std::endl; } err = CL_INVALID_OPERATION; } // get first platform cl_platform_id platform = NULL; if (err == CL_SUCCESS) { err = clGetPlatformIDs(1, &platform, NULL); } if (debug && err == CL_SUCCESS) { result << "Platform: " << getPlatformInfoString(platform, CL_PLATFORM_NAME) << std::endl; result << "Device: " << getDeviceInfoString(device, CL_DEVICE_NAME) << std::endl; } // context cl_context context = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateContext:" << std::endl; } context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); } // queue cl_command_queue queue = NULL; if (err == CL_SUCCESS) { #ifdef CL_VERSION_2_0 if (debug) { result << "clCreateCommandQueueWithProperties:" << std::endl; } queue = clCreateCommandQueueWithProperties(context, device, NULL, &err); #else if (debug) { result << "clCreateCommandQueue:" << std::endl; } queue = clCreateCommandQueue(context, device, 0, &err); #endif } // buffers cl_mem cl_input_matrixA = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_input_matrixA:" << std::endl; } if (use_float) { cl_input_matrixA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowA * ncolA * sizeof(float), input_matrixA_f, &err); } else { cl_input_matrixA = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowA * ncolA * sizeof(double), input_matrixA_d, &err); } } cl_mem cl_input_matrixB = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_input_matrixB:" << std::endl; } if (use_float) { cl_input_matrixB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowB * ncolB * sizeof(float), input_matrixB_f, &err); } else { cl_input_matrixB = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, nrowB * ncolB * sizeof(double), input_matrixB_d, &err); } } int nrowC = transposeA ? ncolA : nrowA; int ncolC = transposeB ? nrowB : ncolB; cl_mem cl_output_matrix = NULL; if (err == CL_SUCCESS) { if (debug) { result << "clCreateBuffer cl_output_vector:" << std::endl; } if (use_float) { cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, nrowC * ncolC * sizeof(float), output_matrix_f, &err); } else { cl_output_matrix = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, nrowC * ncolC * sizeof(double), output_matrix_d, &err); } } // ++++++++++++ const int lda = nrowA; // first dimension of A (rows), before any transpose const int ldb = nrowB; // first dimension of B (rows), before any transpose const int ldc = nrowC; // first dimension of C (rows) const int M = transposeA ? ncolA : nrowA; // rows in A (after transpose, if any) and C const int N = transposeB ? nrowB : ncolB; // cols in B (after transpose, if any) and C const int K = transposeA ? nrowA : ncolA; // cols in A and rows in B (after transposes, if any) const clblasOrder order = clblasColumnMajor; const clblasTranspose transA = transposeA ? clblasTrans : clblasNoTrans; const clblasTranspose transB = transposeB ? clblasTrans : clblasNoTrans; cl_event event = NULL; if (err == CL_SUCCESS) { if (use_float) { if (debug) { result << "clblasSgemm:" << std::endl; } status = clblasSgemm(order, transA, transB, M, N, K, alpha, cl_input_matrixA, 0, lda, cl_input_matrixB, 0, ldb, beta, cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS && debug) { result << "clblasSgemm error:" << clblasErrorToString(status) << std::endl; } } else { if (debug) { result << "clblasDgemm:" << std::endl; } status = clblasDgemm(order, transA, transB, M, N, K, alpha, cl_input_matrixA, 0, lda, cl_input_matrixB, 0, ldb, beta, cl_output_matrix, 0, ldc, 1, &queue, 0, NULL, &event); if (status != CL_SUCCESS) { if (debug) { result << "clblasDgemm error:" << clblasErrorToString(status) << std::endl; } err = status; } } } if (err == CL_SUCCESS) { /* Wait for calculations to be finished. */ if (debug) { result << "clWaitForEvents:" << std::endl; } err = clWaitForEvents(1, &event); } // retrieve result if (err == CL_SUCCESS) { if (debug) { result << "Retrieve result:" << std::endl; } if (use_float) { clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, nrowC * ncolC * sizeof(float), output_matrix_f, 0, NULL, NULL); } else { clEnqueueReadBuffer(queue, cl_output_matrix, CL_TRUE, 0, nrowC * ncolC * sizeof(double), output_matrix_d, 0, NULL, NULL); } } std::string err_str = clErrorToString(err); result << std::endl << err_str << std::endl; // cleanup clReleaseMemObject(cl_output_matrix); cl_output_matrix = NULL; clReleaseMemObject(cl_input_matrixA); cl_input_matrixA = NULL; clReleaseMemObject(cl_input_matrixB); cl_input_matrixB = NULL; clReleaseCommandQueue(queue); queue = NULL; clReleaseContext(context); context = NULL; if (debug) { CERR << result.str(); } ErrorStatus errorStatus = { err, status }; return errorStatus; }
void sum_gpu(long long *in, long long *out, unsigned int n) { size_t global_size; size_t local_size; char *kernel_src; cl_int err; cl_platform_id platform_id; cl_device_id device_id; cl_uint max_compute_units; size_t max_workgroup_size; cl_context context; cl_command_queue commands; cl_program program; cl_kernel kernel; cl_mem d_array; cl_event event; cl_ulong start, end; /* start OpenCL */ err = clGetPlatformIDs(1, &platform_id,NULL); clErrorHandling("clGetPlatformIDs"); err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); clErrorHandling("clGetDeviceIDs"); context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); clErrorHandling("clCreateContext"); commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err); clErrorHandling("clCreateCommandQueue"); /* create kernel */ kernel_src = file_to_string(KERNEL_SRC); program = clCreateProgramWithSource(context, 1, (const char**) &kernel_src, NULL, &err); free(kernel_src); clErrorHandling("clCreateProgramWithSource"); err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); clErrorHandling("clBuildProgram"); kernel = clCreateKernel(program, "matrix_mult", &err); clErrorHandling("clCreateKernel"); /* allocate memory and send to gpu */ d_array = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long long) * n, NULL, &err); clErrorHandling("clCreateBuffer"); err = clEnqueueWriteBuffer(commands, d_array, CL_TRUE, 0, sizeof(long long) * n, in, 0, NULL, NULL); clErrorHandling("clEnqueueWriteBuffer"); err = clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL); err |= clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_workgroup_size, NULL); clErrorHandling("clGetDeviceInfo"); /* prepare kernel args */ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_array); err |= clSetKernelArg(kernel, 1, sizeof(unsigned int), &n); /* execute */ local_size = n / max_compute_units / 8; if (local_size > max_workgroup_size) local_size = max_workgroup_size; /* * Usually it would be * global_size = local_size * max_compute_units; * but that would only be valid if local_size = n / max_compute_units; * local_size is n / max_compute_units / 8 because it obtains its hightest performance. */ for (global_size = local_size; global_size < n; global_size += local_size); err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global_size, &local_size, 0, NULL, &event); clErrorHandling("clEnqueueNDRangeKernel"); clWaitForEvents(1, &event); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); fprintf(stderr, "Time for event (ms): %10.5f \n", (end - start) / 1000000.0); err = clFinish(commands); clErrorHandling("clFinish"); /* transfer back */ err = clEnqueueReadBuffer(commands, d_array, CL_TRUE, 0, sizeof(long long), out, 0, NULL, NULL); // a single long long clErrorHandling("clEnqueueReadBuffer"); /* cleanup*/ clReleaseMemObject(d_array); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); clReleaseEvent(event); }
int main(void) { const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(float); // Generate the input array on the host. float h_a[ARRAY_SIZE]; float h_b[ARRAY_SIZE]; for (int i = 0; i < ARRAY_SIZE; i++) { h_a[i] = (float)i; h_b[i] = (float)(2 * i); } float h_c[ARRAY_SIZE]; FILE *fp; char *source_str; size_t source_size; fp = fopen("vectors_cl.cl", "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); // Get platform and device information cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); // Create an OpenCL context cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); // Create a command queue cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret); // Create memory buffers on the device for each vector cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, ARRAY_BYTES, NULL, &ret); cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, ARRAY_BYTES, NULL, &ret); cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, ARRAY_BYTES, NULL, &ret); // Copy h_a and h_b to memory buffer ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, ARRAY_BYTES, h_a, 0, NULL, NULL); ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, ARRAY_BYTES, h_b, 0, NULL, NULL); // Create a program from the kernel source cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); if (ret != 0) { printf("clCreateProgramWithSource returned non-zero status %d\n\n", ret); exit(1); } // Build the program ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); if (ret != 0) { printf("clBuildProgram returned non-zero status %d: ", ret); if (ret == CL_INVALID_PROGRAM) { printf("invalid program\n"); } else if (ret == CL_INVALID_VALUE) { printf("invalid value\n"); } else if (ret == CL_INVALID_DEVICE) { printf("invalid device\n"); } else if (ret == CL_INVALID_BINARY) { printf("invalid binary\n"); } else if (ret == CL_INVALID_BUILD_OPTIONS) { printf("invalid build options\n"); } else if (ret == CL_INVALID_OPERATION) { printf("invalid operation\n"); } else if (ret == CL_COMPILER_NOT_AVAILABLE) { printf("compiler not available\n"); } else if (ret == CL_BUILD_PROGRAM_FAILURE) { printf("build program failure\n"); // Determine the size of the log size_t log_size; clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); // Allocate memory for the log char *log = (char *) malloc(log_size); // Get the log clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); // Print the log printf("%s\n", log); } else if (ret == CL_OUT_OF_HOST_MEMORY) { printf("out of host memory\n"); } exit(1); } // Create the OpenCL kernel cl_kernel kernel = clCreateKernel(program, "add", &ret); // Set the arguments of the kernel ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj); ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj); ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj); size_t array_size = ARRAY_SIZE; ret = clSetKernelArg(kernel, 3, sizeof(const size_t), (void *)&array_size); // Execute the OpenCL kernel on the list size_t global_item_size = ARRAY_SIZE; // Process the entire lists size_t local_item_size = 1; // Divide work items into groups of 64 ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); // Read the memory buffer C on the device to the local variable C ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, ARRAY_BYTES, h_c, 0, NULL, NULL); // Print out the resulting array. for (int i = 0; i < 8; i++) { printf("%d + %d = %d", (int)h_a[i], (int)h_b[i], (int)h_c[i]); printf(((i % 4) != 3) ? "\t" : "\n"); } printf("...\n"); for (int i = ARRAY_SIZE - 8; i < ARRAY_SIZE; i++) { printf("%d + %d = %d", (int)h_a[i], (int)h_b[i], (int)h_c[i]); printf(((i % 4) != 3) ? "\t" : "\n"); } // Clean up ret = clFlush(command_queue); ret = clFinish(command_queue); ret = clReleaseKernel(kernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(a_mem_obj); ret = clReleaseMemObject(b_mem_obj); ret = clReleaseMemObject(c_mem_obj); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); return 0; }
int main(int argc, char** argv) { cl_platform_id pf[MAX_PLATFORMS]; cl_uint nb_platforms = 0; cl_int err; // error code returned from api calls cl_device_type device_type = CL_DEVICE_TYPE_ALL; // Filter args // argv++; while (argc > 1) { if(!strcmp(*argv, "-g") || !strcmp(*argv, "--gpu-only")) { if(device_type != CL_DEVICE_TYPE_ALL) error("--gpu-only and --cpu-only can not be specified at the same time\n"); device_type = CL_DEVICE_TYPE_GPU; } else if(!strcmp(*argv, "-c") || !strcmp(*argv, "--cpu-only")) { if(device_type != CL_DEVICE_TYPE_ALL) error("--gpu-only and --cpu-only can not be specified at the same time\n"); device_type = CL_DEVICE_TYPE_CPU; } else if(!strcmp(*argv, "-s") || !strcmp(*argv, "--size")) { unsigned i; int r; char c; r = sscanf(argv[1], "%u%[mMkK]", &SIZE, &c); if (r == 2) { if (c == 'k' || c == 'K') SIZE *= 1024; else if (c == 'm' || c == 'M') SIZE *= 1024 * 1024; } argc--; argv++; } else break; argc--; argv++; } if(argc > 1) TILE = atoi(*argv); // Get list of OpenCL platforms detected // err = clGetPlatformIDs(3, pf, &nb_platforms); check(err, "Failed to get platform IDs"); printf("%d OpenCL platforms detected\n", nb_platforms); // For each platform do // for (cl_int p = 0; p < nb_platforms; p++) { cl_uint num; int platform_valid = 1; char name[1024], vendor[1024]; cl_device_id devices[MAX_DEVICES]; cl_uint nb_devices = 0; cl_context context; // compute context cl_program program; // compute program cl_kernel kernel; err = clGetPlatformInfo(pf[p], CL_PLATFORM_NAME, 1024, name, NULL); check(err, "Failed to get Platform Info"); err = clGetPlatformInfo(pf[p], CL_PLATFORM_VENDOR, 1024, vendor, NULL); check(err, "Failed to get Platform Info"); printf("Platform %d: %s - %s\n", p, name, vendor); // Get list of devices // err = clGetDeviceIDs(pf[p], device_type, MAX_DEVICES, devices, &nb_devices); printf("nb devices = %d\n", nb_devices); if(nb_devices == 0) continue; // Create compute context with "device_type" devices // context = clCreateContext (0, nb_devices, devices, NULL, NULL, &err); check(err, "Failed to create compute context"); // Load program source into memory // const char *opencl_prog; opencl_prog = file_load(KERNEL_FILE); // Attach program source to context // program = clCreateProgramWithSource(context, 1, &opencl_prog, NULL, &err); check(err, "Failed to create program"); // Compile program // { char flags[1024]; sprintf (flags, "-cl-mad-enable -cl-fast-relaxed-math -DSIZE=%d -DTILE=%d -DTYPE=%s", SIZE, TILE, "float"); err = clBuildProgram (program, 0, NULL, flags, NULL, NULL); if(err != CL_SUCCESS) { size_t len; // Display compiler log // clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &len); { char buffer[len+1]; fprintf(stderr, "--- Compiler log ---\n"); clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL); fprintf(stderr, "%s\n", buffer); fprintf(stderr, "--------------------\n"); } if(err != CL_SUCCESS) error("Failed to build program!\n"); } } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, KERNEL_NAME, &err); check(err, "Failed to create compute kernel"); // Allocate and initialize input data // alloc_buffers_and_user_data(context); // Iterate over devices // for(cl_int dev = 0; dev < nb_devices; dev++) { cl_command_queue queue; char name[1024]; cl_device_type dtype; err = clGetDeviceInfo(devices[dev], CL_DEVICE_NAME, 1024, name, NULL); check(err, "Cannot get type of device"); err = clGetDeviceInfo(devices[dev], CL_DEVICE_TYPE, sizeof(cl_device_type), &dtype, NULL); check(err, "Cannot get type of device"); printf("\tDevice %d : %s [%s]\n", dev, (dtype == CL_DEVICE_TYPE_GPU) ? "GPU" : "CPU", name); // Create a command queue // queue = clCreateCommandQueue(context, devices[dev], CL_QUEUE_PROFILING_ENABLE, &err); check(err,"Failed to create command queue"); // Write our data set into device buffer // send_input(queue); // Execute kernel // { cl_event prof_event; cl_ulong start, end; struct timeval t1,t2; double timeInMicroseconds; size_t global[2] = { SIZE, SIZE }; // global domain size for our calculation size_t local[2] = { TILE, TILE }; // local domain size for our calculation printf("\t%dx%d Threads in workgroups of %dx%d\n", global[0], global[1], local[0], local[1]); // Set kernel arguments // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_buffer); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_buffer); check(err, "Failed to set kernel arguments"); gettimeofday (&t1, NULL); for (unsigned iter = 0; iter < ITERATIONS; iter++) { err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global, local, 0, NULL, &prof_event); check(err, "Failed to execute kernel"); } // Wait for the command commands to get serviced before reading back results // clFinish(queue); gettimeofday (&t2,NULL); // Check performance // timeInMicroseconds = (double)TIME_DIFF(t1, t2) / ITERATIONS; printf("\tComputation performed in %lf µs over device #%d\n", timeInMicroseconds, dev); clReleaseEvent(prof_event); } // Read back the results from the device to verify the output // retrieve_output(queue); // Validate computation // check_output_data(); clReleaseCommandQueue(queue); } // Cleanup // free_buffers_and_user_data(); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseContext(context); } return 0; }
int main() { /* OpenCL data structures */ cl_device_id device; cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_int err; /* Data and events */ char *kernel_msg; float data[4096]; cl_mem data_buffer; cl_event kernel_event, read_event; /* Create a device and context */ device = create_device(); context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if(err < 0) { perror("Couldn't create a context"); exit(1); } /* Build the program and create a kernel */ program = build_program(context, device, PROGRAM_FILE); kernel = clCreateKernel(program, KERNEL_FUNC, &err); if(err < 0) { perror("Couldn't create a kernel"); exit(1); }; /* Create a write-only buffer to hold the output data */ data_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(data), NULL, &err); if(err < 0) { perror("Couldn't create a buffer"); exit(1); }; /* Create kernel argument */ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_buffer); if(err < 0) { perror("Couldn't set a kernel argument"); exit(1); }; /* Create a command queue */ queue = clCreateCommandQueue(context, device, 0, &err); if(err < 0) { perror("Couldn't create a command queue"); exit(1); }; /* Enqueue kernel */ err = clEnqueueTask(queue, kernel, 0, NULL, &kernel_event); if(err < 0) { perror("Couldn't enqueue the kernel"); exit(1); } /* Read the buffer */ err = clEnqueueReadBuffer(queue, data_buffer, CL_FALSE, 0, sizeof(data), &data, 0, NULL, &read_event); if(err < 0) { perror("Couldn't read the buffer"); exit(1); } /* Set event handling routines */ kernel_msg = "The kernel finished successfully.\n\0"; err = clSetEventCallback(kernel_event, CL_COMPLETE, &kernel_complete, kernel_msg); if(err < 0) { perror("Couldn't set callback for event"); exit(1); } clSetEventCallback(read_event, CL_COMPLETE, &read_complete, data); /* Deallocate resources */ clReleaseMemObject(data_buffer); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseProgram(program); clReleaseContext(context); return 0; }
int main(int argc, char **argv) { printf("start \n"); int x, y, nsteps, i, j; float *u_h; double *f_h; //pointers to host memory int ArraySizeX = 5122; int ArraySizeY = 5122; double n, ux, uy, uxx, uxy, uyy, usq; FILE *fp; size_t size = ArraySizeX*ArraySizeY*sizeof(float); size_t size1 = ArraySizeX*ArraySizeY*9*sizeof(double); u_h = (float *)calloc(ArraySizeX*ArraySizeY,sizeof(float)); f_h = (double *)calloc(ArraySizeX*ArraySizeY*9,sizeof(double)); printf("initialization \n"); // initialization for( x = 0;x<ArraySizeX;x++){ for( y =0;y<ArraySizeY;y++){ // define the macroscopic properties of the initial condition. n = 1 + Amp2*exp(-(pow(x-ArraySizeX/2,2)+pow(y-ArraySizeY/2,2))/Width); ux = 0; uy = 0; // intialize f to be the local equilibrium values uxx = ux*ux; uyy = uy*uy; uxy = 2*ux*uy; usq = uxx+ uyy; f_h[x*ArraySizeY*9+y*9] = w1*n*(1-1.5*usq); f_h[x*ArraySizeY*9+y*9+1] = w2*n*(1+3*ux+4.5*uxx-1.5*usq); f_h[x*ArraySizeY*9+y*9+2] = w2*n*(1-3*ux+4.5*uxx-1.5*usq); f_h[x*ArraySizeY*9+y*9+3] = w2*n*(1+3*uy+4.5*uyy-1.5*usq); f_h[x*ArraySizeY*9+y*9+4]= w2*n*(1-3*uy+4.5*uyy-1.5*usq); f_h[x*ArraySizeY*9+y*9+5] = w3*n*(1+3*(ux+uy)+4.5*(uxx+uxy+uyy)-1.5*usq); f_h[x*ArraySizeY*9+y*9+6] = w3*n*(1+3*(-ux+uy)+4.5*(uxx-uxy+uyy)-1.5*usq); f_h[x*ArraySizeY*9+y*9+7] = w3*n*(1+3*(-ux-uy)+4.5*(uxx+uxy+uyy)-1.5*usq); f_h[x*ArraySizeY*9+y*9+8] = w3*n*(1+3*(ux-uy)+4.5*(uxx-uxy+uyy)-1.5*usq); } } cl_event event; cl_ulong time_start, time_end, total_time; // use this to check the output of each API call cl_int status; // retrieve the number of platforms cl_uint numPlatforms = 0; status = clGetPlatformIDs(0,NULL,&numPlatforms); chk(status, "clGetPlatformIDs0"); // allocate enough space for each platform cl_platform_id *platforms = NULL; platforms = (cl_platform_id *) malloc(numPlatforms*sizeof(cl_platform_id)); // Fill in the platforms status = clGetPlatformIDs(numPlatforms, platforms, NULL); chk(status, "clGetPlatformIDs1"); // Retrieve the number of devices cl_uint numDevices = 0; status = clGetDeviceIDs(platforms[0],CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); chk(status, "clGetDeviceIDs0"); // Allocate enough space for each device cl_device_id *devices = NULL; devices = (cl_device_id *) malloc(numDevices*sizeof(cl_device_id)); // Fill in the devices status = clGetDeviceIDs(platforms[0], CL_DEVICE_TYPE_ALL, numDevices, devices, NULL); chk(status, "clGetDeviceIDs1"); // Create a context and associate it with devices cl_context context; context = clCreateContext(NULL,numDevices, devices, NULL, NULL, &status); chk(status,"clCreateContext"); // Create a command queue and associate it with device cl_command_queue cmdQueue; cmdQueue = clCreateCommandQueue(context, devices[0],CL_QUEUE_PROFILING_ENABLE,&status); chk(status,"clCreateCommandQueue"); // Create Buffer objects on devices cl_mem u_d, f_d; u_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size, NULL, &status); chk(status,"clCreatebuffer"); f_d = clCreateBuffer(context, CL_MEM_READ_WRITE, size1, NULL, &status); chk(status, "clCreatebuffer"); // perform computing on GPU // copy data from host to device status = clEnqueueWriteBuffer(cmdQueue, u_d, CL_FALSE, 0, size, u_h, 0, NULL, NULL); chk(status,"ClEnqueueWriteBuffer"); status = clEnqueueWriteBuffer(cmdQueue, f_d, CL_FALSE, 0, size1, f_h, 0, NULL, NULL); chk(status, "clEnqueueWriteBuffer"); // create program with source code cl_program program = clCreateProgramWithSource(context,1,(const char**)&programSource, NULL, &status); chk(status, "clCreateProgramWithSource"); // Compile program for the device status = clBuildProgram(program, numDevices, devices, NULL, NULL,NULL); // chk(status, "ClBuildProgram"); if(status != CL_SUCCESS){ printf("clBuildProgram failed (%d) \n", status); size_t log_size; clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); char *log = (char *) malloc(log_size); clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, log_size, log, NULL); printf("%s\n", log); exit(-1); } printf("successfully built program \n"); // Create lattice-boltzman kernel cl_kernel kernel, kernel1; kernel = clCreateKernel(program, "lbiteration", &status); kernel1 = clCreateKernel(program, "Denrho", &status); chk(status, "clCreateKernel"); printf("successfully create kernel \n"); // Associate the input and output buffers with the kernel status = clSetKernelArg(kernel,0, sizeof(cl_mem), &f_d); status |= clSetKernelArg(kernel1,0, sizeof(cl_mem), &u_d); status |= clSetKernelArg(kernel1,1, sizeof(cl_mem), &f_d); status |= clSetKernelArg(kernel, 1, sizeof(int), &ArraySizeX); status |= clSetKernelArg(kernel1,2, sizeof(int), &ArraySizeX); status |= clSetKernelArg(kernel, 2, sizeof(int), &ArraySizeY); status |= clSetKernelArg(kernel1,3, sizeof(int),&ArraySizeY); chk(status, "clSerKernelArg"); // set the work dimensions size_t localworksize[2] = {BLOCK_SIZE_X,BLOCK_SIZE_Y}; int nBLOCKSX = (ArraySizeX-2)/(BLOCK_SIZE_X -2); int nBLOCKSY = (ArraySizeY-2)/(BLOCK_SIZE_Y -2); size_t globalworksize[2] = {nBLOCKSX*BLOCK_SIZE_X,nBLOCKSY*BLOCK_SIZE_Y}; // loop the kernel for( nsteps = 0; nsteps < 100; nsteps++){ status = clEnqueueNDRangeKernel(cmdQueue, kernel, 2, NULL, globalworksize,localworksize,0,NULL,&event); clWaitForEvents(1 , &event); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL); total_time += time_end - time_start; } printf("Good so far \n"); status = clEnqueueNDRangeKernel(cmdQueue, kernel1, 2, NULL, globalworksize,localworksize,0,NULL,&event); chk(status, "clEnqueueNDR"); clWaitForEvents(1 , &event); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL); total_time += time_end - time_start; printf("running time is %0.3f \n",(total_time/1000000000.0)); // retrieve data from device status = clEnqueueReadBuffer(cmdQueue, u_d, CL_TRUE, 0, size, u_h, 0, NULL, NULL); chk(status, "clEnqueueReadBuffer"); // Output results fp = fopen("SolutionCL.txt", "wt"); for(i= 0;i<ArraySizeX;i++){ for(j=0;j<ArraySizeY;j++) fprintf(fp, " %f", u_h[i*ArraySizeY+j]); fprintf(fp, "\n"); } fclose(fp); //cleanup clReleaseKernel(kernel); clReleaseKernel(kernel1); clReleaseProgram(program); clReleaseCommandQueue(cmdQueue); clReleaseMemObject(u_d); clReleaseMemObject(f_d); clReleaseContext(context); free(u_h); free(f_h); free(platforms); free(devices); return 0; }
int main() { cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_mem memobj = NULL; cl_program program = NULL; cl_kernel kernel = NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret; float mem[MEM_SIZE]; FILE *fp; char fileName[] = "./kernel.clbin"; size_t binary_size; char *binary_buf; cl_int binary_status; cl_int i; /* カーネルを含むオブジェクトファイルをロード */ fp = fopen(fileName, "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } binary_buf = (char *)malloc(MAX_BINARY_SIZE); binary_size = fread( binary_buf, 1, MAX_BINARY_SIZE, fp ); fclose( fp ); /* データを初期化 */ for( i = 0; i < MEM_SIZE; i++ ) { mem[i] = i; } /* プラットフォーム・デバイスの情報の取得 */ ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); /* OpenCLコンテキストの作成 */ context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); /* コマンドキューの作成 */ command_queue = clCreateCommandQueue(context, device_id, 0, &ret); /* メモリバッファの作成 */ memobj = clCreateBuffer(context, CL_MEM_READ_WRITE, MEM_SIZE * sizeof(float), NULL, &ret); /* メモリバッファにデータを転送 */ ret = clEnqueueWriteBuffer(command_queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(float), mem, 0, NULL, NULL); /* 読み込んだバイナリからカーネルプログラムを作成 */ program = clCreateProgramWithBinary(context, 1, &device_id, (const size_t *)&binary_size, (const unsigned char **)&binary_buf, &binary_status, &ret); /* OpenCLカーネルの作成 */ kernel = clCreateKernel(program, "vecAdd", &ret); printf("err:%d\n", ret); /* OpenCLカーネル引数の設定 */ ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj); size_t global_work_size[3] = {MEM_SIZE, 0, 0}; size_t local_work_size[3] = {MEM_SIZE, 0, 0}; /* OpenCLカーネルを実行 */ ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); /* メモリバッファから結果を取得 */ ret = clEnqueueReadBuffer(command_queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(float), mem, 0, NULL, NULL); /* 結果の表示 */ for(i=0; i<MEM_SIZE; i++) { printf("mem[%d] : %f\n", i, mem[i]); } /* 終了処理 */ ret = clFlush(command_queue); ret = clFinish(command_queue); ret = clReleaseKernel(kernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(memobj); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); free(binary_buf); return 0; }
int main(int argc, char **argv) { cl_platform_id platforms[100]; cl_uint platforms_n = 0; CL_CHECK(clGetPlatformIDs(100, platforms, &platforms_n)); printf("=== %d OpenCL platform(s) found: ===\n", platforms_n); for (int i=0; i<platforms_n; i++) { char buffer[10240]; printf(" -- %d --\n", i); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL)); printf(" PROFILE = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL)); printf(" VERSION = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL)); printf(" NAME = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL)); printf(" VENDOR = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL)); printf(" EXTENSIONS = %s\n", buffer); } cl_device_id devices[100]; cl_uint devices_n = 0; // CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 100, devices, &devices_n)); CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 100, devices, &devices_n)); printf("=== %d OpenCL device(s) found on platform:\n", platforms_n); for (int i=0; i<devices_n; i++) { char buffer[10240]; cl_uint buf_uint; cl_ulong buf_ulong; printf(" -- %d --\n", i); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL)); printf(" DEVICE_NAME = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VENDOR = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL)); printf(" DRIVER_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL)); printf(" DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong); } if (devices_n == 0) return 1; cl_context context; context = CL_CHECK_ERR(clCreateContext(NULL, 1, devices, &pfn_notify, NULL, &_err)); const char *program_source[] = { "__kernel void simple_demo(__global int *src, __global int *dst, int factor)\n", "{\n", " int i = get_global_id(0);\n", " dst[i] = src[i] * factor;\n", "}\n" }; cl_program program; program = CL_CHECK_ERR(clCreateProgramWithSource(context, sizeof(program_source)/sizeof(*program_source), program_source, NULL, &_err)); if (clBuildProgram(program, 1, devices, "", NULL, NULL) != CL_SUCCESS) { char buffer[10240]; clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL); fprintf(stderr, "CL Compilation failed:\n%s", buffer); abort(); } CL_CHECK(clUnloadCompiler()); cl_mem input_buffer; input_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*NUM_DATA, NULL, &_err)); cl_mem output_buffer; output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int)*NUM_DATA, NULL, &_err)); int factor = 2; cl_kernel kernel; kernel = CL_CHECK_ERR(clCreateKernel(program, "simple_demo", &_err)); CL_CHECK(clSetKernelArg(kernel, 0, sizeof(input_buffer), &input_buffer)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(output_buffer), &output_buffer)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(factor), &factor)); cl_command_queue queue; queue = CL_CHECK_ERR(clCreateCommandQueue(context, devices[0], 0, &_err)); for (int i=0; i<NUM_DATA; i++) { CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, i*sizeof(int), sizeof(int), &i, 0, NULL, NULL)); } cl_event kernel_completion; size_t global_work_size[1] = { NUM_DATA }; CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, &kernel_completion)); CL_CHECK(clWaitForEvents(1, &kernel_completion)); CL_CHECK(clReleaseEvent(kernel_completion)); printf("Result:"); for (int i=0; i<NUM_DATA; i++) { int data; CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, i*sizeof(int), sizeof(int), &data, 0, NULL, NULL)); printf(" %d", data); } printf("\n"); CL_CHECK(clReleaseMemObject(input_buffer)); CL_CHECK(clReleaseMemObject(output_buffer)); CL_CHECK(clReleaseKernel(kernel)); CL_CHECK(clReleaseProgram(program)); CL_CHECK(clReleaseContext(context)); return 0; }
int main(int argc, char **argv) { if (find_option(argc, argv, "-h") >= 0) { printf("Options:\n"); printf("-h to see this help\n"); printf("-n <int> to set the number of particles\n"); printf("-o <filename> to specify the output file name\n"); printf("-s <filename> to specify the summary output file name\n"); return 0; } int n = read_int(argc, argv, "-n", 1000); char *savename = read_string(argc, argv, "-o", NULL); char *sumname = read_string(argc, argv, "-s", NULL); // For return values. cl_int ret; // OpenCL stuff. // Loading kernel files. FILE *kernelFile; char *kernelSource; size_t kernelSize; kernelFile = fopen("simulationKernel.cl", "r"); if (!kernelFile) { fprintf(stderr, "No file named simulationKernel.cl was found\n"); exit(-1); } kernelSource = (char*)malloc(MAX_SOURCE_SIZE); kernelSize = fread(kernelSource, 1, MAX_SOURCE_SIZE, kernelFile); fclose(kernelFile); // Getting platform and device information cl_platform_id platformId = NULL; cl_device_id deviceID = NULL; cl_uint retNumDevices; cl_uint retNumPlatforms; ret = clGetPlatformIDs(1, &platformId, &retNumPlatforms); // Different types of devices to pick from. At the moment picks the default opencl device. //CL_DEVICE_TYPE_GPU //CL_DEVICE_TYPE_ACCELERATOR //CL_DEVICE_TYPE_DEFAULT //CL_DEVICE_TYPE_CPU ret = clGetDeviceIDs(platformId, CL_DEVICE_TYPE_ACCELERATOR, 1, &deviceID, &retNumDevices); // Max workgroup size size_t max_available_local_wg_size; ret = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_available_local_wg_size, NULL); // Creating context. cl_context context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &ret); // Creating command queue cl_command_queue commandQueue = clCreateCommandQueueWithProperties (context, deviceID, 0, &ret); // Build program cl_program program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, (const size_t *)&kernelSize, &ret); // printf("program = ret %i \n", ret); ret = clBuildProgram(program, 1, &deviceID, NULL, NULL, NULL); // printf("clBuildProgram: ret %i \n", ret); // Create kernels cl_kernel forceKernel = clCreateKernel(program, "compute_forces_gpu", &ret); cl_kernel moveKernel = clCreateKernel(program, "move_gpu", &ret); cl_kernel binInitKernel = clCreateKernel(program, "bin_init_gpu", &ret); cl_kernel binKernel = clCreateKernel(program, "bin_gpu", &ret); FILE *fsave = savename ? fopen(savename, "w") : NULL; FILE *fsum = sumname ? fopen(sumname, "a") : NULL; particle_t *particles = (particle_t*)malloc(n * sizeof(particle_t)); // GPU particle data structure cl_mem d_particles = clCreateBuffer(context, CL_MEM_READ_WRITE, n * sizeof(particle_t), NULL, &ret); // Set size set_size(n); init_particles(n, particles); double copy_time = read_timer(); // Copy particles to device. ret = clEnqueueWriteBuffer(commandQueue, d_particles, CL_TRUE, 0, n * sizeof(particle_t), particles, 0, NULL, NULL); copy_time = read_timer() - copy_time; // Calculating thread and thread block counts. // sizes size_t globalItemSize; size_t localItemSize; // Global item size if (n <= NUM_THREADS) { globalItemSize = NUM_THREADS; localItemSize = 16; } else if (n % NUM_THREADS != 0) { globalItemSize = (n / NUM_THREADS + 1) * NUM_THREADS; } else { globalItemSize = n; } // Local item size localItemSize = globalItemSize / NUM_THREADS; // Bins and bin sizes. // Because of uniform distribution we will know that bins size is amortized. Therefore I picked the value of 10. // There will never be 10 particles in one bin. int maxParticles = 10; // Calculating the number of bins. int numberOfBins = (int)ceil(size/(2*cutoff)) + 2; // Bins will only exist on the device. particle_t* bins; // How many particles are there in each bin - also only exists on the device. volatile int* binSizes; // Number of bins to be initialized. size_t clearAmt = numberOfBins*numberOfBins; // Allocate memory for bins on the device. cl_mem d_binSizes = clCreateBuffer(context, CL_MEM_READ_WRITE, numberOfBins * numberOfBins * sizeof(volatile int), NULL, &ret); cl_mem d_bins = clCreateBuffer(context, CL_MEM_READ_WRITE, numberOfBins * numberOfBins * maxParticles * sizeof(particle_t), NULL, &ret); // SETTING ARGUMENTS FOR THE KERNELS // Set arguments for the init / clear kernel ret = clSetKernelArg(binInitKernel, 0, sizeof(cl_mem), (void *)&d_binSizes); ret = clSetKernelArg(binInitKernel, 1, sizeof(int), &numberOfBins); // Set arguments for the binning kernel ret = clSetKernelArg(binKernel, 0, sizeof(cl_mem), (void *)&d_particles); ret = clSetKernelArg(binKernel, 1, sizeof(int), &n); ret = clSetKernelArg(binKernel, 2, sizeof(cl_mem), (void *)&d_bins); ret = clSetKernelArg(binKernel, 3, sizeof(cl_mem), (void *)&d_binSizes); ret = clSetKernelArg(binKernel, 4, sizeof(int), &numberOfBins); // Set arguments for force kernel. ret = clSetKernelArg(forceKernel, 0, sizeof(cl_mem), (void *)&d_particles); ret = clSetKernelArg(forceKernel, 1, sizeof(int), &n); ret = clSetKernelArg(forceKernel, 2, sizeof(cl_mem), (void *)&d_bins); ret = clSetKernelArg(forceKernel, 3, sizeof(cl_mem), (void *)&d_binSizes); ret = clSetKernelArg(forceKernel, 4, sizeof(int), &numberOfBins); // Set arguments for move kernel ret = clSetKernelArg(moveKernel, 0, sizeof(cl_mem), (void *)&d_particles); ret = clSetKernelArg(moveKernel, 1, sizeof(int), &n); ret = clSetKernelArg(moveKernel, 2, sizeof(double), &size); // Variable to check if kernel execution is done. cl_event kernelDone; double simulation_time = read_timer(); int step = 0; for (step = 0; step < NSTEPS; step++) { // Execute bin initialization (clearing after first iteration) ret = clEnqueueNDRangeKernel(commandQueue, binInitKernel, 1, NULL, &clearAmt, NULL, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); // Execute binning kernel ret = clEnqueueNDRangeKernel(commandQueue, binKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone); // ret = clEnqueueNDRangeKernel(commandQueue, binKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); // Execute force kernel ret = clEnqueueNDRangeKernel(commandQueue, forceKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); // Execute move kernel ret = clEnqueueNDRangeKernel(commandQueue, moveKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); if (fsave && (step%SAVEFREQ) == 0) { // Copy the particles back to the CPU ret = clEnqueueReadBuffer(commandQueue, d_particles, CL_TRUE, 0, n * sizeof(particle_t), particles, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); save(fsave, n, particles); } } simulation_time = read_timer() - simulation_time; printf("CPU-GPU copy time = %g seconds\n", copy_time); printf("n = %d, simulation time = %g seconds\n", n, simulation_time); if (fsum) fprintf(fsum, "%d %lf \n", n, simulation_time); if (fsum) fclose(fsum); free(particles); if (fsave) fclose(fsave); ret = clFlush(commandQueue); ret = clFinish(commandQueue); ret = clReleaseCommandQueue(commandQueue); ret = clReleaseKernel(forceKernel); ret = clReleaseKernel(moveKernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(d_particles); ret = clReleaseContext(context); return 0; }
bor_cl_t *borCLNewSimple2(size_t program_count, const char **program, const char *buildopts) { cl_uint num_platforms, num_devices, i; cl_int err; cl_platform_id *platforms, platform; cl_device_id device; size_t bufsize; char buf[1024], *buf2; bor_cl_t *cl; // find platform and device platform = (cl_platform_id)-1; err = clGetPlatformIDs(0, NULL, &num_platforms); if (__borCLErrorCheck(err, "Can't get any platform") != 0) return NULL; if (num_platforms == 0) return NULL; platforms = BOR_ALLOC_ARR(cl_platform_id, num_platforms); err = clGetPlatformIDs(num_platforms, platforms, NULL); if (__borCLErrorCheck(err, "Can't get any platform") != 0) return NULL; for (i = 0; i < num_platforms; i++){ err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 0, NULL, &num_devices); if (__borCLErrorCheck(err, "Cant'get any device") != 0) break; err = clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_GPU, 1, &device, &num_devices); if (__borCLErrorCheck(err, "Cant'get any device") != 0) break; if (num_devices > 0){ platform = platforms[i]; break; } } BOR_FREE(platforms); if (platform == (cl_platform_id)-1) return NULL; cl = BOR_ALLOC(bor_cl_t); cl->platform = platform; cl->device = device; // create context cl->context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if (__borCLErrorCheck(err, "Can't create context") != 0){ BOR_FREE(cl); return NULL; } // create queue cl->queue = clCreateCommandQueue(cl->context, cl->device, 0, &err); if (__borCLErrorCheck(err, "Can't create command queue") != 0){ clReleaseContext(cl->context); BOR_FREE(cl); return NULL; } // create program cl->program = clCreateProgramWithSource(cl->context, program_count, program, NULL, &err); if (__borCLErrorCheck(err, "Can't create program") != 0){ clReleaseCommandQueue(cl->queue); clReleaseContext(cl->context); BOR_FREE(cl); return NULL; } // build program err = clBuildProgram(cl->program, 1, &cl->device, buildopts, NULL, NULL); if (__borCLErrorCheck(err, "Can't build program") != 0){ err = clGetProgramBuildInfo(cl->program, cl->device, CL_PROGRAM_BUILD_LOG, 1024, buf, &bufsize); if (err == CL_INVALID_VALUE && bufsize > 1024){ buf2 = BOR_ALLOC_ARR(char, bufsize); err = clGetProgramBuildInfo(cl->program, cl->device, CL_PROGRAM_BUILD_LOG, bufsize, buf2, NULL); if (__borCLErrorCheck(err, "Can't obtain build log") == 0){ fprintf(stderr, " >> Build log:\n%s\n", buf2); } BOR_FREE(buf2); }else{
int main( void ) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX; float *X; cl_event event = NULL; int ret = 0; const size_t N0 = 4, N1 = 4, N2 = 4; char platform_name[128]; char device_name[128]; /* FFT library realted declarations */ clfftPlanHandle planHandle; clfftDim dim = CLFFT_3D; size_t clLengths[3] = {N0, N1, N2}; /* Setup OpenCL environment. */ err = clGetPlatformIDs( 1, &platform, NULL ); size_t ret_param_size = 0; err = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, &ret_param_size); printf("Platform found: %s\n", platform_name); err = clGetDeviceIDs( platform, CL_DEVICE_TYPE_DEFAULT, 1, &device, NULL ); err = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, &ret_param_size); printf("Device found on the above platform: %s\n", device_name); props[1] = (cl_context_properties)platform; ctx = clCreateContext( props, 1, &device, NULL, NULL, &err ); queue = clCreateCommandQueue( ctx, device, 0, &err ); /* Setup clFFT. */ clfftSetupData fftSetup; err = clfftInitSetupData(&fftSetup); err = clfftSetup(&fftSetup); /* Allocate host & initialize data. */ /* Only allocation shown for simplicity. */ size_t buffer_size = N0 * N1 * N2 * 2 * sizeof(*X); X = (float *)malloc(buffer_size); /* print input array just using the * indices to fill the array with data */ printf("\nPerforming fft on an three dimensional array of size N0 x N1 x N2 : %ld x %ld x %ld\n", N0, N1, N2); int i, j, k; i = j = k = 0; for (i=0; i<N0; ++i) { for (j=0; j<N1; ++j) { for (k=0; k<N2; ++k) { float x = 0.0f; float y = 0.0f; if (i==0 && j==0 && k==0) { x = y = 0.5f; } unsigned idx = 2*(k+j*N1+i*N0*N1); X[idx] = x; X[idx+1] = y; printf("(%f, %f) ", X[idx], X[idx+1]); } printf("\n"); } printf("\n"); } /* Prepare OpenCL memory objects and place data inside them. */ bufX = clCreateBuffer( ctx, CL_MEM_READ_WRITE, buffer_size, NULL, &err ); err = clEnqueueWriteBuffer( queue, bufX, CL_TRUE, 0, buffer_size, X, 0, NULL, NULL ); /* Create a default plan for a complex FFT. */ err = clfftCreateDefaultPlan(&planHandle, ctx, dim, clLengths); /* Set plan parameters. */ err = clfftSetPlanPrecision(planHandle, CLFFT_SINGLE); err = clfftSetLayout(planHandle, CLFFT_COMPLEX_INTERLEAVED, CLFFT_COMPLEX_INTERLEAVED); err = clfftSetResultLocation(planHandle, CLFFT_INPLACE); /* Bake the plan. */ err = clfftBakePlan(planHandle, 1, &queue, NULL, NULL); /* Execute the plan. */ err = clfftEnqueueTransform(planHandle, CLFFT_FORWARD, 1, &queue, 0, NULL, NULL, &bufX, NULL, NULL); /* Wait for calculations to be finished. */ err = clFinish(queue); /* Fetch results of calculations. */ err = clEnqueueReadBuffer( queue, bufX, CL_TRUE, 0, buffer_size, X, 0, NULL, NULL ); /* print output array */ printf("\n\nfft result: \n"); i = j = k = 0; for (i=0; i<N0; ++i) { for (j=0; j<N1; ++j) { for (k=0; k<N2; ++k) { unsigned idx = 2*(k+j*N1+i*N0*N1); printf("(%f, %f) ", X[idx], X[idx+1]); } printf("\n"); } printf("\n"); } printf("\n"); /* Release OpenCL memory objects. */ clReleaseMemObject( bufX ); free(X); /* Release the plan. */ err = clfftDestroyPlan( &planHandle ); /* Release clFFT library. */ clfftTeardown( ); /* Release OpenCL working objects. */ clReleaseCommandQueue( queue ); clReleaseContext( ctx ); return ret; }
int main(void) { float *h_psum; // vector to hold partial sum int in_nsteps = INSTEPS; // default number of steps (updated later to device preferable) int niters = ITERS; // number of iterations int nsteps; float step_size; size_t nwork_groups; size_t max_size, work_group_size = 8; float pi_res; cl_mem d_partial_sums; char *kernelsource = getKernelSource("../pi_ocl.cl"); // Kernel source cl_int err; cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel_pi; // compute kernel // Set up OpenCL context. queue, kernel, etc. cl_uint numPlatforms; // Find number of platforms err = clGetPlatformIDs(0, NULL, &numPlatforms); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to find a platform!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Get all platforms cl_platform_id Platform[numPlatforms]; err = clGetPlatformIDs(numPlatforms, Platform, NULL); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to get the platform!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Secure a device for (int i = 0; i < numPlatforms; i++) { err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL); if (err == CL_SUCCESS) break; } if (device_id == NULL) { printf("Error: Failed to create a device group!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Output information err = output_device_info(device_id); // Create a compute context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n%s\n", err_code(err)); return EXIT_FAILURE; } // Create a command queue commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n%s\n", err_code(err)); return EXIT_FAILURE; } // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n%s\n", err_code(err)); return EXIT_FAILURE; } // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel_pi = clCreateKernel(program, "pi", &err); if (!kernel_pi || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n%s\n", err_code(err)); return EXIT_FAILURE; } // Find kernel work-group size err = clGetKernelWorkGroupInfo (kernel_pi, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &work_group_size, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to get kernel work-group info\n%s\n", err_code(err)); return EXIT_FAILURE; } // Now that we know the size of the work-groups, we can set the number of // work-groups, the actual number of steps, and the step size nwork_groups = in_nsteps/(work_group_size*niters); if (nwork_groups < 1) { err = clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(size_t), &nwork_groups, NULL); work_group_size = in_nsteps / (nwork_groups * niters); } nsteps = work_group_size * niters * nwork_groups; step_size = 1.0f/(float)nsteps; h_psum = calloc(sizeof(float), nwork_groups); if (!h_psum) { printf("Error: could not allocate host memory for h_psum\n"); return EXIT_FAILURE; } printf(" %ld work-groups of size %ld. %d Integration steps\n", nwork_groups, work_group_size, nsteps); d_partial_sums = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * nwork_groups, NULL, &err); if (err != CL_SUCCESS) { printf("Error: Failed to create buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } // Set kernel arguments err = clSetKernelArg(kernel_pi, 0, sizeof(int), &niters); err |= clSetKernelArg(kernel_pi, 1, sizeof(float), &step_size); err |= clSetKernelArg(kernel_pi, 2, sizeof(float) * work_group_size, NULL); err |= clSetKernelArg(kernel_pi, 3, sizeof(cl_mem), &d_partial_sums); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments!\n"); return EXIT_FAILURE; } // Execute the kernel over the entire range of our 1D input data set // using the maximum number of work items for this device size_t global = nwork_groups * work_group_size; size_t local = work_group_size; double rtime = wtime(); err = clEnqueueNDRangeKernel( commands, kernel_pi, 1, NULL, &global, &local, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to execute kernel\n%s\n", err_code(err)); return EXIT_FAILURE; } err = clEnqueueReadBuffer( commands, d_partial_sums, CL_TRUE, 0, sizeof(float) * nwork_groups, h_psum, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to read buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } // complete the sum and compute the final integral value on the host pi_res = 0.0f; for (unsigned int i = 0; i < nwork_groups; i++) { pi_res += h_psum[i]; } pi_res *= step_size; rtime = wtime() - rtime; printf("\nThe calculation ran in %lf seconds\n", rtime); printf(" pi = %f for %d steps\n", pi_res, nsteps); // clean up clReleaseMemObject(d_partial_sums); clReleaseProgram(program); clReleaseKernel(kernel_pi); clReleaseCommandQueue(commands); clReleaseContext(context); free(kernelsource); free(h_psum); }
static void clFreeContext(SEXP ctx) { clReleaseContext((cl_context)R_ExternalPtrAddr(ctx)); }