int main(int argc, char** argv) { int err; // error code returned from api calls cl_platform_id platform_id; // platform id cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel size_t global[2]; // global domain size for our calculation size_t local[2]; // local domain size for our calculation char cl_platform_vendor[1001]; char cl_platform_name[1001]; cl_mem in_array; // device memory used for the input array //cl_mem synaptic_weights; // device memory used for the input array cl_mem out_array; // device memory used for the output array if (argc != 2){ printf("%s <inputfile>\n", argv[0]); return -1; } //float in_array[NO_NODES]; //float out_array[NO_NODES]; //float synaptic_weights[NO_NODES*NO_NODES]; float in_array_tb[NO_NODES]; float out_array_tb[NO_NODES]; //float synaptic_weights_tb[NO_NODES*NO_NODES]; float temp =0; int i = 0; int j = 0; int index = 0; FILE* ifp; char* mode = "r"; // // Connect to first platform // err = clGetPlatformIDs(1,&platform_id,NULL); if (err != CL_SUCCESS) { printf("Error: Failed to find an OpenCL platform!\n"); printf("Test failed\n"); return -1; } err = clGetPlatformInfo(platform_id,CL_PLATFORM_VENDOR,1000,(void *)cl_platform_vendor,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_VENDOR) failed!\n"); printf("Test failed\n"); return -1; } printf("CL_PLATFORM_VENDOR %s\n",cl_platform_vendor); err = clGetPlatformInfo(platform_id,CL_PLATFORM_NAME,1000,(void *)cl_platform_name,NULL); if (err != CL_SUCCESS) { printf("Error: clGetPlatformInfo(CL_PLATFORM_NAME) failed!\n"); printf("Test failed\n"); return -1; } printf("CL_PLATFORM_NAME %s\n",cl_platform_name); // Connect to a compute device // int fpga = 0; #if defined (FPGA_DEVICE) fpga = 1; #endif err = clGetDeviceIDs(platform_id, fpga ? CL_DEVICE_TYPE_ACCELERATOR : CL_DEVICE_TYPE_CPU, 1, &device_id, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to create a device group!\n"); printf("Test failed\n"); return -1; } // // Create a compute context // context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n"); printf("Test failed\n"); return -1; } //relu_1(in_array,synaptic_weights,out_array); // Fill our data sets with pattern // //int i = 0; //for(i = 0; i < DATA_SIZE; i++) { // a[i] = (int)i; // b[i] = (int)i; // results[i] = 0; //} // // Create a command commands commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n"); printf("Error: code %i\n",err); printf("Test failed\n"); return -1; } int status; // Create Program Objects // // Load binary from disk unsigned char *kernelbinary; char *xclbin=argv[1]; printf("loading %s\n", xclbin); int n_i = load_file_to_memory(xclbin, (char **) &kernelbinary); if (n_i < 0) { printf("failed to load kernel from xclbin: %s\n", xclbin); printf("Test failed\n"); return -1; } size_t n = n_i; // Create the compute program from offline program = clCreateProgramWithBinary(context, 1, &device_id, &n, (const unsigned char **) &kernelbinary, &status, &err); if ((!program) || (err!=CL_SUCCESS)) { printf("Error: Failed to create compute program from binary %d!\n", err); printf("Test failed\n"); printf("err : %d %s\n",err,err); } // Build the program executable // err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n"); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); printf("Test failed\n"); return -1; } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, "relu_1", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n"); printf("Test failed\n"); return -1; } // Create the input and output arrays in device memory for our calculation // in_array = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * NO_NODES, NULL, NULL); //synaptic_weights = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * NO_NODES * NO_NODES, NULL, NULL); out_array = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * NO_NODES, NULL, NULL); if (!in_array || /*!synaptic_weights ||*/ !out_array) { printf("Error: Failed to allocate device memory!\n"); printf("Test failed\n"); return -1; } ifp = fopen("/home/agandhi92/sdaccel/relu_1/input.txt",mode); if(ifp == NULL) { printf("Input file not found \n"); return -1; } while (fscanf(ifp, "%f", &temp) != EOF && index < NO_NODES) { in_array_tb[index++] = temp; } index = 0; temp = 0; //ifp = fopen("/home/agandhi92/sdaccel/relu_1/weight.txt",mode); //if(ifp == NULL) //{ // printf("Weight file not found \n"); // return -1; //} //while (fscanf(ifp, "%f", &temp) != EOF && index < (NO_NODES*NO_NODES)) { // synaptic_weights_tb[index++] = temp; //} // // Write our data set into the input array in device memory // err = clEnqueueWriteBuffer(commands, in_array, CL_TRUE, 0, sizeof(float) * NO_NODES, in_array_tb, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array a!\n"); printf("Test failed\n"); return -1; } // Write our data set into the input array in device memory // //err = clEnqueueWriteBuffer(commands, synaptic_weights, CL_TRUE, 0, sizeof(float) * NO_NODES * NO_NODES, synaptic_weights_tb, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to write to source array b!\n"); printf("Test failed\n"); return -1; } // Set the arguments to our compute kernel // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &in_array); //err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &synaptic_weights); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &out_array); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments! %d\n", err); printf("Test failed\n"); return -1; } // Execute the kernel over the entire range of our 1d input data set // using the maximum number of work group items for this device // err = clEnqueueTask(commands, kernel, 0, NULL, NULL); if (err) { printf("Error: Failed to execute kernel! %d\n", err); printf("Test failed\n"); return -1; } // Read back the results from the device to verify the output // cl_event readevent; err = clEnqueueReadBuffer( commands, out_array, CL_TRUE, 0, sizeof(float) * NO_NODES, out_array_tb, 0, NULL, &readevent ); if (err != CL_SUCCESS) { printf("Error: Failed to read output array! %d\n", err); printf("Test failed\n"); return -1; } clWaitForEvents(1, &readevent); //printf("A\n"); //for (i=0;i<DATA_SIZE;i++) { // printf("%x ",a[i]); // if (((i+1) % 16) == 0) // printf("\n"); //} //printf("B\n"); //for (i=0;i<DATA_SIZE;i++) { // printf("%x ",b[i]); // if (((i+1) % 16) == 0) // printf("\n"); //} //printf("res\n"); //for (i=0;i<DATA_SIZE;i++) { // printf("%x ",results[i]); // if (((i+1) % 16) == 0) // printf("\n"); //} // Validate our results // //correct = 0; //for(i = 0; i < DATA_SIZE; i++) //{ // int row = i/MATRIX_RANK; // int col = i%MATRIX_RANK; // int running = 0; // int index; // for (index=0;index<MATRIX_RANK;index++) { // int aIndex = row*MATRIX_RANK + index; // int bIndex = col + index*MATRIX_RANK; // running += a[aIndex] * b[bIndex]; // } // sw_results[i] = running; //} // //for (i = 0;i < DATA_SIZE; i++) // if(results[i] == sw_results[i]) // correct++; //printf("Software\n"); //for (i=0;i<DATA_SIZE;i++) { // //printf("%0.2f ",sw_results[i]); // printf("%d ",sw_results[i]); // if (((i+1) % 16) == 0) // printf("\n"); //} // // //// Print a brief summary detailing the results //// //printf("Computed '%d/%d' correct values!\n", correct, DATA_SIZE); // // Shutdown and cleanup int temp_ = 0; for (j = 0; j < NO_NODES; j++) { if (out_array_tb[j] >= 0) // || out_array_tb[j]== 0) { //printf("out_array[%d] = %f \n", j, out_array[j]); temp_++; } } clReleaseMemObject(in_array); //clReleaseMemObject(synaptic_weights); clReleaseMemObject(out_array); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); if (temp_ == NO_NODES) { printf("*********************************************************** \n"); printf("TEST PASSED !!!!!! The output matches the desired output. \n"); printf("*********************************************************** \n"); return EXIT_SUCCESS; } else { printf("**************************************************************** \n"); printf("TEST Failed !!!!!! The output does not match the desired output. \n"); printf("**************************************************************** \n"); return -1; } //if(correct == DATA_SIZE){ // printf("Test passed!\n"); // return EXIT_SUCCESS; //} //else{ // printf("Test failed\n"); // return -1; //} }
cl_uint SetKernelArguments() { cl_int err = CL_SUCCESS; err = clSetKernelArg(ocl.kernel, 0, sizeof(cl_mem), (void *)&ocl.Lights); if (CL_SUCCESS != err) { printf("error: Failed to set argument Lights, returned %s\n", TranslateOpenCLError(err)); return err; } err = clSetKernelArg(ocl.kernel, 1, sizeof(cl_uint), (void *)&ocl.LightCount); if (CL_SUCCESS != err) { printf("Error: Failed to set argument LightCount, returned %s\n", TranslateOpenCLError(err)); return err; } err = clSetKernelArg(ocl.kernel, 2, sizeof(cl_mem), (void *)&ocl.Shapes); if (CL_SUCCESS != err) { printf("error: Failed to set argument Shapes, returned %s\n", TranslateOpenCLError(err)); return err; } err = clSetKernelArg(ocl.kernel, 3, sizeof(cl_uint), (void *)&ocl.ShapeCount); if (CL_SUCCESS != err) { printf("Error: Failed to set argument ShapeCount, returned %s\n", TranslateOpenCLError(err)); return err; } err = clSetKernelArg(ocl.kernel, 4, sizeof(cl_uint), (void *)&ocl.sampleCount); if (CL_SUCCESS != err) { printf("Error: Failed to set argument ShapeCount, returned %s\n", TranslateOpenCLError(err)); return err; } err = clSetKernelArg(ocl.kernel, 5, sizeof(cl_uint), (void *)&ocl.width); if (CL_SUCCESS != err) { printf("Error: Failed to set argument ShapeCount, returned %s\n", TranslateOpenCLError(err)); return err; } err = clSetKernelArg(ocl.kernel, 6, sizeof(cl_uint), (void *)&ocl.height); if (CL_SUCCESS != err) { printf("Error: Failed to set argument ShapeCount, returned %s\n", TranslateOpenCLError(err)); return err; } err = clSetKernelArg(ocl.kernel, 7, sizeof(cl_mem), (void *)&ocl.cam); if (CL_SUCCESS != err) { printf("Error: Failed to set argument ShapeCount, returned %s\n", TranslateOpenCLError(err)); return err; } return err; }
int lcs::GPUExclusiveScanForInt(int workGroupSize, int numOfBanks, cl_kernel scanKernel, cl_kernel reverseUpdateKernel, cl_mem d_arr, cl_int length, cl_command_queue commandQueue) { cl_int err; // Get the work group size size_t localWorkSize = workGroupSize; // Up-sweep and down-sweep clSetKernelArg(scanKernel, 0, sizeof(cl_mem), &d_arr); clSetKernelArg(scanKernel, 1, sizeof(cl_int), &length); clSetKernelArg(scanKernel, 3, sizeof(cl_int) * (workGroupSize * 2 + workGroupSize * 2 / numOfBanks + 1), NULL); static int records[10]; int problemSize = length; int numOfRecords = 0; cl_int d_step = 1; /// DEBUG /// printf("length = %d\n", length); for (; problemSize > 1; problemSize = (problemSize - 1) / (localWorkSize * 2) + 1) { if (numOfRecords) d_step *= localWorkSize * 2; records[numOfRecords++] = problemSize; clSetKernelArg(scanKernel, 2, sizeof(cl_int), &d_step); size_t globalWorkSize = ((problemSize - 1) / (localWorkSize * 2) + 1) * localWorkSize; err = clEnqueueNDRangeKernel(commandQueue, scanKernel, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); if (err) lcs::Error("Fail to enqueue scan"); /// DEBUG /// err = clFinish(commandQueue); printf("err = %d\n", err); if (err) lcs::Error("Non-zero err in pre-scan"); } int zero = 0, sum; err = clEnqueueReadBuffer(commandQueue, d_arr, CL_TRUE, 0, sizeof(int), &sum, 0, NULL, NULL); if (err) lcs::Error("Fail to read d_arr[0]"); err = clEnqueueWriteBuffer(commandQueue, d_arr, CL_TRUE, 0, sizeof(int), &zero, 0, NULL, NULL); if (err) lcs::Error("Fail to clean d_arr[0]"); // Reverse updates clSetKernelArg(reverseUpdateKernel, 0, sizeof(cl_mem), &d_arr); clSetKernelArg(reverseUpdateKernel, 1, sizeof(cl_int), &length); size_t globalWorkSize; for (int i = numOfRecords - 1; i >= 0; i--, d_step /= localWorkSize * 2) { clSetKernelArg(reverseUpdateKernel, 2, sizeof(cl_int), &d_step); globalWorkSize = ((records[i] - 1) / (localWorkSize * 2) + 1) * localWorkSize; err = clEnqueueNDRangeKernel(commandQueue, reverseUpdateKernel, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); if (err) lcs::Error("Fail to enqueue scan"); clFinish(commandQueue); } return sum; }
int main(int argc, char const *argv[]) { /* Get platform */ cl_platform_id platform; cl_uint num_platforms; cl_int ret = clGetPlatformIDs(1, &platform, &num_platforms); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformIDs' failed\n"); exit(1); } printf("Number of platforms: %d\n", num_platforms); printf("platform=%p\n", platform); /* Get platform name */ char platform_name[100]; ret = clGetPlatformInfo(platform, CL_PLATFORM_NAME, sizeof(platform_name), platform_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetPlatformInfo' failed\n"); exit(1); } printf("platform.name='%s'\n\n", platform_name); /* Get device */ cl_device_id device; cl_uint num_devices; ret = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, &num_devices); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceIDs' failed\n"); exit(1); } printf("Number of devices: %d\n", num_devices); printf("device=%p\n", device); /* Get device name */ char device_name[100]; ret = clGetDeviceInfo(device, CL_DEVICE_NAME, sizeof(device_name), device_name, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clGetDeviceInfo' failed\n"); exit(1); } printf("device.name='%s'\n", device_name); printf("\n"); /* Create a Context Object */ cl_context context; context = clCreateContext(NULL, 1, &device, NULL, NULL, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateContext' failed\n"); exit(1); } printf("context=%p\n", context); /* Create a Command Queue Object*/ cl_command_queue command_queue; command_queue = clCreateCommandQueue(context, device, 0, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateCommandQueue' failed\n"); exit(1); } printf("command_queue=%p\n", command_queue); printf("\n"); /* Program source */ unsigned char *source_code; size_t source_length; /* Read program from 'sub_sat_short16short16.cl' */ source_code = read_buffer("sub_sat_short16short16.cl", &source_length); /* Create a program */ cl_program program; program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_length, &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateProgramWithSource' failed\n"); exit(1); } printf("program=%p\n", program); /* Build program */ ret = clBuildProgram(program, 1, &device, NULL, NULL, NULL); if (ret != CL_SUCCESS ) { size_t size; char *log; /* Get log size */ clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,0, NULL, &size); /* Allocate log and print */ log = malloc(size); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG,size, log, NULL); printf("error: call to 'clBuildProgram' failed:\n%s\n", log); /* Free log and exit */ free(log); exit(1); } printf("program built\n"); printf("\n"); /* Create a Kernel Object */ cl_kernel kernel; kernel = clCreateKernel(program, "sub_sat_short16short16", &ret); if (ret != CL_SUCCESS) { printf("error: call to 'clCreateKernel' failed\n"); exit(1); } /* Create and allocate host buffers */ size_t num_elem = 10; /* Create and init host side src buffer 0 */ cl_short16 *src_0_host_buffer; src_0_host_buffer = malloc(num_elem * sizeof(cl_short16)); for (int i = 0; i < num_elem; i++) src_0_host_buffer[i] = (cl_short16){{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}}; /* Create and init device side src buffer 0 */ cl_mem src_0_device_buffer; src_0_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_short16), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_0_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_short16), src_0_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create and init host side src buffer 1 */ cl_short16 *src_1_host_buffer; src_1_host_buffer = malloc(num_elem * sizeof(cl_short16)); for (int i = 0; i < num_elem; i++) src_1_host_buffer[i] = (cl_short16){{2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}}; /* Create and init device side src buffer 1 */ cl_mem src_1_device_buffer; src_1_device_buffer = clCreateBuffer(context, CL_MEM_READ_ONLY, num_elem * sizeof(cl_short16), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create source buffer\n"); exit(1); } ret = clEnqueueWriteBuffer(command_queue, src_1_device_buffer, CL_TRUE, 0, num_elem * sizeof(cl_short16), src_1_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueWriteBuffer' failed\n"); exit(1); } /* Create host dst buffer */ cl_short16 *dst_host_buffer; dst_host_buffer = malloc(num_elem * sizeof(cl_short16)); memset((void *)dst_host_buffer, 1, num_elem * sizeof(cl_short16)); /* Create device dst buffer */ cl_mem dst_device_buffer; dst_device_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, num_elem *sizeof(cl_short16), NULL, &ret); if (ret != CL_SUCCESS) { printf("error: could not create dst buffer\n"); exit(1); } /* Set kernel arguments */ ret = CL_SUCCESS; ret |= clSetKernelArg(kernel, 0, sizeof(cl_mem), &src_0_device_buffer); ret |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &src_1_device_buffer); ret |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clSetKernelArg' failed\n"); exit(1); } /* Launch the kernel */ size_t global_work_size = num_elem; size_t local_work_size = num_elem; ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_work_size, &local_work_size, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueNDRangeKernel' failed\n"); exit(1); } /* Wait for it to finish */ clFinish(command_queue); /* Read results from GPU */ ret = clEnqueueReadBuffer(command_queue, dst_device_buffer, CL_TRUE,0, num_elem * sizeof(cl_short16), dst_host_buffer, 0, NULL, NULL); if (ret != CL_SUCCESS) { printf("error: call to 'clEnqueueReadBuffer' failed\n"); exit(1); } /* Dump dst buffer to file */ char dump_file[100]; sprintf((char *)&dump_file, "%s.result", argv[0]); write_buffer(dump_file, (const char *)dst_host_buffer, num_elem * sizeof(cl_short16)); printf("Result dumped to %s\n", dump_file); /* Free host dst buffer */ free(dst_host_buffer); /* Free device dst buffer */ ret = clReleaseMemObject(dst_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 0 */ free(src_0_host_buffer); /* Free device side src buffer 0 */ ret = clReleaseMemObject(src_0_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Free host side src buffer 1 */ free(src_1_host_buffer); /* Free device side src buffer 1 */ ret = clReleaseMemObject(src_1_device_buffer); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseMemObject' failed\n"); exit(1); } /* Release kernel */ ret = clReleaseKernel(kernel); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseKernel' failed\n"); exit(1); } /* Release program */ ret = clReleaseProgram(program); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseProgram' failed\n"); exit(1); } /* Release command queue */ ret = clReleaseCommandQueue(command_queue); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseCommandQueue' failed\n"); exit(1); } /* Release context */ ret = clReleaseContext(context); if (ret != CL_SUCCESS) { printf("error: call to 'clReleaseContext' failed\n"); exit(1); } return 0; }
void sum_gpu(long long *in, long long *out, unsigned int n) { size_t global_size; size_t local_size; char *kernel_src; cl_int err; cl_platform_id platform_id; cl_device_id device_id; cl_uint max_compute_units; size_t max_workgroup_size; cl_context context; cl_command_queue commands; cl_program program; cl_kernel kernel; cl_mem d_array; cl_event event; cl_ulong start, end; /* start OpenCL */ err = clGetPlatformIDs(1, &platform_id,NULL); clErrorHandling("clGetPlatformIDs"); err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); clErrorHandling("clGetDeviceIDs"); context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); clErrorHandling("clCreateContext"); commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err); clErrorHandling("clCreateCommandQueue"); /* create kernel */ kernel_src = file_to_string(KERNEL_SRC); program = clCreateProgramWithSource(context, 1, (const char**) &kernel_src, NULL, &err); free(kernel_src); clErrorHandling("clCreateProgramWithSource"); err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); clErrorHandling("clBuildProgram"); kernel = clCreateKernel(program, "matrix_mult", &err); clErrorHandling("clCreateKernel"); /* allocate memory and send to gpu */ d_array = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long long) * n, NULL, &err); clErrorHandling("clCreateBuffer"); err = clEnqueueWriteBuffer(commands, d_array, CL_TRUE, 0, sizeof(long long) * n, in, 0, NULL, NULL); clErrorHandling("clEnqueueWriteBuffer"); err = clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL); err |= clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_workgroup_size, NULL); clErrorHandling("clGetDeviceInfo"); /* prepare kernel args */ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_array); err |= clSetKernelArg(kernel, 1, sizeof(unsigned int), &n); /* execute */ local_size = n / max_compute_units / 8; if (local_size > max_workgroup_size) local_size = max_workgroup_size; /* * Usually it would be * global_size = local_size * max_compute_units; * but that would only be valid if local_size = n / max_compute_units; * local_size is n / max_compute_units / 8 because it obtains its hightest performance. */ for (global_size = local_size; global_size < n; global_size += local_size); err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global_size, &local_size, 0, NULL, &event); clErrorHandling("clEnqueueNDRangeKernel"); clWaitForEvents(1, &event); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); fprintf(stderr, "Time for event (ms): %10.5f \n", (end - start) / 1000000.0); err = clFinish(commands); clErrorHandling("clFinish"); /* transfer back */ err = clEnqueueReadBuffer(commands, d_array, CL_TRUE, 0, sizeof(long long), out, 0, NULL, NULL); // a single long long clErrorHandling("clEnqueueReadBuffer"); /* cleanup*/ clReleaseMemObject(d_array); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); clReleaseEvent(event); }
int main() { // Get platform information err = clGetPlatformIDs(0, NULL, &numOfPlatforms); if (err) Error("Fail to get the number of platforms"); printf("The machine has %d platform(s) for OpenCL.\n", numOfPlatforms); platformIDs = new cl_platform_id [numOfPlatforms]; err = clGetPlatformIDs(numOfPlatforms, platformIDs, NULL); if (err) Error("Fail to get the platform list"); int cudaPlatformID = -1; for (int i = 0; i < numOfPlatforms; i++) { char platformName[50]; err = clGetPlatformInfo(platformIDs[i], CL_PLATFORM_NAME, 50, platformName, NULL); if (err) Error("Fail to get the platform name"); printf("Platform %d is %s\n", i + 1, platformName); if (!strcmp(platformName, "NVIDIA CUDA")) cudaPlatformID = i; } printf("\n"); if (cudaPlatformID == -1) Error("Fail to find an NVIDIA CUDA platform"); printf("Platform %d (NVIDIA CUDA) is chosen for use.\n", cudaPlatformID + 1); printf("\n"); // Get device information err = clGetDeviceIDs(platformIDs[cudaPlatformID], CL_DEVICE_TYPE_GPU, 0, NULL, &numOfDevices); if (err) Error("Fail to get the number of devices"); printf("CUDA platform has %d device(s).\n", numOfDevices); deviceIDs = new cl_device_id [numOfDevices]; err = clGetDeviceIDs(platformIDs[cudaPlatformID], CL_DEVICE_TYPE_GPU, numOfDevices, deviceIDs, NULL); if (err) Error("Fail to get the device list"); for (int i = 0; i < numOfDevices; i++) { char deviceName[50]; err = clGetDeviceInfo(deviceIDs[i], CL_DEVICE_NAME, 50, deviceName, NULL); if (err) Error("Fail to get the device name"); printf("Device %d is %s\n", i + 1, deviceName); } printf("\n"); // Create a context context = clCreateContext(NULL, numOfDevices, deviceIDs, NULL, NULL, &err); if (err) Error("Fail to create a context"); printf("Device 1 is chosen for use.\n"); printf("\n"); // Create a command queue for the first device commandQueue = clCreateCommandQueue(context, deviceIDs[0], CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE, &err); if (err) Error("Fail to create a command queue"); // create the program cl_program program = CreateProgram(exclusiveScanKernels, "exclusive scan"); // create two kernels cl_kernel scanKernel = clCreateKernel(program, "Scan", &err); if (err) Error("Fail to create the kernel for scan"); cl_kernel reverseUpdateKernel = clCreateKernel(program, "ReverseUpdate", &err); if (err) Error("Fail to create the kernel for reverse update"); // Get the work group size size_t maxWorkGroupSize; err = clGetKernelWorkGroupInfo(scanKernel, deviceIDs[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxWorkGroupSize, NULL); printf("maxWorkGroupSize = %d\n", maxWorkGroupSize); err = clGetKernelWorkGroupInfo(reverseUpdateKernel, deviceIDs[0], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &maxWorkGroupSize, NULL); printf("maxWorkGroupSize = %d\n", maxWorkGroupSize); // Set work group size to 64 int workGroupSize = 512; int length = 2048000; int *arr = new int [length]; for (int i = 0; i < length; i++) arr[i] = rand() % 100; int *prefixSum = new int [length]; prefixSum[0] = 0; int t0 = clock(); for (int i = 1; i < length; i++) prefixSum[i] = prefixSum[i - 1] + arr[i - 1]; int t1 = clock(); printf("time1: %lf\n", (t1 - t0) * 1.0 / CLOCKS_PER_SEC); cl_mem d_arr = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(int) * length, NULL, &err); if (err) Error("Fail to create d_arr"); err = clEnqueueWriteBuffer(commandQueue, d_arr, CL_TRUE, 0, sizeof(int) * length, arr, 0, NULL, NULL); if (err) Error("Fail to write d_arr"); clSetKernelArg(scanKernel, 0, sizeof(cl_mem), &d_arr); cl_int d_length = length; clSetKernelArg(scanKernel, 1, sizeof(cl_int), &d_length); cl_int d_step = 1; clSetKernelArg(scanKernel, 2, sizeof(cl_int), &d_step); clSetKernelArg(scanKernel, 3, sizeof(cl_int) * (workGroupSize * 2 + workGroupSize * 2 / 16 + 1), NULL); int problemSize = length; int records[10]; int num = 0; int t2 = clock(); for (; problemSize > 1; problemSize = (problemSize - 1) / (workGroupSize * 2) + 1) { if (num) d_step *= workGroupSize * 2; printf("d_step = %d\n", d_step); records[num++] = problemSize; printf("problemSize = %d\n", problemSize); clSetKernelArg(scanKernel, 2, sizeof(cl_int), &d_step); size_t globalWorkSize = ((problemSize - 1) / (workGroupSize * 2) + 1) * workGroupSize; size_t localWorkSize = workGroupSize; err = clEnqueueNDRangeKernel(commandQueue, scanKernel, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); if (err) Error("Fail to enqueue scan"); clFinish(commandQueue); } //CheckValues(length, d_arr); int zero = 0; clEnqueueWriteBuffer(commandQueue, d_arr, CL_TRUE, 0, sizeof(int), &zero, 0, NULL, NULL); printf("d_step = %d\n", d_step); //scanf("%*c"); clSetKernelArg(reverseUpdateKernel, 0, sizeof(cl_mem), &d_arr); clSetKernelArg(reverseUpdateKernel, 1, sizeof(cl_int), &d_length); for (int i = num - 1; i >= 0; i--, d_step /= workGroupSize * 2) { printf("d_step = %d\n", d_step); clSetKernelArg(reverseUpdateKernel, 2, sizeof(cl_int), &d_step); size_t globalWorkSize = ((records[i] - 1) / (workGroupSize * 2) + 1) * workGroupSize; size_t localWorkSize = workGroupSize; printf("globalWorkSize = %d, localWorkSize = %d\n", globalWorkSize, localWorkSize); err = clEnqueueNDRangeKernel(commandQueue, reverseUpdateKernel, 1, NULL, &globalWorkSize, &localWorkSize, 0, NULL, NULL); if (err) Error("Fail to enqueue scan"); clFinish(commandQueue); } int t3 = clock(); printf("time: %lf\n", (t3 - t2) * 1.0 / CLOCKS_PER_SEC); int *GPUResult = new int [length]; memset(GPUResult, 0, sizeof(int) * length); err = clEnqueueReadBuffer(commandQueue, d_arr, CL_TRUE, 0, sizeof(int) * length, GPUResult, 0, NULL, NULL); printf("err = %d\n", err); if (err) Error("Fail to read d_arr"); for (int i = 0; i < length; i++) if (GPUResult[i] != prefixSum[i]) printf("at i = %d, GPUResult[%d] = %d, prefixSum[%d] = %d\n", i, i, GPUResult[i], i, prefixSum[i]); system("pause"); return 0; }
int simpleExample() { /* Create device and determine local size */ device = create_device(); err = clGetDeviceInfo(device, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(local_size), &local_size, NULL); if(err < 0) { perror("Couldn't obtain device information"); exit(1); } /* Create a context */ context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if(err < 0) { perror("Couldn't create a context"); exit(1); } /* Build program */ program = build_program(context, device, PROGRAM_FILE); /* Create data buffer */ data_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, ARRAY_SIZE * sizeof(float), data, &err); sum_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float), NULL, &err); if(err < 0) { perror("Couldn't create a buffer"); exit(1); }; /* Create a command queue */ queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err); if(err < 0) { perror("Couldn't create a command queue"); exit(1); }; /* Create kernels */ vector_kernel = clCreateKernel(program, KERNEL_1, &err); complete_kernel = clCreateKernel(program, KERNEL_2, &err); if(err < 0) { perror("Couldn't create a kernel"); exit(1); }; /* Set arguments for vector kernel */ err = clSetKernelArg(vector_kernel, 0, sizeof(cl_mem), &data_buffer); err |= clSetKernelArg(vector_kernel, 1, local_size * 4 * sizeof(float), NULL); /* Set arguments for complete kernel */ err = clSetKernelArg(complete_kernel, 0, sizeof(cl_mem), &data_buffer); err |= clSetKernelArg(complete_kernel, 1, local_size * 4 * sizeof(float), NULL); err |= clSetKernelArg(complete_kernel, 2, sizeof(cl_mem), &sum_buffer); if(err < 0) { perror("Couldn't create a kernel argument"); exit(1); } /* Enqueue kernels */ global_size = ARRAY_SIZE/4; err = clEnqueueNDRangeKernel(queue, vector_kernel, 1, NULL, &global_size, &local_size, 0, NULL, &start_event); if(err < 0) { perror("Couldn't enqueue the kernel"); exit(1); } printf("Global size = %lu\n", global_size); /* Perform successive stages of the reduction */ while(global_size/local_size > local_size) { global_size = global_size/local_size; err = clEnqueueNDRangeKernel(queue, vector_kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL); printf("Global size = %lu\n", global_size); if(err < 0) { perror("Couldn't enqueue the kernel"); exit(1); } } global_size = global_size/local_size; err = clEnqueueNDRangeKernel(queue, complete_kernel, 1, NULL, &global_size, NULL, 0, NULL, &end_event); printf("Global size = %lu\n", global_size); /* Finish processing the queue and get profiling information */ clFinish(queue); clGetEventProfilingInfo(start_event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL); clGetEventProfilingInfo(end_event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL); total_time = time_end - time_start; /* Read the result */ err = clEnqueueReadBuffer(queue, sum_buffer, CL_TRUE, 0, sizeof(float), &sum, 0, NULL, NULL); if(err < 0) { perror("Couldn't read the buffer"); exit(1); } /* Check result */ actual_sum = 1.0f * (ARRAY_SIZE/2)*(ARRAY_SIZE-1); if(fabs(sum - actual_sum) > 0.01*fabs(sum)) printf("Check failed.\n"); else printf("Check passed.\n"); printf("Total time = %lu\n", total_time); /* Deallocate resources */ clReleaseEvent(start_event); clReleaseEvent(end_event); clReleaseMemObject(sum_buffer); clReleaseMemObject(data_buffer); clReleaseKernel(vector_kernel); clReleaseKernel(complete_kernel); clReleaseCommandQueue(queue); clReleaseProgram(program); clReleaseContext(context); return 0; }
void InitOpenCL() { // 1. Get a platform. cl_platform_id platform; clGetPlatformIDs( 1, &platform, NULL ); // 2. Find a gpu device. cl_device_id device; clGetDeviceIDs( platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); // 3. Create a context and command queue on that device. cl_context context = clCreateContext( NULL, 1, &device, NULL, NULL, NULL); queue = clCreateCommandQueue( context, device, 0, NULL ); // 4. Perform runtime source compilation, and obtain kernel entry point. std::ifstream file("scene.cl"); std::string source; if (file){ while(!file.eof()){ char line[256]; file.getline(line,255); source += std::string(line) + "\n"; } } if (source.length()==0) { std::string err = "fail to load shader"; } cl_ulong maxSize; clGetDeviceInfo(device, CL_DEVICE_MAX_MEM_ALLOC_SIZE , sizeof(cl_ulong), &maxSize, 0); const char* str = source.c_str(); cl_program program = clCreateProgramWithSource( context, 1, &str, NULL, NULL ); cl_int result = clBuildProgram( program, 1, &device, NULL, NULL, NULL ); if ( result ){ char* build_log; size_t log_size; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); build_log = new char[log_size+1]; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, log_size, build_log, NULL); build_log[log_size] = '\0'; if( log_size > 2 ) { std::cout << "build log: " << build_log << std::endl; } delete[] build_log; std::cout << "Error during compilation! (" << result << ")" << std::endl; } kernel = clCreateKernel( program, "tracekernel", NULL ); // 5. Create a data buffer. buffer = clCreateBuffer( context, CL_MEM_WRITE_ONLY, kWidth * kHeight *sizeof(cl_float4), NULL, 0 ); viewTransform = clCreateBuffer( context, CL_MEM_READ_WRITE, 16 *sizeof(cl_float), NULL, 0 ); worldTransforms = clCreateBuffer( context, CL_MEM_READ_WRITE, 16 *sizeof(cl_float)*2, NULL, 0 ); clSetKernelArg(kernel, 0, sizeof(buffer), (void*) &buffer); clSetKernelArg(kernel, 1, sizeof(cl_uint), (void*) &kWidth); clSetKernelArg(kernel, 2, sizeof(cl_uint), (void*) &kWidth); clSetKernelArg(kernel, 3, sizeof(viewTransform), (void*) &viewTransform); clSetKernelArg(kernel, 4, sizeof(worldTransforms), (void*) &worldTransforms); }
int main(int argc, char* argv[]) { const size_t SIZE_execution_bit = (input_length - 3*filter_length +1); const size_t SIZE_input_bit = sizeof(gint32)*(input_length+1); const size_t SIZE_settings_bit = sizeof(gint32)*4; size_t output_bit_on_counts; size_t* SIZE_execution_pointer = &SIZE_execution_bit; gint32* filtersettings = (gint32*) malloc(SIZE_settings_bit); gint32* input_vector = (gint32*) malloc(SIZE_input_bit); gint32* positions = (gint32*) malloc(SIZE_input_bit); filtersettings[0] = filter_length; filtersettings[1] = threshhold; filtersettings[2] = input_length; filtersettings[3] = 0; //GPU-Init ocl = ocl_new(CL_DEVICE_TYPE_GPU,1); context = ocl_get_context(ocl); queue = ocl_get_cmd_queues (ocl)[0]; clFinish(queue); program = ocl_create_program_from_file(ocl, "edel_kernel_secondder.cl", NULL, &errcode); OCL_CHECK_ERROR(errcode); filter1 = clCreateKernel(program, "second_filter", &errcode); OCL_CHECK_ERROR(errcode); //GPU-Buffer which can be done before the Computation settings = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, SIZE_settings_bit, filtersettings, &errcode); OCL_CHECK_ERROR(errcode); input = clCreateBuffer(context, CL_MEM_READ_ONLY, SIZE_input_bit, NULL, &errcode); OCL_CHECK_ERROR(errcode); if(debugmode != 0) { srand((unsigned) time( NULL )); counter = rand_rects(expected,1,input_length,3*filter_length,3*filter_length,3*filter_length,peak_length,base+peak, input_vector, noise, base, 0,positions); if(harddebug != 0) { for(i = 0; i < input_length;i++) { if(input_length < 10000) { printf("input_vector[%i] = %d\n",i,input_vector[i]); } else { printf("input_vector[%i] = %d\t",i,input_vector[i]); } } } printf("\n counts = %d\n", counter); printf("%lu Bits needed for Output-Vector \n", output_bit_on_counts); } output_bit_on_counts = sizeof(gint32) * safetyfactor * 2*((counter + 2)); clEnqueueWriteBuffer(queue, input, CL_TRUE, 0, SIZE_input_bit, input_vector, 0, NULL, NULL); gint32* energy_time = (gint32*)malloc(output_bit_on_counts); for(i = 0; i < safetyfactor * (2*counter+2); i++) { energy_time[i] = -9999; } output = clCreateBuffer(context, CL_MEM_WRITE_ONLY, output_bit_on_counts, NULL , &errcode); OCL_CHECK_ERROR(errcode); OCL_CHECK_ERROR(clSetKernelArg(filter1, 0, sizeof(cl_mem), &input)); OCL_CHECK_ERROR(clSetKernelArg(filter1, 1, sizeof(cl_mem), &output)); OCL_CHECK_ERROR(clSetKernelArg(filter1, 2, sizeof(cl_mem), &settings)); size_t local_item_size; size_t global_item_size = (size_t) (input_length - 3*filter_length +1); local_item_size = ocl_get_local_size(global_item_size, 2,1); if(debugmode != 0) { printf("local item size = %lu \n %lu", &local_item_size, local_item_size); if(local_item_size != 0) { printf("This works because you divide %lu / %lu \n and this is %lu", global_item_size,local_item_size, global_item_size/local_item_size); } else { FILE* attention; attention = fopen("filterlengthbad", "a+"); if(attention == NULL) { printf("error in opening debug file \n"); exit(1); } fprintf(attention, "The filterlength %d is not good for this filter, choose another filterlength ! \n", filter_length); fclose(attention); printf("There is no way to fit it evenly divided to workgroups, just let OpenCL do it \n"); } if(harddebug != 0) { getchar(); } } if(local_item_size == 0) { OCL_CHECK_ERROR(clEnqueueNDRangeKernel(queue, filter1, 1, NULL, &global_item_size, NULL, 0, NULL, NULL)); } else { OCL_CHECK_ERROR(clEnqueueNDRangeKernel(queue, filter1, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL)); } //local_item_size = NULL; clEnqueueReadBuffer(queue, output, CL_TRUE, 0, output_bit_on_counts, energy_time, 0, NULL, NULL); clEnqueueReadBuffer(queue, settings, CL_TRUE, 0, SIZE_settings_bit, filtersettings, 0, NULL, NULL); //Writing back the data for(i = 0; i < filtersettings[3]; i++) { writing_back(filemode, filename, filename_e,filename_t, energy_time,i); } if(debugmode != 0) { printf("The Positions are:\n"); for(i=0; i < counter; i++) { printf("%d\t", positions[i]); printf("note that this postion is the middle of the rect \n"); } } //Safetychanges if(filtersettings[3] > counter) { safetyfactor = safetyfactor + 5*(filtersettings[3] - counter); if(safetyfactor <= 0) { safetyfactor = 10; } notexpect = filtersettings[3] - expected; if(safemode != 0 && notexpect >= notexpect_max) { printf("The Filter found to many peaks it. It expected %d. It found %d times more than expected.\n", expected, notexpect); printf("Safemode is on. Exit program \n"); OCL_CHECK_ERROR(clReleaseMemObject(input)); OCL_CHECK_ERROR(clReleaseMemObject(output)); OCL_CHECK_ERROR(clReleaseMemObject(settings)); OCL_CHECK_ERROR(clReleaseKernel(filter1)); OCL_CHECK_ERROR(clReleaseProgram(program)); ocl_free(ocl); free(input_vector); free(energy_time); free(positions); free(filtersettings); } else { printf("The Filter found to many peaks it. It expected %d. It found %d times more than expected \n", expected, notexpect); } } OCL_CHECK_ERROR(clReleaseMemObject(input)); OCL_CHECK_ERROR(clReleaseMemObject(output)); OCL_CHECK_ERROR(clReleaseMemObject(settings)); OCL_CHECK_ERROR(clReleaseKernel(filter1)); OCL_CHECK_ERROR(clReleaseProgram(program)); ocl_free(ocl); free(input_vector); free(energy_time); free(positions); free(filtersettings); }
// host stub function void ops_par_loop_PdV_kernel_nopredict( char const *name, ops_block block, int dim, int *range, ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3, ops_arg arg4, ops_arg arg5, ops_arg arg6, ops_arg arg7, ops_arg arg8, ops_arg arg9, ops_arg arg10, ops_arg arg11, ops_arg arg12, ops_arg arg13, ops_arg arg14, ops_arg arg15, ops_arg arg16) { // Timing double t1, t2, c1, c2; ops_arg args[17] = {arg0, arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9, arg10, arg11, arg12, arg13, arg14, arg15, arg16}; #ifdef CHECKPOINTING if (!ops_checkpointing_before(args, 17, range, 103)) return; #endif if (OPS_diags > 1) { ops_timing_realloc(103, "PdV_kernel_nopredict"); OPS_kernels[103].count++; ops_timers_core(&c1, &t1); } // compute locally allocated range for the sub-block int start[3]; int end[3]; #ifdef OPS_MPI sub_block_list sb = OPS_sub_block_list[block->index]; if (!sb->owned) return; for (int n = 0; n < 3; n++) { start[n] = sb->decomp_disp[n]; end[n] = sb->decomp_disp[n] + sb->decomp_size[n]; if (start[n] >= range[2 * n]) { start[n] = 0; } else { start[n] = range[2 * n] - start[n]; } if (sb->id_m[n] == MPI_PROC_NULL && range[2 * n] < 0) start[n] = range[2 * n]; if (end[n] >= range[2 * n + 1]) { end[n] = range[2 * n + 1] - sb->decomp_disp[n]; } else { end[n] = sb->decomp_size[n]; } if (sb->id_p[n] == MPI_PROC_NULL && (range[2 * n + 1] > sb->decomp_disp[n] + sb->decomp_size[n])) end[n] += (range[2 * n + 1] - sb->decomp_disp[n] - sb->decomp_size[n]); } #else for (int n = 0; n < 3; n++) { start[n] = range[2 * n]; end[n] = range[2 * n + 1]; } #endif int x_size = MAX(0, end[0] - start[0]); int y_size = MAX(0, end[1] - start[1]); int z_size = MAX(0, end[2] - start[2]); int xdim0 = args[0].dat->size[0]; int ydim0 = args[0].dat->size[1]; int xdim1 = args[1].dat->size[0]; int ydim1 = args[1].dat->size[1]; int xdim2 = args[2].dat->size[0]; int ydim2 = args[2].dat->size[1]; int xdim3 = args[3].dat->size[0]; int ydim3 = args[3].dat->size[1]; int xdim4 = args[4].dat->size[0]; int ydim4 = args[4].dat->size[1]; int xdim5 = args[5].dat->size[0]; int ydim5 = args[5].dat->size[1]; int xdim6 = args[6].dat->size[0]; int ydim6 = args[6].dat->size[1]; int xdim7 = args[7].dat->size[0]; int ydim7 = args[7].dat->size[1]; int xdim8 = args[8].dat->size[0]; int ydim8 = args[8].dat->size[1]; int xdim9 = args[9].dat->size[0]; int ydim9 = args[9].dat->size[1]; int xdim10 = args[10].dat->size[0]; int ydim10 = args[10].dat->size[1]; int xdim11 = args[11].dat->size[0]; int ydim11 = args[11].dat->size[1]; int xdim12 = args[12].dat->size[0]; int ydim12 = args[12].dat->size[1]; int xdim13 = args[13].dat->size[0]; int ydim13 = args[13].dat->size[1]; int xdim14 = args[14].dat->size[0]; int ydim14 = args[14].dat->size[1]; int xdim15 = args[15].dat->size[0]; int ydim15 = args[15].dat->size[1]; int xdim16 = args[16].dat->size[0]; int ydim16 = args[16].dat->size[1]; // build opencl kernel if not already built buildOpenCLKernels_PdV_kernel_nopredict( xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3, xdim4, ydim4, xdim5, ydim5, xdim6, ydim6, xdim7, ydim7, xdim8, ydim8, xdim9, ydim9, xdim10, ydim10, xdim11, ydim11, xdim12, ydim12, xdim13, ydim13, xdim14, ydim14, xdim15, ydim15, xdim16, ydim16); // set up OpenCL thread blocks size_t globalWorkSize[3] = { ((x_size - 1) / OPS_block_size_x + 1) * OPS_block_size_x, ((y_size - 1) / OPS_block_size_y + 1) * OPS_block_size_y, ((z_size - 1) / OPS_block_size_z + 1) * OPS_block_size_z}; size_t localWorkSize[3] = {OPS_block_size_x, OPS_block_size_y, OPS_block_size_z}; // set up initial pointers int d_m[OPS_MAX_DIM]; #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; #endif int base0 = 1 * 1 * (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); base0 = base0 + args[0].dat->size[0] * 1 * (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); base0 = base0 + args[0].dat->size[0] * 1 * args[0].dat->size[1] * 1 * (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; #endif int base1 = 1 * 1 * (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); base1 = base1 + args[1].dat->size[0] * 1 * (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); base1 = base1 + args[1].dat->size[0] * 1 * args[1].dat->size[1] * 1 * (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; #endif int base2 = 1 * 1 * (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); base2 = base2 + args[2].dat->size[0] * 1 * (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); base2 = base2 + args[2].dat->size[0] * 1 * args[2].dat->size[1] * 1 * (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; #endif int base3 = 1 * 1 * (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); base3 = base3 + args[3].dat->size[0] * 1 * (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); base3 = base3 + args[3].dat->size[0] * 1 * args[3].dat->size[1] * 1 * (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d] + OPS_sub_dat_list[args[4].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[4].dat->d_m[d]; #endif int base4 = 1 * 1 * (start[0] * args[4].stencil->stride[0] - args[4].dat->base[0] - d_m[0]); base4 = base4 + args[4].dat->size[0] * 1 * (start[1] * args[4].stencil->stride[1] - args[4].dat->base[1] - d_m[1]); base4 = base4 + args[4].dat->size[0] * 1 * args[4].dat->size[1] * 1 * (start[2] * args[4].stencil->stride[2] - args[4].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d] + OPS_sub_dat_list[args[5].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[5].dat->d_m[d]; #endif int base5 = 1 * 1 * (start[0] * args[5].stencil->stride[0] - args[5].dat->base[0] - d_m[0]); base5 = base5 + args[5].dat->size[0] * 1 * (start[1] * args[5].stencil->stride[1] - args[5].dat->base[1] - d_m[1]); base5 = base5 + args[5].dat->size[0] * 1 * args[5].dat->size[1] * 1 * (start[2] * args[5].stencil->stride[2] - args[5].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d] + OPS_sub_dat_list[args[6].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[6].dat->d_m[d]; #endif int base6 = 1 * 1 * (start[0] * args[6].stencil->stride[0] - args[6].dat->base[0] - d_m[0]); base6 = base6 + args[6].dat->size[0] * 1 * (start[1] * args[6].stencil->stride[1] - args[6].dat->base[1] - d_m[1]); base6 = base6 + args[6].dat->size[0] * 1 * args[6].dat->size[1] * 1 * (start[2] * args[6].stencil->stride[2] - args[6].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d] + OPS_sub_dat_list[args[7].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[7].dat->d_m[d]; #endif int base7 = 1 * 1 * (start[0] * args[7].stencil->stride[0] - args[7].dat->base[0] - d_m[0]); base7 = base7 + args[7].dat->size[0] * 1 * (start[1] * args[7].stencil->stride[1] - args[7].dat->base[1] - d_m[1]); base7 = base7 + args[7].dat->size[0] * 1 * args[7].dat->size[1] * 1 * (start[2] * args[7].stencil->stride[2] - args[7].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d] + OPS_sub_dat_list[args[8].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[8].dat->d_m[d]; #endif int base8 = 1 * 1 * (start[0] * args[8].stencil->stride[0] - args[8].dat->base[0] - d_m[0]); base8 = base8 + args[8].dat->size[0] * 1 * (start[1] * args[8].stencil->stride[1] - args[8].dat->base[1] - d_m[1]); base8 = base8 + args[8].dat->size[0] * 1 * args[8].dat->size[1] * 1 * (start[2] * args[8].stencil->stride[2] - args[8].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d] + OPS_sub_dat_list[args[9].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[9].dat->d_m[d]; #endif int base9 = 1 * 1 * (start[0] * args[9].stencil->stride[0] - args[9].dat->base[0] - d_m[0]); base9 = base9 + args[9].dat->size[0] * 1 * (start[1] * args[9].stencil->stride[1] - args[9].dat->base[1] - d_m[1]); base9 = base9 + args[9].dat->size[0] * 1 * args[9].dat->size[1] * 1 * (start[2] * args[9].stencil->stride[2] - args[9].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d] + OPS_sub_dat_list[args[10].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[10].dat->d_m[d]; #endif int base10 = 1 * 1 * (start[0] * args[10].stencil->stride[0] - args[10].dat->base[0] - d_m[0]); base10 = base10 + args[10].dat->size[0] * 1 * (start[1] * args[10].stencil->stride[1] - args[10].dat->base[1] - d_m[1]); base10 = base10 + args[10].dat->size[0] * 1 * args[10].dat->size[1] * 1 * (start[2] * args[10].stencil->stride[2] - args[10].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[11].dat->d_m[d] + OPS_sub_dat_list[args[11].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[11].dat->d_m[d]; #endif int base11 = 1 * 1 * (start[0] * args[11].stencil->stride[0] - args[11].dat->base[0] - d_m[0]); base11 = base11 + args[11].dat->size[0] * 1 * (start[1] * args[11].stencil->stride[1] - args[11].dat->base[1] - d_m[1]); base11 = base11 + args[11].dat->size[0] * 1 * args[11].dat->size[1] * 1 * (start[2] * args[11].stencil->stride[2] - args[11].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[12].dat->d_m[d] + OPS_sub_dat_list[args[12].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[12].dat->d_m[d]; #endif int base12 = 1 * 1 * (start[0] * args[12].stencil->stride[0] - args[12].dat->base[0] - d_m[0]); base12 = base12 + args[12].dat->size[0] * 1 * (start[1] * args[12].stencil->stride[1] - args[12].dat->base[1] - d_m[1]); base12 = base12 + args[12].dat->size[0] * 1 * args[12].dat->size[1] * 1 * (start[2] * args[12].stencil->stride[2] - args[12].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[13].dat->d_m[d] + OPS_sub_dat_list[args[13].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[13].dat->d_m[d]; #endif int base13 = 1 * 1 * (start[0] * args[13].stencil->stride[0] - args[13].dat->base[0] - d_m[0]); base13 = base13 + args[13].dat->size[0] * 1 * (start[1] * args[13].stencil->stride[1] - args[13].dat->base[1] - d_m[1]); base13 = base13 + args[13].dat->size[0] * 1 * args[13].dat->size[1] * 1 * (start[2] * args[13].stencil->stride[2] - args[13].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[14].dat->d_m[d] + OPS_sub_dat_list[args[14].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[14].dat->d_m[d]; #endif int base14 = 1 * 1 * (start[0] * args[14].stencil->stride[0] - args[14].dat->base[0] - d_m[0]); base14 = base14 + args[14].dat->size[0] * 1 * (start[1] * args[14].stencil->stride[1] - args[14].dat->base[1] - d_m[1]); base14 = base14 + args[14].dat->size[0] * 1 * args[14].dat->size[1] * 1 * (start[2] * args[14].stencil->stride[2] - args[14].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[15].dat->d_m[d] + OPS_sub_dat_list[args[15].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[15].dat->d_m[d]; #endif int base15 = 1 * 1 * (start[0] * args[15].stencil->stride[0] - args[15].dat->base[0] - d_m[0]); base15 = base15 + args[15].dat->size[0] * 1 * (start[1] * args[15].stencil->stride[1] - args[15].dat->base[1] - d_m[1]); base15 = base15 + args[15].dat->size[0] * 1 * args[15].dat->size[1] * 1 * (start[2] * args[15].stencil->stride[2] - args[15].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[16].dat->d_m[d] + OPS_sub_dat_list[args[16].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[16].dat->d_m[d]; #endif int base16 = 1 * 1 * (start[0] * args[16].stencil->stride[0] - args[16].dat->base[0] - d_m[0]); base16 = base16 + args[16].dat->size[0] * 1 * (start[1] * args[16].stencil->stride[1] - args[16].dat->base[1] - d_m[1]); base16 = base16 + args[16].dat->size[0] * 1 * args[16].dat->size[1] * 1 * (start[2] * args[16].stencil->stride[2] - args[16].dat->base[2] - d_m[2]); ops_H_D_exchanges_device(args, 17); ops_halo_exchanges(args, 17, range); ops_H_D_exchanges_device(args, 17); if (OPS_diags > 1) { ops_timers_core(&c2, &t2); OPS_kernels[103].mpi_time += t2 - t1; } if (globalWorkSize[0] > 0 && globalWorkSize[1] > 0 && globalWorkSize[2] > 0) { clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 0, sizeof(cl_mem), (void *)&arg0.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 1, sizeof(cl_mem), (void *)&arg1.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 2, sizeof(cl_mem), (void *)&arg2.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 3, sizeof(cl_mem), (void *)&arg3.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 4, sizeof(cl_mem), (void *)&arg4.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 5, sizeof(cl_mem), (void *)&arg5.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 6, sizeof(cl_mem), (void *)&arg6.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 7, sizeof(cl_mem), (void *)&arg7.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 8, sizeof(cl_mem), (void *)&arg8.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 9, sizeof(cl_mem), (void *)&arg9.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 10, sizeof(cl_mem), (void *)&arg10.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 11, sizeof(cl_mem), (void *)&arg11.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 12, sizeof(cl_mem), (void *)&arg12.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 13, sizeof(cl_mem), (void *)&arg13.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 14, sizeof(cl_mem), (void *)&arg14.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 15, sizeof(cl_mem), (void *)&arg15.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 16, sizeof(cl_mem), (void *)&arg16.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 17, sizeof(cl_double), (void *)&dt)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 18, sizeof(cl_int), (void *)&base0)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 19, sizeof(cl_int), (void *)&base1)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 20, sizeof(cl_int), (void *)&base2)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 21, sizeof(cl_int), (void *)&base3)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 22, sizeof(cl_int), (void *)&base4)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 23, sizeof(cl_int), (void *)&base5)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 24, sizeof(cl_int), (void *)&base6)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 25, sizeof(cl_int), (void *)&base7)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 26, sizeof(cl_int), (void *)&base8)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 27, sizeof(cl_int), (void *)&base9)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 28, sizeof(cl_int), (void *)&base10)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 29, sizeof(cl_int), (void *)&base11)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 30, sizeof(cl_int), (void *)&base12)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 31, sizeof(cl_int), (void *)&base13)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 32, sizeof(cl_int), (void *)&base14)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 33, sizeof(cl_int), (void *)&base15)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 34, sizeof(cl_int), (void *)&base16)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 35, sizeof(cl_int), (void *)&x_size)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 36, sizeof(cl_int), (void *)&y_size)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[103], 37, sizeof(cl_int), (void *)&z_size)); // call/enque opencl kernel wrapper function clSafeCall(clEnqueueNDRangeKernel( OPS_opencl_core.command_queue, OPS_opencl_core.kernel[103], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL)); } if (OPS_diags > 1) { clSafeCall(clFinish(OPS_opencl_core.command_queue)); } if (OPS_diags > 1) { ops_timers_core(&c1, &t1); OPS_kernels[103].time += t1 - t2; } ops_set_dirtybit_device(args, 17); ops_set_halo_dirtybit3(&args[6], range); ops_set_halo_dirtybit3(&args[10], range); ops_set_halo_dirtybit3(&args[13], range); if (OPS_diags > 1) { // Update kernel record ops_timers_core(&c2, &t2); OPS_kernels[103].mpi_time += t2 - t1; OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg0); OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg1); OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg2); OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg3); OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg4); OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg5); OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg6); OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg7); OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg8); OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg9); OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg10); OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg11); OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg12); OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg13); OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg14); OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg15); OPS_kernels[103].transfer += ops_compute_transfer(dim, start, end, &arg16); } }
int main(int argc, char *argv[]) { struct ocl_ds *o_ds; cl_int err; cl_event evt; cl_mem o_in; cl_int4 *o_out; struct ocl_kernel *o_k; int len = LEN; int i; size_t workGroupSize[2], localz[2]; localz[1] = 16; localz[0] = 16; workGroupSize[0] = 1024*1024; workGroupSize[1] = 1; o_ds = create_ocl_ds(KERNELDIR KERNELS_FILE); if (o_ds == NULL){ return 1; } o_in = create_ocl_mem(o_ds, sizeof(cl_int4)*len); if (o_in == NULL){ goto free_mem_in; } o_out = malloc(sizeof(cl_int4) * len); if (o_out == NULL){ goto free_mem_out; } //bzero(o_out, len*sizeof(cl_int4)); o_k = create_ocl_kernel(o_ds, "ocl_layout"); if (o_k == NULL){ goto free_kernel; } #if 0 err = clSetKernelArg(o_k->k_kernel, 0, sizeof(cl_mem), (void *) &o_in); err |= clSetKernelArg(o_k->k_kernel, 1, sizeof(int), (void *) &len); if (err != CL_SUCCESS){ #ifdef DEBUG fprintf(stderr, "clSetKernelArg return %s\n", oclErrorString(err)); #endif goto clean_up; } err = clEnqueueNDRangeKernel(o_ds->d_cq, o_k->k_kernel, 2, NULL, workGroupSize, localz, 0, NULL, &evt); //err = clEnqueueNDRangeKernel(o_ds->d_cq, o_k->k_kernel, 3, NULL, workGroupSize, NULL, 0, NULL, &evt); if (err != CL_SUCCESS){ #ifdef DEBUG fprintf(stderr, "clEnqueueNDRangeKernel: %s\n", oclErrorString(err)); #endif goto clean_up; } clReleaseEvent(evt); clFinish(o_ds->d_cq); #endif #if 0 fprintf(stderr, "%s: pointers s:[%ld] p:<%p>\n", __func__, sizeof(cl_mem), &o_in); fprintf(stderr, "%s: pointers s:[%ld] p:<%p>\n", __func__, sizeof(int), &len); #endif #if 0 //if (run_1d_ocl_kernel(o_ds, o_k, workGroupSize, ((void*)(&(o_in))), (sizeof(o_in)), ((void*)(&(len))), (sizeof(len)), NULL) < 0){ if (run_1d_ocl_kernel(o_ds, o_k, workGroupSize, OCL_PARAM(o_in), OCL_PARAM(len), NULL) < 0){ #ifdef DEBUG fprintf(stderr, "%s: error in run kernel\n", __func__); #endif goto clean_up; } #endif //for (i=1024; i<len; i+=1024){ //if (xfer_from_ocl_mem(o_ds, o_in, sizeof(cl_int4) * i, o_out) < 0){ if (xfer_from_ocl_mem(o_ds, o_in, sizeof(cl_int4) * len, o_out) < 0){ #ifdef DEBUG fprintf(stderr, "%s: xfer from ocl error\n", __func__); #endif goto clean_up; } // } #if 0 #if 1 #ifdef DEBUG for (i=0; i<len; i++){ fprintf(stderr, "%d %d %d %d\n", o_out[i].w, o_out[i].x, o_out[i].y, o_out[i].z); } #endif #else fwrite(o_out, len, sizeof(cl_int4), stdout); #endif #endif clean_up: destroy_ocl_kernel(o_k); free_kernel: if (o_out) free(o_out); free_mem_out: destroy_ocl_mem(o_in); free_mem_in: destroy_ocl_ds(o_ds); return 0; }
int exec_trig_kernel(const char *program_source, int n, void *srcA, void *dst) { cl_context context; cl_command_queue cmd_queue; cl_device_id *devices; cl_program program; cl_kernel kernel; cl_mem memobjs[2]; size_t global_work_size[1]; size_t local_work_size[1]; size_t cb; cl_int err; float c = 7.3f; // a scalar number to test non-pointer args // create the OpenCL context on a GPU device context = poclu_create_any_context(); if (context == (cl_context)0) return -1; // get the list of GPU devices associated with context clGetContextInfo(context, CL_CONTEXT_DEVICES, 0, NULL, &cb); devices = (cl_device_id *) malloc(cb); clGetContextInfo(context, CL_CONTEXT_DEVICES, cb, devices, NULL); // create a command-queue cmd_queue = clCreateCommandQueue(context, devices[0], 0, NULL); if (cmd_queue == (cl_command_queue)0) { clReleaseContext(context); free(devices); return -1; } free(devices); // allocate the buffer memory objects memobjs[0] = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(cl_float4) * n, srcA, NULL); if (memobjs[0] == (cl_mem)0) { clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } memobjs[1] = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_float4) * n, NULL, NULL); if (memobjs[1] == (cl_mem)0) { delete_memobjs(memobjs, 1); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // create the program program = clCreateProgramWithSource(context, 1, (const char**)&program_source, NULL, NULL); if (program == (cl_program)0) { delete_memobjs(memobjs, 2); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // create the kernel kernel = clCreateKernel(program, "trig", NULL); if (kernel == (cl_kernel)0) { delete_memobjs(memobjs, 2); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // set the args values err = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *) &memobjs[0]); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *) &memobjs[1]); err |= clSetKernelArg(kernel, 2, sizeof(float), (void *) &c); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // set work-item dimensions global_work_size[0] = n; local_work_size[0]= 2; // execute kernel err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // read output image err = clEnqueueReadBuffer(cmd_queue, memobjs[1], CL_TRUE, 0, n * sizeof(cl_float4), dst, 0, NULL, NULL); if (err != CL_SUCCESS) { delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return -1; } // release kernel, program, and memory objects delete_memobjs(memobjs, 2); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); return 0; // success... }
extern "C" void magmablas_zlacpy( magma_uplo_t uplo, magma_int_t m, magma_int_t n, magmaDoubleComplex_ptr dA, size_t dA_offset, magma_int_t lda, magmaDoubleComplex_ptr dB, size_t dB_offset, magma_int_t ldb, magma_queue_t queue) { /* Note ======== - UPLO Parameter is disabled - Do we want to provide a generic function to the user with all the options? Purpose ======= ZLACPY copies all or part of a two-dimensional matrix A to another matrix B. Arguments ========= UPLO (input) INTEGER Specifies the part of the matrix A to be copied to B. = 'U': Upper triangular part = 'L': Lower triangular part Otherwise: All of the matrix A M (input) INTEGER The number of rows of the matrix A. M >= 0. N (input) INTEGER The number of columns of the matrix A. N >= 0. A (input) COMPLEX DOUBLE PRECISION array, dimension (LDA,N) The m by n matrix A. If UPLO = 'U', only the upper triangle or trapezoid is accessed; if UPLO = 'L', only the lower triangle or trapezoid is accessed. LDA (input) INTEGER The leading dimension of the array A. LDA >= max(1,M). B (output) COMPLEX DOUBLE PRECISION array, dimension (LDB,N) On exit, B = A in the locations specified by UPLO. LDB (input) INTEGER The leading dimension of the array B. LDB >= max(1,M). ===================================================================== */ size_t LocalWorkSize[1] = {64}; size_t GlobalWorkSize[1] = {(m/64+(m%64 != 0))*64}; if ( m == 0 || n == 0 ) return; if ( uplo == MagmaUpper ) { fprintf(stderr, "lacpy upper is not implemented\n"); } else if ( uplo == MagmaLower ) { fprintf(stderr, "lacpy lower is not implemented\n"); } else { cl_int ciErrNum; cl_kernel ckKernel = NULL; ckKernel = rt->KernelPool["zlacpy_kernel"]; if(!ckKernel){ printf ("Error: cannot locate kernel in line %d, file %s\n", __LINE__, __FILE__); return; } int offset_A = (int)dA_offset; int offset_B = (int)dB_offset; int nn = 0; ciErrNum = clSetKernelArg( ckKernel, nn++, sizeof(cl_int), (void*)&m); ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(cl_int), (void*)&n ); ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(cl_mem), (void*)&dA); ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(cl_int), (void*)&offset_A ); ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(cl_int), (void*)&lda ); ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(cl_mem), (void*)&dB); ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(cl_int), (void*)&offset_B ); ciErrNum |= clSetKernelArg( ckKernel, nn++, sizeof(cl_int), (void*)&ldb ); if (ciErrNum != CL_SUCCESS){ printf("Error: clSetKernelArg at %d in file %s, %s\n", __LINE__, __FILE__, rt->GetErrorCode(ciErrNum)); return; } // launch kernel ciErrNum = clEnqueueNDRangeKernel( queue, ckKernel, 1, NULL, GlobalWorkSize, LocalWorkSize, 0, NULL, NULL); if (ciErrNum != CL_SUCCESS) { printf("Error: clEnqueueNDRangeKernel at %d in file %s \"%s\"\n", __LINE__, __FILE__, rt->GetErrorCode(ciErrNum)); return; } } }
/** Purpose ------- SLACPY_Q copies all or part of a two-dimensional matrix dA to another matrix dB. This is the same as SLACPY, but adds queue argument. Arguments --------- @param[in] uplo magma_uplo_t Specifies the part of the matrix dA to be copied to dB. - = MagmaUpper: Upper triangular part - = MagmaLower: Lower triangular part Otherwise: All of the matrix dA @param[in] m INTEGER The number of rows of the matrix dA. M >= 0. @param[in] n INTEGER The number of columns of the matrix dA. N >= 0. @param[in] dA REAL array, dimension (LDDA,N) The m by n matrix dA. If UPLO = MagmaUpper, only the upper triangle or trapezoid is accessed; if UPLO = MagmaLower, only the lower triangle or trapezoid is accessed. @param[in] ldda INTEGER The leading dimension of the array dA. LDDA >= max(1,M). @param[out] dB REAL array, dimension (LDDB,N) The m by n matrix dB. On exit, dB = dA in the locations specified by UPLO. @param[in] lddb INTEGER The leading dimension of the array dB. LDDB >= max(1,M). @param[in] queue magma_queue_t Queue to execute in. @ingroup magma_saux2 ********************************************************************/ extern "C" void magmablas_slacpy( magma_uplo_t uplo, magma_int_t m, magma_int_t n, magmaFloat_const_ptr dA, size_t dA_offset, magma_int_t ldda, magmaFloat_ptr dB, size_t dB_offset, magma_int_t lddb, magma_queue_t queue ) { cl_kernel kernel; cl_int err; int i; magma_int_t info = 0; if ( m < 0 ) info = -2; else if ( n < 0 ) info = -3; else if ( ldda < max(1,m)) info = -5; else if ( lddb < max(1,m)) info = -7; if ( info != 0 ) { magma_xerbla( __func__, -(info) ); return; } if ( m == 0 || n == 0 ) return; size_t threads[2] = { BLK_X, 1 }; size_t grid[2] = { (m + BLK_X - 1)/BLK_X, (n + BLK_Y - 1)/BLK_Y }; grid[0] *= threads[0]; grid[1] *= threads[1]; if ( uplo == MagmaLower ) { kernel = g_runtime.get_kernel( "slacpy_kernel_lower" ); if ( kernel != NULL ) { err = 0; i = 0; err |= clSetKernelArg( kernel, i++, sizeof(m ), &m ); err |= clSetKernelArg( kernel, i++, sizeof(n ), &n ); err |= clSetKernelArg( kernel, i++, sizeof(dA ), &dA ); err |= clSetKernelArg( kernel, i++, sizeof(dA_offset), &dA_offset ); err |= clSetKernelArg( kernel, i++, sizeof(ldda ), &ldda ); err |= clSetKernelArg( kernel, i++, sizeof(dB ), &dB ); err |= clSetKernelArg( kernel, i++, sizeof(dB_offset), &dB_offset ); err |= clSetKernelArg( kernel, i++, sizeof(lddb ), &lddb ); check_error( err ); err = clEnqueueNDRangeKernel( queue, kernel, 2, NULL, grid, threads, 0, NULL, NULL ); check_error( err ); } } else if ( uplo == MagmaUpper ) { kernel = g_runtime.get_kernel( "slacpy_kernel_upper" ); if ( kernel != NULL ) { err = 0; i = 0; err |= clSetKernelArg( kernel, i++, sizeof(m ), &m ); err |= clSetKernelArg( kernel, i++, sizeof(n ), &n ); err |= clSetKernelArg( kernel, i++, sizeof(dA ), &dA ); err |= clSetKernelArg( kernel, i++, sizeof(dA_offset), &dA_offset ); err |= clSetKernelArg( kernel, i++, sizeof(ldda ), &ldda ); err |= clSetKernelArg( kernel, i++, sizeof(dB ), &dB ); err |= clSetKernelArg( kernel, i++, sizeof(dB_offset), &dB_offset ); err |= clSetKernelArg( kernel, i++, sizeof(lddb ), &lddb ); check_error( err ); err = clEnqueueNDRangeKernel( queue, kernel, 2, NULL, grid, threads, 0, NULL, NULL ); check_error( err ); } } else { kernel = g_runtime.get_kernel( "slacpy_kernel_full" ); if ( kernel != NULL ) { err = 0; i = 0; err |= clSetKernelArg( kernel, i++, sizeof(m ), &m ); err |= clSetKernelArg( kernel, i++, sizeof(n ), &n ); err |= clSetKernelArg( kernel, i++, sizeof(dA ), &dA ); err |= clSetKernelArg( kernel, i++, sizeof(dA_offset), &dA_offset ); err |= clSetKernelArg( kernel, i++, sizeof(ldda ), &ldda ); err |= clSetKernelArg( kernel, i++, sizeof(dB ), &dB ); err |= clSetKernelArg( kernel, i++, sizeof(dB_offset), &dB_offset ); err |= clSetKernelArg( kernel, i++, sizeof(lddb ), &lddb ); check_error( err ); err = clEnqueueNDRangeKernel( queue, kernel, 2, NULL, grid, threads, 0, NULL, NULL ); check_error( err ); } } }
int main(void) { //time meassuring struct timeval tvs; //variables int Nx=1024; int Ny=1024; int plotnum=0; int Tmax=2; int plottime=0; int plotgap=1; double Lx=1.0; double Ly=1.0; double dt=0.0; double A=0.0; double B=0.0; double Du=0.0; double Dv=0.0; //splitting coefficients double a=0.5; double b=0.5; double c=1.0; //loop counters int i=0; int j=0; int n=0; double*umax=NULL; double*vmax=NULL; parainit(&Nx,&Ny,&Tmax,&plotgap,&Lx,&Ly,&dt,&Du,&Dv,&A,&B); plottime=plotgap; vmax=(double*)malloc((Tmax/plotgap+1)*sizeof(double)); umax=(double*)malloc((Tmax/plotgap+1)*sizeof(double)); //openCL variables cl_platform_id *platform_id = NULL; cl_kernel frequencies = NULL, initialdata = NULL, linearpart=NULL; cl_kernel nonlinearpart_a=NULL, nonlinearpart_b=NULL; cl_int ret; cl_uint num_platforms; // Detect how many platforms there are. ret = clGetPlatformIDs(0, NULL, &num_platforms); // Allocate enough space for the number of platforms. platform_id = (cl_platform_id*) malloc(num_platforms*sizeof(cl_platform_id)); // Store the platforms ret = clGetPlatformIDs(num_platforms, platform_id, NULL); printf("Found %d platform(s)!\n",num_platforms); cl_uint *num_devices; num_devices=(cl_uint*) malloc(num_platforms*sizeof(cl_uint)); cl_device_id **device_id = NULL; device_id =(cl_device_id**) malloc(num_platforms*sizeof(cl_device_id*)); // Detect number of devices in the platforms for(i=0;i<num_platforms;i++){ char buf[65536]; size_t size; ret = clGetPlatformInfo(platform_id[i],CL_PLATFORM_VERSION,sizeof(buf),buf,&size); printf("%s\n",buf); ret = clGetDeviceIDs(platform_id[i],CL_DEVICE_TYPE_ALL,0,NULL,num_devices); printf("Found %d device(s) on platform %d!\n", num_devices[i],i); ret = clGetPlatformInfo(platform_id[i],CL_PLATFORM_NAME,sizeof(buf),buf,&size); printf("%s ",buf); // Store numDevices from platform device_id[i]=(cl_device_id*) malloc(num_devices[i]*sizeof(device_id)); ret = clGetDeviceIDs(platform_id[i],CL_DEVICE_TYPE_ALL,num_devices[i],device_id[i],NULL); for(j=0;j<num_devices[i];j++){ ret = clGetDeviceInfo(device_id[i][j],CL_DEVICE_NAME,sizeof(buf),buf,&size); printf("%s (%d,%d)\n",buf,i,j); } } //create context and command_queue cl_context context = NULL; cl_command_queue command_queue = NULL; //Which platform and device do i choose? int chooseplatform=0; int choosedevice=0; printf("Choose platform %d and device %d!\n",chooseplatform,choosedevice); context = clCreateContext( NULL, num_devices[chooseplatform], device_id[chooseplatform], NULL, NULL, &ret); if(ret!=CL_SUCCESS){printf("createContext ret:%d\n",ret); exit(1); } command_queue = clCreateCommandQueue(context, device_id[chooseplatform][choosedevice], 0, &ret); if(ret!=CL_SUCCESS){printf("createCommandQueue ret:%d\n",ret); exit(1); } //OpenCL arrays cl_mem cl_u = NULL,cl_v = NULL; cl_mem cl_uhat = NULL, cl_vhat = NULL; cl_mem cl_kx = NULL, cl_ky = NULL; //FFT clfftPlanHandle planHandle; cl_mem tmpBuffer = NULL; fftinit(&planHandle,&context, &command_queue, &tmpBuffer, Nx, Ny); //allocate gpu memory/ cl_u=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx* Ny* sizeof(double), NULL, &ret); cl_v=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx* Ny* sizeof(double), NULL, &ret); cl_uhat=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx * Ny* sizeof(double), NULL, &ret); cl_vhat=clCreateBuffer(context, CL_MEM_READ_WRITE, 2*Nx * Ny* sizeof(double), NULL, &ret); cl_kx = clCreateBuffer(context, CL_MEM_READ_WRITE, Nx * sizeof(double), NULL, &ret); cl_ky = clCreateBuffer(context, CL_MEM_READ_WRITE, Ny * sizeof(double), NULL, &ret); printf("allocated space\n"); //load the kernels loadKernel(&frequencies,&context,&device_id[chooseplatform][choosedevice],"frequencies"); loadKernel(&initialdata,&context,&device_id[chooseplatform][choosedevice],"initialdata"); loadKernel(&linearpart,&context,&device_id[chooseplatform][choosedevice],"linearpart"); loadKernel(&nonlinearpart_a,&context,&device_id[chooseplatform][choosedevice],"nonlinearpart_a"); loadKernel(&nonlinearpart_b,&context,&device_id[chooseplatform][choosedevice],"nonlinearpart_b"); size_t global_work_size[1] = {Nx*Ny}; size_t global_work_size_X[1] = {Nx}; size_t global_work_size_Y[1] = {Ny}; //frequencies ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem),(void *)&cl_kx); ret = clSetKernelArg(frequencies, 1, sizeof(double),(void* )&Lx); ret = clSetKernelArg(frequencies, 2, sizeof(int),(void* )&Nx); ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_X, NULL, 0, NULL, NULL); ret = clFinish(command_queue); ret = clSetKernelArg(frequencies, 0, sizeof(cl_mem),(void *)&cl_ky); ret = clSetKernelArg(frequencies, 1, sizeof(double),(void* )&Ly); ret = clSetKernelArg(frequencies, 2, sizeof(int),(void* )&Ny); ret = clEnqueueNDRangeKernel(command_queue, frequencies, 1, NULL, global_work_size_Y, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //printCL(&cl_kx,&command_queue,Nx,1); //printCL(&cl_ky,&command_queue,1,Ny); //inintial data ret = clSetKernelArg(initialdata, 0, sizeof(cl_mem),(void *)&cl_u); ret = clSetKernelArg(initialdata, 1, sizeof(cl_mem),(void* )&cl_v); ret = clSetKernelArg(initialdata, 2, sizeof(int),(void* )&Nx); ret = clSetKernelArg(initialdata, 3, sizeof(int),(void* )&Ny); ret = clSetKernelArg(initialdata, 4, sizeof(double),(void* )&Lx); ret = clSetKernelArg(initialdata, 5, sizeof(double),(void* )&Ly); ret = clEnqueueNDRangeKernel(command_queue, initialdata, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //make output writedata_C(&cl_u, &command_queue,Nx,Ny,plotnum,"u"); writedata_C(&cl_v, &command_queue,Nx,Ny,plotnum,"v"); umax[plotnum]=writeimage(&cl_u, &command_queue,Nx,Ny,plotnum,"u"); vmax[plotnum]=writeimage(&cl_v, &command_queue,Nx,Ny,plotnum,"v"); printf("Got initial data, starting timestepping\n"); mtime_s(&tvs); for(n=0;n<=Tmax;n++){ //nonlinearpart_a ret = clSetKernelArg(nonlinearpart_a, 0, sizeof(cl_mem),(void *)&cl_u); ret = clSetKernelArg(nonlinearpart_a, 1, sizeof(cl_mem),(void* )&cl_v); ret = clSetKernelArg(nonlinearpart_a, 2, sizeof(double),(void* )&A); ret = clSetKernelArg(nonlinearpart_a, 3, sizeof(double),(void* )&dt); ret = clSetKernelArg(nonlinearpart_a, 4, sizeof(double),(void* )&a); ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_a, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //nonlinearpart_b ret = clSetKernelArg(nonlinearpart_b, 0, sizeof(cl_mem),(void *)&cl_u); ret = clSetKernelArg(nonlinearpart_b, 1, sizeof(cl_mem),(void* )&cl_v); ret = clSetKernelArg(nonlinearpart_b, 2, sizeof(double),(void* )&A); ret = clSetKernelArg(nonlinearpart_b, 3, sizeof(double),(void* )&dt); ret = clSetKernelArg(nonlinearpart_b, 4, sizeof(double),(void* )&b); ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_b, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //linear fft2dfor(&cl_u, &cl_uhat,&planHandle,&command_queue,&tmpBuffer); fft2dfor(&cl_v, &cl_vhat,&planHandle,&command_queue,&tmpBuffer); //printf("A%f,B%f\n",A,B); ret = clSetKernelArg(linearpart, 0, sizeof(cl_mem),(void *)&cl_uhat); ret = clSetKernelArg(linearpart, 1, sizeof(cl_mem),(void *)&cl_vhat); ret = clSetKernelArg(linearpart, 2, sizeof(cl_mem),(void* )&cl_kx); ret = clSetKernelArg(linearpart, 3, sizeof(cl_mem),(void* )&cl_ky); ret = clSetKernelArg(linearpart, 4, sizeof(double),(void* )&Du); ret = clSetKernelArg(linearpart, 5, sizeof(double),(void* )&Dv); ret = clSetKernelArg(linearpart, 6, sizeof(double),(void* )&A); ret = clSetKernelArg(linearpart, 7, sizeof(double),(void* )&B); ret = clSetKernelArg(linearpart, 8, sizeof(double),(void* )&dt); ret = clSetKernelArg(linearpart, 9, sizeof(double),(void* )&c); ret = clSetKernelArg(linearpart, 10, sizeof(int),(void* )&Nx); ret = clSetKernelArg(linearpart, 11, sizeof(int),(void* )&Ny); ret = clEnqueueNDRangeKernel(command_queue, linearpart, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); fft2dback(&cl_u, &cl_uhat,&planHandle,&command_queue,&tmpBuffer); fft2dback(&cl_v, &cl_vhat,&planHandle,&command_queue,&tmpBuffer); //nonlinearpart_b ret = clSetKernelArg(nonlinearpart_b, 0, sizeof(cl_mem),(void *)&cl_u); ret = clSetKernelArg(nonlinearpart_b, 1, sizeof(cl_mem),(void* )&cl_v); ret = clSetKernelArg(nonlinearpart_b, 2, sizeof(double),(void* )&A); ret = clSetKernelArg(nonlinearpart_b, 3, sizeof(double),(void* )&dt); ret = clSetKernelArg(nonlinearpart_b, 4, sizeof(double),(void* )&b); ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_b, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); //nonlinearpart_a ret = clSetKernelArg(nonlinearpart_a, 0, sizeof(cl_mem),(void *)&cl_u); ret = clSetKernelArg(nonlinearpart_a, 1, sizeof(cl_mem),(void* )&cl_v); ret = clSetKernelArg(nonlinearpart_a, 2, sizeof(double),(void* )&A); ret = clSetKernelArg(nonlinearpart_a, 3, sizeof(double),(void* )&dt); ret = clSetKernelArg(nonlinearpart_a, 4, sizeof(double),(void* )&a); ret = clEnqueueNDRangeKernel(command_queue, nonlinearpart_a, 1, NULL, global_work_size, NULL, 0, NULL, NULL); ret = clFinish(command_queue); // done if(n==plottime){ printf("time:%f, step:%d,%d,umax:%f,vmax:%f\n",n*dt,n,plotnum,umax[plotnum],vmax[plotnum]); plottime=plottime+plotgap; plotnum=plotnum+1; writedata_C(&cl_u, &command_queue,Nx,Ny,plotnum,"u"); writedata_C(&cl_v, &command_queue,Nx,Ny,plotnum,"v"); umax[plotnum]=writeimage(&cl_u, &command_queue,Nx,Ny,plotnum,"u"); vmax[plotnum]=writeimage(&cl_v, &command_queue,Nx,Ny,plotnum,"v"); } }//end timestepping printf("Finished time stepping\n"); mtime_e(&tvs,"Programm took:"); writearray(umax,(Tmax/plotgap)+1,"u"); writearray(vmax,(Tmax/plotgap)+1,"v"); free(umax); free(vmax); clReleaseMemObject(cl_u); clReleaseMemObject(cl_v); clReleaseMemObject(cl_uhat); clReleaseMemObject(cl_vhat); clReleaseMemObject(cl_kx); clReleaseMemObject(cl_ky); ret = clReleaseKernel(initialdata); ret = clReleaseKernel(frequencies); ret = clReleaseKernel(linearpart); ret = clReleaseKernel(nonlinearpart_a); ret = clReleaseKernel(nonlinearpart_b); fftdestroy(&planHandle, &tmpBuffer); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); for(i=0;i<num_platforms;i++){free(device_id[i]);} free(device_id); free(platform_id); free(num_devices); printf("Program execution complete\n"); return 0; }
//--------------------------------------------------------------------- // this function computes the norm of the difference between the // computed solution and the exact solution //--------------------------------------------------------------------- void error_norm(double rms[5]) { int i, m, d; cl_kernel k_error_norm; cl_mem m_rms; double (*g_rms)[5]; size_t local_ws, global_ws, temp, wg_num, buf_size; cl_int ecode; int d0 = grid_points[0]; int d1 = grid_points[1]; int d2 = grid_points[2]; for (m = 0; m < 5; m++) { rms[m] = 0.0; } temp = d2 / max_compute_units; local_ws = temp == 0 ? 1 : temp; global_ws = clu_RoundWorkSize((size_t)d2, local_ws); wg_num = global_ws / local_ws; buf_size = sizeof(double) * 5 * wg_num; m_rms = clCreateBuffer(context, CL_MEM_READ_WRITE, buf_size, NULL, &ecode); clu_CheckError(ecode, "clCreateBuffer()"); k_error_norm = clCreateKernel(p_error, "error_norm", &ecode); clu_CheckError(ecode, "clCreateKernel()"); ecode = clSetKernelArg(k_error_norm, 0, sizeof(cl_mem), &m_u); ecode |= clSetKernelArg(k_error_norm, 1, sizeof(cl_mem), &m_ce); ecode |= clSetKernelArg(k_error_norm, 2, sizeof(cl_mem), &m_rms); ecode |= clSetKernelArg(k_error_norm, 3, sizeof(double)*5*local_ws, NULL); ecode |= clSetKernelArg(k_error_norm, 4, sizeof(int), &d0); ecode |= clSetKernelArg(k_error_norm, 5, sizeof(int), &d1); ecode |= clSetKernelArg(k_error_norm, 6, sizeof(int), &d2); clu_CheckError(ecode, "clSetKernelArg()"); ecode = clEnqueueNDRangeKernel(cmd_queue, k_error_norm, 1, NULL, &global_ws, &local_ws, 0, NULL, NULL); clu_CheckError(ecode, "clEnqueueNDRangeKernel()"); g_rms = (double (*)[5])malloc(buf_size); ecode = clEnqueueReadBuffer(cmd_queue, m_rms, CL_TRUE, 0, buf_size, g_rms, 0, NULL, NULL); clu_CheckError(ecode, "clReadBuffer()"); // reduction for (i = 0; i < wg_num; i++) { for (m = 0; m < 5; m++) { rms[m] += g_rms[i][m]; } } for (m = 0; m < 5; m++) { for (d = 0; d < 3; d++) { rms[m] = rms[m] / (double)(grid_points[d]-2); } rms[m] = sqrt(rms[m]); } free(g_rms); clReleaseMemObject(m_rms); clReleaseKernel(k_error_norm); }
int main(int argc, char **argv) { if (find_option(argc, argv, "-h") >= 0) { printf("Options:\n"); printf("-h to see this help\n"); printf("-n <int> to set the number of particles\n"); printf("-o <filename> to specify the output file name\n"); printf("-s <filename> to specify the summary output file name\n"); return 0; } int n = read_int(argc, argv, "-n", 1000); char *savename = read_string(argc, argv, "-o", NULL); char *sumname = read_string(argc, argv, "-s", NULL); // For return values. cl_int ret; // OpenCL stuff. // Loading kernel files. FILE *kernelFile; char *kernelSource; size_t kernelSize; kernelFile = fopen("simulationKernel.cl", "r"); if (!kernelFile) { fprintf(stderr, "No file named simulationKernel.cl was found\n"); exit(-1); } kernelSource = (char*)malloc(MAX_SOURCE_SIZE); kernelSize = fread(kernelSource, 1, MAX_SOURCE_SIZE, kernelFile); fclose(kernelFile); // Getting platform and device information cl_platform_id platformId = NULL; cl_device_id deviceID = NULL; cl_uint retNumDevices; cl_uint retNumPlatforms; ret = clGetPlatformIDs(1, &platformId, &retNumPlatforms); // Different types of devices to pick from. At the moment picks the default opencl device. //CL_DEVICE_TYPE_GPU //CL_DEVICE_TYPE_ACCELERATOR //CL_DEVICE_TYPE_DEFAULT //CL_DEVICE_TYPE_CPU ret = clGetDeviceIDs(platformId, CL_DEVICE_TYPE_ACCELERATOR, 1, &deviceID, &retNumDevices); // Max workgroup size size_t max_available_local_wg_size; ret = clGetDeviceInfo(deviceID, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_available_local_wg_size, NULL); // Creating context. cl_context context = clCreateContext(NULL, 1, &deviceID, NULL, NULL, &ret); // Creating command queue cl_command_queue commandQueue = clCreateCommandQueueWithProperties (context, deviceID, 0, &ret); // Build program cl_program program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, (const size_t *)&kernelSize, &ret); // printf("program = ret %i \n", ret); ret = clBuildProgram(program, 1, &deviceID, NULL, NULL, NULL); // printf("clBuildProgram: ret %i \n", ret); // Create kernels cl_kernel forceKernel = clCreateKernel(program, "compute_forces_gpu", &ret); cl_kernel moveKernel = clCreateKernel(program, "move_gpu", &ret); cl_kernel binInitKernel = clCreateKernel(program, "bin_init_gpu", &ret); cl_kernel binKernel = clCreateKernel(program, "bin_gpu", &ret); FILE *fsave = savename ? fopen(savename, "w") : NULL; FILE *fsum = sumname ? fopen(sumname, "a") : NULL; particle_t *particles = (particle_t*)malloc(n * sizeof(particle_t)); // GPU particle data structure cl_mem d_particles = clCreateBuffer(context, CL_MEM_READ_WRITE, n * sizeof(particle_t), NULL, &ret); // Set size set_size(n); init_particles(n, particles); double copy_time = read_timer(); // Copy particles to device. ret = clEnqueueWriteBuffer(commandQueue, d_particles, CL_TRUE, 0, n * sizeof(particle_t), particles, 0, NULL, NULL); copy_time = read_timer() - copy_time; // Calculating thread and thread block counts. // sizes size_t globalItemSize; size_t localItemSize; // Global item size if (n <= NUM_THREADS) { globalItemSize = NUM_THREADS; localItemSize = 16; } else if (n % NUM_THREADS != 0) { globalItemSize = (n / NUM_THREADS + 1) * NUM_THREADS; } else { globalItemSize = n; } // Local item size localItemSize = globalItemSize / NUM_THREADS; // Bins and bin sizes. // Because of uniform distribution we will know that bins size is amortized. Therefore I picked the value of 10. // There will never be 10 particles in one bin. int maxParticles = 10; // Calculating the number of bins. int numberOfBins = (int)ceil(size/(2*cutoff)) + 2; // Bins will only exist on the device. particle_t* bins; // How many particles are there in each bin - also only exists on the device. volatile int* binSizes; // Number of bins to be initialized. size_t clearAmt = numberOfBins*numberOfBins; // Allocate memory for bins on the device. cl_mem d_binSizes = clCreateBuffer(context, CL_MEM_READ_WRITE, numberOfBins * numberOfBins * sizeof(volatile int), NULL, &ret); cl_mem d_bins = clCreateBuffer(context, CL_MEM_READ_WRITE, numberOfBins * numberOfBins * maxParticles * sizeof(particle_t), NULL, &ret); // SETTING ARGUMENTS FOR THE KERNELS // Set arguments for the init / clear kernel ret = clSetKernelArg(binInitKernel, 0, sizeof(cl_mem), (void *)&d_binSizes); ret = clSetKernelArg(binInitKernel, 1, sizeof(int), &numberOfBins); // Set arguments for the binning kernel ret = clSetKernelArg(binKernel, 0, sizeof(cl_mem), (void *)&d_particles); ret = clSetKernelArg(binKernel, 1, sizeof(int), &n); ret = clSetKernelArg(binKernel, 2, sizeof(cl_mem), (void *)&d_bins); ret = clSetKernelArg(binKernel, 3, sizeof(cl_mem), (void *)&d_binSizes); ret = clSetKernelArg(binKernel, 4, sizeof(int), &numberOfBins); // Set arguments for force kernel. ret = clSetKernelArg(forceKernel, 0, sizeof(cl_mem), (void *)&d_particles); ret = clSetKernelArg(forceKernel, 1, sizeof(int), &n); ret = clSetKernelArg(forceKernel, 2, sizeof(cl_mem), (void *)&d_bins); ret = clSetKernelArg(forceKernel, 3, sizeof(cl_mem), (void *)&d_binSizes); ret = clSetKernelArg(forceKernel, 4, sizeof(int), &numberOfBins); // Set arguments for move kernel ret = clSetKernelArg(moveKernel, 0, sizeof(cl_mem), (void *)&d_particles); ret = clSetKernelArg(moveKernel, 1, sizeof(int), &n); ret = clSetKernelArg(moveKernel, 2, sizeof(double), &size); // Variable to check if kernel execution is done. cl_event kernelDone; double simulation_time = read_timer(); int step = 0; for (step = 0; step < NSTEPS; step++) { // Execute bin initialization (clearing after first iteration) ret = clEnqueueNDRangeKernel(commandQueue, binInitKernel, 1, NULL, &clearAmt, NULL, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); // Execute binning kernel ret = clEnqueueNDRangeKernel(commandQueue, binKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone); // ret = clEnqueueNDRangeKernel(commandQueue, binKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); // Execute force kernel ret = clEnqueueNDRangeKernel(commandQueue, forceKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); // Execute move kernel ret = clEnqueueNDRangeKernel(commandQueue, moveKernel, 1, NULL, &globalItemSize, &localItemSize, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); if (fsave && (step%SAVEFREQ) == 0) { // Copy the particles back to the CPU ret = clEnqueueReadBuffer(commandQueue, d_particles, CL_TRUE, 0, n * sizeof(particle_t), particles, 0, NULL, &kernelDone); ret = clWaitForEvents(1, &kernelDone); save(fsave, n, particles); } } simulation_time = read_timer() - simulation_time; printf("CPU-GPU copy time = %g seconds\n", copy_time); printf("n = %d, simulation time = %g seconds\n", n, simulation_time); if (fsum) fprintf(fsum, "%d %lf \n", n, simulation_time); if (fsum) fclose(fsum); free(particles); if (fsave) fclose(fsave); ret = clFlush(commandQueue); ret = clFinish(commandQueue); ret = clReleaseCommandQueue(commandQueue); ret = clReleaseKernel(forceKernel); ret = clReleaseKernel(moveKernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(d_particles); ret = clReleaseContext(context); return 0; }
int main( int argc, char* argv[] ) { // Length of vectors unsigned int n = 100000; // Host input vectors double *h_a; double *h_b; // Host output vector double *h_c; // Device input buffers cl_mem d_a; cl_mem d_b; // Device output buffer cl_mem d_c; cl_platform_id cpPlatform; // OpenCL platform cl_device_id device_id; // device ID cl_context context; // context cl_command_queue queue; // command queue cl_program program; // program cl_kernel kernel; // kernel // Size, in bytes, of each vector size_t bytes = n*sizeof(double); // Allocate memory for each vector on host h_a = (double*)malloc(bytes); h_b = (double*)malloc(bytes); h_c = (double*)malloc(bytes); // Initialize vectors on host int i; for( i = 0; i < n; i++ ) { h_a[i] = sinf(i)*sinf(i); h_b[i] = cosf(i)*cosf(i); } size_t globalSize, localSize; cl_int err; // Number of work items in each local work group localSize = 64; // Number of total work items - localSize must be devisor globalSize = ceil(n/(float)localSize)*localSize; // Bind to platform err = clGetPlatformIDs(1, &cpPlatform, NULL); // Get ID for the device err = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); // Create a context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); // Create a command queue queue = clCreateCommandQueue(context, device_id, 0, &err); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelSource, NULL, &err); // Build the program executable clBuildProgram(program, 0, NULL, NULL, NULL, NULL); // Create the compute kernel in the program we wish to run kernel = clCreateKernel(program, "vecAdd", &err); // Create the input and output arrays in device memory for our calculation d_a = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_b = clCreateBuffer(context, CL_MEM_READ_ONLY, bytes, NULL, NULL); d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, bytes, NULL, NULL); // Write our data set into the input array in device memory err = clEnqueueWriteBuffer(queue, d_a, CL_TRUE, 0, bytes, h_a, 0, NULL, NULL); err |= clEnqueueWriteBuffer(queue, d_b, CL_TRUE, 0, bytes, h_b, 0, NULL, NULL); // Set the arguments to our compute kernel err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_c); err |= clSetKernelArg(kernel, 3, sizeof(unsigned int), &n); // Execute the kernel over the entire range of the data set err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &globalSize, &localSize, 0, NULL, NULL); // Wait for the command queue to get serviced before reading back results clFinish(queue); // Read the results from the device clEnqueueReadBuffer(queue, d_c, CL_TRUE, 0, bytes, h_c, 0, NULL, NULL ); //Sum up vector c and print result divided by n, this should equal 1 within error double sum = 0; for(i=0; i<n; i++) sum += h_c[i]; printf("final result: %f\n", sum/n); // release OpenCL resources clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseContext(context); //release host memory free(h_a); free(h_b); free(h_c); return 0; }
int btocl_invdetcholeskyDMATRIX1(cl_mem buffer, DMATRIX* dm, double* det) { // need extra buffer to store new diagonal int i,j, k,diagidx, subdiagidx, idx, idx1, idx2,n; double diag, *m; double* diagArray; //OCL variables cl_kernel kernel1, kernel2; cl_int err; cl_context context; cl_command_queue queue; cl_mem buffer_diag; size_t global_work_size; *det = 0; n = dm->nrows; m = dm->m; diagArray = (double*)malloc(sizeof(double)*n); context = btocl_getContext(); queue = btocl_getCommandQueue(); kernel1 = btocl_getKernel(BTOCL_CHOLUPDCOL); if (kernel1 == NULL) { printf("Error: Couldn't load kernel %s\n",btocl_getKernelName(BTOCL_CHOLUPDCOL)); exit(1); } else { printf("loaded %s\n",btocl_getKernelName(BTOCL_CHOLUPDCOL)); } kernel2 = btocl_getKernel(BTOCL_CHOLUPDMAT); if (kernel1 == NULL) { printf("Error: Couldn't load kernel %s\n",btocl_getKernelName(BTOCL_CHOLUPDMAT)); exit(1); } else { printf("loaded %s\n",btocl_getKernelName(BTOCL_CHOLUPDMAT)); } // buffer to store diagonal buffer_diag = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(double), NULL, &err); if (err < 0) { printf("Couldn't create diagonal buffer\n"); exit(1); } //btdebug_enter("btoclcopy1"); // copy matrix to buffer - may not need this - probably copied during creation clEnqueueWriteBuffer(queue,buffer,CL_TRUE,0,n*n*sizeof(double),&m[0],0,0,NULL); //btdebug_exit("btoclcopy1"); diagidx = 0; diag = m[diagidx]; // first diagonal global_work_size = n; // size of column to be updated for(i=0; i < n; i++) { //printf("Column %d\n",i); //printf("new diag %lf\n",diag); // diag = m[diagidx]; -- computed in previous iteration if (diag < 0) { printf("Error index %d negative diagonal %lf\n",diagidx,diag); return 1; } global_work_size--; diag = sqrt(diag); //printf("sq root diag %lf\n",diag); diagArray[i] = diag; // square root of diagonal if (diag < 0) *det += log(-diag); else *det += log(diag); // call update column kernel //perhaps it could be extended to have three phases: // update, copy to local and do multiplication //idx = diagidx; //for(j=i+1; j < n; j++) { // idx++; // m[idx] = m[idx]/diag; //} //printf("before update - diag %lf\n",diag); //btlapack_printDMATRIX(dm); // copy parameters // Pass Arguments if (global_work_size > 0) { if ((err = clSetKernelArg(kernel1,0,sizeof(cl_mem), &buffer)) < 0) { printf("Couldnt set first argument\n"); exit(1); } if ((err = clSetKernelArg(kernel1,1,sizeof(diagidx), &diagidx)) < 0) { printf("Couldnt set second argument\n"); exit(1); } if ((err = clSetKernelArg(kernel1,2,sizeof(diag), &diag)) < 0) { printf("Couldnt set third argument\n"); exit(1); } // copy colum to buffer //btdebug_enter("btoclkernel1"); // schedule kernel clEnqueueNDRangeKernel(queue,kernel1,1,NULL,&global_work_size,NULL,0,NULL,NULL); // copy back //btdebug_exit("btoclkernel1"); if ((err = clSetKernelArg(kernel2,0,sizeof(cl_mem), &buffer)) < 0) { printf("Couldnt set first argument\n"); exit(1); } if ((err = clSetKernelArg(kernel2,1,sizeof(cl_mem), &buffer_diag)) < 0) { printf("Couldnt set second argument\n"); exit(1); } if ((err = clSetKernelArg(kernel2,2,sizeof(diagidx), &diagidx)) < 0) { printf("Couldnt set third argument\n"); exit(1); } if ((err = clSetKernelArg(kernel2,3,sizeof(n), &n)) < 0) { // lda printf("Couldnt set fifth argument\n"); exit(1); } if ((err = clSetKernelArg(kernel2,4,sizeof(global_work_size), &global_work_size)) < 0) { printf("Couldnt set forth argument\n"); exit(1); } //btdebug_enter("btoclkernel2"); clEnqueueNDRangeKernel(queue,kernel2,1,NULL,&global_work_size,NULL,0,NULL,NULL); //btdebug_exit("btoclkernel2"); // copy diagonal_buffer to diag //btdebug_enter("btocldiag"); clEnqueueReadBuffer(queue,buffer_diag,CL_TRUE,0,sizeof(double),&diag,0,0,NULL); //btdebug_exit("btocldiag"); } //printf("after update, global_work size %d\n",global_work_size); //btlapack_printDMATRIX(dm); diagidx += n+1; // update diagonal index } // copy matrix from device to host clEnqueueReadBuffer(queue,buffer,CL_TRUE,0,n*n*sizeof(double),&m[0],0,0,NULL); // copy diagonals diagidx=0; for(j = 0; j < n; j++) { m[diagidx]=diagArray[j]; diagidx += (n+1); } *det *= 2.0; return 0; }
void OpenCLDevice::COM_clAttachOutputMemoryBufferToKernelParameter(cl_kernel kernel, int parameterIndex, cl_mem clOutputMemoryBuffer) { cl_int error; error = clSetKernelArg(kernel, parameterIndex, sizeof(cl_mem), &clOutputMemoryBuffer); if (error != CL_SUCCESS) { printf("CLERROR[%d]: %s\n", error, clewErrorString(error)); } }
// host stub function void ops_par_loop_flux_calc_kernely(char const *name, ops_block block, int dim, int *range, ops_arg arg0, ops_arg arg1, ops_arg arg2, ops_arg arg3) { // Timing double t1, t2, c1, c2; ops_arg args[4] = {arg0, arg1, arg2, arg3}; #ifdef CHECKPOINTING if (!ops_checkpointing_before(args, 4, range, 107)) return; #endif if (OPS_diags > 1) { ops_timing_realloc(107, "flux_calc_kernely"); OPS_kernels[107].count++; ops_timers_core(&c1, &t1); } // compute locally allocated range for the sub-block int start[3]; int end[3]; #ifdef OPS_MPI sub_block_list sb = OPS_sub_block_list[block->index]; if (!sb->owned) return; for (int n = 0; n < 3; n++) { start[n] = sb->decomp_disp[n]; end[n] = sb->decomp_disp[n] + sb->decomp_size[n]; if (start[n] >= range[2 * n]) { start[n] = 0; } else { start[n] = range[2 * n] - start[n]; } if (sb->id_m[n] == MPI_PROC_NULL && range[2 * n] < 0) start[n] = range[2 * n]; if (end[n] >= range[2 * n + 1]) { end[n] = range[2 * n + 1] - sb->decomp_disp[n]; } else { end[n] = sb->decomp_size[n]; } if (sb->id_p[n] == MPI_PROC_NULL && (range[2 * n + 1] > sb->decomp_disp[n] + sb->decomp_size[n])) end[n] += (range[2 * n + 1] - sb->decomp_disp[n] - sb->decomp_size[n]); } #else for (int n = 0; n < 3; n++) { start[n] = range[2 * n]; end[n] = range[2 * n + 1]; } #endif int x_size = MAX(0, end[0] - start[0]); int y_size = MAX(0, end[1] - start[1]); int z_size = MAX(0, end[2] - start[2]); int xdim0 = args[0].dat->size[0]; int ydim0 = args[0].dat->size[1]; int xdim1 = args[1].dat->size[0]; int ydim1 = args[1].dat->size[1]; int xdim2 = args[2].dat->size[0]; int ydim2 = args[2].dat->size[1]; int xdim3 = args[3].dat->size[0]; int ydim3 = args[3].dat->size[1]; // build opencl kernel if not already built buildOpenCLKernels_flux_calc_kernely(xdim0, ydim0, xdim1, ydim1, xdim2, ydim2, xdim3, ydim3); // set up OpenCL thread blocks size_t globalWorkSize[3] = { ((x_size - 1) / OPS_block_size_x + 1) * OPS_block_size_x, ((y_size - 1) / OPS_block_size_y + 1) * OPS_block_size_y, ((z_size - 1) / OPS_block_size_z + 1) * OPS_block_size_z}; size_t localWorkSize[3] = {OPS_block_size_x, OPS_block_size_y, OPS_block_size_z}; // set up initial pointers int d_m[OPS_MAX_DIM]; #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d] + OPS_sub_dat_list[args[0].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[0].dat->d_m[d]; #endif int base0 = 1 * 1 * (start[0] * args[0].stencil->stride[0] - args[0].dat->base[0] - d_m[0]); base0 = base0 + args[0].dat->size[0] * 1 * (start[1] * args[0].stencil->stride[1] - args[0].dat->base[1] - d_m[1]); base0 = base0 + args[0].dat->size[0] * 1 * args[0].dat->size[1] * 1 * (start[2] * args[0].stencil->stride[2] - args[0].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d] + OPS_sub_dat_list[args[1].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[1].dat->d_m[d]; #endif int base1 = 1 * 1 * (start[0] * args[1].stencil->stride[0] - args[1].dat->base[0] - d_m[0]); base1 = base1 + args[1].dat->size[0] * 1 * (start[1] * args[1].stencil->stride[1] - args[1].dat->base[1] - d_m[1]); base1 = base1 + args[1].dat->size[0] * 1 * args[1].dat->size[1] * 1 * (start[2] * args[1].stencil->stride[2] - args[1].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d] + OPS_sub_dat_list[args[2].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[2].dat->d_m[d]; #endif int base2 = 1 * 1 * (start[0] * args[2].stencil->stride[0] - args[2].dat->base[0] - d_m[0]); base2 = base2 + args[2].dat->size[0] * 1 * (start[1] * args[2].stencil->stride[1] - args[2].dat->base[1] - d_m[1]); base2 = base2 + args[2].dat->size[0] * 1 * args[2].dat->size[1] * 1 * (start[2] * args[2].stencil->stride[2] - args[2].dat->base[2] - d_m[2]); #ifdef OPS_MPI for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d] + OPS_sub_dat_list[args[3].dat->index]->d_im[d]; #else for (int d = 0; d < dim; d++) d_m[d] = args[3].dat->d_m[d]; #endif int base3 = 1 * 1 * (start[0] * args[3].stencil->stride[0] - args[3].dat->base[0] - d_m[0]); base3 = base3 + args[3].dat->size[0] * 1 * (start[1] * args[3].stencil->stride[1] - args[3].dat->base[1] - d_m[1]); base3 = base3 + args[3].dat->size[0] * 1 * args[3].dat->size[1] * 1 * (start[2] * args[3].stencil->stride[2] - args[3].dat->base[2] - d_m[2]); ops_H_D_exchanges_device(args, 4); ops_halo_exchanges(args, 4, range); ops_H_D_exchanges_device(args, 4); if (OPS_diags > 1) { ops_timers_core(&c2, &t2); OPS_kernels[107].mpi_time += t2 - t1; } if (globalWorkSize[0] > 0 && globalWorkSize[1] > 0 && globalWorkSize[2] > 0) { clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[107], 0, sizeof(cl_mem), (void *)&arg0.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[107], 1, sizeof(cl_mem), (void *)&arg1.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[107], 2, sizeof(cl_mem), (void *)&arg2.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[107], 3, sizeof(cl_mem), (void *)&arg3.data_d)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[107], 4, sizeof(cl_double), (void *)&dt)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[107], 5, sizeof(cl_int), (void *)&base0)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[107], 6, sizeof(cl_int), (void *)&base1)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[107], 7, sizeof(cl_int), (void *)&base2)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[107], 8, sizeof(cl_int), (void *)&base3)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[107], 9, sizeof(cl_int), (void *)&x_size)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[107], 10, sizeof(cl_int), (void *)&y_size)); clSafeCall(clSetKernelArg(OPS_opencl_core.kernel[107], 11, sizeof(cl_int), (void *)&z_size)); // call/enque opencl kernel wrapper function clSafeCall(clEnqueueNDRangeKernel( OPS_opencl_core.command_queue, OPS_opencl_core.kernel[107], 3, NULL, globalWorkSize, localWorkSize, 0, NULL, NULL)); } if (OPS_diags > 1) { clSafeCall(clFinish(OPS_opencl_core.command_queue)); } if (OPS_diags > 1) { ops_timers_core(&c1, &t1); OPS_kernels[107].time += t1 - t2; } ops_set_dirtybit_device(args, 4); ops_set_halo_dirtybit3(&args[0], range); if (OPS_diags > 1) { // Update kernel record ops_timers_core(&c2, &t2); OPS_kernels[107].mpi_time += t2 - t1; OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg0); OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg1); OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg2); OPS_kernels[107].transfer += ops_compute_transfer(dim, start, end, &arg3); } }
int main() { typedef float ScalarType; ///////////////////////////////////////////////////////////////////////////////////////////////////////// //////////////////////// Part 1: Set up a custom context and perform a sample operation. //////////////// //////////////////////// This is rather lengthy due to the OpenCL framework. //////////////// //////////////////////// The following does essentially the same as the //////////////// //////////////////////// 'custom_kernels'-tutorial! //////////////// ///////////////////////////////////////////////////////////////////////////////////////////////////////// //manually set up a custom OpenCL context: std::vector<cl_device_id> device_id_array; //get all available devices viennacl::ocl::platform pf; std::cout << "Platform info: " << pf.info() << std::endl; std::vector<viennacl::ocl::device> devices = pf.devices(CL_DEVICE_TYPE_DEFAULT); std::cout << devices[0].name() << std::endl; std::cout << "Number of devices for custom context: " << devices.size() << std::endl; //set up context using all found devices: for (size_t i=0; i<devices.size(); ++i) { device_id_array.push_back(devices[i].id()); } std::cout << "Creating context..." << std::endl; cl_int err; cl_context my_context = clCreateContext(0, device_id_array.size(), &(device_id_array[0]), NULL, NULL, &err); VIENNACL_ERR_CHECK(err); //create two Vectors: unsigned int vector_size = 10; std::vector<ScalarType> vec1(vector_size); std::vector<ScalarType> vec2(vector_size); std::vector<ScalarType> result(vector_size); // // fill the operands vec1 and vec2: // for (unsigned int i=0; i<vector_size; ++i) { vec1[i] = static_cast<ScalarType>(i); vec2[i] = static_cast<ScalarType>(vector_size-i); } // // create memory in OpenCL context: // cl_mem mem_vec1 = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(vec1[0]), &err); VIENNACL_ERR_CHECK(err); cl_mem mem_vec2 = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(vec2[0]), &err); VIENNACL_ERR_CHECK(err); cl_mem mem_result = clCreateBuffer(my_context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, vector_size * sizeof(ScalarType), &(result[0]), &err); VIENNACL_ERR_CHECK(err); // // create a command queue for each device: // std::vector<cl_command_queue> queues(devices.size()); for (size_t i=0; i<devices.size(); ++i) { queues[i] = clCreateCommandQueue(my_context, devices[i].id(), 0, &err); VIENNACL_ERR_CHECK(err); } // // create and build a program in the context: // size_t source_len = std::string(my_compute_program).length(); cl_program my_prog = clCreateProgramWithSource(my_context, 1, &my_compute_program, &source_len, &err); err = clBuildProgram(my_prog, 0, NULL, NULL, NULL, NULL); /* char buffer[1024]; cl_build_status status; clGetProgramBuildInfo(my_prog, devices[1].id(), CL_PROGRAM_BUILD_STATUS, sizeof(cl_build_status), &status, NULL); clGetProgramBuildInfo(my_prog, devices[1].id(), CL_PROGRAM_BUILD_LOG, sizeof(char)*1024, &buffer, NULL); std::cout << "Build Scalar: Err = " << err << " Status = " << status << std::endl; std::cout << "Log: " << buffer << std::endl;*/ VIENNACL_ERR_CHECK(err); // // create a kernel from the program: // const char * kernel_name = "elementwise_prod"; cl_kernel my_kernel = clCreateKernel(my_prog, kernel_name, &err); VIENNACL_ERR_CHECK(err); // // Execute elementwise_prod kernel on first queue: result = vec1 .* vec2; // err = clSetKernelArg(my_kernel, 0, sizeof(cl_mem), (void*)&mem_vec1); VIENNACL_ERR_CHECK(err); err = clSetKernelArg(my_kernel, 1, sizeof(cl_mem), (void*)&mem_vec2); VIENNACL_ERR_CHECK(err); err = clSetKernelArg(my_kernel, 2, sizeof(cl_mem), (void*)&mem_result); VIENNACL_ERR_CHECK(err); err = clSetKernelArg(my_kernel, 3, sizeof(unsigned int), (void*)&vector_size); VIENNACL_ERR_CHECK(err); size_t global_size = vector_size; size_t local_size = vector_size; err = clEnqueueNDRangeKernel(queues[0], my_kernel, 1, NULL, &global_size, &local_size, 0, NULL, NULL); VIENNACL_ERR_CHECK(err); // // Read and output result: // err = clEnqueueReadBuffer(queues[0], mem_vec1, CL_TRUE, 0, sizeof(ScalarType)*vector_size, &(vec1[0]), 0, NULL, NULL); VIENNACL_ERR_CHECK(err); err = clEnqueueReadBuffer(queues[0], mem_result, CL_TRUE, 0, sizeof(ScalarType)*vector_size, &(result[0]), 0, NULL, NULL); VIENNACL_ERR_CHECK(err); std::cout << "vec1 : "; for (size_t i=0; i<vec1.size(); ++i) std::cout << vec1[i] << " "; std::cout << std::endl; std::cout << "vec2 : "; for (size_t i=0; i<vec2.size(); ++i) std::cout << vec2[i] << " "; std::cout << std::endl; std::cout << "result: "; for (size_t i=0; i<result.size(); ++i) std::cout << result[i] << " "; std::cout << std::endl; //////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////// Part 2: Let ViennaCL use the already created context: ////////////////////////// //////////////////////////////////////////////////////////////////////////////////////////////////////// //Tell ViennaCL to use the previously created context. //This context is assigned an id '0' when using viennacl::ocl::switch_context(). viennacl::ocl::setup_context(0, my_context, device_id_array, queues); viennacl::ocl::switch_context(0); //activate the new context (only mandatory with context-id not equal to zero) // // Proof that ViennaCL really uses the new context: // std::cout << "Existing context: " << my_context << std::endl; std::cout << "ViennaCL uses context: " << viennacl::ocl::current_context().handle().get() << std::endl; // // Wrap existing OpenCL objects into ViennaCL: // viennacl::vector<ScalarType> vcl_vec1(mem_vec1, vector_size); viennacl::vector<ScalarType> vcl_vec2(mem_vec2, vector_size); viennacl::vector<ScalarType> vcl_result(mem_result, vector_size); viennacl::scalar<ScalarType> vcl_s = 2.0; std::cout << "Standard vector operations within ViennaCL:" << std::endl; vcl_result = vcl_s * vcl_vec1 + vcl_vec2; std::cout << "vec1 : "; std::cout << vcl_vec1 << std::endl; std::cout << "vec2 : "; std::cout << vcl_vec2 << std::endl; std::cout << "result: "; std::cout << vcl_result << std::endl; // // We can also reuse the existing elementwise_prod kernel. // Therefore, we first have to make the existing program known to ViennaCL // For more details on the three lines, see tutorial 'custom-kernels' // std::cout << "Using existing kernel within the OpenCL backend of ViennaCL:" << std::endl; viennacl::ocl::program & my_vcl_prog = viennacl::ocl::current_context().add_program(my_prog, "my_compute_program"); viennacl::ocl::kernel & my_vcl_kernel = my_vcl_prog.add_kernel("elementwise_prod"); viennacl::ocl::enqueue(my_vcl_kernel(vcl_vec1, vcl_vec2, vcl_result, static_cast<cl_uint>(vcl_vec1.size()))); //Note that size_t might differ between host and device. Thus, a cast to cl_uint is necessary here. std::cout << "vec1 : "; std::cout << vcl_vec1 << std::endl; std::cout << "vec2 : "; std::cout << vcl_vec2 << std::endl; std::cout << "result: "; std::cout << vcl_result << std::endl; // // Since a linear piece of memory can be interpreted in several ways, // we will now create a 3x3 row-major matrix out of the linear memory in mem_vec1/ // The first three entries in vcl_vec2 and vcl_result are used to carry out matrix-vector products: // viennacl::matrix<ScalarType> vcl_matrix(mem_vec1, 3, 3); vcl_vec2.resize(3); //note that the resize operation leads to new memory, thus vcl_vec2 is now at a different memory location (values are copied) vcl_result.resize(3); //note that the resize operation leads to new memory, thus vcl_vec2 is now at a different memory location (values are copied) vcl_result = viennacl::linalg::prod(vcl_matrix, vcl_vec2); std::cout << "result of matrix-vector product: "; std::cout << vcl_result << std::endl; // // That's it. // std::cout << "!!!! TUTORIAL COMPLETED SUCCESSFULLY !!!!" << std::endl; return 0; }
int main() { cl_int num_rand = 4096*256; /* The number of random numbers generated using one generator */ int count_all, i, num_generator = sizeof(mts)/sizeof(mts[0]); /* The number of generators */ double pi; cl_platform_id platform_id = NULL; cl_uint ret_num_platforms; cl_device_id device_id = NULL; cl_uint ret_num_devices; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_program program = NULL; cl_kernel kernel_mt = NULL, kernel_pi = NULL; size_t kernel_code_size; char *kernel_src_str; cl_uint *result; cl_int ret; FILE *fp; cl_mem rand, count; size_t global_item_size[3], local_item_size[3]; cl_mem dev_mts; cl_event ev_mt_end, ev_pi_end, ev_copy_end; cl_ulong prof_start, prof_mt_end, prof_pi_end, prof_copy_end; clGetPlatformIDs(1, &platform_id, &ret_num_platforms); clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); result = (cl_uint*)malloc(sizeof(cl_uint)*num_generator); command_queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &ret); fp = fopen("mt.cl", "r"); kernel_src_str = (char*)malloc(MAX_SOURCE_SIZE); kernel_code_size = fread(kernel_src_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); /* Create output buffer */ rand = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint)*num_rand*num_generator, NULL, &ret); count = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_uint)*num_generator, NULL, &ret); /* Build Program*/ program = clCreateProgramWithSource(context, 1, (const char **)&kernel_src_str, (const size_t *)&kernel_code_size, &ret); clBuildProgram(program, 1, &device_id, "", NULL, NULL); kernel_mt = clCreateKernel(program, "genrand", &ret); kernel_pi = clCreateKernel(program, "calc_pi", &ret); /* Create input parameter */ dev_mts = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(mts), NULL, &ret); clEnqueueWriteBuffer(command_queue, dev_mts, CL_TRUE, 0, sizeof(mts), mts, 0, NULL, NULL); /* Set Kernel Arguments */ clSetKernelArg(kernel_mt, 0, sizeof(cl_mem), (void*)&rand); /* Random numbers (output of genrand) */ clSetKernelArg(kernel_mt, 1, sizeof(cl_mem), (void*)&dev_mts); /* MT parameter (input to genrand) */ clSetKernelArg(kernel_mt, 2, sizeof(num_rand), &num_rand); /* Number of random numbers to generate */ clSetKernelArg(kernel_pi, 0, sizeof(cl_mem), (void*)&count); /* Counter for points within circle (output of calc_pi) */ clSetKernelArg(kernel_pi, 1, sizeof(cl_mem), (void*)&rand); /* Random numbers (input to calc_pi) */ clSetKernelArg(kernel_pi, 2, sizeof(num_rand), &num_rand); /* Number of random numbers used */ global_item_size[0] = num_generator; global_item_size[1] = 1; global_item_size[2] = 1; local_item_size[0] = num_generator; local_item_size[1] = 1; local_item_size[2] = 1; /* Create a random number array */ clEnqueueNDRangeKernel(command_queue, kernel_mt, 1, NULL, global_item_size, local_item_size, 0, NULL, &ev_mt_end); /* Compute PI */ clEnqueueNDRangeKernel(command_queue, kernel_pi, 1, NULL, global_item_size, local_item_size, 0, NULL, &ev_pi_end); /* Get result */ clEnqueueReadBuffer(command_queue, count, CL_TRUE, 0, sizeof(cl_uint)*num_generator, result, 0, NULL, &ev_copy_end); /* Average the values of PI */ count_all = 0; for (i=0; i < num_generator; i++) { count_all += result[i]; } pi = ((double)count_all)/(num_rand * num_generator) * 4; printf("pi = %f\n", pi); /* Get execution time info */ clGetEventProfilingInfo(ev_mt_end, CL_PROFILING_COMMAND_QUEUED, sizeof(cl_ulong), &prof_start, NULL); clGetEventProfilingInfo(ev_mt_end, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &prof_mt_end, NULL); clGetEventProfilingInfo(ev_pi_end, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &prof_pi_end, NULL); clGetEventProfilingInfo(ev_copy_end, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &prof_copy_end, NULL); printf(" mt: %f[ms]\n" " pi: %f[ms]\n" " copy: %f[ms]\n", (prof_mt_end - prof_start)/(1000000.0), (prof_pi_end - prof_mt_end)/(1000000.0), (prof_copy_end - prof_pi_end)/(1000000.0)); clReleaseEvent(ev_mt_end); clReleaseEvent(ev_pi_end); clReleaseEvent(ev_copy_end); clReleaseMemObject(rand); clReleaseMemObject(count); clReleaseKernel(kernel_mt); clReleaseKernel(kernel_pi); clReleaseProgram(program); clReleaseCommandQueue(command_queue); clReleaseContext(context); free(kernel_src_str); free(result); return 0; }
int main(void) { const size_t ARRAY_BYTES = ARRAY_SIZE * sizeof(float); // Generate the input array on the host. float h_a[ARRAY_SIZE]; float h_b[ARRAY_SIZE]; for (int i = 0; i < ARRAY_SIZE; i++) { h_a[i] = (float)i; h_b[i] = (float)(2 * i); } float h_c[ARRAY_SIZE]; FILE *fp; char *source_str; size_t source_size; fp = fopen("vectors_cl.cl", "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } source_str = (char *)malloc(MAX_SOURCE_SIZE); source_size = fread(source_str, 1, MAX_SOURCE_SIZE, fp); fclose(fp); // Get platform and device information cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); // Create an OpenCL context cl_context context = clCreateContext(NULL, 1, &device_id, NULL, NULL, &ret); // Create a command queue cl_command_queue command_queue = clCreateCommandQueue(context, device_id, 0, &ret); // Create memory buffers on the device for each vector cl_mem a_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, ARRAY_BYTES, NULL, &ret); cl_mem b_mem_obj = clCreateBuffer(context, CL_MEM_READ_ONLY, ARRAY_BYTES, NULL, &ret); cl_mem c_mem_obj = clCreateBuffer(context, CL_MEM_WRITE_ONLY, ARRAY_BYTES, NULL, &ret); // Copy h_a and h_b to memory buffer ret = clEnqueueWriteBuffer(command_queue, a_mem_obj, CL_TRUE, 0, ARRAY_BYTES, h_a, 0, NULL, NULL); ret = clEnqueueWriteBuffer(command_queue, b_mem_obj, CL_TRUE, 0, ARRAY_BYTES, h_b, 0, NULL, NULL); // Create a program from the kernel source cl_program program = clCreateProgramWithSource(context, 1, (const char **)&source_str, (const size_t *)&source_size, &ret); if (ret != 0) { printf("clCreateProgramWithSource returned non-zero status %d\n\n", ret); exit(1); } // Build the program ret = clBuildProgram(program, 1, &device_id, NULL, NULL, NULL); if (ret != 0) { printf("clBuildProgram returned non-zero status %d: ", ret); if (ret == CL_INVALID_PROGRAM) { printf("invalid program\n"); } else if (ret == CL_INVALID_VALUE) { printf("invalid value\n"); } else if (ret == CL_INVALID_DEVICE) { printf("invalid device\n"); } else if (ret == CL_INVALID_BINARY) { printf("invalid binary\n"); } else if (ret == CL_INVALID_BUILD_OPTIONS) { printf("invalid build options\n"); } else if (ret == CL_INVALID_OPERATION) { printf("invalid operation\n"); } else if (ret == CL_COMPILER_NOT_AVAILABLE) { printf("compiler not available\n"); } else if (ret == CL_BUILD_PROGRAM_FAILURE) { printf("build program failure\n"); // Determine the size of the log size_t log_size; clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &log_size); // Allocate memory for the log char *log = (char *) malloc(log_size); // Get the log clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, log_size, log, NULL); // Print the log printf("%s\n", log); } else if (ret == CL_OUT_OF_HOST_MEMORY) { printf("out of host memory\n"); } exit(1); } // Create the OpenCL kernel cl_kernel kernel = clCreateKernel(program, "add", &ret); // Set the arguments of the kernel ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&a_mem_obj); ret = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&b_mem_obj); ret = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void *)&c_mem_obj); size_t array_size = ARRAY_SIZE; ret = clSetKernelArg(kernel, 3, sizeof(const size_t), (void *)&array_size); // Execute the OpenCL kernel on the list size_t global_item_size = ARRAY_SIZE; // Process the entire lists size_t local_item_size = 1; // Divide work items into groups of 64 ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, &global_item_size, &local_item_size, 0, NULL, NULL); // Read the memory buffer C on the device to the local variable C ret = clEnqueueReadBuffer(command_queue, c_mem_obj, CL_TRUE, 0, ARRAY_BYTES, h_c, 0, NULL, NULL); // Print out the resulting array. for (int i = 0; i < 8; i++) { printf("%d + %d = %d", (int)h_a[i], (int)h_b[i], (int)h_c[i]); printf(((i % 4) != 3) ? "\t" : "\n"); } printf("...\n"); for (int i = ARRAY_SIZE - 8; i < ARRAY_SIZE; i++) { printf("%d + %d = %d", (int)h_a[i], (int)h_b[i], (int)h_c[i]); printf(((i % 4) != 3) ? "\t" : "\n"); } // Clean up ret = clFlush(command_queue); ret = clFinish(command_queue); ret = clReleaseKernel(kernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(a_mem_obj); ret = clReleaseMemObject(b_mem_obj); ret = clReleaseMemObject(c_mem_obj); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); return 0; }
// main() for simple buffer and sub-buffer example // int main(int argc, char** argv) { cl_int errNum; cl_uint numPlatforms; cl_uint numDevices; cl_platform_id * platformIDs; cl_device_id * deviceIDs; cl_context context; cl_program program; std::vector<cl_kernel> kernels; std::vector<cl_command_queue> queues; std::vector<cl_mem> buffers; int * inputOutput; std::cout << "Simple buffer and sub-buffer Example" << std::endl; // First, select an OpenCL platform to run on. errNum = clGetPlatformIDs(0, NULL, &numPlatforms); checkErr( (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs"); platformIDs = (cl_platform_id *)alloca(sizeof(cl_platform_id) * numPlatforms); std::cout << "Number of platforms: \t" << numPlatforms << std::endl; errNum = clGetPlatformIDs(numPlatforms, platformIDs, NULL); checkErr( (errNum != CL_SUCCESS) ? errNum : (numPlatforms <= 0 ? -1 : CL_SUCCESS), "clGetPlatformIDs"); std::ifstream srcFile("simple.cl"); checkErr(srcFile.is_open() ? CL_SUCCESS : -1, "reading simple.cl"); std::string srcProg( std::istreambuf_iterator<char>(srcFile), (std::istreambuf_iterator<char>())); const char * src = srcProg.c_str(); size_t length = srcProg.length(); deviceIDs = NULL; DisplayPlatformInfo( platformIDs[PLATFORM_INDEX], CL_PLATFORM_VENDOR, "CL_PLATFORM_VENDOR"); errNum = clGetDeviceIDs( platformIDs[PLATFORM_INDEX], CL_DEVICE_TYPE_ALL, 0, NULL, &numDevices); if (errNum != CL_SUCCESS && errNum != CL_DEVICE_NOT_FOUND){ checkErr(errNum, "clGetDeviceIDs"); } deviceIDs = (cl_device_id *)alloca( sizeof(cl_device_id) * numDevices); errNum = clGetDeviceIDs( platformIDs[PLATFORM_INDEX], CL_DEVICE_TYPE_ALL, numDevices, &deviceIDs[0], NULL); checkErr(errNum, "clGetDeviceIDs"); cl_context_properties contextProperties[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)platformIDs[PLATFORM_INDEX], 0 }; context = clCreateContext( contextProperties, numDevices, deviceIDs, NULL, NULL, &errNum); checkErr(errNum, "clCreateContext"); // Create program from source program = clCreateProgramWithSource( context, 1, &src, &length, &errNum); checkErr(errNum, "clCreateProgramWithSource"); // Build program errNum = clBuildProgram( program, numDevices, deviceIDs, "-I.", NULL, NULL); if (errNum != CL_SUCCESS){ // Determine the reason for the error char buildLog[16384]; clGetProgramBuildInfo( program, deviceIDs[0], CL_PROGRAM_BUILD_LOG, sizeof(buildLog), buildLog, NULL); std::cerr << "Error in OpenCL C source: " << std::endl; std::cerr << buildLog; checkErr(errNum, "clBuildProgram"); } // create buffers and sub-buffers inputOutput = new int[NUM_BUFFER_ELEMENTS * numDevices]; for (unsigned int i = 0; i < NUM_BUFFER_ELEMENTS * numDevices; i++) { inputOutput[i] = i; } // create a single buffer to cover all the input data cl_mem buffer = clCreateBuffer( context, CL_MEM_READ_WRITE, sizeof(int) * NUM_BUFFER_ELEMENTS * numDevices, NULL, &errNum); checkErr(errNum, "clCreateBuffer"); buffers.push_back(buffer); // now for all devices other than the first create a sub-buffer for (unsigned int i = 1; i < numDevices; i++) { cl_buffer_region region = { NUM_BUFFER_ELEMENTS * i * sizeof(int), NUM_BUFFER_ELEMENTS * sizeof(int) }; buffer = clCreateSubBuffer( buffers[0], CL_MEM_READ_WRITE, CL_BUFFER_CREATE_TYPE_REGION, ®ion, &errNum); checkErr(errNum, "clCreateSubBuffer"); buffers.push_back(buffer); } // Create command queues for (int i = 0; i < numDevices; i++) { InfoDevice<cl_device_type>::display(deviceIDs[i], CL_DEVICE_TYPE, "CL_DEVICE_TYPE"); cl_command_queue queue = clCreateCommandQueue( context, deviceIDs[i], 0, &errNum); checkErr(errNum, "clCreateCommandQueue"); queues.push_back(queue); cl_kernel kernel = clCreateKernel( program, "square", &errNum); checkErr(errNum, "clCreateKernel(square)"); errNum = clSetKernelArg( kernel, 0, sizeof(cl_mem), (void *)&buffers[i]); checkErr(errNum, "clSetKernelArg(square)"); kernels.push_back(kernel); // Write input data clEnqueueWriteBuffer( queues[0], buffers[0], CL_TRUE, 0, sizeof(int) * NUM_BUFFER_ELEMENTS * numDevices, (void*)inputOutput, 0, NULL, NULL); std::vector<cl_event> events; // call kernel for each device for (int i = 0; i < queues.size(); i++) { cl_event event; size_t gWI = NUM_BUFFER_ELEMENTS; errNum = clEnqueueNDRangeKernel( queues[i], kernels[i], 1, NULL, (const size_t*)&gWI, (const size_t*)NULL, 0, 0, &event); events.push_back(event); } // Technically don't need this as we are doing a blocking read // with in-order queue. clWaitForEvents(events.size(), events.data()); // Read back computed data clEnqueueReadBuffer( queues[0], buffers[0], CL_TRUE, 0, sizeof(int) * NUM_BUFFER_ELEMENTS * numDevices, (void*)inputOutput, 0, NULL, NULL); // Display output in rows for (unsigned i = 0; i < numDevices; i++) { for (unsigned elems = i * NUM_BUFFER_ELEMENTS; elems < ((i+1) * NUM_BUFFER_ELEMENTS); elems++) { std::cout << " " << inputOutput[elems]; } std::cout << std::endl; } std::cout << "Program completed successfully" << std::endl; return 0; } }
int main() { /* OpenCL data structures */ cl_device_id device; cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_int err; /* Data and events */ char *kernel_msg; float data[4096]; cl_mem data_buffer; cl_event kernel_event, read_event; /* Create a device and context */ device = create_device(); context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if(err < 0) { perror("Couldn't create a context"); exit(1); } /* Build the program and create a kernel */ program = build_program(context, device, PROGRAM_FILE); kernel = clCreateKernel(program, KERNEL_FUNC, &err); if(err < 0) { perror("Couldn't create a kernel"); exit(1); }; /* Create a write-only buffer to hold the output data */ data_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(data), NULL, &err); if(err < 0) { perror("Couldn't create a buffer"); exit(1); }; /* Create kernel argument */ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_buffer); if(err < 0) { perror("Couldn't set a kernel argument"); exit(1); }; /* Create a command queue */ queue = clCreateCommandQueue(context, device, 0, &err); if(err < 0) { perror("Couldn't create a command queue"); exit(1); }; /* Enqueue kernel */ err = clEnqueueTask(queue, kernel, 0, NULL, &kernel_event); if(err < 0) { perror("Couldn't enqueue the kernel"); exit(1); } /* Read the buffer */ err = clEnqueueReadBuffer(queue, data_buffer, CL_FALSE, 0, sizeof(data), &data, 0, NULL, &read_event); if(err < 0) { perror("Couldn't read the buffer"); exit(1); } /* Set event handling routines */ kernel_msg = "The kernel finished successfully.\n\0"; err = clSetEventCallback(kernel_event, CL_COMPLETE, &kernel_complete, kernel_msg); if(err < 0) { perror("Couldn't set callback for event"); exit(1); } clSetEventCallback(read_event, CL_COMPLETE, &read_complete, data); /* Deallocate resources */ clReleaseMemObject(data_buffer); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseProgram(program); clReleaseContext(context); return 0; }
int main( int argc, char **argv ) { struct pb_Parameters *params; params = pb_ReadParameters(&argc, argv); if ((params->inpFiles[0] == NULL) || (params->inpFiles[1] != NULL)) { fprintf(stderr, "Expecting one input filename\n"); exit(-1); } int err = 0; if(argc != 3) err |= 1; else { char* numend; N = strtol(argv[1], &numend, 10); if(numend == argv[1]) err |= 2; B = strtol(argv[2], &numend, 10); if(numend == argv[2]) err |= 4; } if(err) { fprintf(stderr, "Expecting two integers for N and B\n"); exit(-1); } //8*1024*1024; int n_bytes = N * B* sizeof(float2); int nthreads = T; struct pb_TimerSet timers; pb_InitializeTimerSet(&timers); pb_AddSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); float *shared_source =(float *)malloc(n_bytes); float2 *source = (float2 *)malloc( n_bytes ); float2 *result = (float2 *)calloc( N*B, sizeof(float2) ); inputData(params->inpFiles[0],(float*)source,N*B*2); // OpenCL Code cl_int clErrNum; pb_Context* pb_context; pb_context = pb_InitOpenCLContext(params); if (pb_context == NULL) { fprintf (stderr, "Error: No OpenCL platform/device can be found."); return -1; } cl_int clStatus; cl_device_id clDevice = (cl_device_id) pb_context->clDeviceId; cl_platform_id clPlatform = (cl_platform_id) pb_context->clPlatformId; cl_context clContext = (cl_context) pb_context->clContext; cl_command_queue clCommandQueue; cl_program clProgram; cl_kernel fft_kernel; cl_mem d_source, d_work; //float2 *d_source, *d_work; cl_mem d_shared_source; //float *d_shared_source; clCommandQueue = clCreateCommandQueue(clContext, clDevice, CL_QUEUE_PROFILING_ENABLE, &clErrNum); OCL_ERRCK_VAR(clErrNum); pb_SetOpenCL(&clContext, &clCommandQueue); pb_SwitchToSubTimer(&timers, oclOverhead, pb_TimerID_KERNEL); const char *source_path = "src/opencl_nvidia/fft_kernel.cl"; char *sourceCode; sourceCode = readFile(source_path); if (sourceCode == NULL) { fprintf(stderr, "Could not load program source of '%s'\n", source_path); exit(1); } clProgram = clCreateProgramWithSource(clContext, 1, (const char **)&sourceCode, NULL, &clErrNum); OCL_ERRCK_VAR(clErrNum); free(sourceCode); /* char compileOptions[1024]; // -cl-nv-verbose // Provides register info for NVIDIA devices // Set all Macros referenced by kernels sprintf(compileOptions, "\ -D PRESCAN_THREADS=%u\ -D KB=%u -D UNROLL=%u\ -D BINS_PER_BLOCK=%u -D BLOCK_X=%u", prescanThreads, lmemKB, UNROLL, bins_per_block, blockX ); */ OCL_ERRCK_RETVAL ( clBuildProgram(clProgram, 1, &clDevice, NULL /*compileOptions*/, NULL, NULL) ); char *build_log; size_t ret_val_size; OCL_ERRCK_RETVAL ( clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, 0, NULL, &ret_val_size) ); build_log = (char *)malloc(ret_val_size+1); OCL_ERRCK_RETVAL ( clGetProgramBuildInfo(clProgram, clDevice, CL_PROGRAM_BUILD_LOG, ret_val_size, build_log, NULL) ); // to be careful, terminate with \0 build_log[ret_val_size] = '\0'; fprintf(stderr, "%s\n", build_log ); fft_kernel = clCreateKernel(clProgram, "GPU_FftShMem", &clErrNum); OCL_ERRCK_VAR(clErrNum); pb_SwitchToTimer(&timers, pb_TimerID_COPY); // allocate & copy device memory d_shared_source = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, n_bytes, shared_source, &clErrNum); OCL_ERRCK_VAR(clErrNum); d_source = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, n_bytes, source, &clErrNum); OCL_ERRCK_VAR(clErrNum); //result is initially zero'd out d_work = clCreateBuffer(clContext, CL_MEM_COPY_HOST_PTR, n_bytes, result, &clErrNum); OCL_ERRCK_VAR(clErrNum); pb_SwitchToTimer(&timers, pb_TimerID_KERNEL); size_t block[1] = { nthreads }; size_t grid[1] = { B*block[0] }; OCL_ERRCK_RETVAL( clSetKernelArg(fft_kernel, 0, sizeof(cl_mem), (void *)&d_source) ); OCL_ERRCK_RETVAL ( clEnqueueNDRangeKernel(clCommandQueue, fft_kernel, 1, 0, grid, block, 0, 0, 0) ); pb_SwitchToTimer(&timers, pb_TimerID_COPY); // copy device memory to host OCL_ERRCK_RETVAL( clEnqueueReadBuffer(clCommandQueue, d_source, CL_TRUE, 0, // Offset in bytes n_bytes, // Size of data to read result, // Host Source 0, NULL, NULL) ); if (params->outFile) { /* Write result to file */ pb_SwitchToTimer(&timers, pb_TimerID_IO); outputData(params->outFile, (float*)result, N*B*2); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); } OCL_ERRCK_RETVAL ( clReleaseMemObject(d_source) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(d_work) ); OCL_ERRCK_RETVAL ( clReleaseMemObject(d_shared_source) ); pb_SwitchToTimer(&timers, pb_TimerID_COMPUTE); free(shared_source); free(source); free(result); pb_SwitchToTimer(&timers, pb_TimerID_NONE); pb_PrintTimerSet(&timers); pb_DestroyTimerSet(&timers); return 0; }
int main() { cl_platform_id platform_id = NULL; cl_device_id device_id = NULL; cl_context context = NULL; cl_command_queue command_queue = NULL; cl_mem memobj = NULL; cl_program program = NULL; cl_kernel kernel = NULL; cl_uint ret_num_devices; cl_uint ret_num_platforms; cl_int ret; float mem[MEM_SIZE]; FILE *fp; char fileName[] = "./kernel.clbin"; size_t binary_size; char *binary_buf; cl_int binary_status; cl_int i; /* カーネルを含むオブジェクトファイルをロード */ fp = fopen(fileName, "r"); if (!fp) { fprintf(stderr, "Failed to load kernel.\n"); exit(1); } binary_buf = (char *)malloc(MAX_BINARY_SIZE); binary_size = fread( binary_buf, 1, MAX_BINARY_SIZE, fp ); fclose( fp ); /* データを初期化 */ for( i = 0; i < MEM_SIZE; i++ ) { mem[i] = i; } /* プラットフォーム・デバイスの情報の取得 */ ret = clGetPlatformIDs(1, &platform_id, &ret_num_platforms); ret = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_DEFAULT, 1, &device_id, &ret_num_devices); /* OpenCLコンテキストの作成 */ context = clCreateContext( NULL, 1, &device_id, NULL, NULL, &ret); /* コマンドキューの作成 */ command_queue = clCreateCommandQueue(context, device_id, 0, &ret); /* メモリバッファの作成 */ memobj = clCreateBuffer(context, CL_MEM_READ_WRITE, MEM_SIZE * sizeof(float), NULL, &ret); /* メモリバッファにデータを転送 */ ret = clEnqueueWriteBuffer(command_queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(float), mem, 0, NULL, NULL); /* 読み込んだバイナリからカーネルプログラムを作成 */ program = clCreateProgramWithBinary(context, 1, &device_id, (const size_t *)&binary_size, (const unsigned char **)&binary_buf, &binary_status, &ret); /* OpenCLカーネルの作成 */ kernel = clCreateKernel(program, "vecAdd", &ret); printf("err:%d\n", ret); /* OpenCLカーネル引数の設定 */ ret = clSetKernelArg(kernel, 0, sizeof(cl_mem), (void *)&memobj); size_t global_work_size[3] = {MEM_SIZE, 0, 0}; size_t local_work_size[3] = {MEM_SIZE, 0, 0}; /* OpenCLカーネルを実行 */ ret = clEnqueueNDRangeKernel(command_queue, kernel, 1, NULL, global_work_size, local_work_size, 0, NULL, NULL); /* メモリバッファから結果を取得 */ ret = clEnqueueReadBuffer(command_queue, memobj, CL_TRUE, 0, MEM_SIZE * sizeof(float), mem, 0, NULL, NULL); /* 結果の表示 */ for(i=0; i<MEM_SIZE; i++) { printf("mem[%d] : %f\n", i, mem[i]); } /* 終了処理 */ ret = clFlush(command_queue); ret = clFinish(command_queue); ret = clReleaseKernel(kernel); ret = clReleaseProgram(program); ret = clReleaseMemObject(memobj); ret = clReleaseCommandQueue(command_queue); ret = clReleaseContext(context); free(binary_buf); return 0; }
void b2CLNarrowPhase::UpdateContactPairs(int contactNum, int *pContactNums, int maxContactNum/*, b2ContactListener* listener*/) { //// for debug //int *fill_data = new int[contactNum]; //for (int i=0; i<contactNum; i++) // fill_data[i] = 231; //b2CLDevice::instance().copyArrayToDevice(b2CLCommonData::instance().manifoldBinaryBitListBuffer, fill_data, 0, sizeof(int) * contactNum); //delete [] fill_data; //// for debug //int* globalIndices = new int[50*4]; //b2CLDevice::instance().copyArrayFromDevice(globalIndices, b2CLCommonData::instance().globalIndicesBuffer, 0, sizeof(int)*4*50, true); //int* indices = new int[200]; //b2CLDevice::instance().copyArrayFromDevice(indices, b2CLCommonData::instance().pairIndicesBuffer, 0, sizeof(int)*200, true); //int *test = new int[contactNum]; // memset(test, 0, sizeof(int)*contactNum); //for (int i=0; i<contactNum; i++) // test[indices[i]] = 1; //for (int i=0; i<contactNum; i++) // if (test[i]==0) // int a = 1; //delete [] test; //delete [] indices; //delete [] globalIndices; for (int contactType=0; contactType<b2Shape::contact_type_num; contactType++) { if (pContactNums[0]!=0) int a = 0; if (pContactNums[contactType]>0) { unsigned int a = 0; cl_kernel collideKernel; switch (contactType) { case 0: // circle-circle collideKernel = collideCirclesKernel; break; case 1: // circle-polygon collideKernel = collidePolygonAndCircleKernel; break; case 2: // polygon-polygon collideKernel = collidePolygonsKernel; break; case 3: // edge-circle collideKernel = collideEdgeAndCircleKernel; break; case 4: // edge-polygon collideKernel = collideEdgeAndPolygonKernel; break; default: printf("Error! Unsupported contact type: %d!\n", contactType); exit(0); } int err = CL_SUCCESS; err |= clSetKernelArg(collideKernel, a++, sizeof(cl_mem), &(b2CLCommonData::instance().manifoldListBuffers[b2CLCommonData::instance().currentManifoldBuffer])); #if defined(SCAN_OPENCL) err |= clSetKernelArg(collideKernel, a++, sizeof(cl_mem), &(b2CLCommonData::instance().manifoldBinaryBitListBuffer)); #endif err |= clSetKernelArg(collideKernel, a++, sizeof(cl_mem), &(b2CLCommonData::instance().shapeListBuffer)); err |= clSetKernelArg(collideKernel, a++, sizeof(cl_mem), &(b2CLCommonData::instance().xfListBuffer)); err |= clSetKernelArg(collideKernel, a++, sizeof(cl_mem), &(b2CLCommonData::instance().globalIndicesBuffer)); err |= clSetKernelArg(collideKernel, a++, sizeof(cl_mem), &(b2CLCommonData::instance().pairIndicesBuffer)); err |= clSetKernelArg(collideKernel, a++, sizeof(int), &maxContactNum); err |= clSetKernelArg(collideKernel, a++, sizeof(int), pContactNums+contactType); if (err != CL_SUCCESS) { printf("Error: %s: Failed to set kernel arguments!\n", (char *) collidePolygonsKernel); return; } int group_num = (pContactNums[contactType] + kernel_work_group_size-1)/kernel_work_group_size; size_t global = group_num * kernel_work_group_size; //cout << contactNum << ", " << group_num << endl; err = CL_SUCCESS; err |= clEnqueueNDRangeKernel(b2CLDevice::instance().GetCommandQueue(), collideKernel, 1, NULL, &global, &kernel_work_group_size, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Collide Kernel: Failed to execute kernel!\n"); return; } #ifdef _DEBUG //// for debug //b2clManifold *testManifold = new b2clManifold[contactNum]; //b2CLDevice::instance().copyArrayFromDevice(testManifold, b2CLCommonData::instance().manifoldListBuffers[b2CLCommonData::instance().currentManifoldBuffer], 0, sizeof(b2clManifold)*contactNum,true); //int* input = new int[contactNum]; //b2CLDevice::instance().copyArrayFromDevice(input, b2CLCommonData::instance().manifoldBinaryBitListBuffer, 0, sizeof(int)*contactNum, true); //for (int i=0; i<contactNum; i++) //{ // if (input[i]!=0 && input[i]!=1) // int a = 0; //} //delete [] testManifold; //delete [] input; #endif } } }
void* RenderDisplay(void* arguments) { int err = 0; struct PASSING_OCL *ocl_info = (struct PASSING_OCL *)arguments; cl_command_queue commandQueue; cl_uint RandomSeeds[WorkAmount * 2]; cl_mem Pixels; cl_mem Seeds; unsigned int PixelData[WorkAmount]; for(int i = 0; i< WorkAmount*2; i++) { RandomSeeds[i] = rand(); if (RandomSeeds[i] < 2) RandomSeeds[i] = 2; } commandQueue = clCreateCommandQueue(ocl_info->ocl.context, ocl_info->ocl.device[ocl_info->device_index], CL_QUEUE_PROFILING_ENABLE, &err); if(err != CL_SUCCESS) { printf("Error: Failed clCreateCommandQueue\n"); return NULL; } Pixels = clCreateBuffer(ocl_info->ocl.context, CL_MEM_WRITE_ONLY | CL_MEM_ALLOC_HOST_PTR, sizeof(cl_uint) * WorkAmount, NULL, &err); if(err != CL_SUCCESS) { printf("Error: Failed pixels create buffer\n"); return NULL; } Seeds = clCreateBuffer(ocl_info->ocl.context, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, sizeof(cl_uint) * WorkAmount*2, RandomSeeds, &err); if(err != CL_SUCCESS) { printf("Error: Failed Seeds create buffer\n"); return NULL; } for(int i = ocl_info->workNDRangeStart; i<ocl_info->workNDRangeEnd; i++) { size_t globalWorkSize[1] = {WorkAmount}; size_t localWorkSize[1] = {ocl_info->localSize}; err = clSetKernelArg(ocl_info->ocl.kernel, 8, sizeof(cl_uint), (void*)&Seeds); if(CL_SUCCESS != err) { printf("Error: Failed to set argument Seeds\n"); return NULL; } err = clSetKernelArg(ocl_info->ocl.kernel, 9, sizeof(cl_mem), (void*)&Pixels); if(CL_SUCCESS != err) { printf("Error: Failed to set argument Pixels\n"); return NULL; } err = clSetKernelArg(ocl_info->ocl.kernel, 10, sizeof(cl_uint), (void*)&i); if(CL_SUCCESS != err) { printf("Error: Failed to set argument i\n"); return NULL; } err = clEnqueueNDRangeKernel(commandQueue, ocl_info->ocl.kernel, 1, NULL, globalWorkSize, localWorkSize, NULL, 0, NULL); if(CL_SUCCESS != err) { printf("Error: Failed to run kernel to run\n"); return NULL; } clEnqueueReadBuffer(commandQueue, Pixels, CL_TRUE, 0, WorkAmount * sizeof(cl_uint), PixelData, 0, NULL, NULL); for(int j=0; j<WorkAmount; j++) { ImagePixel[j + i*WorkAmount] = PixelData[j]; } } if(clFinish(commandQueue) != CL_SUCCESS) { printf("Error: Failed to Finish"); return NULL; } return NULL; }