int main(int argc, char** argv) { cl_platform_id pf[MAX_PLATFORMS]; cl_uint nb_platforms = 0; cl_int err; // error code returned from api calls cl_device_type device_type = CL_DEVICE_TYPE_ALL; // Filter args // argv++; while (argc > 1) { if(!strcmp(*argv, "-g") || !strcmp(*argv, "--gpu-only")) { if(device_type != CL_DEVICE_TYPE_ALL) error("--gpu-only and --cpu-only can not be specified at the same time\n"); device_type = CL_DEVICE_TYPE_GPU; } else if(!strcmp(*argv, "-c") || !strcmp(*argv, "--cpu-only")) { if(device_type != CL_DEVICE_TYPE_ALL) error("--gpu-only and --cpu-only can not be specified at the same time\n"); device_type = CL_DEVICE_TYPE_CPU; } else if(!strcmp(*argv, "-s") || !strcmp(*argv, "--size")) { unsigned i; int r; char c; r = sscanf(argv[1], "%u%[mMkK]", &SIZE, &c); if (r == 2) { if (c == 'k' || c == 'K') SIZE *= 1024; else if (c == 'm' || c == 'M') SIZE *= 1024 * 1024; } argc--; argv++; } else break; argc--; argv++; } if(argc > 1) TILE = atoi(*argv); // Get list of OpenCL platforms detected // err = clGetPlatformIDs(3, pf, &nb_platforms); check(err, "Failed to get platform IDs"); printf("%d OpenCL platforms detected\n", nb_platforms); // For each platform do // for (cl_int p = 0; p < nb_platforms; p++) { cl_uint num; int platform_valid = 1; char name[1024], vendor[1024]; cl_device_id devices[MAX_DEVICES]; cl_uint nb_devices = 0; cl_context context; // compute context cl_program program; // compute program cl_kernel kernel; err = clGetPlatformInfo(pf[p], CL_PLATFORM_NAME, 1024, name, NULL); check(err, "Failed to get Platform Info"); err = clGetPlatformInfo(pf[p], CL_PLATFORM_VENDOR, 1024, vendor, NULL); check(err, "Failed to get Platform Info"); printf("Platform %d: %s - %s\n", p, name, vendor); // Get list of devices // err = clGetDeviceIDs(pf[p], device_type, MAX_DEVICES, devices, &nb_devices); printf("nb devices = %d\n", nb_devices); if(nb_devices == 0) continue; // Create compute context with "device_type" devices // context = clCreateContext (0, nb_devices, devices, NULL, NULL, &err); check(err, "Failed to create compute context"); // Load program source into memory // const char *opencl_prog; opencl_prog = file_load(KERNEL_FILE); // Attach program source to context // program = clCreateProgramWithSource(context, 1, &opencl_prog, NULL, &err); check(err, "Failed to create program"); // Compile program // { char flags[1024]; sprintf (flags, "-cl-mad-enable -cl-fast-relaxed-math -DSIZE=%d -DTILE=%d -DTYPE=%s", SIZE, TILE, "float"); err = clBuildProgram (program, 0, NULL, flags, NULL, NULL); if(err != CL_SUCCESS) { size_t len; // Display compiler log // clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, 0, NULL, &len); { char buffer[len+1]; fprintf(stderr, "--- Compiler log ---\n"); clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL); fprintf(stderr, "%s\n", buffer); fprintf(stderr, "--------------------\n"); } if(err != CL_SUCCESS) error("Failed to build program!\n"); } } // Create the compute kernel in the program we wish to run // kernel = clCreateKernel(program, KERNEL_NAME, &err); check(err, "Failed to create compute kernel"); // Allocate and initialize input data // alloc_buffers_and_user_data(context); // Iterate over devices // for(cl_int dev = 0; dev < nb_devices; dev++) { cl_command_queue queue; char name[1024]; cl_device_type dtype; err = clGetDeviceInfo(devices[dev], CL_DEVICE_NAME, 1024, name, NULL); check(err, "Cannot get type of device"); err = clGetDeviceInfo(devices[dev], CL_DEVICE_TYPE, sizeof(cl_device_type), &dtype, NULL); check(err, "Cannot get type of device"); printf("\tDevice %d : %s [%s]\n", dev, (dtype == CL_DEVICE_TYPE_GPU) ? "GPU" : "CPU", name); // Create a command queue // queue = clCreateCommandQueue(context, devices[dev], CL_QUEUE_PROFILING_ENABLE, &err); check(err,"Failed to create command queue"); // Write our data set into device buffer // send_input(queue); // Execute kernel // { cl_event prof_event; cl_ulong start, end; struct timeval t1,t2; double timeInMicroseconds; size_t global[2] = { SIZE, SIZE }; // global domain size for our calculation size_t local[2] = { TILE, TILE }; // local domain size for our calculation printf("\t%dx%d Threads in workgroups of %dx%d\n", global[0], global[1], local[0], local[1]); // Set kernel arguments // err = 0; err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_buffer); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_buffer); check(err, "Failed to set kernel arguments"); gettimeofday (&t1, NULL); for (unsigned iter = 0; iter < ITERATIONS; iter++) { err = clEnqueueNDRangeKernel(queue, kernel, 2, NULL, global, local, 0, NULL, &prof_event); check(err, "Failed to execute kernel"); } // Wait for the command commands to get serviced before reading back results // clFinish(queue); gettimeofday (&t2,NULL); // Check performance // timeInMicroseconds = (double)TIME_DIFF(t1, t2) / ITERATIONS; printf("\tComputation performed in %lf µs over device #%d\n", timeInMicroseconds, dev); clReleaseEvent(prof_event); } // Read back the results from the device to verify the output // retrieve_output(queue); // Validate computation // check_output_data(); clReleaseCommandQueue(queue); } // Cleanup // free_buffers_and_user_data(); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseContext(context); } return 0; }
// Example use of the double-precision Xtrsm routine DTRSM, solving A*X = alpha*B, storing the // result in the memory of matrix B. Uses row-major storage (C-style). int main() { // OpenCL platform/device settings const auto platform_id = 0; const auto device_id = 0; // Example TRSM arguments const size_t m = 4; const size_t n = 3; const double alpha = 1.0; const auto a_ld = m; const auto b_ld = n; // Initializes the OpenCL platform auto platforms = std::vector<cl::Platform>(); cl::Platform::get(&platforms); if (platforms.size() == 0 || platform_id >= platforms.size()) { return 1; } auto platform = platforms[platform_id]; // Initializes the OpenCL device auto devices = std::vector<cl::Device>(); platform.getDevices(CL_DEVICE_TYPE_ALL, &devices); if (devices.size() == 0 || device_id >= devices.size()) { return 1; } auto device = devices[device_id]; // Creates the OpenCL context, queue, and an event auto device_as_vector = std::vector<cl::Device>{device}; auto context = cl::Context(device_as_vector); auto queue = cl::CommandQueue(context, device); auto event = cl_event{nullptr}; // Populate host matrices with some example data auto host_a = std::vector<double>({1.0, 2.0, 1.0, -2.0, 0.0, -1.0, -2.0, 0.0, 0.0, 0.0, 1.0, 1.0, 0.0, 0.0, 0.0, -1.0}); auto host_b = std::vector<double>({-1.0, -1.0, 3.0, 1.0, -3.0, 2.0, 1.0, 1.0, -1.0, 4.0, -1.0, -2.0}); // Expected result: // 8 -5 2 // -11 3 4 // 5 0 -3 // -4 1 2 // Copy the matrices to the device auto device_a = cl::Buffer(context, CL_MEM_READ_WRITE, host_a.size()*sizeof(double)); auto device_b = cl::Buffer(context, CL_MEM_READ_WRITE, host_b.size()*sizeof(double)); queue.enqueueWriteBuffer(device_a, CL_TRUE, 0, host_a.size()*sizeof(double), host_a.data()); queue.enqueueWriteBuffer(device_b, CL_TRUE, 0, host_b.size()*sizeof(double), host_b.data()); // Call the DTRSM routine. Note that the type of alpha and beta (double) determine the precision. auto queue_plain = queue(); auto status = clblast::Trsm(clblast::Layout::kRowMajor, clblast::Side::kLeft, clblast::Triangle::kUpper, clblast::Transpose::kNo, clblast::Diagonal::kNonUnit, m, n, alpha, device_a(), 0, a_ld, device_b(), 0, b_ld, &queue_plain, &event); // Retrieves the results if (status == clblast::StatusCode::kSuccess) { clWaitForEvents(1, &event); clReleaseEvent(event); } queue.enqueueReadBuffer(device_b, CL_TRUE, 0, host_b.size()*sizeof(double), host_b.data()); // Example completed. See "clblast.h" for status codes (0 -> success). printf("Completed TRSM with status %d and results:\n", static_cast<int>(status)); for (auto i = size_t{0}; i < m; ++i) { for (auto j = size_t{0}; j < n; ++j) { printf("%3.0f ", host_b[i * b_ld + j]); } printf("\n"); } return 0; }
/** * \related cl_Mem_Object_t * * This fucntion unmaps previously mapped memory for OpenCL buffer. * * @param[in,out] self pointer to structure, in which 'Unmap' function pointer * is defined to point on this function. * @param[out] p_mapped_ptr pointer to pointer, that was returned as the * result of mapping operation. * @param[in] time_mode enumeration, that denotes how time measurement should be * performed. * @param[out] evt_to_generate pointer to OpenCL event that will be generated * at the end of operation. * * @return CL_SUCCESS in case of success, error code of type 'ret_code' otherwise. * * @see cl_err_codes.h for detailed error description. * @see 'cl_Error_t' structure for error handling. */ static ret_code Mem_Object_Unmap( scow_Mem_Object *self, cl_bool blocking_map, void **p_mapped_ptr, TIME_STUDY_MODE time_mode, cl_event *evt_to_generate, cl_command_queue explicit_queue) { cl_int ret; cl_event *p_unmapping_ready; OCL_CHECK_EXISTENCE(self, INVALID_BUFFER_GIVEN); OCL_CHECK_EXISTENCE(self->mapped_to_region, MEM_OBJ_NOT_MAPPED); if (p_mapped_ptr) { OCL_CHECK_EXISTENCE(*p_mapped_ptr, INVALID_BUFFER_GIVEN); /* Check if we are trying to unmap pointer, that was mapped from different * Memory Object. */ if (self->mapped_to_region != *p_mapped_ptr) { OCL_DIE_ON_ERROR(WRONG_PARENT_OBJECT, CL_SUCCESS, NULL, WRONG_PARENT_OBJECT); } } /* We generate event in any case - because later we may want to wait for * unmapping completion. */ (evt_to_generate != NULL) ? (p_unmapping_ready = evt_to_generate) : (p_unmapping_ready = &self->unmap_evt); cl_command_queue q = (explicit_queue == NULL) ? (self->parent_thread->q_data_htod) : (explicit_queue); ret = clEnqueueUnmapMemObject(q, self->cl_mem_object, self->mapped_to_region, 0, NULL, p_unmapping_ready); OCL_DIE_ON_ERROR(ret, CL_SUCCESS, NULL, ret); self->mapped_to_region = NULL; self->row_pitch = 0; if (p_mapped_ptr != NULL) { *p_mapped_ptr = NULL; } switch (time_mode) { case MEASURE: self->timer->current_time_device = Gather_Time_uS(p_unmapping_ready); self->timer->total_time_device += self->timer->current_time_device; break; case DONT_MEASURE: break; default: if (blocking_map) { ret = clWaitForEvents(1, p_unmapping_ready); OCL_DIE_ON_ERROR(ret, CL_SUCCESS, NULL, ret); } break; } if (p_unmapping_ready != evt_to_generate){ clReleaseEvent(*p_unmapping_ready); } return ret; }
void run1(int N, char *fileName) { puts("Matrix Vector Multiplication Naive\n"); int i,j; float *A; A = (float*)malloc(sizeof(float)*N*N); for( i = 0; i < N ; ++i ) { for( j = 0; j < N ; ++j ) { A[i*N + j] = 1.f; } } float *B; B = (float*)malloc(sizeof(float)*N); for( i = 0; i < N ; ++i ) { B[i] = 1.f; } float *C; C = (float*)malloc(sizeof(float)*N); #ifdef DEBUG puts("A"); check_2d_f(A,N,N); puts("B"); check_1d_f(B,N); #endif int NumK = 1; int NumE = 1; double gpuTime; cl_ulong gstart, gend; //------------------------------------------------ // OpenCL //------------------------------------------------ cl_int err; cl_platform_id platform; // OpenCL platform cl_device_id device_id; // device ID cl_context context; // context cl_command_queue queue; // command queue cl_program program; // program cl_kernel *kernel = (cl_kernel*)malloc(sizeof(cl_kernel)*NumK); cl_event *event = (cl_event*)malloc(sizeof(cl_event)*NumE); // read kernel file //char *fileName = "transpose_kernel.cl"; char *kernelSource; size_t size; FILE *fh = fopen(fileName, "rb"); if(!fh) { printf("Error: Failed to open kernel file!\n"); exit(1); } fseek(fh,0,SEEK_END); size=ftell(fh); fseek(fh,0,SEEK_SET); kernelSource = malloc(size+1); size_t result; result = fread(kernelSource,1,size,fh); if(result != size){ fputs("Reading error", stderr);exit(1);} kernelSource[size] = '\0'; // Bind to platform err = clGetPlatformIDs(1, &platform, NULL); OCL_CHECK(err); // Get ID for the device err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); OCL_CHECK(err); // Create a context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); OCL_CHECK(err); // Create a command queue queue = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err); OCL_CHECK(err); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **)&kernelSource, NULL, &err); OCL_CHECK(err); // turn on optimization for kernel char *options="-cl-mad-enable -cl-fast-relaxed-math -cl-no-signed-zeros -cl-unsafe-math-optimizations -cl-finite-math-only"; err = clBuildProgram(program, 1, &device_id, options, NULL, NULL); if(err != CL_SUCCESS) printCompilerOutput(program, device_id); OCL_CHECK(err); #ifdef SAVEBIN // Calculate size of binaries size_t binary_size; err = clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, sizeof(size_t), &binary_size, NULL); OCL_CHECK(err); unsigned char* bin; bin = (unsigned char*)malloc(sizeof(unsigned char)*binary_size); err = clGetProgramInfo(program, CL_PROGRAM_BINARIES, sizeof(unsigned char*), &bin, NULL); OCL_CHECK(err); // Print the binary out to the output file fh = fopen("kernel_mv_1.bin", "wb"); fwrite(bin, 1, binary_size, fh); fclose(fh); #endif kernel[0] = clCreateKernel(program, "mv_1", &err); OCL_CHECK(err); // memory on device cl_mem A_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N*N, NULL, NULL); cl_mem B_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N, NULL, NULL); cl_mem C_d = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*N, NULL, NULL); // Initialize device memory err = clEnqueueWriteBuffer(queue, A_d, CL_TRUE, 0, sizeof(float)*N*N, A, 0, NULL , NULL); OCL_CHECK(err); err = clEnqueueWriteBuffer(queue, B_d, CL_TRUE, 0, sizeof(float)*N, B, 0, NULL , NULL); OCL_CHECK(err); size_t localsize = 64; size_t globalsize = N; err = clSetKernelArg(kernel[0], 0, sizeof(cl_mem), &A_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clSetKernelArg(kernel[0], 1, sizeof(cl_mem), &B_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clSetKernelArg(kernel[0], 2, sizeof(cl_mem), &C_d); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clSetKernelArg(kernel[0], 3, sizeof(int), &N); if(err != 0) { printf("%d\n",err); OCL_CHECK(err); exit(1);} err = clEnqueueNDRangeKernel(queue, kernel[0], 1, NULL, &globalsize, &localsize, 0, NULL, &event[0]); OCL_CHECK(err); clFinish(queue); clEnqueueReadBuffer(queue, C_d, CL_TRUE, 0, sizeof(float)*N, C , 0, NULL , NULL ); err = clWaitForEvents(1,&event[0]); OCL_CHECK(err); err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &gstart, NULL); OCL_CHECK(err); err = clGetEventProfilingInfo (event[0], CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &gend, NULL); OCL_CHECK(err); gpuTime = (double)(gend -gstart)/1000000000.0; //check_1d_f(sum, blks+1); #ifdef DEBUG puts("C = A * B"); check_1d_f(C,N); #endif printf("oclTime = %lf (s)\n", gpuTime ); // free clReleaseMemObject(A_d); clReleaseMemObject(B_d); clReleaseMemObject(C_d); clReleaseProgram(program); clReleaseContext(context); clReleaseCommandQueue(queue); for(i=0;i<NumK;++i){ clReleaseKernel(kernel[i]); } for(i=0;i<NumE;++i){ clReleaseEvent(event[i]); } free(kernelSource); #ifdef SAVEBIN free(bin); #endif free(A); free(B); free(C); return; }
cl_mem parallelRemap1( cl_mem a_buffer, cl_mem v_buffer, cl_mem b_buffer, uint asize, uint bsize, real max_a, real min_val, real min_diff, double *time ) { cl_int error = 0; uint temp_size = (uint)((max_a - min_val)/min_diff); cl_mem temp_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, temp_size*sizeof(int), NULL, &error); if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); size_t global_work_size[1]; size_t local_work_size[1]; local_work_size[0] = TILE_SIZE; global_work_size[0] = ((asize+local_work_size[0]-1)/local_work_size[0])*local_work_size[0]; /****************** * Hash Kernel ******************/ error = clSetKernelArg(cHash_kernel, 0, sizeof(real), &min_val); if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); error = clSetKernelArg(cHash_kernel, 1, sizeof(real), &min_diff); if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); error = clSetKernelArg(cHash_kernel, 2, sizeof(cl_uint), &asize); if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); error = clSetKernelArg(cHash_kernel, 3, sizeof(cl_mem), (void*)&a_buffer); if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); error = clSetKernelArg(cHash_kernel, 4, sizeof(cl_mem), (void*)&temp_buffer); if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); global_work_size[0] = ((asize+local_work_size[0]-1)/local_work_size[0])*local_work_size[0]; cl_event hash_kernel_event; error = clEnqueueNDRangeKernel(queue, cHash_kernel, 1, 0, global_work_size, local_work_size, 0, NULL, &hash_kernel_event); if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); /***************** * Remap Kernel *****************/ cl_mem remap_buffer = clCreateBuffer(context, CL_MEM_READ_WRITE, bsize*sizeof(real), NULL, &error); if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); error = clSetKernelArg(remap1_kernel, 0, sizeof(real), &min_val); if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); error = clSetKernelArg(remap1_kernel, 1, sizeof(real), &min_diff); if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); error = clSetKernelArg(remap1_kernel, 2, sizeof(cl_uint), &temp_size); if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); error = clSetKernelArg(remap1_kernel, 3, sizeof(cl_uint), &bsize); if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); error = clSetKernelArg(remap1_kernel, 4, sizeof(cl_mem), (void*)&a_buffer); if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); error = clSetKernelArg(remap1_kernel, 5, sizeof(cl_mem), (void*)&v_buffer); if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); error = clSetKernelArg(remap1_kernel, 6, sizeof(cl_mem), (void*)&b_buffer); if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); error = clSetKernelArg(remap1_kernel, 7, sizeof(cl_mem), (void*)&temp_buffer); if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); error = clSetKernelArg(remap1_kernel, 8, sizeof(cl_mem), (void*)&remap_buffer); if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); global_work_size[0] = ((bsize+local_work_size[0]-1)/local_work_size[0])*local_work_size[0]; cl_event remap_event; error = clEnqueueNDRangeKernel(queue, remap1_kernel, 1, 0, global_work_size, local_work_size, 0, NULL, &remap_event); if (error != CL_SUCCESS) printf("Error is %d at line %d\n",error,__LINE__); long gpu_time_start, gpu_time_end, gpu_time=0; clWaitForEvents(1, &remap_event); clGetEventProfilingInfo(hash_kernel_event, CL_PROFILING_COMMAND_START, sizeof(gpu_time_start), &gpu_time_start, NULL); clGetEventProfilingInfo(hash_kernel_event, CL_PROFILING_COMMAND_END, sizeof(gpu_time_end), &gpu_time_end, NULL); gpu_time += gpu_time_end - gpu_time_start; clReleaseEvent(hash_kernel_event); clGetEventProfilingInfo(remap_event, CL_PROFILING_COMMAND_START, sizeof(gpu_time_start), &gpu_time_start, NULL); clGetEventProfilingInfo(remap_event, CL_PROFILING_COMMAND_END, sizeof(gpu_time_end), &gpu_time_end, NULL); gpu_time += gpu_time_end - gpu_time_start; clReleaseEvent(remap_event); clReleaseMemObject(temp_buffer); *time = gpu_time*1.0e-9; return remap_buffer; }
int main() { /* OpenCL data structures */ cl_device_id device; cl_context context; cl_command_queue queue; cl_program program; cl_kernel kernel; cl_int i, err, num_vectors; /* Data and events */ char data[NUM_BYTES]; cl_mem data_buffer; cl_event prof_event; cl_ulong time_start, time_end, total_time; void* mapped_memory; /* Create a device and context */ device = create_device(); context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); if(err < 0) { perror("Couldn't create a context"); exit(1); } /* Build the program and create a kernel */ program = build_program(context, device, PROGRAM_FILE); kernel = clCreateKernel(program, KERNEL_FUNC, &err); if(err < 0) { perror("Couldn't create a kernel"); exit(1); }; /* Create a buffer to hold data */ data_buffer = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(data), NULL, &err); if(err < 0) { perror("Couldn't create a buffer"); exit(1); }; /* Create kernel argument */ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &data_buffer); if(err < 0) { perror("Couldn't set a kernel argument"); exit(1); }; /* Tell kernel number of char16 vectors */ num_vectors = NUM_BYTES/16; clSetKernelArg(kernel, 1, sizeof(num_vectors), &num_vectors); /* Create a command queue */ queue = clCreateCommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE, &err); if(err < 0) { perror("Couldn't create a command queue"); exit(1); }; total_time = 0.0f; for(i=0; i<NUM_ITERATIONS; i++) { /* Enqueue kernel */ err = clEnqueueTask(queue, kernel, 0, NULL, NULL); if(err < 0) { perror("Couldn't enqueue the kernel"); exit(1); } #ifdef PROFILE_READ /* Read the buffer */ err = clEnqueueReadBuffer(queue, data_buffer, CL_TRUE, 0, sizeof(data), data, 0, NULL, &prof_event); if(err < 0) { perror("Couldn't read the buffer"); exit(1); } #else /* Create memory map */ mapped_memory = clEnqueueMapBuffer(queue, data_buffer, CL_TRUE, CL_MAP_READ, 0, sizeof(data), 0, NULL, &prof_event, &err); if(err < 0) { perror("Couldn't map the buffer to host memory"); exit(1); } #endif /* Get profiling information */ clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_START, sizeof(time_start), &time_start, NULL); clGetEventProfilingInfo(prof_event, CL_PROFILING_COMMAND_END, sizeof(time_end), &time_end, NULL); total_time += time_end - time_start; #ifndef PROFILE_READ /* Unmap the buffer */ err = clEnqueueUnmapMemObject(queue, data_buffer, mapped_memory, 0, NULL, NULL); if(err < 0) { perror("Couldn't unmap the buffer"); exit(1); } #endif } #ifdef PROFILE_READ printf("Average read time: %lu\n", total_time/NUM_ITERATIONS); #else printf("Average map time: %lu\n", total_time/NUM_ITERATIONS); #endif /* Deallocate resources */ clReleaseEvent(prof_event); clReleaseMemObject(data_buffer); clReleaseKernel(kernel); clReleaseCommandQueue(queue); clReleaseProgram(program); clReleaseContext(context); return 0; }
/*------------------------------------------------------ ** ForwardSub() -- Forward substitution of Gaussian ** elimination. **------------------------------------------------------ */ void ForwardSub(cl_context context, float *a, float *b, float *m, int size,int timing){ // 1. set up kernels cl_kernel fan1_kernel,fan2_kernel; cl_int status=0; cl_program gaussianElim_program; cl_event writeEvent,kernelEvent,readEvent; float writeTime=0,readTime=0,kernelTime=0; float writeMB=0,readMB=0; gaussianElim_program = cl_compileProgram( (char *)"gaussianElim_kernels.cl",NULL); fan1_kernel = clCreateKernel( gaussianElim_program, "Fan1", &status); status = cl_errChk(status, (char *)"Error Creating Fan1 kernel",true); if(status)exit(1); fan2_kernel = clCreateKernel( gaussianElim_program, "Fan2", &status); status = cl_errChk(status, (char *)"Error Creating Fan2 kernel",true); if(status)exit(1); // 2. set up memory on device and send ipts data to device cl_mem a_dev, b_dev, m_dev; cl_int error=0; a_dev = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*size*size, NULL, &error); b_dev = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float)*size, NULL, &error); m_dev = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * size * size, NULL, &error); command_queue = cl_getCommandQueue(); error = clEnqueueWriteBuffer(command_queue, a_dev, 1, // change to 0 for nonblocking write 0, // offset sizeof(float)*size*size, a, 0, NULL, &writeEvent); if (timing) writeTime+=eventTime(writeEvent,command_queue); clReleaseEvent(writeEvent); error = clEnqueueWriteBuffer(command_queue, b_dev, 1, // change to 0 for nonblocking write 0, // offset sizeof(float)*size, b, 0, NULL, &writeEvent); if (timing) writeTime+=eventTime(writeEvent,command_queue); clReleaseEvent(writeEvent); error = clEnqueueWriteBuffer(command_queue, m_dev, 1, // change to 0 for nonblocking write 0, // offset sizeof(float)*size*size, m, 0, NULL, &writeEvent); if (timing) writeTime+=eventTime(writeEvent,command_queue); clReleaseEvent(writeEvent); writeMB = (float)(sizeof(float) * size * (size + size + 1) / 1e6); // 3. Determine block sizes size_t globalWorksizeFan1[1]; size_t globalWorksizeFan2[2]; size_t localWorksizeFan1Buf[1]={BLOCK_SIZE_0}; size_t localWorksizeFan2Buf[2]={BLOCK_SIZE_1_X, BLOCK_SIZE_1_Y}; size_t *localWorksizeFan1=NULL; size_t *localWorksizeFan2=NULL; globalWorksizeFan1[0] = size; globalWorksizeFan2[0] = size; globalWorksizeFan2[1] = size; if(localWorksizeFan1Buf[0]){ localWorksizeFan1=localWorksizeFan1Buf; globalWorksizeFan1[0]=(int)ceil(globalWorksizeFan1[0]/(double)localWorksizeFan1Buf[0])*localWorksizeFan1Buf[0]; } if(localWorksizeFan2Buf[0]){ localWorksizeFan2=localWorksizeFan2Buf; globalWorksizeFan2[0]=(int)ceil(globalWorksizeFan2[0]/(double)localWorksizeFan2Buf[0])*localWorksizeFan2Buf[0]; globalWorksizeFan2[1]=(int)ceil(globalWorksizeFan2[1]/(double)localWorksizeFan2Buf[1])*localWorksizeFan2Buf[1]; } int t; // 4. Setup and Run kernels for (t=0; t<(size-1); t++) { // kernel args cl_int argchk; argchk = clSetKernelArg(fan1_kernel, 0, sizeof(cl_mem), (void *)&m_dev); argchk |= clSetKernelArg(fan1_kernel, 1, sizeof(cl_mem), (void *)&a_dev); argchk |= clSetKernelArg(fan1_kernel, 2, sizeof(cl_mem), (void *)&b_dev); argchk |= clSetKernelArg(fan1_kernel, 3, sizeof(int), (void *)&size); argchk |= clSetKernelArg(fan1_kernel, 4, sizeof(int), (void *)&t); cl_errChk(argchk,"ERROR in Setting Fan1 kernel args",true); //printf("localWorksizeFan1:%u, globalWorksizeFan1:%u\n", localWorksizeFan1Buf[0], globalWorksizeFan1[0]); #pragma dividend local_work_group_size localWorksizeFan1 dim 1 dim1(2:64:2:64) //This lws will be used to profile the OpenCL kernel with id 1 size_t _dividend_lws_localWorksizeFan1_k1[2]; { _dividend_lws_localWorksizeFan1_k1[0] = getLWSValue("DIVIDEND_LWS1_D0",DIVIDEND_LWS1_D0_DEFAULT_VAL); //Dividend extension: store the kernel id as the last element _dividend_lws_localWorksizeFan1_k1[1] = 1; } // launch kernel error = DIVIDEND_CL_WRAP(clEnqueueNDRangeKernel)( command_queue, fan1_kernel, 1, 0, globalWorksizeFan1, _dividend_lws_localWorksizeFan1_k1, 0, NULL, NULL); cl_errChk(error,"ERROR in Executing Fan1 Kernel",true); //fprintf(stderr, "AFTER THIS\n"); argchk = clSetKernelArg(fan2_kernel, 0, sizeof(cl_mem), (void *)&m_dev); argchk |= clSetKernelArg(fan2_kernel, 1, sizeof(cl_mem), (void *)&a_dev); argchk |= clSetKernelArg(fan2_kernel, 2, sizeof(cl_mem), (void *)&b_dev); argchk |= clSetKernelArg(fan2_kernel, 3, sizeof(int), (void *)&size); argchk |= clSetKernelArg(fan2_kernel, 4, sizeof(int), (void *)&t); cl_errChk(argchk,"ERROR in Setting Fan2 kernel args",true); size_t local_work_size[] = {128, 128}; //printf("localWorksizeFan2:%u, globalWorksizeFan2[0]:%u, globalWorksizeFan2[1]:%u\n", localWorksizeFan2Buf[0], globalWorksizeFan2[0], globalWorksizeFan2[1]); #pragma dividend local_work_group_size local_work_size dim 2 dim1(8:64:2:64) dim2(8:64:2:64) //This lws will be used to profile the OpenCL kernel with id 2 size_t _dividend_lws_local_work_size_k2[3]; { _dividend_lws_local_work_size_k2[0] = getLWSValue("DIVIDEND_LWS2_D0",DIVIDEND_LWS2_D0_DEFAULT_VAL); _dividend_lws_local_work_size_k2[1] = getLWSValue("DIVIDEND_LWS2_D1",DIVIDEND_LWS2_D1_DEFAULT_VAL); //Dividend extension: store the kernel id as the last element _dividend_lws_local_work_size_k2[2] = 2; } // launch kernel error = DIVIDEND_CL_WRAP(clEnqueueNDRangeKernel)( command_queue, fan2_kernel, 2, 0, globalWorksizeFan2, _dividend_lws_local_work_size_k2, 0, NULL, NULL); cl_errChk(error,"ERROR in Executing Fan2 Kernel",true); if (timing) { // printf("here2a\n"); // kernelTime+=eventTime(kernelEvent,command_queue); // printf("here2b\n"); } clReleaseEvent(kernelEvent); //Fan2<<<dimGridXY,dimBlockXY>>>(m_cuda,a_cuda,b_cuda,Size,Size-t,t); //cudaThreadSynchronize(); } // 5. transfer data off of device error = clEnqueueReadBuffer(command_queue, a_dev, 1, // change to 0 for nonblocking write 0, // offset sizeof(float) * size * size, a, 0, NULL, &readEvent); cl_errChk(error,"ERROR with clEnqueueReadBuffer",true); if (timing) readTime+=eventTime(readEvent,command_queue); clReleaseEvent(readEvent); error = clEnqueueReadBuffer(command_queue, b_dev, 1, // change to 0 for nonblocking write 0, // offset sizeof(float) * size, b, 0, NULL, &readEvent); cl_errChk(error,"ERROR with clEnqueueReadBuffer",true); if (timing) readTime+=eventTime(readEvent,command_queue); clReleaseEvent(readEvent); error = clEnqueueReadBuffer(command_queue, m_dev, 1, // change to 0 for nonblocking write 0, // offset sizeof(float) * size * size, m, 0, NULL, &readEvent); cl_errChk(error,"ERROR with clEnqueueReadBuffer",true); if (timing) readTime+=eventTime(readEvent,command_queue); clReleaseEvent(readEvent); readMB = (float)(sizeof(float) * size * (size + size + 1) / 1e6); if (timing) { printf("Matrix Size\tWrite(s) [size]\t\tKernel(s)\tRead(s) [size]\t\tTotal(s)\n"); printf("%dx%d \t",size,size); printf("%f [%.2fMB]\t",writeTime,writeMB); printf("%f\t",kernelTime); printf("%f [%.2fMB]\t",readTime,readMB); printf("%f\n\n",writeTime+kernelTime+readTime); } }
int main(int argc, char *argv[]) { // selected platform and device number cl_uint pn = 0, dn = 0; // OpenCL error cl_int error; // generic iterator cl_uint i; // major/minor version of the platform OpenCL version cl_uint ocl_major, ocl_minor; // set platform/device num from command line if (argc > 1) pn = atoi(argv[1]); if (argc > 2) dn = atoi(argv[2]); error = clGetPlatformIDs(0, NULL, &np); CHECK_ERROR("getting amount of platform IDs"); printf("%u platforms found\n", np); if (pn >= np) { fprintf(stderr, "there is no platform #%u\n" , pn); exit(1); } // only allocate for IDs up to the intended one platform = calloc(pn+1,sizeof(*platform)); // if allocation failed, next call will bomb. rely on this error = clGetPlatformIDs(pn+1, platform, NULL); CHECK_ERROR("getting platform IDs"); // choose platform p = platform[pn]; error = clGetPlatformInfo(p, CL_PLATFORM_NAME, BUFSZ, strbuf, NULL); CHECK_ERROR("getting platform name"); printf("using platform %u: %s\n", pn, strbuf); error = clGetPlatformInfo(p, CL_PLATFORM_VERSION, BUFSZ, strbuf, NULL); CHECK_ERROR("getting platform version"); // we need 1.2 at least i = sscanf(strbuf, "OpenCL %u.%u ", &ocl_major, &ocl_minor); if (i != 2) { fprintf(stderr, "%s:%u: unable to determine platform OpenCL version\n", __func__, __LINE__); exit(1); } if (ocl_major == 1 && ocl_minor < 2) { fprintf(stderr, "%s:%u: Platform version %s is not at least 1.2\n", __func__, __LINE__, strbuf); exit(1); } error = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, 0, NULL, &nd); CHECK_ERROR("getting amount of device IDs"); printf("%u devices found\n", nd); if (dn >= nd) { fprintf(stderr, "there is no device #%u\n", dn); exit(1); } // only allocate for IDs up to the intended one device = calloc(dn+1,sizeof(*device)); // if allocation failed, next call will bomb. rely on this error = clGetDeviceIDs(p, CL_DEVICE_TYPE_ALL, dn+1, device, NULL); CHECK_ERROR("getting device IDs"); // choose device d = device[dn]; error = clGetDeviceInfo(d, CL_DEVICE_NAME, BUFSZ, strbuf, NULL); CHECK_ERROR("getting device name"); printf("using device %u: %s\n", dn, strbuf); error = clGetDeviceInfo(d, CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(gmem), &gmem, NULL); CHECK_ERROR("getting device global memory size"); error = clGetDeviceInfo(d, CL_DEVICE_MAX_MEM_ALLOC_SIZE, sizeof(alloc_max), &alloc_max, NULL); CHECK_ERROR("getting device max memory allocation size"); // create context ctx_prop[1] = (cl_context_properties)p; ctx = clCreateContext(ctx_prop, 1, &d, NULL, NULL, &error); CHECK_ERROR("creating context"); // create queue q = clCreateCommandQueue(ctx, d, CL_QUEUE_PROFILING_ENABLE, &error); CHECK_ERROR("creating queue"); // create program pg = clCreateProgramWithSource(ctx, sizeof(src)/sizeof(*src), src, NULL, &error); CHECK_ERROR("creating program"); // build program error = clBuildProgram(pg, 1, &d, NULL, NULL, NULL); CHECK_ERROR("building program"); // get kernel k = clCreateKernel(pg, "add", &error); CHECK_ERROR("creating kernel"); error = clGetKernelWorkGroupInfo(k, d, CL_KERNEL_PREFERRED_WORK_GROUP_SIZE_MULTIPLE, sizeof(wgm), &wgm, NULL); CHECK_ERROR("getting preferred workgroup size multiple"); // number of elements on which kernel will be launched. it's ok if we don't // cover every byte of the buffers nels = alloc_max/sizeof(cl_float); gws = ROUND_MUL(nels, wgm); printf("will use %zu workitems grouped by %zu to process %u elements\n", gws, wgm, nels); // we will try and allocate at least one buffer more than needed to fill // the device memory, and no less than 3 anyway nbuf = gmem/alloc_max + 1; if (nbuf < 3) nbuf = 3; #define MB (1024*1024.0) printf("will try allocating %u host buffers of %gMB each to overcommit %gMB\n", nbuf, alloc_max/MB, gmem/MB); hostbuf = calloc(nbuf, sizeof(cl_mem)); if (!hostbuf) { fprintf(stderr, "could not prepare support for %u buffers\n", nbuf); exit(1); } // allocate ‘host’ buffers for (i = 0; i < nbuf; ++i) { hostbuf[i] = clCreateBuffer(ctx, CL_MEM_ALLOC_HOST_PTR | CL_MEM_READ_ONLY, alloc_max, NULL, &error); CHECK_ERROR("allocating host buffer"); printf("host buffer %u allocated\n", i); error = clEnqueueMigrateMemObjects(q, 1, hostbuf + i, CL_MIGRATE_MEM_OBJECT_HOST | CL_MIGRATE_MEM_OBJECT_CONTENT_UNDEFINED, 0, NULL, NULL); CHECK_ERROR("migrating buffer to host"); printf("buffer %u migrated to host\n", i); } // allocate ‘device’ buffers for (i = 0; i < 2; ++i) { devbuf[i] = clCreateBuffer(ctx, CL_MEM_READ_WRITE | CL_MEM_HOST_NO_ACCESS, alloc_max, NULL, &error); CHECK_ERROR("allocating devbuffer"); printf("dev buffer %u allocated\n", i); if (i == 0) { float patt = 0; error = clEnqueueFillBuffer(q, devbuf[0], &patt, sizeof(patt), 0, nels*sizeof(patt), 0, NULL, &mem_evt); CHECK_ERROR("enqueueing memset"); } } error = clWaitForEvents(1, &mem_evt); CHECK_ERROR("waiting for buffer fill"); clReleaseEvent(mem_evt); mem_evt = NULL; // use the buffers for (i = 0; i < nbuf; ++i) { printf("testing buffer %u\n", i); // for each buffer, we do a setup on CPU and then use it as second // argument for the kernel hbuf = clEnqueueMapBuffer(q, hostbuf[i], CL_TRUE, CL_MAP_WRITE_INVALIDATE_REGION, 0, alloc_max, 0, NULL, NULL, &error); CHECK_ERROR("mapping buffer"); for (e = 0; e < nels; ++e) hbuf[e] = i; error = clEnqueueUnmapMemObject(q, hostbuf[i], hbuf, 0, NULL, NULL); CHECK_ERROR("unmapping buffer"); hbuf = NULL; // copy ‘host’ to ‘device’ buffer clEnqueueCopyBuffer(q, hostbuf[i], devbuf[1], 0, 0, alloc_max, 0, NULL, NULL); // make sure all pending actions are completed error = clFinish(q); CHECK_ERROR("settling down"); clSetKernelArg(k, 0, sizeof(cl_mem), devbuf); clSetKernelArg(k, 1, sizeof(cl_mem), devbuf + 1); clSetKernelArg(k, 2, sizeof(nels), &nels); error = clEnqueueNDRangeKernel(q, k, 1, NULL, &gws, &wgm, 0, NULL, &krn_evt); CHECK_ERROR("enqueueing kernel"); error = clEnqueueCopyBuffer(q, devbuf[0], hostbuf[0], 0, 0, alloc_max, 1, &krn_evt, &mem_evt); CHECK_ERROR("copying data to host"); expected = i*(i+1)/2.0f; hbuf = clEnqueueMapBuffer(q, hostbuf[0], CL_TRUE, CL_MAP_READ, 0, alloc_max, 1, &mem_evt, NULL, &error); CHECK_ERROR("mapping buffer 0"); for (e = 0; e < nels; ++e) if (hbuf[e] != expected) { fprintf(stderr, "mismatch @ %u: %g instead of %g\n", e, hbuf[e], expected); exit(1); } error = clEnqueueUnmapMemObject(q, hostbuf[0], hbuf, 0, NULL, NULL); CHECK_ERROR("unmapping buffer 0"); hbuf = NULL; clReleaseEvent(krn_evt); clReleaseEvent(mem_evt); krn_evt = mem_evt = NULL; } for (i = 1; i <= 2; ++i) { clReleaseMemObject(devbuf[2 - i]); printf("dev buffer %u freed\n", nbuf - i); } for (i = 1; i <= nbuf; ++i) { clReleaseMemObject(hostbuf[nbuf - i]); printf("host buffer %u freed\n", nbuf - i); } return 0; }
void cpu_to_opencl_opencl_func(void *buffers[], void *args) { STARPU_SKIP_IF_VALGRIND; (void) args; int id, devid, ret; cl_int err; cl_kernel kernel; cl_command_queue queue; cl_event event; unsigned n = STARPU_MULTIFORMAT_GET_NX(buffers[0]); cl_mem src = (cl_mem) STARPU_MULTIFORMAT_GET_CPU_PTR(buffers[0]); cl_mem dst = (cl_mem) STARPU_MULTIFORMAT_GET_OPENCL_PTR(buffers[0]); id = starpu_worker_get_id(); devid = starpu_worker_get_devid(id); ret = starpu_opencl_load_opencl_from_file(KERNEL_LOCATION, &opencl_conversion_program, NULL); STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_load_opencl_from_file"); err = starpu_opencl_load_kernel(&kernel, &queue, &opencl_conversion_program, "cpu_to_opencl_opencl", devid); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 0, sizeof(src), &src); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 1, sizeof(dst), &dst); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); err = clSetKernelArg(kernel, 2, sizeof(n), &n); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); { size_t global=n; size_t local; size_t s; cl_device_id device; starpu_opencl_get_device(devid, &device); err = clGetKernelWorkGroupInfo (kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(local), &local, &s); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); if (local > global) local = global; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event); if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err); } clFinish(queue); starpu_opencl_collect_stats(event); clReleaseEvent(event); starpu_opencl_release_kernel(kernel); ret = starpu_opencl_unload_opencl(&opencl_conversion_program); STARPU_CHECK_RETURN_VALUE(ret, "starpu_opencl_unload_opencl"); }
void mat_mul_opencl_1d(float *M_A, float *M_B, float *M_C, size_t ROW_A, size_t COL_A, size_t COL_B) { cl_platform_id *platform; cl_device_type dev_type; cl_device_id dev; cl_context context; cl_command_queue cmd_queue; cl_program program; cl_kernel kernel; cl_mem mem_A, mem_B, mem_C; cl_event ev_kernel; cl_int err; cl_uint num_platforms; cl_uint num_dev = 0; int i; // Platform err = clGetPlatformIDs(0, NULL, &num_platforms); CHECK_ERROR(err); if (num_platforms == 0) { fprintf(stderr, "[%s:%d] ERROR: No OpenCL platform\n", __FILE__,__LINE__); exit(EXIT_FAILURE); } printf("Number of platforms: %u\n", num_platforms); platform = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms); err = clGetPlatformIDs(num_platforms, platform, NULL); CHECK_ERROR(err); // Device dev_type = get_device_type(); for (i = 0; i < num_platforms; i++) { err = clGetDeviceIDs(platform[i], dev_type, 1, &dev, &num_dev); if (err != CL_DEVICE_NOT_FOUND) CHECK_ERROR(err); if (num_dev == 1) break; } if (num_dev < 1) { fprintf(stderr, "[%s:%d] ERROR: No device\n", __FILE__, __LINE__); exit(EXIT_FAILURE); } print_device_name(dev); free(platform); // Context context = clCreateContext(NULL, 1, &dev, NULL, NULL, &err); CHECK_ERROR(err); // Command queue cmd_queue = clCreateCommandQueue(context, dev, CL_QUEUE_PROFILING_ENABLE, &err); CHECK_ERROR(err); // Create a program. char *source_code = get_source_code("./kernel_1d.cl"); program = clCreateProgramWithSource(context, 1, (const char **)&source_code, NULL, &err); free(source_code); CHECK_ERROR(err); // Build the program. char build_opts[200]; sprintf(build_opts, "-DROW_A=%lu -DCOL_A=%lu -DCOL_B=%lu", ROW_A, COL_A, COL_B); err = clBuildProgram(program, 1, &dev, build_opts, NULL, NULL); if (err != CL_SUCCESS) { print_build_log(program, dev); CHECK_ERROR(err); } // Kernel kernel = clCreateKernel(program, "mat_mul", &err); CHECK_ERROR(err); // Buffers mem_A = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * ROW_A * COL_A, M_A, &err); CHECK_ERROR(err); mem_B = clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(float) * COL_A * COL_B, NULL, &err); CHECK_ERROR(err); err = clEnqueueWriteBuffer(cmd_queue, mem_B, CL_FALSE, 0, sizeof(float) * COL_A * COL_B, M_B, 0, NULL, NULL); CHECK_ERROR(err) mem_C = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(float) * ROW_A * COL_B, NULL, &err); CHECK_ERROR(err); // Set the arguments. err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &mem_A); CHECK_ERROR(err); err = clSetKernelArg(kernel, 1, sizeof(cl_mem), &mem_B); CHECK_ERROR(err); err = clSetKernelArg(kernel, 2, sizeof(cl_mem), &mem_C); CHECK_ERROR(err); // Enqueue the kernel. size_t lws[1] = {256}; size_t gws[1]; gws[0] = (size_t)ceil((double)ROW_A / lws[0]) * lws[0]; err = clEnqueueNDRangeKernel(cmd_queue, kernel, 1, NULL, gws, lws, 0, NULL, &ev_kernel); CHECK_ERROR(err); // Read the result. err = clEnqueueReadBuffer(cmd_queue, mem_C, CL_TRUE, 0, sizeof(float) * ROW_A * COL_B, M_C, 0, NULL, NULL); CHECK_ERROR(err); // Read the profiling info. cl_ulong start_time, end_time; err = clGetEventProfilingInfo(ev_kernel, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start_time, NULL); CHECK_ERROR(err); err = clGetEventProfilingInfo(ev_kernel, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end_time, NULL); CHECK_ERROR(err); printf("Kernel time : %lf sec\n", (double)(end_time - start_time) / 10e9); // Release clReleaseEvent(ev_kernel); clReleaseMemObject(mem_A); clReleaseMemObject(mem_B); clReleaseMemObject(mem_C); clReleaseKernel(kernel); clReleaseProgram(program); clReleaseCommandQueue(cmd_queue); clReleaseContext(context); }
void mandelbrot(int m, int n) { cl_platform_id *platform; cl_device_type dev_type = CL_DEVICE_TYPE_GPU; cl_device_id *devs = NULL; cl_context context; cl_command_queue *cmd_queues; cl_program program; cl_kernel *kernels; cl_mem *mem_R; cl_mem *mem_G; cl_mem *mem_B; cl_int err; cl_uint num_platforms; cl_uint num_devs = 0; cl_event *ev_kernels; int count_max = COUNT_MAX; int i, j, jhi, jlo; char *output_filename = "mandelbrot.ppm"; FILE *output_unit; double wtime; float x_max = 1.25; float x_min = - 2.25; // float x; // float x1; // float x2; float y_max = 1.75; float y_min = - 1.75; //float y; //float y1; //float y2; size_t size_color; size_color = sizeof(int) * m * n; int (*r)[n] = (int (*)[n])calloc(m * n, sizeof(int)); int (*g)[n] = (int (*)[n])calloc(m * n, sizeof(int)); int (*b)[n] = (int (*)[n])calloc(m * n, sizeof(int)); printf( " Sequential C version\n" ); printf( "\n" ); printf( " Create an ASCII PPM image of the Mandelbrot set.\n" ); printf( "\n" ); printf( " For each point C = X + i*Y\n" ); printf( " with X range [%g,%g]\n", x_min, x_max ); printf( " and Y range [%g,%g]\n", y_min, y_max ); printf( " carry out %d iterations of the map\n", count_max ); printf( " Z(n+1) = Z(n)^2 + C.\n" ); printf( " If the iterates stay bounded (norm less than 2)\n" ); printf( " then C is taken to be a member of the set.\n" ); printf( "\n" ); printf( " An ASCII PPM image of the set is created using\n" ); printf( " M = %d pixels in the X direction and\n", m ); printf( " N = %d pixels in the Y direction.\n", n ); timer_init(); timer_start(0); // Platform err = clGetPlatformIDs(0, NULL, &num_platforms); CHECK_ERROR(err); if (num_platforms == 0) { fprintf(stderr, "[%s:%d] ERROR: No OpenCL platform\n", __FILE__,__LINE__); exit(EXIT_FAILURE); } printf("Number of platforms: %u\n", num_platforms); platform = (cl_platform_id *)malloc(sizeof(cl_platform_id) * num_platforms); err = clGetPlatformIDs(num_platforms, platform, NULL); CHECK_ERROR(err); // Device for (i = 0; i < num_platforms; i++) { err = clGetDeviceIDs(platform[i], dev_type, 0, NULL, &num_devs); if (err != CL_DEVICE_NOT_FOUND) CHECK_ERROR(err); num_devs = 1; //** if (num_devs >= 1) { devs = (cl_device_id*)malloc(sizeof(cl_device_id) * num_devs); err = clGetDeviceIDs(platform[i], dev_type, num_devs, devs, NULL); break; } } if ( devs == NULL || num_devs < 1) { fprintf(stderr, "[%s:%d] ERROR: No device\n", __FILE__, __LINE__); exit(EXIT_FAILURE); } for( i = 0; i < num_devs; ++i ) { printf("dev[%d] : ", i); print_device_name(devs[i]); } // Context context = clCreateContext(NULL, num_devs, devs, NULL, NULL, &err); CHECK_ERROR(err); // Command queue cmd_queues = (cl_command_queue*)malloc(sizeof(cl_command_queue)*num_devs); for( i = 0; i < num_devs; ++i) { cmd_queues[i] = clCreateCommandQueue(context, devs[i], 0, &err); CHECK_ERROR(err); } // Create a program. size_t source_len; char *source_code = get_source_code("./mandelbrot_kernel.cl", &source_len); program = clCreateProgramWithSource(context, 1, (const char **)&source_code, &source_len, &err); free(source_code); CHECK_ERROR(err); // Build the program. char build_opts[200]; sprintf(build_opts, "-Dm=%d -Dn=%d -Dnum_devs=%d", m, n, num_devs); err = clBuildProgram(program, num_devs, devs, build_opts, NULL, NULL); if (err != CL_SUCCESS) { print_build_log(program, devs[0]); CHECK_ERROR(err); } // Kernel kernels = (cl_kernel*)malloc(sizeof(cl_kernel)*num_devs); for (i = 0; i < num_devs; i++) { kernels[i] = clCreateKernel(program, "mandelbrot_kernel", NULL); } // Buffers mem_R = (cl_mem*)malloc(sizeof(cl_mem)*num_devs); mem_G = (cl_mem*)malloc(sizeof(cl_mem)*num_devs); mem_B = (cl_mem*)malloc(sizeof(cl_mem)*num_devs); for(i = 0; i < num_devs; i++) { mem_R[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, size_color / num_devs, NULL, NULL); mem_G[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, size_color / num_devs, NULL, NULL); mem_B[i] = clCreateBuffer(context, CL_MEM_READ_WRITE, size_color / num_devs, NULL, NULL); } /* // Write to Buffers for(i = 0; i < num_devs; i++) { clEnqueueWriteBuffer(cmd_queues[i], mem_CHECK[i], CL_FALSE, 0, size_CHECK / num_devs, (CHECK + (N / num_devs) * i), 0, NULL, NULL); } */ // Set the arguments. for(i = 0; i < num_devs; i++) { // flag = i * (m * n / num_devs); clSetKernelArg(kernels[i], 0, sizeof(cl_mem), (void*) &mem_R[i]); clSetKernelArg(kernels[i], 1, sizeof(cl_mem), (void*) &mem_G[i]); clSetKernelArg(kernels[i], 2, sizeof(cl_mem), (void*) &mem_B[i]); clSetKernelArg(kernels[i], 3, sizeof(int), &count_max); clSetKernelArg(kernels[i], 4, sizeof(float), &x_max); clSetKernelArg(kernels[i], 5, sizeof(float), &x_min); clSetKernelArg(kernels[i], 6, sizeof(float), &y_max); clSetKernelArg(kernels[i], 7, sizeof(float), &y_min); } // Enqueue the kernel. size_t lws[1] = {256}; size_t gws[1] = { m * n /num_devs }; gws[0] = (size_t)ceil((double)m * n / lws[0]) * lws[0]; ev_kernels = (cl_event*)malloc(sizeof(cl_event)*num_devs); for(i = 0; i < num_devs; i++) { err = clEnqueueNDRangeKernel(cmd_queues[i], kernels[i], 1, NULL, gws, lws, 0, NULL, &ev_kernels[i]); CHECK_ERROR(err); } // Read the result. for(i = 0; i < num_devs; i++) { err = clEnqueueReadBuffer(cmd_queues[i], mem_R[i], CL_TRUE, 0, size_color / num_devs, r, 1, &ev_kernels[i], NULL); err = clEnqueueReadBuffer(cmd_queues[i], mem_G[i], CL_TRUE, 0, size_color / num_devs, g, 1, &ev_kernels[i], NULL); err = clEnqueueReadBuffer(cmd_queues[i], mem_B[i], CL_TRUE, 0, size_color / num_devs, b, 1, &ev_kernels[i], NULL); } // Release for( i = 0; i < num_devs; ++i ) { clFinish(cmd_queues[i]); clReleaseMemObject(mem_R[i]); clReleaseMemObject(mem_G[i]); clReleaseMemObject(mem_B[i]); clReleaseKernel(kernels[i]); clReleaseCommandQueue(cmd_queues[i]); clReleaseEvent(ev_kernels[i]); } clReleaseProgram(program); clReleaseContext(context); free(mem_R); free(mem_G); free(mem_B); free(cmd_queues); free(kernels); free(devs); free(ev_kernels); free(platform); timer_stop(0); wtime = timer_read(0); printf( "\n" ); printf( " Time = %lf seconds.\n", wtime ); // Write data to an ASCII PPM file. output_unit = fopen( output_filename, "wt" ); fprintf( output_unit, "P3\n" ); fprintf( output_unit, "%d %d\n", n, m ); fprintf( output_unit, "%d\n", 255 ); for ( i = 0; i < m; i++ ) { for ( jlo = 0; jlo < n; jlo = jlo + 4 ) { jhi = MIN( jlo + 4, n ); for ( j = jlo; j < jhi; j++ ) { fprintf( output_unit, " %d %d %d", r[i][j], g[i][j], b[i][j] ); } fprintf( output_unit, "\n" ); } } fclose( output_unit ); printf( "\n" ); printf( " Graphics data written to \"%s\".\n\n", output_filename ); // Terminate. free(r); free(g); free(b); }
int main (int argc, const char **argv) { OclPlatform *ocl; cl_program program; cl_device_id *devices; cl_command_queue *queues; cl_kernel kernel; cl_int errcode; int num_devices; GTimer *timer; ocl = ocl_new_from_args (argc, argv, CL_QUEUE_PROFILING_ENABLE); program = ocl_create_program_from_source (ocl, source, NULL, &errcode); OCL_CHECK_ERROR (errcode); kernel = clCreateKernel (program, "touch", &errcode); OCL_CHECK_ERROR (errcode); num_devices = ocl_get_num_devices (ocl); devices = ocl_get_devices (ocl); queues = ocl_get_cmd_queues (ocl); timer = g_timer_new (); for (int i = 0; i < num_devices; i++) { char name[256]; cl_event event; size_t size = 16; const int NUM_RUNS = 50000; unsigned long total_wait = 0; unsigned long total_execution = 0; double wall_clock = 0.0; for (int r = 0; r < NUM_RUNS; r++) { unsigned long wait; unsigned long execution; g_timer_start (timer); OCL_CHECK_ERROR (clEnqueueNDRangeKernel (queues[i], kernel, 1, NULL, &size, NULL, 0, NULL, &event)); clWaitForEvents (1, &event); g_timer_stop (timer); wall_clock += g_timer_elapsed (timer, NULL); get_event_times (event, &wait, &execution); clReleaseEvent (event); total_wait += wait; total_execution += execution; } OCL_CHECK_ERROR (clGetDeviceInfo (devices[i], CL_DEVICE_NAME, 256, name, NULL)); /* all times in nano seconds */ printf ("%s %f %f %f\n", name, total_wait / ((double) NUM_RUNS), total_execution / ((double) NUM_RUNS), wall_clock / NUM_RUNS * 1000 * 1000 * 1000); } g_timer_destroy (timer); clReleaseKernel (kernel); clReleaseProgram (program); ocl_free (ocl); }
int task(cl_context context, cl_device_id device, cl_command_queue queue, void* data_) { const TaskData* data = (const TaskData*) data_; cl_int err; if (data->points % data->points_per_work_item) check_error(CLQMC_INVALID_VALUE, "points must be a multiple of points_per_work_item"); if (data->replications % data->replications_per_work_item) check_error(CLQMC_INVALID_VALUE, "replications must be a multiple of replications_per_work_item"); // Lattice buffer size_t pointset_size; // gen_vec is given in common.c clqmcLatticeRule* pointset = clqmcLatticeRuleCreate(data->points, DIMENSION, gen_vec, &pointset_size, &err); check_error(err, NULL); cl_mem pointset_buf = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, pointset_size, pointset, &err); check_error(err, "cannot create point set buffer"); // Shifts buffer clqmc_fptype* shifts = (clqmc_fptype*) malloc(data->replications * DIMENSION * sizeof(clqmc_fptype)); // populate random shifts using a random stream clrngMrg31k3pStream* stream = clrngMrg31k3pCreateStreams(NULL, 1, NULL, &err); check_error(err, NULL); for (cl_uint i = 0; i < data->replications; i++) for (cl_uint j = 0; j < DIMENSION; j++) shifts[i * DIMENSION + j] = clrngMrg31k3pRandomU01(stream); err = clrngMrg31k3pDestroyStreams(stream); check_error(err, NULL); cl_mem shifts_buf = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_HOST_WRITE_ONLY | CL_MEM_COPY_HOST_PTR, data->replications * DIMENSION * sizeof(clqmc_fptype), shifts, &err); check_error(err, "cannot create shifts buffer"); // Output buffer size_t points_block_count = data->points / data->points_per_work_item; cl_mem output_buf = clCreateBuffer(context, CL_MEM_WRITE_ONLY | CL_MEM_HOST_READ_ONLY, data->replications * points_block_count * sizeof(clqmc_fptype), NULL, &err); check_error(err, "cannot create output buffer"); // OpenCL kernel cl_program program = build_program_from_file(context, device, "client/DocsTutorial/example4_kernel.cl", NULL); check_error(err, NULL); cl_kernel kernel = clCreateKernel(program, "simulateWithRQMC", &err); check_error(err, "cannot create kernel"); int iarg = 0; err = clSetKernelArg(kernel, iarg++, sizeof(pointset_buf), &pointset_buf); err |= clSetKernelArg(kernel, iarg++, sizeof(shifts_buf), &shifts_buf); err |= clSetKernelArg(kernel, iarg++, sizeof(data->points_per_work_item), &data->points_per_work_item); err |= clSetKernelArg(kernel, iarg++, sizeof(data->replications), &data->replications); err |= clSetKernelArg(kernel, iarg++, sizeof(output_buf), &output_buf); check_error(err, "cannot set kernel arguments"); // Execution cl_event ev; size_t global_size = (data->replications / data->replications_per_work_item) * points_block_count; err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global_size, NULL, 0, NULL, &ev); check_error(err, "cannot enqueue kernel"); err = clWaitForEvents(1, &ev); check_error(err, "error waiting for events"); clqmc_fptype* output = (clqmc_fptype*) malloc(data->replications * points_block_count * sizeof(clqmc_fptype)); err = clEnqueueReadBuffer(queue, output_buf, CL_TRUE, 0, data->replications * points_block_count * sizeof(clqmc_fptype), output, 0, NULL, NULL); check_error(err, "cannot read output buffer"); printf("\nAdvanced randomized quasi-Monte Carlo integration:\n\n"); err = clqmcLatticeRuleWriteInfo(pointset, stdout); check_error(err, NULL); printf("\n"); rqmcReport(data->replications, data->points, points_block_count, output); // Clean up clReleaseEvent(ev); clReleaseMemObject(output_buf); clReleaseMemObject(pointset_buf); clReleaseKernel(kernel); clReleaseProgram(program); free(output); err = clqmcLatticeRuleDestroy(pointset); check_error(err, NULL); return EXIT_SUCCESS; }
int main(int argc, char **argv) { cl_platform_id platforms[100]; cl_uint platforms_n = 0; CL_CHECK(clGetPlatformIDs(100, platforms, &platforms_n)); printf("=== %d OpenCL platform(s) found: ===\n", platforms_n); for (int i=0; i<platforms_n; i++) { char buffer[10240]; printf(" -- %d --\n", i); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_PROFILE, 10240, buffer, NULL)); printf(" PROFILE = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VERSION, 10240, buffer, NULL)); printf(" VERSION = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_NAME, 10240, buffer, NULL)); printf(" NAME = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_VENDOR, 10240, buffer, NULL)); printf(" VENDOR = %s\n", buffer); CL_CHECK(clGetPlatformInfo(platforms[i], CL_PLATFORM_EXTENSIONS, 10240, buffer, NULL)); printf(" EXTENSIONS = %s\n", buffer); } cl_device_id devices[100]; cl_uint devices_n = 0; // CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_ALL, 100, devices, &devices_n)); CL_CHECK(clGetDeviceIDs(NULL, CL_DEVICE_TYPE_GPU, 100, devices, &devices_n)); printf("=== %d OpenCL device(s) found on platform:\n", platforms_n); for (int i=0; i<devices_n; i++) { char buffer[10240]; cl_uint buf_uint; cl_ulong buf_ulong; printf(" -- %d --\n", i); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_NAME, sizeof(buffer), buffer, NULL)); printf(" DEVICE_NAME = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VENDOR, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VENDOR = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_VERSION, sizeof(buffer), buffer, NULL)); printf(" DEVICE_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DRIVER_VERSION, sizeof(buffer), buffer, NULL)); printf(" DRIVER_VERSION = %s\n", buffer); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_COMPUTE_UNITS = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_MAX_CLOCK_FREQUENCY, sizeof(buf_uint), &buf_uint, NULL)); printf(" DEVICE_MAX_CLOCK_FREQUENCY = %u\n", (unsigned int)buf_uint); CL_CHECK(clGetDeviceInfo(devices[i], CL_DEVICE_GLOBAL_MEM_SIZE, sizeof(buf_ulong), &buf_ulong, NULL)); printf(" DEVICE_GLOBAL_MEM_SIZE = %llu\n", (unsigned long long)buf_ulong); } if (devices_n == 0) return 1; cl_context context; context = CL_CHECK_ERR(clCreateContext(NULL, 1, devices, &pfn_notify, NULL, &_err)); const char *program_source[] = { "__kernel void simple_demo(__global int *src, __global int *dst, int factor)\n", "{\n", " int i = get_global_id(0);\n", " dst[i] = src[i] * factor;\n", "}\n" }; cl_program program; program = CL_CHECK_ERR(clCreateProgramWithSource(context, sizeof(program_source)/sizeof(*program_source), program_source, NULL, &_err)); if (clBuildProgram(program, 1, devices, "", NULL, NULL) != CL_SUCCESS) { char buffer[10240]; clGetProgramBuildInfo(program, devices[0], CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL); fprintf(stderr, "CL Compilation failed:\n%s", buffer); abort(); } CL_CHECK(clUnloadCompiler()); cl_mem input_buffer; input_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_READ_ONLY, sizeof(int)*NUM_DATA, NULL, &_err)); cl_mem output_buffer; output_buffer = CL_CHECK_ERR(clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(int)*NUM_DATA, NULL, &_err)); int factor = 2; cl_kernel kernel; kernel = CL_CHECK_ERR(clCreateKernel(program, "simple_demo", &_err)); CL_CHECK(clSetKernelArg(kernel, 0, sizeof(input_buffer), &input_buffer)); CL_CHECK(clSetKernelArg(kernel, 1, sizeof(output_buffer), &output_buffer)); CL_CHECK(clSetKernelArg(kernel, 2, sizeof(factor), &factor)); cl_command_queue queue; queue = CL_CHECK_ERR(clCreateCommandQueue(context, devices[0], 0, &_err)); for (int i=0; i<NUM_DATA; i++) { CL_CHECK(clEnqueueWriteBuffer(queue, input_buffer, CL_TRUE, i*sizeof(int), sizeof(int), &i, 0, NULL, NULL)); } cl_event kernel_completion; size_t global_work_size[1] = { NUM_DATA }; CL_CHECK(clEnqueueNDRangeKernel(queue, kernel, 1, NULL, global_work_size, NULL, 0, NULL, &kernel_completion)); CL_CHECK(clWaitForEvents(1, &kernel_completion)); CL_CHECK(clReleaseEvent(kernel_completion)); printf("Result:"); for (int i=0; i<NUM_DATA; i++) { int data; CL_CHECK(clEnqueueReadBuffer(queue, output_buffer, CL_TRUE, i*sizeof(int), sizeof(int), &data, 0, NULL, NULL)); printf(" %d", data); } printf("\n"); CL_CHECK(clReleaseMemObject(input_buffer)); CL_CHECK(clReleaseMemObject(output_buffer)); CL_CHECK(clReleaseKernel(kernel)); CL_CHECK(clReleaseProgram(program)); CL_CHECK(clReleaseContext(context)); return 0; }
static void fill_cc_ocl_gpu_wrapper_helmholtzbem3d(void *data, uint kernel) { merge_data_nf *mdata = (merge_data_nf *) data; cl_int res; cl_uint nq2; cl_uint triangles; uint current_device; uint current_queue; uint current_kernel_id; uint current_thread; cl_uint num_devices = ocl_system.num_devices; cl_uint num_queues = ocl_system.queues_per_device; cl_event h2d[2], calc; cl_kernel oclkernel; cl_command_queue queue; size_t global_off[] = { 0 }; size_t local_size[] = { 128 }; size_t global_size[] = { ((mdata->pos + local_size[0] - 1) / local_size[0]) * local_size[0] }; field k = mdata->bem->k; field alpha = mdata->bem->alpha; /**************************************************** * Determine device, queue and kernel. ****************************************************/ current_device = omp_get_thread_num() / num_queues; current_queue = omp_get_thread_num() % num_queues; current_thread = current_queue + current_device * num_queues; current_kernel_id = current_queue + current_device * num_queues + kernel * num_devices * num_queues; oclkernel = ocl_bem3d.kernels[current_kernel_id]; queue = ocl_system.queues[current_thread]; /**************************************************** * Transfer input data. ****************************************************/ res = clEnqueueWriteBuffer(queue, ocl_bem3d.mem_ridx[current_thread], CL_FALSE, 0, mdata->pos * sizeof(uint), mdata->ridx, 0, NULL, h2d + 0); CL_CHECK(res) res = clEnqueueWriteBuffer(queue, ocl_bem3d.mem_cidx[current_thread], CL_FALSE, 0, mdata->pos * sizeof(uint), mdata->cidx, 0, NULL, h2d + 1); CL_CHECK(res) /**************************************************** * Setup kernel arguments for 'assemble_xxx_cc_list_z' ****************************************************/ triangles = ocl_bem3d.triangles; if (kernel == 0 || kernel == 4) { nq2 = ocl_bem3d.nq; res = clSetKernelArg(oclkernel, 0, sizeof(cl_mem), &ocl_bem3d.mem_q_xw[current_device]); CL_CHECK(res) res = clSetKernelArg(oclkernel, 1, sizeof(cl_uint), &nq2); } else { nq2 = ocl_bem3d.nq2; res = clSetKernelArg(oclkernel, 0, sizeof(cl_mem), &ocl_bem3d.mem_q2_xw[current_device]); CL_CHECK(res) res = clSetKernelArg(oclkernel, 1, sizeof(cl_uint), &nq2); } CL_CHECK(res) res = clSetKernelArg(oclkernel, 2, sizeof(cl_mem), &ocl_bem3d.mem_gr_t[current_device]); CL_CHECK(res) res = clSetKernelArg(oclkernel, 3, sizeof(cl_mem), &ocl_bem3d.mem_gr_x[current_device]); CL_CHECK(res) res = clSetKernelArg(oclkernel, 4, sizeof(cl_uint), &triangles); CL_CHECK(res) res = clSetKernelArg(oclkernel, 5, sizeof(cl_mem), &ocl_bem3d.mem_ridx[current_thread]); CL_CHECK(res) res = clSetKernelArg(oclkernel, 6, sizeof(cl_mem), &ocl_bem3d.mem_cidx[current_thread]); CL_CHECK(res) res = clSetKernelArg(oclkernel, 7, sizeof(cl_mem), &ocl_bem3d.mem_N[current_thread]); CL_CHECK(res) res = clSetKernelArg(oclkernel, 8, sizeof(cl_uint), &mdata->pos); CL_CHECK(res) res = clSetKernelArg(oclkernel, 9, sizeof(field), &k); CL_CHECK(res) res = clSetKernelArg(oclkernel, 10, sizeof(field), &alpha); CL_CHECK(res) /**************************************************** * Invoke the kernel 'assemble_xxx_cc_list_z' on the GPU ****************************************************/ res = clEnqueueNDRangeKernel(queue, oclkernel, 1, global_off, global_size, local_size, 2, h2d, &calc); CL_CHECK(res) /**************************************************** * Copy results back. ****************************************************/ res = clEnqueueReadBuffer(queue, ocl_bem3d.mem_N[current_thread], CL_TRUE, 0, mdata->pos * sizeof(field), mdata->N, 1, &calc, NULL); CL_CHECK(res) clReleaseEvent(h2d[0]); clReleaseEvent(h2d[1]); clReleaseEvent(calc); }
int NBody::runCLKernels() { cl_int status; cl_event events[1]; /* * Enqueue a kernel run call. */ size_t globalThreads[] = {numBodies}; size_t localThreads[] = {groupSize}; if(localThreads[0] > maxWorkItemSizes[0] || localThreads[0] > maxWorkGroupSize) { std::cout << "Unsupported: Device" "does not support requested number of work items."; return SDK_FAILURE; } status = clEnqueueNDRangeKernel( commandQueue, kernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueNDRangeKernel failed.")) { return SDK_FAILURE; } status = clFinish(commandQueue); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clFinish failed.")) { return SDK_FAILURE; } /* Copy data from new to old */ status = clEnqueueCopyBuffer(commandQueue, newPos, currPos, 0, 0, sizeof(cl_float4) * numBodies, 0, 0, 0); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueCopyBuffer failed.(newPos->oldPos)")) { return SDK_FAILURE; } status = clEnqueueCopyBuffer(commandQueue, newVel, currVel, 0, 0, sizeof(cl_float4) * numBodies, 0, 0, 0); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueCopyBuffer failed.(newVel->oldVels)")) { return SDK_FAILURE; } status = clFinish(commandQueue); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clFinish failed.")) { return SDK_FAILURE; } /* Enqueue readBuffer*/ status = clEnqueueReadBuffer( commandQueue, currPos, CL_TRUE, 0, numBodies* sizeof(cl_float4), pos, 0, NULL, &events[0]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueReadBuffer failed.")) return SDK_FAILURE; /* Wait for the read buffer to finish execution */ status = clWaitForEvents(1, &events[0]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clWaitForEvents failed.")) return SDK_FAILURE; clReleaseEvent(events[0]); return SDK_SUCCESS; }
float sgemmMain(int rowa,int cola,int colb) { cl_context context = 0; cl_command_queue commandQueue = 0; cl_program program = 0; cl_device_id device = 0; cl_kernel kernel = 0; const unsigned int numberOfMemoryObjects = 3; cl_mem memoryObjectsa = 0; cl_mem memoryObjectsb = 0; cl_mem memoryObjectsc = 0; cl_int errorNumber; cl_uint clrowa = rowa; cl_uint clcola = cola; cl_uint clcolb = colb; int err; err = createContext(&context); LOGD("create context"); err = createCommandQueue(context, &commandQueue, &device); err = createProgram(context, device, "/mnt/sdcard/kernel/sgemm.cl", &program); kernel = clCreateKernel(program, "sgemm", &errorNumber); LOGD("createKernel code %d",errorNumber); LOGD("start computing"); float alpha = 1; float beta = 0.1; /* Create the matrices. */ size_t matrixSizea = rowa * cola; size_t matrixSizeb = cola * colb; size_t matrixSizec = rowa * colb; /* As all the matrices have the same size, the buffer size is common. */ size_t bufferSizea = matrixSizea * sizeof(float); size_t bufferSizeb = matrixSizeb * sizeof(float); size_t bufferSizec = matrixSizec * sizeof(float); /* Create buffers for the matrices used in the kernel. */ int createMemoryObjectsSuccess = 0; memoryObjectsa = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSizea, NULL, &errorNumber); createMemoryObjectsSuccess &= errorNumber; memoryObjectsb = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, bufferSizeb, NULL, &errorNumber); createMemoryObjectsSuccess &= errorNumber; memoryObjectsc = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR, bufferSizec, NULL, &errorNumber); createMemoryObjectsSuccess &= errorNumber; LOGD("create memory err %d",createMemoryObjectsSuccess); int mapMemoryObjectsSuccess = 0; cl_float* matrixA = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsa, CL_TRUE, CL_MAP_WRITE, 0, bufferSizea, 0, NULL, NULL, &errorNumber); mapMemoryObjectsSuccess &= errorNumber; cl_float* matrixB = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsb, CL_TRUE, CL_MAP_WRITE, 0, bufferSizeb, 0, NULL, NULL, &errorNumber); mapMemoryObjectsSuccess &= errorNumber; cl_float* matrixC = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsc, CL_TRUE, CL_MAP_WRITE, 0, bufferSizec, 0, NULL, NULL, &errorNumber); mapMemoryObjectsSuccess &= errorNumber; LOGD("map memory err %d",mapMemoryObjectsSuccess); sgemmInitialize(rowa,cola,colb, matrixA, matrixB, matrixC); LOGD("data initial finish"); int unmapMemoryObjectsSuccess = 0; errorNumber = clEnqueueUnmapMemObject(commandQueue, memoryObjectsa, matrixA, 0, NULL, NULL); LOGD("memory code %d",errorNumber); unmapMemoryObjectsSuccess &= errorNumber; errorNumber = clEnqueueUnmapMemObject(commandQueue, memoryObjectsb, matrixB, 0, NULL, NULL); LOGD("memory code %d",errorNumber); unmapMemoryObjectsSuccess &= errorNumber; errorNumber = clEnqueueUnmapMemObject(commandQueue, memoryObjectsc, matrixC, 0, NULL, NULL); LOGD("memory code %d",errorNumber); unmapMemoryObjectsSuccess &= errorNumber; LOGD("unmap memory err %d",unmapMemoryObjectsSuccess); int setKernelArgumentsSuccess = 0; errorNumber = clSetKernelArg(kernel, 0, sizeof(cl_mem), &memoryObjectsa); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 1, sizeof(cl_mem), &memoryObjectsb); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 2, sizeof(cl_mem), &memoryObjectsc); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 3, sizeof(cl_uint), &clrowa); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 4, sizeof(cl_uint), &clcola); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 5, sizeof(cl_uint), &clcolb); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 6, sizeof(cl_float), &alpha); setKernelArgumentsSuccess &= errorNumber; errorNumber = clSetKernelArg(kernel, 7, sizeof(cl_float), &beta); setKernelArgumentsSuccess &= errorNumber; LOGD("setKernel err %d",setKernelArgumentsSuccess); LOGD("start running kernel"); clock_t start_t,end_t; float cost_time; start_t = clock(); cl_event event = 0; size_t globalWorksize[2] = {rowa, colb}; errorNumber = clEnqueueNDRangeKernel(commandQueue, kernel, 2, NULL, globalWorksize, NULL, 0, NULL, &event); //LOGD("Enqueue err code %d",errorNumber); errorNumber = clFinish(commandQueue); end_t = clock(); cost_time = (float)(end_t-start_t)/CLOCKS_PER_SEC*1000; LOGD("Finish err code %d",errorNumber); float time; time = printProfilingInfo(event); LOGT("using CPU clock: %f ms",cost_time); LOGT("using GPU clock: %f ms",time); clReleaseEvent(event); matrixC = (cl_float*)clEnqueueMapBuffer(commandQueue, memoryObjectsc, CL_TRUE, CL_MAP_READ, 0, bufferSizec, 0, NULL, NULL, &errorNumber); clEnqueueUnmapMemObject(commandQueue, memoryObjectsc, matrixC, 0, NULL, NULL); LOGD("read out matrixC finish"); LOGD("matrixC value C(0,0): %f",matrixC[0]); cleanUpOpenCL(context, commandQueue, program, kernel, memoryObjectsa, memoryObjectsb,memoryObjectsc,numberOfMemoryObjects); LOGD("RUNNING finsh"); return time; }
int BoxFilterGLSeparable::runCLKernels() { cl_int status; cl_event events[2]; /* Set appropriate arguments to the kernel */ /* input buffer image */ status = clSetKernelArg( horizontalKernel, 0, sizeof(cl_mem), &inputImageBuffer); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg failed. (inputImageBuffer)")) { return SDK_FAILURE; } /* outBuffer imager */ status = clSetKernelArg( horizontalKernel, 1, sizeof(cl_mem), &tempImageBuffer); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg failed. (outputImageBuffer)")) { return SDK_FAILURE; } /* filter width */ status = clSetKernelArg( horizontalKernel, 2, sizeof(cl_int), &filterWidth); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg failed. (filterWidth)")) { return SDK_FAILURE; } #ifdef USE_LDS /* shared memory */ status = clSetKernelArg( horizontalKernel, 3, (GROUP_SIZE + filterWidth - 1) * sizeof(cl_uchar4), 0); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg failed. (local memory)")) { return SDK_FAILURE; } #endif /* * Enqueue a kernel run call. */ size_t globalThreads[] = {width, height}; size_t localThreads[] = {blockSizeX, blockSizeY}; status = clEnqueueNDRangeKernel( commandQueue, horizontalKernel, 2, NULL, globalThreads, localThreads, 0, NULL, &events[0]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueNDRangeKernel failed.")) { return SDK_FAILURE; } status = clWaitForEvents(1, &events[0]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clWaitForEvents failed.")) return SDK_FAILURE; /* Do vertical pass */ /* Set appropriate arguments to the kernel */ /* input buffer image */ status = clSetKernelArg( verticalKernel, 0, sizeof(cl_mem), &tempImageBuffer); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg failed. (inputImageBuffer)")) { return SDK_FAILURE; } // Acquire GL buffer status = clEnqueueAcquireGLObjects(commandQueue, 1, &outputImageBuffer, 0, 0, NULL); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueAcquireGLObjects failed.")) return SDK_FAILURE; /* outBuffer imager */ status = clSetKernelArg( verticalKernel, 1, sizeof(cl_mem), &outputImageBuffer); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg failed. (outputImageBuffer)")) { return SDK_FAILURE; } /* filter width */ status = clSetKernelArg( verticalKernel, 2, sizeof(cl_int), &filterWidth); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg failed. (filterWidth)")) { return SDK_FAILURE; } /* * Enqueue a kernel run call. */ //size_t globalThreads[] = {width, height}; //size_t localThreads[] = {blockSizeX, blockSizeY}; status = clEnqueueNDRangeKernel( commandQueue, verticalKernel, 2, NULL, globalThreads, localThreads, 0, NULL, &events[0]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueNDRangeKernel failed.")) { return SDK_FAILURE; } status = clWaitForEvents(1, &events[0]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clWaitForEvents failed.")) return SDK_FAILURE; /* Read buffer only if verification flag is true */ if(verify) { /* Enqueue readBuffer*/ status = clEnqueueReadBuffer( commandQueue, outputImageBuffer, CL_TRUE, 0, width * height * pixelSize, outputImageData, 0, NULL, &events[0]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueReadBuffer failed.")) return SDK_FAILURE; /* Wait for the read buffer to finish execution */ status = clWaitForEvents(1, &events[0]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clWaitForEvents failed.")) return SDK_FAILURE; } /* Now OpenGL gets control of outputImageBuffer */ status = clEnqueueReleaseGLObjects(commandQueue, 1, &outputImageBuffer, 0, 0, 0); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueReleaseGLObjects failed.")) return SDK_FAILURE; status = clFinish(commandQueue); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clFinish failed.")) return SDK_FAILURE; clReleaseEvent(events[0]); return SDK_SUCCESS; }
int BinomialOption::runCLKernels() { cl_int status; /* * This algorithm reduces each group of work-items to a single value * on OpenCL device */ /* Set appropriate arguments to the kernel */ /* number of steps */ status = clSetKernelArg(kernel, 0, sizeof(int), (void*)&numSteps); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clSetKernelArg failed. (numSteps)")) { return SDK_FAILURE; } /* randBuffer */ status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void*)&randBuffer); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clSetKernelArg failed. (randBuffer)")) { return SDK_FAILURE; } /* outBuffer */ status = clSetKernelArg(kernel, 2, sizeof(cl_mem), (void*)&outBuffer); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clSetKernelArg failed. (outBuffer)")) { return SDK_FAILURE; } /* local memory callA */ status = clSetKernelArg(kernel, 3, (numSteps + 1) * sizeof(cl_float4), NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clSetKernelArg failed. (callA)")) { return SDK_FAILURE; } /* local memory callB */ status = clSetKernelArg(kernel, 4, numSteps * sizeof(cl_float4), NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clSetKernelArg failed. (callB)")) { return SDK_FAILURE; } /** * Enqueue a kernel run call. */ size_t globalThreads[] = {numSamples * (numSteps + 1)}; size_t localThreads[] = {numSteps + 1}; if(localThreads[0] > maxWorkItemSizes[0] || localThreads[0] > maxWorkGroupSize) { std::cout << "Unsupported: Device does not support" "requested number of work items."; return SDK_FAILURE; } status = clGetKernelWorkGroupInfo(kernel, devices[deviceId], CL_KERNEL_LOCAL_MEM_SIZE, sizeof(cl_ulong), &usedLocalMemory, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clGetKernelWorkGroupInfo" "CL_KERNEL_LOCAL_MEM_SIZE failed.")) { return SDK_FAILURE; } if(usedLocalMemory > totalLocalMemory) { std::cout << "Unsupported: Insufficient local memory on device." << std::endl; return SDK_FAILURE; } status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, globalThreads, localThreads, 0, NULL, NULL); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clEnqueueNDRangeKernel failed.")) { return SDK_FAILURE; } status = clFinish(commandQueue); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clFinish failed.")) { return SDK_FAILURE; } cl_event events[1]; /* Enqueue readBuffer*/ status = clEnqueueReadBuffer(commandQueue, outBuffer, CL_TRUE, 0, numSamples * sizeof(cl_float4), output, 0, NULL, &events[0]); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clEnqueueReadBuffer failed.")) { return SDK_FAILURE; } /* Wait for the read buffer to finish execution */ status = clWaitForEvents(1, &events[0]); if(!sampleCommon->checkVal(status, CL_SUCCESS, "clWaitForEvents failed.")) { return SDK_FAILURE; } clReleaseEvent(events[0]); return SDK_SUCCESS; }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufX, bufAsum, scratchBuff; cl_event event = NULL; int ret = 0; int lenX = 1 + (N-1)*abs(incx); /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, (lenX*sizeof(cl_float)), NULL, &err); bufAsum = clCreateBuffer(ctx, CL_MEM_WRITE_ONLY, (sizeof(cl_float)), NULL, &err); // Allocate minimum of N elements scratchBuff = clCreateBuffer(ctx, CL_MEM_READ_WRITE, (N*sizeof(cl_float)), NULL, &err); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, (lenX*sizeof(cl_float)), X, 0, NULL, NULL); /* Call clblas function. */ err = clblasSasum( N, bufAsum, 0, bufX, 0, incx, scratchBuff, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasSasum() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufAsum, CL_TRUE, 0, sizeof(cl_float), &asum, 0, NULL, NULL); printf("Result : %f\n", asum); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufX); clReleaseMemObject(bufAsum); clReleaseMemObject(scratchBuff); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
// Main function // ********************************************************************* int main(int argc, char **argv) { shrQAStart(argc, argv); int NUM_BLOCKS = 10; shrSetLogFileName ("Barrier_Centralized.txt"); while(NUM_BLOCKS<=120) { int iNumElements = NUM_BLOCKS* NUM_THREADS; // total num of threads // BARRIER GOAL int goal_val = NUM_BLOCKS; // get command line arg for quick test, if provided bNoPrompt = shrCheckCmdLineFlag(argc, (const char**)argv, "noprompt"); // start logs cExecutableName = argv[0]; shrSetLogFileName ("Barrier.txt"); shrLog("%s Starting...\n\n# of THREADS \t= %i\n", argv[0], iNumElements); // set and log Global and Local work size dimensions szLocalWorkSize = NUM_THREADS ; szGlobalWorkSize = shrRoundUp((int)szLocalWorkSize, iNumElements); // rounded up to the nearest multiple of the LocalWorkSize shrLog("Global Work Size \t\t= %u\nLocal Work Size \t\t= %u\n# of Work Groups \t\t= %u\n\n", szGlobalWorkSize, szLocalWorkSize, (szGlobalWorkSize % szLocalWorkSize + szGlobalWorkSize/szLocalWorkSize)); //Get an OpenCL platform ciErr1 = clGetPlatformIDs(1, &cpPlatform, NULL); shrLog("clGetPlatformID...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clGetPlatformID, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } //Get the devices ciErr1 = clGetDeviceIDs(cpPlatform, CL_DEVICE_TYPE_GPU, 1, &cdDevice, NULL); shrLog("clGetDeviceIDs...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clGetDeviceIDs, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } //Create the context cxGPUContext = clCreateContext(0, 1, &cdDevice, NULL, NULL, &ciErr1); shrLog("clCreateContext...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateContext, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Create a command-queue cqCommandQueue = clCreateCommandQueue(cxGPUContext, cdDevice, CL_QUEUE_PROFILING_ENABLE, &ciErr1); shrLog("clCreateCommandQueue...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateCommandQueue, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Read the OpenCL kernel in from source file shrLog("oclLoadProgSource (%s)...\n", cSourceFile); cPathAndName = shrFindFilePath(cSourceFile, argv[0]); cSourceCL = oclLoadProgSource(cPathAndName, "", &szKernelLength); // Create the program cpProgram = clCreateProgramWithSource(cxGPUContext, 1, (const char **)&cSourceCL, &szKernelLength, &ciErr1); shrLog("clCreateProgramWithSource...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateProgramWithSource, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Build the program with 'mad' Optimization option #ifdef MAC char* flags = "-cl-fast-relaxed-math -DMAC"; #else char* flags = "-cl-fast-relaxed-math"; #endif ciErr1 = clBuildProgram(cpProgram, 0, NULL, NULL, NULL, NULL); shrLog("clBuildProgram...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clBuildProgram, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Create the kernel ckKernel = clCreateKernel(cpProgram, "Barrier", &ciErr1); shrLog("clCreateKernel (Barrier)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Allocate and initialize host arrays shrLog( "Allocate and Init Host Mem...\n"); input = (int *)malloc(sizeof(int) * NUM_BLOCKS); for(int i =0; i<=NUM_BLOCKS; i++) { input[i]=0; } // Allocate the OpenCL buffer memory objects for source and result on the device GMEM array_in = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(int)* NUM_BLOCKS, NULL, &ciErr1); array_out = clCreateBuffer(cxGPUContext, CL_MEM_READ_WRITE, sizeof(int)* NUM_BLOCKS, NULL, &ciErr1); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clCreateBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Set the Argument values ciErr1 = clSetKernelArg(ckKernel, 0, sizeof(cl_int), (void*)&goal_val); ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_mem), (void*)&array_in); ciErr1 |= clSetKernelArg(ckKernel, 2, sizeof(cl_mem), (void*)&array_out); // ciErr1 |= clSetKernelArg(ckKernel, 1, sizeof(cl_int), (void*)&iNumElements); shrLog("clSetKernelArg 0 - 2...\n\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clSetKernelArg, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // -------------------------------------------------------- // Start Core sequence... copy input data to GPU, compute, copy results back ciErr1 = clEnqueueWriteBuffer(cqCommandQueue, array_in, CL_FALSE, 0, sizeof(int) * NUM_BLOCKS,(void*) input, 0, NULL, NULL); shrLog("clEnqueueWriteBuffer (SrcA and SrcB)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueWriteBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } // Launch kernel ciErr1 = clEnqueueNDRangeKernel(cqCommandQueue, ckKernel, 1, NULL, &szGlobalWorkSize, &szLocalWorkSize, 0, NULL, &ceEvent); shrLog("clEnqueueNDRangeKernel (Barrier)...\n"); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueNDRangeKernel, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); } /*ciErr1 = clEnqueueReadBuffer(cqCommandQueue, global_mutex, CL_TRUE, 0, sizeof(cl_int), &original_goal, 0, NULL, NULL); shrLog("clEnqueueReadBuffer (Dst)...%d \n\n", original_goal); if (ciErr1 != CL_SUCCESS) { shrLog("Error in clEnqueueReadBuffer, Line %u in file %s !!!\n\n", __LINE__, __FILE__); Cleanup(argc, argv, EXIT_FAILURE); }*/ //GPU_PROFILING ciErr1=clWaitForEvents(1, &ceEvent); if (ciErr1 != CL_SUCCESS) { shrLog("Error 1 !\n\n"); Cleanup(argc, argv, EXIT_FAILURE); } cl_ulong start, end; ciErr1 = clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); ciErr1 |= clGetEventProfilingInfo(ceEvent, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); //oclCheckErrorEX(ciErrNum, CL_SUCCESS, pCleanup); if (ciErr1 != CL_SUCCESS) { shrLog("Error 2 !\n\n"); Cleanup(argc, argv, EXIT_FAILURE); } double dSeconds = 1.0e-9 * (double)(end - start); shrLog("Done! time taken %ul \n",end - start ); // shrLog("Done! Kernel execution time: %.5f s\n\n", dSeconds); // Release event clReleaseEvent(ceEvent); ceEvent = 0; Cleanup (argc, argv, EXIT_SUCCESS); NUM_BLOCKS = NUM_BLOCKS+10; } shrQAFinishExit(argc, (const char **)argv, QA_PASSED); }
int URNG::runCLKernels() { cl_int status; cl_event events[2]; /* Set appropriate arguments to the kernel */ /* input buffer image */ status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputImageBuffer); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg failed. (inputImageBuffer)")) { return SDK_FAILURE; } /* outBuffer imager */ status = clSetKernelArg(kernel, 1, sizeof(cl_mem),&outputImageBuffer); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg failed. (outputImageBuffer)")) { return SDK_FAILURE; } /* input buffer image */ status = clSetKernelArg(kernel, 2, sizeof(factor), &factor); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clSetKernelArg failed. (factor)")) { return SDK_FAILURE; } /* * Enqueue a kernel run call. */ size_t globalThreads[] = {width, height}; size_t localThreads[] = {blockSizeX, blockSizeY}; status = clEnqueueNDRangeKernel( commandQueue, kernel, 2, NULL, globalThreads, localThreads, 0, NULL, &events[0]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueNDRangeKernel failed.")) { return SDK_FAILURE; } status = clWaitForEvents(1, &events[0]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clWaitForEvents failed.")) return SDK_FAILURE; /* Enqueue readBuffer*/ status = clEnqueueReadBuffer( commandQueue, outputImageBuffer, CL_TRUE, 0, width * height * pixelSize, outputImageData, 0, NULL, &events[0]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clEnqueueReadBuffer failed.")) return SDK_FAILURE; /* Wait for the read buffer to finish execution */ status = clWaitForEvents(1, &events[0]); if(!sampleCommon->checkVal( status, CL_SUCCESS, "clWaitForEvents failed.")) return SDK_FAILURE; clReleaseEvent(events[0]); return SDK_SUCCESS; }
int main(void) { cl_int err; cl_platform_id platform = 0; cl_device_id device = 0; cl_context_properties props[3] = { CL_CONTEXT_PLATFORM, 0, 0 }; cl_context ctx = 0; cl_command_queue queue = 0; cl_mem bufA, bufX; cl_event event = NULL; int ret = 0; /* Setup OpenCL environment. */ err = clGetPlatformIDs(1, &platform, NULL); if (err != CL_SUCCESS) { printf( "clGetPlatformIDs() failed with %d\n", err ); return 1; } err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); if (err != CL_SUCCESS) { printf( "clGetDeviceIDs() failed with %d\n", err ); return 1; } props[1] = (cl_context_properties)platform; ctx = clCreateContext(props, 1, &device, NULL, NULL, &err); if (err != CL_SUCCESS) { printf( "clCreateContext() failed with %d\n", err ); return 1; } queue = clCreateCommandQueue(ctx, device, 0, &err); if (err != CL_SUCCESS) { printf( "clCreateCommandQueue() failed with %d\n", err ); clReleaseContext(ctx); return 1; } /* Setup clblas. */ err = clblasSetup(); if (err != CL_SUCCESS) { printf("clblasSetup() failed with %d\n", err); clReleaseCommandQueue(queue); clReleaseContext(ctx); return 1; } /* Prepare OpenCL memory objects and place matrices inside them. */ bufA = clCreateBuffer(ctx, CL_MEM_READ_WRITE, N * lda * sizeof(cl_float2), NULL, &err); bufX = clCreateBuffer(ctx, CL_MEM_READ_ONLY, N * sizeof(cl_float2), NULL, &err); err = clEnqueueWriteBuffer(queue, bufA, CL_TRUE, 0, N * lda * sizeof(cl_float2), A, 0, NULL, NULL); err = clEnqueueWriteBuffer(queue, bufX, CL_TRUE, 0, N * sizeof(cl_float2), X, 0, NULL, NULL); err = clblasCher(order, uplo, N, alpha, bufX, 0 /*offx */, incx, bufA, 0 /*offa */, lda, 1, &queue, 0, NULL, &event); if (err != CL_SUCCESS) { printf("clblasCher() failed with %d\n", err); ret = 1; } else { /* Wait for calculations to be finished. */ err = clWaitForEvents(1, &event); /* Fetch results of calculations from GPU memory. */ err = clEnqueueReadBuffer(queue, bufA, CL_TRUE, 0, (N * lda * sizeof(cl_float2)), A, 0, NULL, NULL); /* At this point you will get the result of CHER placed in A array. */ printResult(); } /* Release OpenCL events. */ clReleaseEvent(event); /* Release OpenCL memory objects. */ clReleaseMemObject(bufX); clReleaseMemObject(bufA); /* Finalize work with clblas. */ clblasTeardown(); /* Release OpenCL working objects. */ clReleaseCommandQueue(queue); clReleaseContext(ctx); return ret; }
void filter_OpenCL_impl(ComputeEnv *env, Buffer *packed_input_buf, Buffer *packed_output_buf, int nInputPlanes, int nOutputPlanes, const float *fbiases, const float *weight, int w, int h, int nJob) { cl_int err; int dev_id = 0; OpenCLDev *dev = &env->cl_dev_list[dev_id]; size_t in_size = sizeof(float) * w * h * nInputPlanes; cl_context context = dev->context; cl_mem cl_packed_input = packed_input_buf->get_read_ptr_cl(env, dev_id, in_size); cl_mem cl_packed_output = packed_output_buf->get_write_ptr_cl(env, dev_id); cl_mem cl_fbiases = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, sizeof(float) * nOutputPlanes, (void*)fbiases, &err ); enum filter_type { FILTER_GENERIC, FILTER_IN1, FILTER_IN3, FILTER_OUT1, FILTER_OUT3, } type = FILTER_GENERIC; cl_kernel ker = dev->ker_filter; bool static_nplane = false; if (nInputPlanes == 1 && nOutputPlanes == 32) { type = FILTER_IN1; ker = dev->ker_filter_in1_out32; } else if (nInputPlanes == 3 && nOutputPlanes == 32) { type = FILTER_IN3; ker = dev->ker_filter_in3_out32; static_nplane = true; } else if (nOutputPlanes == 1 && nInputPlanes == 128) { type = FILTER_OUT1; ker = dev->ker_filter_in128_out1; } else if (nOutputPlanes == 3 && nInputPlanes == 128) { type = FILTER_OUT3; ker = dev->ker_filter_in128_out3; static_nplane = true; } size_t weight_size; if (type == FILTER_GENERIC) { weight_size = sizeof(float) * GPU_VEC_WIDTH * nInputPlanes * 9; } else { weight_size = sizeof(float) * nOutputPlanes * nInputPlanes * 9; } cl_mem cl_weight = clCreateBuffer(context, CL_MEM_READ_ONLY|CL_MEM_COPY_HOST_PTR, weight_size, (void*)weight, &err ); int ai = 0; clSetKernelArg(ker, ai++, sizeof(cl_mem), &cl_packed_input); if (! static_nplane) { clSetKernelArg(ker, ai++, sizeof(cl_int), &nInputPlanes); } clSetKernelArg(ker, ai++, sizeof(cl_mem), &cl_packed_output); if (! static_nplane) { clSetKernelArg(ker, ai++, sizeof(cl_int), &nOutputPlanes); } clSetKernelArg(ker, ai++, sizeof(cl_mem), &cl_fbiases); clSetKernelArg(ker, ai++, sizeof(cl_int), &h); clSetKernelArg(ker, ai++, sizeof(cl_int), &w); clSetKernelArg(ker, ai++, sizeof(cl_mem), &cl_weight); size_t local_size = 0; //local_size += sizeof(float) * 256; //local_size += sizeof(float) * GPU_VEC_WIDTH; if (type == FILTER_GENERIC) { local_size += sizeof(float) * nInputPlanes * (GPU_BLOCK_SIZE+2) * 3; clSetKernelArg(ker, ai++, local_size, nullptr); } cl_event event; size_t gws[3] = {1, 1, 1}; size_t lws[3] = {1, 1, 1}; if (type == FILTER_GENERIC) { gws[0] = h * nOutputPlanes; lws[0] = nOutputPlanes; } else if (type == FILTER_IN1) { gws[0] = h * 256; lws[0] = 256; } else if (type == FILTER_OUT1 || type == FILTER_OUT3) { gws[0] = h*128; lws[0] = 128; } else if (type == FILTER_IN3) { gws[0] = h * 192; lws[0] = 192; } err = clEnqueueNDRangeKernel(dev->queue, ker, 3, nullptr, gws, lws, 0, nullptr, &event); if (err != CL_SUCCESS) { printf("enqueue ndrange error : %d\n", err); exit(1); } err = clWaitForEvents(1, &event); if (err != CL_SUCCESS) { printf("wait ndrange error : %d\n", err); exit(1); } if (err != CL_SUCCESS) { printf("read buffer error : %d\n", err); exit(1); } clReleaseMemObject(cl_fbiases); clReleaseMemObject(cl_weight); clReleaseEvent(event); }
int LDSBandwidth::bandwidth(cl_kernel &kernel) { cl_int status; // Check group size against kernelWorkGroupSize status = clGetKernelWorkGroupInfo(kernel, devices[sampleArgs->deviceId], CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &kernelWorkGroupSize, 0); CHECK_OPENCL_ERROR(status, "clGetKernelWorkGroupInfo failed."); if(localThreads > kernelWorkGroupSize) { localThreads = kernelWorkGroupSize; } // Set appropriate arguments to the kernel size_t size = (NUM_READS + localThreads) * vectorSize * sizeof(cl_float); // Local memory status = clSetKernelArg(kernel, 0, size, 0); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed.(local memory)"); // Output buffer status = clSetKernelArg(kernel, 1, sizeof(cl_mem), (void *)&outputBuffer); CHECK_OPENCL_ERROR(status, "clSetKernelArg failed.(outputBuffer)"); // Get used local memory status = clGetKernelWorkGroupInfo(kernel, devices[sampleArgs->deviceId], CL_KERNEL_LOCAL_MEM_SIZE, sizeof(cl_ulong), &usedLocalMemory, NULL); CHECK_OPENCL_ERROR(status, "clGetKernelWorkGroupInfo CL_KERNEL_LOCAL_MEM_SIZE failed."); if(usedLocalMemory > deviceInfo.localMemSize) { std::cout << "Unsupported: Insufficient local memory on device." << std::endl; return SDK_FAILURE; } double sec = 0; if(sampleArgs->deviceType.compare("cpu") == 0) { iterations = 10; } // Run the kernel for a number of iterations for(int i = 0; i < iterations; i++) { // Enqueue a kernel run call cl_event ndrEvt; status = clEnqueueNDRangeKernel(commandQueue, kernel, 1, NULL, &globalThreads, &localThreads, 0, NULL, &ndrEvt); CHECK_OPENCL_ERROR(status, "clEnqueueNDRangeKernel failed."); // wait for the kernel call to finish execution status = clWaitForEvents(1, &ndrEvt); CHECK_OPENCL_ERROR(status, "clWaitForEvents failed."); // Calculate performance cl_ulong startTime; cl_ulong endTime; // Get kernel profiling info status = clGetEventProfilingInfo(ndrEvt, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &startTime, 0); CHECK_OPENCL_ERROR(status, "clGetEventProfilingInfo failed.(startTime)"); status = clGetEventProfilingInfo(ndrEvt, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &endTime, 0); CHECK_OPENCL_ERROR(status, "clGetEventProfilingInfo failed.(endTime)"); // Cumulate time for each iteration sec += 1e-9 * (endTime - startTime); status = clReleaseEvent(ndrEvt); CHECK_OPENCL_ERROR(status, "clReleaseEvent failed.(endTime)"); } // Copy bytes int bytesPerThread = 0; if(vec3 == true) { bytesPerThread = NUM_READS * 3 * sizeof(cl_float); } else { bytesPerThread = NUM_READS * vectorSize * sizeof(cl_float); } double bytes = (double)(iterations * bytesPerThread); double perf = (bytes / sec) * 1e-9; perf *= globalThreads; std::cout << ": " << perf << " GB/s" << std::endl; return SDK_SUCCESS; }
int main(int argc, const char** argv) { size_t x = 512, y = 250000; //y has to be a multiple of ciDeviceCount! struct svm_node* px = (struct svm_node*)malloc((x+1)*sizeof(struct svm_node)); gen_data(px, x, 1, 3); struct svm_node* py = (struct svm_node*)malloc((x+1)*y*sizeof(struct svm_node)); for(size_t i = 0; i < y; ++i) { struct svm_node* tmp = py+i*(x+1); gen_data(tmp, x, 3,2); } dtype* result = (dtype*)malloc(y*sizeof(dtype)); int* pyLength = (int*)malloc(y*sizeof(int)); for(size_t i = 0; i < y; ++i) { for(size_t j = 0; py[i*(x+1)+j].index >= 0; ++j) pyLength[i] = py[i*(x+1)+j].index; ++pyLength[i]; } cl_int err = CL_SUCCESS; // cl_platform_id platform = NULL; // cl_uint ciDeviceCount = 0; // cl_device_id *device = NULL; // retrieve devices cl_platform_id platform; err = clGetPlatformIDs(1, &platform, NULL); cl_device_id device; err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_CPU, 1, &device, NULL); size_t localDim = 256l; size_t globalDim = localDim*y; /* device = (cl_device_id *)malloc(ciDeviceCount * sizeof(cl_device_id) ); err = clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, ciDeviceCount, device, NULL); if (err != CL_SUCCESS) { printf("Failed to get devices:\n%s\n", oclErrorString(err)); return -1; } */ //Create the context cl_context context1 = clCreateContext(0, 1, &device, NULL, NULL, &err); if(err != CL_SUCCESS) { printf("Context creation failed:\n%d\n", err); return -1; } // create a command queue for first device the context reported cl_command_queue queue = clCreateCommandQueue(context1, device, 0, 0); // load program from disk char *tmp = strdup(argv[0]); char* my_dir = dirname(tmp); // size_t program_length; char path[256]; snprintf(path, PATH_MAX - 1, "%s/vecops.cl", my_dir); cl_program vecops = load_kernel(path, context1); if(err != CL_SUCCESS) { printf("Program creation failed:\n%d\n", (err)); return -1; } err = clBuildProgram(vecops, 0, NULL, "-I.", NULL, NULL); if(err != CL_SUCCESS) { err = clGetProgramBuildInfo(vecops, device, CL_PROGRAM_BUILD_LOG, 8192, buffer, NULL); if(err != CL_SUCCESS) printf("Cannot get build info: %d\n", (err)); printf("Build log:\n%s\n", buffer); } // create kernel cl_kernel sparsedot_kernel; #if version == 1 sparsedot_kernel = clCreateKernel(vecops, "sparsedot1_kernel", &err); #endif #if version == 2 sparsedot_kernel = clCreateKernel(vecops, "sparsedot4_kernel", &err); #endif #if version == 3 sparsedot_kernel = clCreateKernel(vecops, "sparsedot3_kernel", &err); #endif if (err != CL_SUCCESS) { printf("Kernel creation failed:\n%d\n", (err)); return -1; } // allocate memory on the devices cl_mem px_d, py_d, result_d, pyLength_d; #if version == 1 px_d = clCreateBuffer(context1, CL_MEM_READ_ONLY, (x+1) * sizeof(struct svm_node), 0, &err); #endif #if version == 2 || version == 3 //unpack px int size = px[x-1].index+1; for(size_t i = 0; i < y; ++i) size = size > pyLength[i] ? size : pyLength[i]; dtype* px_u = (dtype*)calloc(size, sizeof(dtype)); unpack(px, px_u); printf("px size: %d\n", size); #endif #if version == 3 size_t height, width; clGetDeviceInfo(device, CL_DEVICE_IMAGE2D_MAX_HEIGHT, sizeof(size_t), &height, 0); clGetDeviceInfo(Device, CL_DEVICE_IMAGE2D_MAX_WIDTH, sizeof(size_t), &width, 0); size_t region[3]; region[2] = 1; region[0] = min(4, size); region[1] = (size+2-1) / 4; cl_image_format px_format; px_format.image_channel_order = CL_R; px_format.image_channel_data_type = CL_FLOAT; #endif #if version == 2 px_d = clCreateBuffer(context1, CL_MEM_READ_ONLY, size * sizeof(dtype), 0, &err); #endif #if version == 3 px_d = clCreateImage2D(context1, CL_MEM_READ_ONLY, &px_format, region[0], region[1], 0, 0, &err); #endif if(err != CL_SUCCESS) { printf("Failed to allocate px:\n%d\n", (err)); return -1; } py_d = clCreateBuffer(context1, CL_MEM_READ_ONLY, (x+1) * y * sizeof(struct svm_node), 0, &err); if(err != CL_SUCCESS) { printf("Failed to allocate px:\n%d\n", (err)); return -1; } result_d = clCreateBuffer(context1, CL_MEM_WRITE_ONLY, y * sizeof(dtype), 0, 0); pyLength_d = clCreateBuffer(context1, CL_MEM_READ_ONLY, y * sizeof(int), 0, 0); #if bench //start time measurement start_timer(0); #endif // copy host vectors to device err = CL_SUCCESS; err |= clEnqueueWriteBuffer(queue, py_d, CL_FALSE, 0, (x+1) * y * sizeof(struct svm_node), py, 0, NULL, NULL); err |= clEnqueueWriteBuffer(queue, pyLength_d, CL_FALSE, 0, y * sizeof(int), pyLength, 0, NULL, NULL); #if version == 1 err |= clEnqueueWriteBuffer(queue, px_d, CL_FALSE, 0, (x+1) * sizeof(struct svm_node), px, 0, NULL, NULL); #endif #if version == 2 err |= clEnqueueWriteBuffer(queue, px_d, CL_FALSE, 0, size * sizeof(dtype), px_u, 0, NULL, NULL); #endif #if version == 3 size_t offset[] = {0,0,0}; err |= clEnqueueWriteImage(queue, px_d, CL_TRUE, offset, region, sizeof(dtype), 0, px_u, 0, 0, NULL); #endif clFinish(queue); if(err != CL_SUCCESS) { printf("Data transfer to GPU failed:\n%d\n", (err)); return -1; } #if bench stop_timer(0); start_timer(1); #endif // set kernel arguments clSetKernelArg(sparsedot_kernel, 0, sizeof(cl_mem), (void *) &px_d); clSetKernelArg(sparsedot_kernel, 1, sizeof(cl_mem), (void *) &py_d); clSetKernelArg(sparsedot_kernel, 2, sizeof(cl_mem), (void *) &result_d); clSetKernelArg(sparsedot_kernel, 3, sizeof(cl_mem), (void *) &pyLength_d); clSetKernelArg(sparsedot_kernel, 4, sizeof(cl_ulong), (void *) &x); clSetKernelArg(sparsedot_kernel, 5, sizeof(cl_ulong), (void *) &y); // clSetKernelArg(sparsedot_kernel, 6, sizeof(cl_float8)*localDim, 0); #if version == 3 clSetKernelArg(sparsedot_kernel, 7, sizeof(cl_long), (void *) ®ion[1]) ; clSetKernelArg(sparsedot_kernel, 8, sizeof(cl_long), (void *) ®ion[0]) ; #endif clFlush(queue); // start kernel err = clEnqueueNDRangeKernel(queue, sparsedot_kernel, 1, 0, &globalDim, &localDim, 0, NULL, 0); if(err != CL_SUCCESS) { printf("Kernel launch failed:\n%d\n", (err)); return -1; } clFinish(queue); #if bench stop_timer(1); start_timer(2); #endif cl_event result_gather; // Non-blocking copy of result from device to host err = clEnqueueReadBuffer(queue, result_d, CL_FALSE, 0, y * sizeof(dtype), result, 0, NULL, &result_gather); if(err != CL_SUCCESS) { printf("Reading result failed:\n%d\n", (err)); return -1; } // CPU sync with GPU clWaitForEvents(1, &result_gather); #if bench // stop GPU time measurement stop_timer(2); #endif //check result /* for(size_t i = 0; i < y; ++i) { printf("%f ", result[i]); } printf("\n"); */ #if bench start_timer(3); #endif bool correct = validate(px, py, result, x, y); #if bench stop_timer(3); printf("v%i; x: %lu, y: %lu\n", version, x, y); printf("CPU: %f, upcpy: %f DeviceCalc: %f, downcpy: %f\n", get_secs(3), get_secs(0), get_secs(1), get_secs(2)); #endif if(correct) printf("SUCCESS!\n"); //cleenup clReleaseKernel(sparsedot_kernel); clReleaseCommandQueue(queue); clReleaseEvent(result_gather); clReleaseMemObject(px_d); clReleaseMemObject(py_d); clReleaseMemObject(result_d); clReleaseMemObject(pyLength_d); // clReleaseDevice(device); free(px); #if version == 2 || version == 3 free(px_u); #endif free(py); free(result); return 0; }
/** * \related cl_Mem_Object_t * * This function map OpenCL Image into Host-accessible memory & returns pointer * to mapped memory region * @param[in,out] self pointer to structure, in which 'Map' function pointer * is defined to point on this function. * @param[in] blocking_map flag of type 'cl_bool' that denotes, should operation * be blocking or not. * @param [in] map_flags mapping flags, that denotes how memory object should be * mapped * @param[in] time_mode enumeration, that denotes how time measurement should be * performed * @param[out] evt_to_generate pointer to OpenCL event that will be generated * at the end of operation. * * @return pointer to Host-accessible region of memory in case of success, NULL * pointer otherwise. In that case function sets error value, which is available * through cl_Error_t structure, defined by pointer 'self->error' * * @see cl_err_codes.h for detailed error description. * @see 'cl_Error_t' structure for error handling. */ static void* Image_Map( scow_Mem_Object *self, cl_bool blocking_map, cl_map_flags map_flags, TIME_STUDY_MODE time_mode, cl_event *evt_to_generate, cl_command_queue explicit_queue) { cl_int ret; cl_event mapping_ready, *p_mapping_ready; const size_t origin[3] = { 0, 0, 0 }, region[3] = { self->width, self->height, 1 }; OCL_CHECK_EXISTENCE(self, NULL); if (blocking_map > CL_TRUE) { self->error->Set_Last_Code(self->error, INVALID_BLOCKING_FLAG); return NULL; } (evt_to_generate != NULL) ? (p_mapping_ready = evt_to_generate) : (p_mapping_ready = &mapping_ready); // We can't map the object, that is already mapped if (self->mapped_to_region != NULL) { self->error->Set_Last_Code(self->error, BUFFER_IN_USE); return VOID_MEM_OBJ_PTR; } cl_command_queue q = (explicit_queue == NULL) ? (self->parent_thread->q_data_dtoh) : (explicit_queue); /* Save mapped pointer inside a structure in case if memory object is being * destroyed without unmapping it at first. */ self->mapped_to_region = clEnqueueMapImage(q, self->cl_mem_object, blocking_map, map_flags, origin, region, &self->row_pitch, NULL, 0, NULL, p_mapping_ready, &ret); OCL_DIE_ON_ERROR(ret, CL_SUCCESS, self->error->Set_Last_Code(self->error, ret), NULL); switch (time_mode) { case MEASURE: self->timer->current_time_device = Gather_Time_uS(p_mapping_ready); self->timer->total_time_device += self->timer->current_time_device; break; case DONT_MEASURE: break; default: break; } if (p_mapping_ready != evt_to_generate){ clReleaseEvent(*p_mapping_ready); } return self->mapped_to_region; }
void OCLAcceleratorMatrixHYB<ValueType>::Apply(const BaseVector<ValueType> &in, BaseVector<ValueType> *out) const { if (this->get_nnz() > 0) { assert(in. get_size() >= 0); assert(out->get_size() >= 0); assert(in. get_size() == this->get_ncol()); assert(out->get_size() == this->get_nrow()); const OCLAcceleratorVector<ValueType> *cast_in = dynamic_cast<const OCLAcceleratorVector<ValueType>*> (&in) ; OCLAcceleratorVector<ValueType> *cast_out = dynamic_cast< OCLAcceleratorVector<ValueType>*> (out) ; assert(cast_in != NULL); assert(cast_out!= NULL); // ELL if (this->get_ell_nnz() > 0) { int nrow = this->get_nrow(); int ncol = this->get_ncol(); int max_row = this->get_ell_max_row(); cl_int err; cl_event ocl_event; size_t localWorkSize[1]; size_t globalWorkSize[1]; err = clSetKernelArg( CL_KERNEL_ELL_SPMV, 0, sizeof(int), (void *) &nrow ); err |= clSetKernelArg( CL_KERNEL_ELL_SPMV, 1, sizeof(int), (void *) &ncol ); err |= clSetKernelArg( CL_KERNEL_ELL_SPMV, 2, sizeof(int), (void *) &max_row ); err |= clSetKernelArg( CL_KERNEL_ELL_SPMV, 3, sizeof(cl_mem), (void *) this->mat_.ELL.col ); err |= clSetKernelArg( CL_KERNEL_ELL_SPMV, 4, sizeof(cl_mem), (void *) this->mat_.ELL.val ); err |= clSetKernelArg( CL_KERNEL_ELL_SPMV, 5, sizeof(cl_mem), (void *) cast_in->vec_ ); err |= clSetKernelArg( CL_KERNEL_ELL_SPMV, 6, sizeof(cl_mem), (void *) cast_out->vec_ ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); localWorkSize[0] = this->local_backend_.OCL_max_work_group_size; localWorkSize[0] /= 0.5; globalWorkSize[0] = ( size_t( nrow / localWorkSize[0] ) + 1 ) * localWorkSize[0]; err = clEnqueueNDRangeKernel( OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, CL_KERNEL_ELL_SPMV, 1, NULL, &globalWorkSize[0], &localWorkSize[0], 0, NULL, &ocl_event); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); err = clWaitForEvents( 1, &ocl_event ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); err = clReleaseEvent( ocl_event ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); } // COO if (this->get_coo_nnz() > 0) { // do not support super small matrices assert(this->get_coo_nnz() > OPENCL_WARPSIZE); // ---------------------------------------------------------- // Modified and adopted from CUSP 0.3.1, // http://code.google.com/p/cusp-library/ // NVIDIA, APACHE LICENSE 2.0 // ---------------------------------------------------------- // see __spmv_coo_flat(...) // ---------------------------------------------------------- // CHANGELOG // - adopted interface // ---------------------------------------------------------- const unsigned int BLOCK_SIZE = this->local_backend_.OCL_max_work_group_size; // const unsigned int MAX_BLOCKS = this->local_backend_.GPU_max_blocks; const unsigned int MAX_BLOCKS = 32; // cusp::detail::device::arch::max_active_blocks(spmv_coo_flat_kernel<IndexType, ValueType, BLOCK_SIZE, UseCache>, BLOCK_SIZE, (size_t) 0); const unsigned int WARPS_PER_BLOCK = BLOCK_SIZE / OPENCL_WARPSIZE; const unsigned int num_units = this->get_coo_nnz() / OPENCL_WARPSIZE; const unsigned int num_warps = std::min(num_units, WARPS_PER_BLOCK * MAX_BLOCKS); const unsigned int num_blocks = (num_warps + (WARPS_PER_BLOCK-1)) / WARPS_PER_BLOCK; // (N + (granularity - 1)) / granularity const unsigned int num_iters = (num_units + (num_warps-1)) / num_warps; const unsigned int interval_size = OPENCL_WARPSIZE * num_iters; const int tail = num_units * OPENCL_WARPSIZE; // do the last few nonzeros separately (fewer than this->local_backend_.GPU_wrap elements) const unsigned int active_warps = (interval_size == 0) ? 0 : ((tail + (interval_size-1))/interval_size); cl_mem *temp_rows = NULL; cl_mem *temp_vals = NULL; allocate_ocl<int> (active_warps, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &temp_rows); allocate_ocl<ValueType>(active_warps, OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_context, &temp_vals); cl_int err; cl_event ocl_event; size_t localWorkSize[1]; size_t globalWorkSize[1]; ValueType scalar = 1.0; // Set arguments for kernel call err = clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 0, sizeof(int), (void *) &tail ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 1, sizeof(int), (void *) &interval_size ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 2, sizeof(cl_mem), (void *) this->mat_.COO.row ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 3, sizeof(cl_mem), (void *) this->mat_.COO.col ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 4, sizeof(cl_mem), (void *) this->mat_.COO.val ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 5, sizeof(ValueType), (void *) &scalar ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 6, sizeof(cl_mem), (void *) cast_in->vec_ ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 7, sizeof(cl_mem), (void *) cast_out->vec_ ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 8, sizeof(cl_mem), (void *) temp_rows ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_FLAT, 9, sizeof(cl_mem), (void *) temp_vals ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Determine local work size for kernel call localWorkSize[0] = BLOCK_SIZE; // Determine global work size for kernel call globalWorkSize[0] = num_blocks * localWorkSize[0]; // Start kernel run err = clEnqueueNDRangeKernel( OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, CL_KERNEL_COO_SPMV_FLAT, 1, NULL, &globalWorkSize[0], &localWorkSize[0], 0, NULL, &ocl_event); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Wait for kernel run to finish err = clWaitForEvents( 1, &ocl_event ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Release event when kernel run finished err = clReleaseEvent( ocl_event ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Set arguments for kernel call err = clSetKernelArg( CL_KERNEL_COO_SPMV_REDUCE_UPDATE, 0, sizeof(int), (void *) &active_warps ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_REDUCE_UPDATE, 1, sizeof(cl_mem), (void *) temp_rows ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_REDUCE_UPDATE, 2, sizeof(cl_mem), (void *) temp_vals ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_REDUCE_UPDATE, 3, sizeof(cl_mem), (void *) cast_out->vec_ ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Determine global work size for kernel call globalWorkSize[0] = localWorkSize[0]; // Start kernel run err = clEnqueueNDRangeKernel( OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, CL_KERNEL_COO_SPMV_REDUCE_UPDATE, 1, NULL, &globalWorkSize[0], &localWorkSize[0], 0, NULL, &ocl_event); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Wait for kernel run to finish err = clWaitForEvents( 1, &ocl_event ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Release event when kernel run finished err = clReleaseEvent( ocl_event ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); int nnz = this->get_coo_nnz(); // Set arguments for kernel call err = clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 0, sizeof(int), (void *) &nnz ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 1, sizeof(cl_mem), (void *) this->mat_.COO.row ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 2, sizeof(cl_mem), (void *) this->mat_.COO.col ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 3, sizeof(cl_mem), (void *) this->mat_.COO.val ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 4, sizeof(ValueType), (void *) &scalar ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 5, sizeof(cl_mem), (void *) cast_in->vec_ ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 6, sizeof(cl_mem), (void *) cast_out->vec_ ); err |= clSetKernelArg( CL_KERNEL_COO_SPMV_SERIAL, 7, sizeof(int), (void *) &tail ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Determine local work size for kernel call localWorkSize[0] = 1; // Determine global work size for kernel call globalWorkSize[0] = 1; // Start kernel run err = clEnqueueNDRangeKernel( OCL_HANDLE(this->local_backend_.OCL_handle)->OCL_cmdQueue, CL_KERNEL_COO_SPMV_SERIAL, 1, NULL, &globalWorkSize[0], &localWorkSize[0], 0, NULL, &ocl_event); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Wait for kernel run to finish err = clWaitForEvents( 1, &ocl_event ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); // Release event when kernel run finished err = clReleaseEvent( ocl_event ); CHECK_OCL_ERROR( err, __FILE__, __LINE__ ); free_ocl(&temp_rows); free_ocl(&temp_vals); } } }
/** * \related cl_Mem_Object_t * * This function copies content of one OpenCL buffer memory object into another. * * @param[in,out] self pointer to structure, in which 'Copy' function pointer * is defined to point on this function. * @param[out] dest pointer to another Memory Object structure, where the data * from 'self' will be copied to. * @param[in] blocking_flag flag, that denotes, should operation be blocking or not. * @param[in] time_mode enumeration, that denotes how time measurement should be * performed. * @param[out] evt_to_generate pointer to OpenCL event that will be generated * at the end of operation. * * @return CL_SUCCESS in case of success, error code of type 'ret_code' otherwise. * * @see cl_err_codes.h for detailed error description. * @see 'cl_Error_t' structure for error handling. */ static ret_code Buffer_Copy( scow_Mem_Object *self, scow_Mem_Object *dest, cl_bool blocking_flag, TIME_STUDY_MODE time_mode, cl_event *evt_to_generate, cl_command_queue explicit_queue) { cl_int ret = CL_SUCCESS; cl_event copy_ready, *p_copy_ready = (cl_event*) 0x0; OCL_CHECK_EXISTENCE(self, INVALID_BUFFER_GIVEN); OCL_CHECK_EXISTENCE(dest, INVALID_BUFFER_GIVEN); // Can't copy distinct memory objects if (self->obj_mem_type != dest->obj_mem_type) { return DISTINCT_MEM_OBJECTS; } // Can't copy bigger object into smaller one if (self->size > dest->size) { return INVALID_BUFFER_SIZE; } // If src & dest are the same, no need to copy at all, just reset timer. if (self == dest) { self->timer->current_time_device = 0; return CL_SUCCESS; } (evt_to_generate == NULL) ? (p_copy_ready = ©_ready) : (p_copy_ready = evt_to_generate); cl_command_queue q = (explicit_queue == NULL) ? (self->parent_thread->q_data_dtod) : (explicit_queue); ret = clEnqueueCopyBuffer(q, self->cl_mem_object, dest->cl_mem_object, 0, 0, self->size, 0, NULL, p_copy_ready); OCL_DIE_ON_ERROR(ret, CL_SUCCESS, NULL, ret); switch (time_mode) { case MEASURE: self->timer->current_time_device = Gather_Time_uS(p_copy_ready); self->timer->total_time_device += self->timer->current_time_device; break; default: break; } if (p_copy_ready != evt_to_generate){ clReleaseEvent(*p_copy_ready); } return ret; }
void sum_gpu(long long *in, long long *out, unsigned int n) { size_t global_size; size_t local_size; char *kernel_src; cl_int err; cl_platform_id platform_id; cl_device_id device_id; cl_uint max_compute_units; size_t max_workgroup_size; cl_context context; cl_command_queue commands; cl_program program; cl_kernel kernel; cl_mem d_array; cl_event event; cl_ulong start, end; /* start OpenCL */ err = clGetPlatformIDs(1, &platform_id,NULL); clErrorHandling("clGetPlatformIDs"); err = clGetDeviceIDs(platform_id, CL_DEVICE_TYPE_GPU, 1, &device_id, NULL); clErrorHandling("clGetDeviceIDs"); context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); clErrorHandling("clCreateContext"); commands = clCreateCommandQueue(context, device_id, CL_QUEUE_PROFILING_ENABLE, &err); clErrorHandling("clCreateCommandQueue"); /* create kernel */ kernel_src = file_to_string(KERNEL_SRC); program = clCreateProgramWithSource(context, 1, (const char**) &kernel_src, NULL, &err); free(kernel_src); clErrorHandling("clCreateProgramWithSource"); err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); clErrorHandling("clBuildProgram"); kernel = clCreateKernel(program, "matrix_mult", &err); clErrorHandling("clCreateKernel"); /* allocate memory and send to gpu */ d_array = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(long long) * n, NULL, &err); clErrorHandling("clCreateBuffer"); err = clEnqueueWriteBuffer(commands, d_array, CL_TRUE, 0, sizeof(long long) * n, in, 0, NULL, NULL); clErrorHandling("clEnqueueWriteBuffer"); err = clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(cl_uint), &max_compute_units, NULL); err |= clGetDeviceInfo(device_id, CL_DEVICE_MAX_WORK_GROUP_SIZE, sizeof(size_t), &max_workgroup_size, NULL); clErrorHandling("clGetDeviceInfo"); /* prepare kernel args */ err = clSetKernelArg(kernel, 0, sizeof(cl_mem), &d_array); err |= clSetKernelArg(kernel, 1, sizeof(unsigned int), &n); /* execute */ local_size = n / max_compute_units / 8; if (local_size > max_workgroup_size) local_size = max_workgroup_size; /* * Usually it would be * global_size = local_size * max_compute_units; * but that would only be valid if local_size = n / max_compute_units; * local_size is n / max_compute_units / 8 because it obtains its hightest performance. */ for (global_size = local_size; global_size < n; global_size += local_size); err = clEnqueueNDRangeKernel(commands, kernel, 1, NULL, &global_size, &local_size, 0, NULL, &event); clErrorHandling("clEnqueueNDRangeKernel"); clWaitForEvents(1, &event); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &start, NULL); clGetEventProfilingInfo(event, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &end, NULL); fprintf(stderr, "Time for event (ms): %10.5f \n", (end - start) / 1000000.0); err = clFinish(commands); clErrorHandling("clFinish"); /* transfer back */ err = clEnqueueReadBuffer(commands, d_array, CL_TRUE, 0, sizeof(long long), out, 0, NULL, NULL); // a single long long clErrorHandling("clEnqueueReadBuffer"); /* cleanup*/ clReleaseMemObject(d_array); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); clReleaseEvent(event); }