static void runallbench(Benchmark *b) { for (; b->f; b++) { runbench(b); } }
extern "C" void mixbenchGPU(cl_device_id dev_id, double *c, long size, bool block_strided, bool host_allocated, size_t workgroupsize, unsigned int elements_per_wi, unsigned int fusion_degree) { const char *benchtype; if(block_strided) benchtype = "Workgroup"; else benchtype = "NDRange"; printf("Workitem stride: %s\n", benchtype); const char *buffer_allocation = host_allocated ? "Host allocated" : "Device allocated"; printf("Buffer allocation: %s\n", buffer_allocation); // Set context properties cl_platform_id p_id; OCL_SAFE_CALL( clGetDeviceInfo(dev_id, CL_DEVICE_PLATFORM, sizeof(p_id), &p_id, NULL) ); size_t length; OCL_SAFE_CALL( clGetDeviceInfo(dev_id, CL_DEVICE_EXTENSIONS, 0, NULL, &length) ); char *extensions = (char*)alloca(length); OCL_SAFE_CALL( clGetDeviceInfo(dev_id, CL_DEVICE_EXTENSIONS, length, extensions, NULL) ); bool enable_dp = strstr(extensions, "cl_khr_fp64") != NULL; cl_context_properties ctxProps[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)p_id, 0 }; cl_int errno; // Create context cl_context context = clCreateContext(ctxProps, 1, &dev_id, NULL, NULL, &errno); OCL_SAFE_CALL(errno); cl_mem_flags buf_flags = CL_MEM_READ_WRITE; if( host_allocated ) buf_flags |= CL_MEM_ALLOC_HOST_PTR; cl_mem c_buffer = clCreateBuffer(context, buf_flags, size*sizeof(double), NULL, &errno); OCL_SAFE_CALL(errno); // Create command queue cl_command_queue cmd_queue = clCreateCommandQueue(context, dev_id, CL_QUEUE_PROFILING_ENABLE, &errno); OCL_SAFE_CALL(errno); // Set data on device memory cl_int *mapped_data = (cl_int*)clEnqueueMapBuffer(cmd_queue, c_buffer, CL_TRUE, CL_MAP_WRITE, 0, size*sizeof(double), 0, NULL, NULL, &errno); OCL_SAFE_CALL(errno); for(int i=0; i<size; i++) mapped_data[i] = 0; clEnqueueUnmapMemObject(cmd_queue, c_buffer, mapped_data, 0, NULL, NULL); // Load source, create program and all kernels printf("Loading kernel source file...\n"); const char c_param_format_str[] = "-cl-std=CL1.1 -cl-mad-enable -Dclass_T=%s -Dblockdim=" SIZE_T_FORMAT " -DCOMPUTE_ITERATIONS=%d -DELEMENTS_PER_THREAD=%d -DFUSION_DEGREE=%d %s %s"; const char *c_empty = ""; const char *c_striding = block_strided ? "-DBLOCK_STRIDED" : c_empty; const char *c_enable_dp = "-DENABLE_DP"; char c_build_params[256]; const char *c_kernel_source = {ReadFile("mix_kernels_ro.cl")}; printf("Precompilation of kernels... "); sprintf(c_build_params, c_param_format_str, "short", workgroupsize, 0, 1, 1, c_striding, c_empty); cl_kernel kernel_warmup = BuildKernel(context, dev_id, c_kernel_source, c_build_params); show_progress_init(compute_iterations_len); cl_kernel kernels[kdt_double+1][compute_iterations_len]; for(int i=0; i<compute_iterations_len; i++) { show_progress_step(0, '\\'); sprintf(c_build_params, c_param_format_str, "float", workgroupsize, compute_iterations[i], elements_per_wi, fusion_degree, c_striding, c_empty); //printf("%s\n",c_build_params); kernels[kdt_float][i] = BuildKernel(context, dev_id, c_kernel_source, c_build_params); show_progress_step(0, '|'); sprintf(c_build_params, c_param_format_str, "int", workgroupsize, compute_iterations[i], elements_per_wi, fusion_degree, c_striding, c_empty); //printf("%s\n",c_build_params); kernels[kdt_int][i] = BuildKernel(context, dev_id, c_kernel_source, c_build_params); if( enable_dp ) { show_progress_step(0, '/'); sprintf(c_build_params, c_param_format_str, "double", workgroupsize, compute_iterations[i], elements_per_wi, fusion_degree, c_striding, c_enable_dp); //printf("%s\n",c_build_params); kernels[kdt_double][i] = BuildKernel(context, dev_id, c_kernel_source, c_build_params); } else kernels[kdt_double][i] = 0; show_progress_step(1, '>'); } show_progress_done(); free((char*)c_kernel_source); runbench_warmup(cmd_queue, kernel_warmup, c_buffer, size, workgroupsize); // Synchronize in order to wait for memory operations to finish OCL_SAFE_CALL( clFinish(cmd_queue) ); printf("---------------------------------------------------------- CSV data ----------------------------------------------------------\n"); printf("Experiment ID, Single Precision ops,,,, Double precision ops,,,, Integer operations,,, \n"); printf("Compute iters, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, Iops/byte, ex.time, GIOPS, GB/sec\n"); for(int i=0; i<compute_iterations_len; i++) runbench(compute_iterations, i, cmd_queue, kernels, c_buffer, size, workgroupsize, elements_per_wi, fusion_degree); printf("------------------------------------------------------------------------------------------------------------------------------\n"); // Copy results back to host memory OCL_SAFE_CALL( clEnqueueReadBuffer(cmd_queue, c_buffer, CL_TRUE, 0, size*sizeof(double), c, 0, NULL, NULL) ); // Release kernels and program ReleaseKernelNProgram(kernel_warmup); for(int i=0; i<compute_iterations_len; i++) { ReleaseKernelNProgram(kernels[kdt_float][i]); ReleaseKernelNProgram(kernels[kdt_int][i]); if( enable_dp ) ReleaseKernelNProgram(kernels[kdt_double][i]); } // Release buffer OCL_SAFE_CALL( clReleaseMemObject(c_buffer) ); }