extern "C" void mixbenchGPU(cl_device_id dev_id, double *c, long size, bool block_strided, bool host_allocated, size_t workgroupsize, unsigned int elements_per_wi, unsigned int fusion_degree) { const char *benchtype; if(block_strided) benchtype = "Workgroup"; else benchtype = "NDRange"; printf("Workitem stride: %s\n", benchtype); const char *buffer_allocation = host_allocated ? "Host allocated" : "Device allocated"; printf("Buffer allocation: %s\n", buffer_allocation); // Set context properties cl_platform_id p_id; OCL_SAFE_CALL( clGetDeviceInfo(dev_id, CL_DEVICE_PLATFORM, sizeof(p_id), &p_id, NULL) ); size_t length; OCL_SAFE_CALL( clGetDeviceInfo(dev_id, CL_DEVICE_EXTENSIONS, 0, NULL, &length) ); char *extensions = (char*)alloca(length); OCL_SAFE_CALL( clGetDeviceInfo(dev_id, CL_DEVICE_EXTENSIONS, length, extensions, NULL) ); bool enable_dp = strstr(extensions, "cl_khr_fp64") != NULL; cl_context_properties ctxProps[] = { CL_CONTEXT_PLATFORM, (cl_context_properties)p_id, 0 }; cl_int errno; // Create context cl_context context = clCreateContext(ctxProps, 1, &dev_id, NULL, NULL, &errno); OCL_SAFE_CALL(errno); cl_mem_flags buf_flags = CL_MEM_READ_WRITE; if( host_allocated ) buf_flags |= CL_MEM_ALLOC_HOST_PTR; cl_mem c_buffer = clCreateBuffer(context, buf_flags, size*sizeof(double), NULL, &errno); OCL_SAFE_CALL(errno); // Create command queue cl_command_queue cmd_queue = clCreateCommandQueue(context, dev_id, CL_QUEUE_PROFILING_ENABLE, &errno); OCL_SAFE_CALL(errno); // Set data on device memory cl_int *mapped_data = (cl_int*)clEnqueueMapBuffer(cmd_queue, c_buffer, CL_TRUE, CL_MAP_WRITE, 0, size*sizeof(double), 0, NULL, NULL, &errno); OCL_SAFE_CALL(errno); for(int i=0; i<size; i++) mapped_data[i] = 0; clEnqueueUnmapMemObject(cmd_queue, c_buffer, mapped_data, 0, NULL, NULL); // Load source, create program and all kernels printf("Loading kernel source file...\n"); const char c_param_format_str[] = "-cl-std=CL1.1 -cl-mad-enable -Dclass_T=%s -Dblockdim=" SIZE_T_FORMAT " -DCOMPUTE_ITERATIONS=%d -DELEMENTS_PER_THREAD=%d -DFUSION_DEGREE=%d %s %s"; const char *c_empty = ""; const char *c_striding = block_strided ? "-DBLOCK_STRIDED" : c_empty; const char *c_enable_dp = "-DENABLE_DP"; char c_build_params[256]; const char *c_kernel_source = {ReadFile("mix_kernels_ro.cl")}; printf("Precompilation of kernels... "); sprintf(c_build_params, c_param_format_str, "short", workgroupsize, 0, 1, 1, c_striding, c_empty); cl_kernel kernel_warmup = BuildKernel(context, dev_id, c_kernel_source, c_build_params); show_progress_init(compute_iterations_len); cl_kernel kernels[kdt_double+1][compute_iterations_len]; for(int i=0; i<compute_iterations_len; i++) { show_progress_step(0, '\\'); sprintf(c_build_params, c_param_format_str, "float", workgroupsize, compute_iterations[i], elements_per_wi, fusion_degree, c_striding, c_empty); //printf("%s\n",c_build_params); kernels[kdt_float][i] = BuildKernel(context, dev_id, c_kernel_source, c_build_params); show_progress_step(0, '|'); sprintf(c_build_params, c_param_format_str, "int", workgroupsize, compute_iterations[i], elements_per_wi, fusion_degree, c_striding, c_empty); //printf("%s\n",c_build_params); kernels[kdt_int][i] = BuildKernel(context, dev_id, c_kernel_source, c_build_params); if( enable_dp ) { show_progress_step(0, '/'); sprintf(c_build_params, c_param_format_str, "double", workgroupsize, compute_iterations[i], elements_per_wi, fusion_degree, c_striding, c_enable_dp); //printf("%s\n",c_build_params); kernels[kdt_double][i] = BuildKernel(context, dev_id, c_kernel_source, c_build_params); } else kernels[kdt_double][i] = 0; show_progress_step(1, '>'); } show_progress_done(); free((char*)c_kernel_source); runbench_warmup(cmd_queue, kernel_warmup, c_buffer, size, workgroupsize); // Synchronize in order to wait for memory operations to finish OCL_SAFE_CALL( clFinish(cmd_queue) ); printf("---------------------------------------------------------- CSV data ----------------------------------------------------------\n"); printf("Experiment ID, Single Precision ops,,,, Double precision ops,,,, Integer operations,,, \n"); printf("Compute iters, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, Iops/byte, ex.time, GIOPS, GB/sec\n"); for(int i=0; i<compute_iterations_len; i++) runbench(compute_iterations, i, cmd_queue, kernels, c_buffer, size, workgroupsize, elements_per_wi, fusion_degree); printf("------------------------------------------------------------------------------------------------------------------------------\n"); // Copy results back to host memory OCL_SAFE_CALL( clEnqueueReadBuffer(cmd_queue, c_buffer, CL_TRUE, 0, size*sizeof(double), c, 0, NULL, NULL) ); // Release kernels and program ReleaseKernelNProgram(kernel_warmup); for(int i=0; i<compute_iterations_len; i++) { ReleaseKernelNProgram(kernels[kdt_float][i]); ReleaseKernelNProgram(kernels[kdt_int][i]); if( enable_dp ) ReleaseKernelNProgram(kernels[kdt_double][i]); } // Release buffer OCL_SAFE_CALL( clReleaseMemObject(c_buffer) ); }
extern "C" void mixbenchGPU(double *c, long size){ const char *benchtype = "compute with global memory (block strided)"; printf("Trade-off type: %s\n", benchtype); double *cd; CUDA_SAFE_CALL( hipMalloc((void**)&cd, size*sizeof(double)) ); // Copy data to device memory CUDA_SAFE_CALL( hipMemset(cd, 0, size*sizeof(double)) ); // initialize to zeros // Synchronize in order to wait for memory operations to finish CUDA_SAFE_CALL( hipDeviceSynchronize() ); printf("---------------------------------------------------------- CSV data ----------------------------------------------------------\n"); printf("Experiment ID, Single Precision ops,,,, Double precision ops,,,, Integer operations,,, \n"); printf("Compute iters, Flops/byte, ex.time, GFLOPS, GB/sec, Flops/byte, ex.time, GFLOPS, GB/sec, Iops/byte, ex.time, GIOPS, GB/sec\n"); runbench_warmup(cd, size); runbench<32>(cd, size); runbench<31>(cd, size); runbench<30>(cd, size); runbench<29>(cd, size); runbench<28>(cd, size); runbench<27>(cd, size); runbench<26>(cd, size); runbench<25>(cd, size); runbench<24>(cd, size); runbench<23>(cd, size); runbench<22>(cd, size); runbench<21>(cd, size); runbench<20>(cd, size); runbench<19>(cd, size); runbench<18>(cd, size); runbench<17>(cd, size); runbench<16>(cd, size); runbench<15>(cd, size); runbench<14>(cd, size); runbench<13>(cd, size); runbench<12>(cd, size); runbench<11>(cd, size); runbench<10>(cd, size); runbench<9>(cd, size); runbench<8>(cd, size); runbench<7>(cd, size); runbench<6>(cd, size); runbench<5>(cd, size); runbench<4>(cd, size); runbench<3>(cd, size); runbench<2>(cd, size); runbench<1>(cd, size); runbench<0>(cd, size); printf("---------------------------------------------------------- CSV data ----------------------------------------------------------\n"); // Copy results back to host memory CUDA_SAFE_CALL( hipMemcpy(c, cd, size*sizeof(double), hipMemcpyDeviceToHost) ); CUDA_SAFE_CALL( hipFree(cd) ); CUDA_SAFE_CALL( hipDeviceReset() ); }