void runbench(double *cd, long size){ if( memory_ratio>UNROLL_ITERATIONS ){ fprintf(stderr, "ERROR: memory_ratio exceeds UNROLL_ITERATIONS\n"); exit(1); } const long compute_grid_size = size/(UNROLLED_MEMORY_ACCESSES)/2; const int BLOCK_SIZE = 256; const int TOTAL_BLOCKS = compute_grid_size/BLOCK_SIZE; const long long computations = 2*(long long)(COMP_ITERATIONS)*REGBLOCK_SIZE*compute_grid_size; const long long memoryoperations = (long long)(COMP_ITERATIONS)*compute_grid_size; dim3 dimBlock(BLOCK_SIZE, 1, 1); dim3 dimGrid(TOTAL_BLOCKS, 1, 1); hipEvent_t start, stop; initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< float, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1.0f, (float*)cd); float kernel_time_mad_sp = finalizeEvents(start, stop); initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< double, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1.0, cd); float kernel_time_mad_dp = finalizeEvents(start, stop); initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< int, BLOCK_SIZE, memory_ratio >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1, (int*)cd); float kernel_time_mad_int = finalizeEvents(start, stop); const double memaccesses_ratio = (double)(memory_ratio)/UNROLL_ITERATIONS; const double computations_ratio = 1.0-memaccesses_ratio; printf(" %4d, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f\n", UNROLL_ITERATIONS-memory_ratio, (computations_ratio*(double)computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(float)), kernel_time_mad_sp, (computations_ratio*(double)computations)/kernel_time_mad_sp*1000./(double)(1000*1000*1000), (memaccesses_ratio*(double)memoryoperations*sizeof(float))/kernel_time_mad_sp*1000./(1000.*1000.*1000.), (computations_ratio*(double)computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(double)), kernel_time_mad_dp, (computations_ratio*(double)computations)/kernel_time_mad_dp*1000./(double)(1000*1000*1000), (memaccesses_ratio*(double)memoryoperations*sizeof(double))/kernel_time_mad_dp*1000./(1000.*1000.*1000.), (computations_ratio*(double)computations)/(memaccesses_ratio*(double)memoryoperations*sizeof(int)), kernel_time_mad_int, (computations_ratio*(double)computations)/kernel_time_mad_int*1000./(double)(1000*1000*1000), (memaccesses_ratio*(double)memoryoperations*sizeof(int))/kernel_time_mad_int*1000./(1000.*1000.*1000.) ); }
void runbench(double *cd, long size){ const long compute_grid_size = size/ELEMENTS_PER_THREAD; const int BLOCK_SIZE = 256; const int TOTAL_BLOCKS = compute_grid_size/BLOCK_SIZE; const long long computations = ELEMENTS_PER_THREAD*(long long)compute_grid_size+(2*ELEMENTS_PER_THREAD*compute_iterations)*(long long)compute_grid_size; const long long memoryoperations = size; dim3 dimBlock(BLOCK_SIZE, 1, 1); dim3 dimGrid(TOTAL_BLOCKS, 1, 1); hipEvent_t start, stop; initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< float, BLOCK_SIZE, ELEMENTS_PER_THREAD, compute_iterations >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1.0f, (float*)cd); float kernel_time_mad_sp = finalizeEvents(start, stop); initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< double, BLOCK_SIZE, ELEMENTS_PER_THREAD, compute_iterations >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1.0, cd); float kernel_time_mad_dp = finalizeEvents(start, stop); initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(benchmark_func< int, BLOCK_SIZE, ELEMENTS_PER_THREAD, compute_iterations >), dim3(dimGrid), dim3(dimBlock ), 0, 0, 1, (int*)cd); float kernel_time_mad_int = finalizeEvents(start, stop); printf(" %4d, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f, %8.3f,%8.2f,%8.2f,%7.2f\n", compute_iterations, ((double)computations)/((double)memoryoperations*sizeof(float)), kernel_time_mad_sp, ((double)computations)/kernel_time_mad_sp*1000./(double)(1000*1000*1000), ((double)memoryoperations*sizeof(float))/kernel_time_mad_sp*1000./(1000.*1000.*1000.), ((double)computations)/((double)memoryoperations*sizeof(double)), kernel_time_mad_dp, ((double)computations)/kernel_time_mad_dp*1000./(double)(1000*1000*1000), ((double)memoryoperations*sizeof(double))/kernel_time_mad_dp*1000./(1000.*1000.*1000.), ((double)computations)/((double)memoryoperations*sizeof(int)), kernel_time_mad_int, ((double)computations)/kernel_time_mad_int*1000./(double)(1000*1000*1000), ((double)memoryoperations*sizeof(int))/kernel_time_mad_int*1000./(1000.*1000.*1000.) ); }
int test_amdgcn_wave_lshift_1 (const int n, const int blockSize, const int launch_iter=1, const int shfl_iter=1, const bool verify=true) { const int WIDTH = 64; const int DELTA = 1; std::vector<int> input(n); std::future<void> inputFuture = std::async([&]() { std::default_random_engine generator; std::uniform_int_distribution<int> input_dist(0, WIDTH-1); std::generate(std::begin(input), std::end(input),[&]() { return input_dist(generator); }); }); inputFuture.wait(); int* gpuInput; hipMalloc(&gpuInput, n * sizeof(int)); hipMemcpy(gpuInput, input.data(), n * sizeof(int), hipMemcpyHostToDevice); int* gpuOutput; hipMalloc(&gpuOutput, n * sizeof(int)); // warm up { hipEvent_t start, stop; initializeEvents(&start, &stop); hipLaunchKernel(HIP_KERNEL_NAME(run_amdgcn_wave_lshift_1) , dim3(n/blockSize), dim3(blockSize), 0, 0 , gpuInput, gpuOutput, shfl_iter); float time_ms = finalizeEvents(start, stop); } // measure the performance hipEvent_t start, stop; initializeEvents(&start, &stop); for (int i = 0; i < launch_iter; i++) { hipLaunchKernel(HIP_KERNEL_NAME(run_amdgcn_wave_lshift_1) , dim3(n/blockSize), dim3(blockSize), 0, 0 , gpuInput, gpuOutput, shfl_iter); } float time_ms = finalizeEvents(start, stop); std::vector<int> output(n); hipMemcpy(output.data(), gpuOutput, n * sizeof(int), hipMemcpyDeviceToHost); // verification int errors = 0; if (verify) { for (int i = 0; i < n; i+=WIDTH) { int local_output[WIDTH]; for (int j = 0; j < shfl_iter; j++) { for (int k = 0; k < WIDTH; k++) { unsigned int lane = ((k+(int)DELTA)<WIDTH)?(k+DELTA):k; local_output[k] = input[i+lane]; } for (int k = 0; k < WIDTH; k++) { input[i+k] = local_output[k]; } } for (int k = 0; k < WIDTH; k++) { if (input[i+k] != output[i+k]) { errors++; } } } } std::cout << __FUNCTION__ << "<" << DELTA << "," << WIDTH << "> total(" << launch_iter << " launches, " << shfl_iter << " wavefront_shift_left/lane/kernel): " << time_ms << "ms, " << time_ms/(double)launch_iter << " ms/kernel, " << errors << " errors" << std::endl; hipFree(gpuInput); hipFree(gpuOutput); return errors; }