static int lrun(lua_State *L) { check_schedule(L); int threads = schedule_threads(S); struct worker w[threads]; struct thread t[threads]; struct workshop ws; ws.threads = threads; ws.w = w; int i; for (i=0;i<threads;i++) { w[i].id = i; w[i].suspend = 0; w[i].workshop = &ws; thread_event_create(&w[i].event); t[i].func = worker_func; t[i].ud = &w[i]; } thread_join(t,threads); for (i=0;i<threads;i++) { thread_event_release(&w[i].event); } return 0; }
////////////////////////////////// // Main entry function of the GPU cache model ////////////////////////////////// int main(int argc, char** argv) { srand(time(0)); std::cout << SPLIT_STRING << std::endl; message(""); // Flush messages as soon as possible std::cout.setf(std::ios_base::unitbuf); // Read the hardware settings from file Settings hardware = get_settings(); // Print cache statistics message("Cache configuration:"); std::cout << "### \t Cache size: ~" << hardware.cache_bytes/1024 << "KB" << std::endl; std::cout << "### \t Line size: " << hardware.line_size << " bytes" << std::endl; std::cout << "### \t Layout: " << hardware.cache_ways << " ways, " << hardware.cache_sets << " sets" << std::endl; message(""); // Parse the input argument and make sure that there is only one if (argc != 3) { message("Error: provide one argument only (a folder containing input trace files)"); message(""); std::cout << SPLIT_STRING << std::endl; exit(1); } std::string benchname = argv[1]; std::string suitename = argv[2]; // Loop over all found traces in the folder (one trace per kernel) for (unsigned kernel_id = 0; kernel_id < 20; kernel_id++) { std::vector<Thread> threads(MAX_THREADS); for (unsigned t=0; t<MAX_THREADS; t++) { threads[t] = Thread(); } // Set the kernelname and include a counter std::string kernelname; if (kernel_id < 10) { kernelname = benchname+"_0"+std::to_string(kernel_id); } else { kernelname = benchname+"_" +std::to_string(kernel_id); } // Load a memory access trace from a file Dim3 blockdim = read_file(threads, kernelname, benchname, suitename); unsigned blocksize = blockdim.x*blockdim.y*blockdim.z; // There was not a single trace that could be found - exit with an error if (blocksize == 0 && kernel_id == 0) { std::cout << "### Error: could not read file 'output/" << benchname << "/" << kernelname << ".trc'" << std::endl; message(""); std::cout << SPLIT_STRING << std::endl; exit(1); } // The final tracefile is already processed, exit the loop if (blocksize == 0) { break; } // Assign threads to warps, threadblocks and GPU cores message(""); std::cout << "### Assigning threads to warps/blocks/cores..."; unsigned num_blocks = ceil(threads.size()/(float)(blocksize)); unsigned num_warps_per_block = ceil(blocksize/(float)(hardware.warp_size)); std::vector<std::vector<unsigned>> warps(num_warps_per_block*num_blocks); std::vector<std::vector<unsigned>> blocks(num_blocks); std::vector<std::vector<unsigned>> cores(hardware.num_cores); schedule_threads(threads, warps, blocks, cores, hardware, blocksize); std::cout << "done" << std::endl; // Model only a single core, modelling multiple cores requires a loop over 'cid' unsigned cid = 0; // Compute the number of active blocks on this core unsigned hardware_max_active_blocks = std::min(hardware.max_active_threads/blocksize, hardware.max_active_blocks); unsigned active_blocks = std::min((unsigned)cores[cid].size(), hardware_max_active_blocks); // Start the computation of the reuse distance profile message(""); std::cout << "### [core " << cid << "]:" << std::endl; std::cout << "### Running " << active_blocks << " block(s) at a time" << std::endl; std::cout << "### Calculating the reuse distances"; // Create a Gaussian distribution to model memory latencies std::random_device random; std::mt19937 gen(random()); // Compute the reuse distance for 4 different cases std::vector<map_type<unsigned,unsigned>> distances(NUM_CASES); for (unsigned runs = 0; runs < NUM_CASES; runs++) { std::cout << "..."; unsigned sets, ways; unsigned ml, ms, nml; unsigned mshr; // CASE 0 | Normal - full model sets = hardware.cache_sets; ways = hardware.cache_ways; ml = hardware.mem_latency; ms = hardware.mem_latency_stddev; nml = NON_MEM_LATENCY; mshr = hardware.num_mshr; // CASE 1 | Only 1 set: don't model associativity if (runs == 1) { sets = 1; ways = hardware.cache_ways*hardware.cache_sets; } // CASE 2 | Memory latency to 0: don't model latencies if (runs == 2) { ml = 0; ms = 0; nml = 0; } // CASE 3 | MSHR count to infinite: don't model MSHRs if (runs == 3) { mshr = INF; } // Calculate the reuse distance profile std::normal_distribution<> distribution(0,ms); reuse_distance(cores[cid], blocks, warps, threads, distances[runs], active_blocks, hardware, sets, ways, ml, nml, mshr, gen, distribution); } std::cout << "done" << std::endl; // Process the reuse distance profile to obtain the cache hit/miss rate message(""); output_miss_rate(distances, kernelname, benchname,suitename, hardware); // Display the cache hit/miss rate from the output of the verifier (if available) message(""); verify_miss_rate(kernelname, benchname); message(""); } // End of the program std::cout << SPLIT_STRING << std::endl; return 0; }