////////////////////////////////// // Main entry function of the GPU cache model ////////////////////////////////// int main(int argc, char** argv) { srand(time(0)); std::cout << SPLIT_STRING << std::endl; message(""); // Flush messages as soon as possible std::cout.setf(std::ios_base::unitbuf); // Read the hardware settings from file Settings hardware = get_settings(); // Print cache statistics message("Cache configuration:"); std::cout << "### \t Cache size: ~" << hardware.cache_bytes/1024 << "KB" << std::endl; std::cout << "### \t Line size: " << hardware.line_size << " bytes" << std::endl; std::cout << "### \t Layout: " << hardware.cache_ways << " ways, " << hardware.cache_sets << " sets" << std::endl; message(""); // Parse the input argument and make sure that there is only one if (argc != 3) { message("Error: provide one argument only (a folder containing input trace files)"); message(""); std::cout << SPLIT_STRING << std::endl; exit(1); } std::string benchname = argv[1]; std::string suitename = argv[2]; // Loop over all found traces in the folder (one trace per kernel) for (unsigned kernel_id = 0; kernel_id < 20; kernel_id++) { std::vector<Thread> threads(MAX_THREADS); for (unsigned t=0; t<MAX_THREADS; t++) { threads[t] = Thread(); } // Set the kernelname and include a counter std::string kernelname; if (kernel_id < 10) { kernelname = benchname+"_0"+std::to_string(kernel_id); } else { kernelname = benchname+"_" +std::to_string(kernel_id); } // Load a memory access trace from a file Dim3 blockdim = read_file(threads, kernelname, benchname, suitename); unsigned blocksize = blockdim.x*blockdim.y*blockdim.z; // There was not a single trace that could be found - exit with an error if (blocksize == 0 && kernel_id == 0) { std::cout << "### Error: could not read file 'output/" << benchname << "/" << kernelname << ".trc'" << std::endl; message(""); std::cout << SPLIT_STRING << std::endl; exit(1); } // The final tracefile is already processed, exit the loop if (blocksize == 0) { break; } // Assign threads to warps, threadblocks and GPU cores message(""); std::cout << "### Assigning threads to warps/blocks/cores..."; unsigned num_blocks = ceil(threads.size()/(float)(blocksize)); unsigned num_warps_per_block = ceil(blocksize/(float)(hardware.warp_size)); std::vector<std::vector<unsigned>> warps(num_warps_per_block*num_blocks); std::vector<std::vector<unsigned>> blocks(num_blocks); std::vector<std::vector<unsigned>> cores(hardware.num_cores); schedule_threads(threads, warps, blocks, cores, hardware, blocksize); std::cout << "done" << std::endl; // Model only a single core, modelling multiple cores requires a loop over 'cid' unsigned cid = 0; // Compute the number of active blocks on this core unsigned hardware_max_active_blocks = std::min(hardware.max_active_threads/blocksize, hardware.max_active_blocks); unsigned active_blocks = std::min((unsigned)cores[cid].size(), hardware_max_active_blocks); // Start the computation of the reuse distance profile message(""); std::cout << "### [core " << cid << "]:" << std::endl; std::cout << "### Running " << active_blocks << " block(s) at a time" << std::endl; std::cout << "### Calculating the reuse distances"; // Create a Gaussian distribution to model memory latencies std::random_device random; std::mt19937 gen(random()); // Compute the reuse distance for 4 different cases std::vector<map_type<unsigned,unsigned>> distances(NUM_CASES); for (unsigned runs = 0; runs < NUM_CASES; runs++) { std::cout << "..."; unsigned sets, ways; unsigned ml, ms, nml; unsigned mshr; // CASE 0 | Normal - full model sets = hardware.cache_sets; ways = hardware.cache_ways; ml = hardware.mem_latency; ms = hardware.mem_latency_stddev; nml = NON_MEM_LATENCY; mshr = hardware.num_mshr; // CASE 1 | Only 1 set: don't model associativity if (runs == 1) { sets = 1; ways = hardware.cache_ways*hardware.cache_sets; } // CASE 2 | Memory latency to 0: don't model latencies if (runs == 2) { ml = 0; ms = 0; nml = 0; } // CASE 3 | MSHR count to infinite: don't model MSHRs if (runs == 3) { mshr = INF; } // Calculate the reuse distance profile std::normal_distribution<> distribution(0,ms); reuse_distance(cores[cid], blocks, warps, threads, distances[runs], active_blocks, hardware, sets, ways, ml, nml, mshr, gen, distribution); } std::cout << "done" << std::endl; // Process the reuse distance profile to obtain the cache hit/miss rate message(""); output_miss_rate(distances, kernelname, benchname,suitename, hardware); // Display the cache hit/miss rate from the output of the verifier (if available) message(""); verify_miss_rate(kernelname, benchname); message(""); } // End of the program std::cout << SPLIT_STRING << std::endl; return 0; }
typename EMSubpixelCorrelatorView<ImagePixelT>::prerasterize_type EMSubpixelCorrelatorView<ImagePixelT>::prerasterize(BBox2i const& bbox) const { vw_out(InfoMessage, "stereo") << "EMSubpixelCorrelatorView: rasterizing image block " << bbox << ".\n"; // Find the range of disparity values for this patch. // int num_good; // not used BBox2i search_range; try { search_range = get_disparity_range(crop(m_course_disparity, bbox)); } catch (const std::exception& e) { search_range = BBox2i(); } #ifdef USE_GRAPHICS ImageWindow window; if(debug_level >= 0) { window = vw_create_window("disparity"); } #endif // The area in the right image that we'll be searching is // determined by the bbox of the left image plus the search // range. BBox2i left_crop_bbox(bbox); BBox2i right_crop_bbox(bbox.min() + search_range.min(), bbox.max() + search_range.max()); // The correlator requires the images to be the same size. The // search bbox will always be larger than the given left image // bbox, so we just make the left bbox the same size as the // right bbox. left_crop_bbox.max() = left_crop_bbox.min() + Vector2i(right_crop_bbox.width(), right_crop_bbox.height()); // Finally, we must adjust both bounding boxes to account for // the size of the kernel itself. right_crop_bbox.min() -= Vector2i(m_kernel_size[0], m_kernel_size[1]); right_crop_bbox.max() += Vector2i(m_kernel_size[0], m_kernel_size[1]); left_crop_bbox.min() -= Vector2i(m_kernel_size[0], m_kernel_size[1]); left_crop_bbox.max() += Vector2i(m_kernel_size[0], m_kernel_size[1]); // We crop the images to the expanded bounding box and edge // extend in case the new bbox extends past the image bounds. ImageView<ImagePixelT> left_image_patch, right_image_patch; ImageView<disparity_pixel> disparity_map_patch_in; ImageView<result_type> disparity_map_patch_out; left_image_patch = crop(edge_extend(m_left_image, ZeroEdgeExtension()), left_crop_bbox); right_image_patch = crop(edge_extend(m_right_image, ZeroEdgeExtension()), right_crop_bbox); disparity_map_patch_in = crop(edge_extend(m_course_disparity, ZeroEdgeExtension()), left_crop_bbox); disparity_map_patch_out.set_size(disparity_map_patch_in.cols(), disparity_map_patch_in.rows()); // Adjust the disparities to be relative to the cropped // image pixel locations for (int v = 0; v < disparity_map_patch_in.rows(); ++v) { for (int u = 0; u < disparity_map_patch_in.cols(); ++u) { if (disparity_map_patch_in(u,v).valid()) { disparity_map_patch_in(u,v).child().x() -= search_range.min().x(); disparity_map_patch_in(u,v).child().y() -= search_range.min().y(); } } } double blur_sigma_progressive = .5; // 3*sigma = 1.5 pixels // create the pyramid first std::vector<ImageView<ImagePixelT> > left_pyramid(pyramid_levels), right_pyramid(pyramid_levels); std::vector<BBox2i> regions_of_interest(pyramid_levels); std::vector<ImageView<Matrix2x2> > warps(pyramid_levels); std::vector<ImageView<disparity_pixel> > disparity_map_pyramid(pyramid_levels); // initialize the pyramid at level 0 left_pyramid[0] = channels_to_planes(left_image_patch); right_pyramid[0] = channels_to_planes(right_image_patch); disparity_map_pyramid[0] = disparity_map_patch_in; regions_of_interest[0] = BBox2i(m_kernel_size[0], m_kernel_size[1], bbox.width(),bbox.height()); // downsample the disparity map and the image pair to initialize the intermediate levels for(int i = 1; i < pyramid_levels; i++) { left_pyramid[i] = subsample(gaussian_filter(left_pyramid[i-1], blur_sigma_progressive), 2); right_pyramid[i] = subsample(gaussian_filter(right_pyramid[i-1], blur_sigma_progressive), 2); disparity_map_pyramid[i] = detail::subsample_disp_map_by_two(disparity_map_pyramid[i-1]); regions_of_interest[i] = BBox2i(regions_of_interest[i-1].min()/2, regions_of_interest[i-1].max()/2); } // initialize warps at the lowest resolution level warps[pyramid_levels-1].set_size(left_pyramid[pyramid_levels-1].cols(), left_pyramid[pyramid_levels-1].rows()); for(int y = 0; y < warps[pyramid_levels-1].rows(); y++) { for(int x = 0; x < warps[pyramid_levels-1].cols(); x++) { warps[pyramid_levels-1](x, y).set_identity(); } } #ifdef USE_GRAPHICS vw_initialize_graphics(0, NULL); if(debug_level >= 0) { for(int i = 0; i < pyramid_levels; i++) { vw_show_image(window, left_pyramid[i]); usleep((int)(.2*1000*1000)); } } #endif // go up the pyramid; first run refinement, then upsample result for the next level for(int i = pyramid_levels-1; i >=0; i--) { vw_out() << "processing pyramid level " << i << " of " << pyramid_levels-1 << std::endl; if(debug_level >= 0) { std::stringstream stream; stream << "pyramid_level_" << i << ".tif"; write_image(stream.str(), disparity_map_pyramid[i]); } ImageView<ImagePixelT> process_left_image = left_pyramid[i]; ImageView<ImagePixelT> process_right_image = right_pyramid[i]; if(i > 0) { // in this case take refine the upsampled disparity map from the previous level, // and upsample for the next level m_subpixel_refine(edge_extend(process_left_image, ZeroEdgeExtension()), edge_extend(process_right_image, ZeroEdgeExtension()), disparity_map_pyramid[i], disparity_map_pyramid[i], warps[i], regions_of_interest[i], false, debug_level == i); // upsample the warps and the refined map for the next level of processing int up_width = left_pyramid[i-1].cols(); int up_height = left_pyramid[i-1].rows(); warps[i-1] = copy(resize(warps[i], up_width , up_height, ConstantEdgeExtension(), NearestPixelInterpolation())); //upsample affine transforms disparity_map_pyramid[i-1] = copy(detail::upsample_disp_map_by_two(disparity_map_pyramid[i], up_width, up_height)); } else { // here there is no next level so we refine directly to the output patch m_subpixel_refine(edge_extend(process_left_image, ZeroEdgeExtension()), edge_extend(process_right_image, ZeroEdgeExtension()), disparity_map_pyramid[i], disparity_map_patch_out, warps[i], regions_of_interest[i], true, debug_level == i); } } #ifdef USE_GRAPHICS if(debug_level >= 0) { vw_show_image(window, .5 + select_plane(channels_to_planes(disparity_map_patch_out)/6., 0)); usleep(10*1000*1000); } #endif // Undo the above adjustment for (int v = 0; v < disparity_map_patch_out.rows(); ++v) { for (int u = 0; u < disparity_map_patch_out.cols(); ++u) { if (disparity_map_patch_out(u,v).valid()) { disparity_map_patch_out(u,v).child().x() += search_range.min().x(); disparity_map_patch_out(u,v).child().y() += search_range.min().y(); } } } #ifdef USE_GRAPHICS if(debug_level >= 0 ) { vw_destroy_window(window); } #endif return crop(disparity_map_patch_out, BBox2i(m_kernel_size[0]-bbox.min().x(), m_kernel_size[1]-bbox.min().y(), m_left_image.cols(), m_left_image.rows())); }