Ejemplo n.º 1
0
//////////////////////////////////
// Main entry function of the GPU cache model
//////////////////////////////////
int main(int argc, char** argv) {
    srand(time(0));
    std::cout << SPLIT_STRING << std::endl;
    message("");

    // Flush messages as soon as possible
    std::cout.setf(std::ios_base::unitbuf);

    // Read the hardware settings from file
    Settings hardware = get_settings();

    // Print cache statistics
    message("Cache configuration:");
    std::cout << "### \t Cache size: ~" << hardware.cache_bytes/1024 << "KB" << std::endl;
    std::cout << "### \t Line size: " << hardware.line_size << " bytes" << std::endl;
    std::cout << "### \t Layout: " << hardware.cache_ways << " ways, " << hardware.cache_sets << " sets" << std::endl;
    message("");

    // Parse the input argument and make sure that there is only one
    if (argc != 3) {
        message("Error: provide one argument only (a folder containing input trace files)");
        message("");
        std::cout << SPLIT_STRING << std::endl;
        exit(1);
    }
    std::string benchname = argv[1];
    std::string suitename = argv[2];

    // Loop over all found traces in the folder (one trace per kernel)
    for (unsigned kernel_id = 0; kernel_id < 20; kernel_id++) {
        std::vector<Thread> threads(MAX_THREADS);
        for (unsigned t=0; t<MAX_THREADS; t++) {
            threads[t] = Thread();
        }

        // Set the kernelname and include a counter
        std::string kernelname;
        if (kernel_id < 10) {
            kernelname = benchname+"_0"+std::to_string(kernel_id);
        }
        else {
            kernelname = benchname+"_" +std::to_string(kernel_id);
        }

        // Load a memory access trace from a file
        Dim3 blockdim = read_file(threads, kernelname, benchname, suitename);
        unsigned blocksize = blockdim.x*blockdim.y*blockdim.z;

        // There was not a single trace that could be found - exit with an error
        if (blocksize == 0 && kernel_id == 0) {
            std::cout << "### Error: could not read file 'output/" << benchname << "/" << kernelname << ".trc'" << std::endl;
            message("");
            std::cout << SPLIT_STRING << std::endl;
            exit(1);
        }

        // The final tracefile is already processed, exit the loop
        if (blocksize == 0) {
            break;
        }

        // Assign threads to warps, threadblocks and GPU cores
        message("");
        std::cout << "### Assigning threads to warps/blocks/cores...";
        unsigned num_blocks = ceil(threads.size()/(float)(blocksize));
        unsigned num_warps_per_block = ceil(blocksize/(float)(hardware.warp_size));
        std::vector<std::vector<unsigned>> warps(num_warps_per_block*num_blocks);
        std::vector<std::vector<unsigned>> blocks(num_blocks);
        std::vector<std::vector<unsigned>> cores(hardware.num_cores);
        schedule_threads(threads, warps, blocks, cores, hardware, blocksize);
        std::cout << "done" << std::endl;

        // Model only a single core, modelling multiple cores requires a loop over 'cid'
        unsigned cid = 0;

        // Compute the number of active blocks on this core
        unsigned hardware_max_active_blocks = std::min(hardware.max_active_threads/blocksize, hardware.max_active_blocks);
        unsigned active_blocks = std::min((unsigned)cores[cid].size(), hardware_max_active_blocks);

        // Start the computation of the reuse distance profile
        message("");
        std::cout << "### [core " << cid << "]:" << std::endl;
        std::cout << "### Running " << active_blocks << " block(s) at a time" << std::endl;
        std::cout << "### Calculating the reuse distances";

        // Create a Gaussian distribution to model memory latencies
        std::random_device random;
        std::mt19937 gen(random());

        // Compute the reuse distance for 4 different cases
        std::vector<map_type<unsigned,unsigned>> distances(NUM_CASES);
        for (unsigned runs = 0; runs < NUM_CASES; runs++) {
            std::cout << "...";
            unsigned sets, ways;
            unsigned ml, ms, nml;
            unsigned mshr;

            // CASE 0 | Normal - full model
            sets = hardware.cache_sets;
            ways = hardware.cache_ways;
            ml = hardware.mem_latency;
            ms = hardware.mem_latency_stddev;
            nml = NON_MEM_LATENCY;
            mshr = hardware.num_mshr;

            // CASE 1 | Only 1 set: don't model associativity
            if (runs == 1) {
                sets = 1;
                ways = hardware.cache_ways*hardware.cache_sets;
            }

            // CASE 2 | Memory latency to 0: don't model latencies
            if (runs == 2) {
                ml = 0;
                ms = 0;
                nml = 0;
            }

            // CASE 3 | MSHR count to infinite: don't model MSHRs
            if (runs == 3) {
                mshr = INF;
            }

            // Calculate the reuse distance profile
            std::normal_distribution<> distribution(0,ms);
            reuse_distance(cores[cid], blocks, warps, threads, distances[runs], active_blocks, hardware,
                           sets, ways, ml, nml, mshr, gen, distribution);
        }
        std::cout << "done" << std::endl;

        // Process the reuse distance profile to obtain the cache hit/miss rate
        message("");
        output_miss_rate(distances, kernelname, benchname,suitename, hardware);

        // Display the cache hit/miss rate from the output of the verifier (if available)
        message("");
        verify_miss_rate(kernelname, benchname);
        message("");
    }

    // End of the program
    std::cout << SPLIT_STRING << std::endl;
    return 0;
}
    typename EMSubpixelCorrelatorView<ImagePixelT>::prerasterize_type
    EMSubpixelCorrelatorView<ImagePixelT>::prerasterize(BBox2i const& bbox) const {
      vw_out(InfoMessage, "stereo") << "EMSubpixelCorrelatorView: rasterizing image block " << bbox << ".\n";

      // Find the range of disparity values for this patch.
      // int num_good; // not used
      BBox2i search_range;
      try {
        search_range = get_disparity_range(crop(m_course_disparity, bbox));
      }
      catch (const std::exception& e) {
        search_range = BBox2i();
      }


#ifdef USE_GRAPHICS
      ImageWindow window;
      if(debug_level >= 0) {
        window = vw_create_window("disparity");
      }
#endif

      // The area in the right image that we'll be searching is
      // determined by the bbox of the left image plus the search
      // range.
      BBox2i left_crop_bbox(bbox);
      BBox2i right_crop_bbox(bbox.min() + search_range.min(),
                             bbox.max() + search_range.max());

      // The correlator requires the images to be the same size. The
      // search bbox will always be larger than the given left image
      // bbox, so we just make the left bbox the same size as the
      // right bbox.
      left_crop_bbox.max() = left_crop_bbox.min() + Vector2i(right_crop_bbox.width(), right_crop_bbox.height());

      // Finally, we must adjust both bounding boxes to account for
      // the size of the kernel itself.
      right_crop_bbox.min() -= Vector2i(m_kernel_size[0], m_kernel_size[1]);
      right_crop_bbox.max() += Vector2i(m_kernel_size[0], m_kernel_size[1]);
      left_crop_bbox.min() -= Vector2i(m_kernel_size[0], m_kernel_size[1]);
      left_crop_bbox.max() += Vector2i(m_kernel_size[0], m_kernel_size[1]);

      // We crop the images to the expanded bounding box and edge
      // extend in case the new bbox extends past the image bounds.
      ImageView<ImagePixelT> left_image_patch, right_image_patch;
      ImageView<disparity_pixel> disparity_map_patch_in;
      ImageView<result_type> disparity_map_patch_out;


      left_image_patch = crop(edge_extend(m_left_image, ZeroEdgeExtension()),
                              left_crop_bbox);
      right_image_patch = crop(edge_extend(m_right_image, ZeroEdgeExtension()),
                               right_crop_bbox);
      disparity_map_patch_in = crop(edge_extend(m_course_disparity, ZeroEdgeExtension()),
                                    left_crop_bbox);
      disparity_map_patch_out.set_size(disparity_map_patch_in.cols(), disparity_map_patch_in.rows());


      // Adjust the disparities to be relative to the cropped
      // image pixel locations
      for (int v = 0; v < disparity_map_patch_in.rows(); ++v) {
        for (int u = 0; u < disparity_map_patch_in.cols(); ++u) {
          if (disparity_map_patch_in(u,v).valid())  {
            disparity_map_patch_in(u,v).child().x() -= search_range.min().x();
            disparity_map_patch_in(u,v).child().y() -= search_range.min().y();
          }
        }
      }


      double blur_sigma_progressive = .5; // 3*sigma = 1.5 pixels

      // create the pyramid first
      std::vector<ImageView<ImagePixelT> > left_pyramid(pyramid_levels), right_pyramid(pyramid_levels);
      std::vector<BBox2i> regions_of_interest(pyramid_levels);
      std::vector<ImageView<Matrix2x2> > warps(pyramid_levels);
      std::vector<ImageView<disparity_pixel> > disparity_map_pyramid(pyramid_levels);


      // initialize the pyramid at level 0
      left_pyramid[0] = channels_to_planes(left_image_patch);
      right_pyramid[0] = channels_to_planes(right_image_patch);
      disparity_map_pyramid[0] = disparity_map_patch_in;
      regions_of_interest[0] = BBox2i(m_kernel_size[0], m_kernel_size[1],
                                      bbox.width(),bbox.height());


      // downsample the disparity map and the image pair to initialize the intermediate levels
      for(int i = 1; i < pyramid_levels; i++) {
        left_pyramid[i] = subsample(gaussian_filter(left_pyramid[i-1], blur_sigma_progressive), 2);
        right_pyramid[i] = subsample(gaussian_filter(right_pyramid[i-1], blur_sigma_progressive), 2);

        disparity_map_pyramid[i] = detail::subsample_disp_map_by_two(disparity_map_pyramid[i-1]);
        regions_of_interest[i] = BBox2i(regions_of_interest[i-1].min()/2, regions_of_interest[i-1].max()/2);
      }

      // initialize warps at the lowest resolution level
      warps[pyramid_levels-1].set_size(left_pyramid[pyramid_levels-1].cols(),
                                       left_pyramid[pyramid_levels-1].rows());
      for(int y = 0; y < warps[pyramid_levels-1].rows(); y++) {
        for(int x = 0; x < warps[pyramid_levels-1].cols(); x++) {
          warps[pyramid_levels-1](x, y).set_identity();
        }
      }

#ifdef USE_GRAPHICS
      vw_initialize_graphics(0, NULL);
      if(debug_level >= 0) {
        for(int i = 0; i < pyramid_levels; i++) {
          vw_show_image(window, left_pyramid[i]);
          usleep((int)(.2*1000*1000));
        }
      }
#endif

      // go up the pyramid; first run refinement, then upsample result for the next level
      for(int i = pyramid_levels-1; i >=0; i--) {
        vw_out() << "processing pyramid level "
                  << i << " of " << pyramid_levels-1 << std::endl;

        if(debug_level >= 0) {
          std::stringstream stream;
          stream << "pyramid_level_" << i << ".tif";
          write_image(stream.str(), disparity_map_pyramid[i]);
        }

        ImageView<ImagePixelT> process_left_image = left_pyramid[i];
        ImageView<ImagePixelT> process_right_image = right_pyramid[i];

        if(i > 0) { // in this case take refine the upsampled disparity map from the previous level,
          // and upsample for the next level
          m_subpixel_refine(edge_extend(process_left_image, ZeroEdgeExtension()), edge_extend(process_right_image, ZeroEdgeExtension()),
                            disparity_map_pyramid[i], disparity_map_pyramid[i], warps[i],
                            regions_of_interest[i], false, debug_level == i);

          // upsample the warps and the refined map for the next level of processing
          int up_width = left_pyramid[i-1].cols();
          int up_height = left_pyramid[i-1].rows();
          warps[i-1] = copy(resize(warps[i], up_width , up_height, ConstantEdgeExtension(), NearestPixelInterpolation())); //upsample affine transforms
          disparity_map_pyramid[i-1] = copy(detail::upsample_disp_map_by_two(disparity_map_pyramid[i], up_width, up_height));
        }
        else { // here there is no next level so we refine directly to the output patch
          m_subpixel_refine(edge_extend(process_left_image, ZeroEdgeExtension()), edge_extend(process_right_image, ZeroEdgeExtension()),
                            disparity_map_pyramid[i], disparity_map_patch_out, warps[i],
                            regions_of_interest[i], true, debug_level == i);
        }
      }

#ifdef USE_GRAPHICS
      if(debug_level >= 0) {
        vw_show_image(window, .5 + select_plane(channels_to_planes(disparity_map_patch_out)/6., 0));
        usleep(10*1000*1000);
      }
#endif

      // Undo the above adjustment
      for (int v = 0; v < disparity_map_patch_out.rows(); ++v) {
        for (int u = 0; u < disparity_map_patch_out.cols(); ++u) {
          if (disparity_map_patch_out(u,v).valid())  {
            disparity_map_patch_out(u,v).child().x() += search_range.min().x();
            disparity_map_patch_out(u,v).child().y() += search_range.min().y();
          }
        }
      }

#ifdef USE_GRAPHICS
      if(debug_level >= 0 ) {
        vw_destroy_window(window);
      }
#endif

      return crop(disparity_map_patch_out, BBox2i(m_kernel_size[0]-bbox.min().x(),
                                                  m_kernel_size[1]-bbox.min().y(),
                                                  m_left_image.cols(),
                                                  m_left_image.rows()));
    }