Пример #1
0
void getCanonicalSize(const Halide::Buffer<>& buffer, int* width, int* height,
                      int* channels, int* batch)
{
    CV_Assert(buffer.dimensions() == 4);
    *width = buffer.extent(0);
    *height = buffer.extent(1);
    *channels = buffer.extent(2);
    *batch = buffer.extent(3);
}
Пример #2
0
void print(Halide::Buffer<T> buf) {
    for (int j = 0; j < std::min(buf.height(), 10); j++) {
        std::stringstream oss;
        for (int i = 0; i < std::min(buf.width(), 10); i++) {
            oss << " [";
            for (int k = 0; k < buf.channels(); k++) {
                oss << std::fixed << std::setprecision(1);
                if (k > 0) {
                    oss << std::setw(4);
                }
                oss << +buf(i, j, k);
            }
            oss << "]";
        }
        LOGI("%s", oss.str().c_str());
    }
}
Пример #3
0
void Closure::found_buffer_ref(const string &name, Type type,
                               bool read, bool written, Halide::Buffer<> image) {
    if (!ignore.contains(name)) {
        debug(3) << "Adding buffer " << name << " to closure\n";
        Buffer &ref = buffers[name];
        ref.type = type.element_of(); // TODO: Validate type is the same as existing refs?
        ref.read = read;
        ref.write = written;

        // If reading an image/buffer, compute the size.
        if (image.defined()) {
            ref.size = image.size_in_bytes();
            ref.dimensions = image.dimensions();
        }
    } else {
        debug(3) << "Not adding " << name << " to closure\n";
    }
}
Пример #4
0
HalideBackendWrapper::HalideBackendWrapper(const Ptr<BackendWrapper>& base,
                                           const MatShape& shape)
    : BackendWrapper(DNN_BACKEND_HALIDE, base->targetId)
{
    managesDevMemory = false;
    Halide::Buffer<float> baseBuffer = halideBuffer(base);
    buffer = Halide::Buffer<float>((float*)baseBuffer.raw_buffer()->host,
                                   getBufferShape(shape));
    if (baseBuffer.has_device_allocation())
    {
        buffer.raw_buffer()->device = baseBuffer.raw_buffer()->device;
        buffer.raw_buffer()->device_interface = baseBuffer.raw_buffer()->device_interface;
        buffer.set_device_dirty();
    }
    else
    {
        buffer.set_host_dirty();  // Indicate that data is on CPU.
        CV_Assert(targetId == DNN_TARGET_CPU);
    }
}
Пример #5
0
HalideBackendWrapper::HalideBackendWrapper(const Ptr<BackendWrapper>& base,
                                           const MatShape& shape)
    : BackendWrapper(DNN_BACKEND_HALIDE, base->targetId)
{
    int w, h, c, n;
    getCanonicalSize(shape, &w, &h, &c, &n);
    Halide::Buffer<float> baseBuffer = halideBuffer(base);
    buffer = Halide::Buffer<float>((float*)baseBuffer.raw_buffer()->host,
                                   {w, h, c, n});
    if (baseBuffer.has_device_allocation())
    {
        buffer.raw_buffer()->device = baseBuffer.raw_buffer()->device;
        buffer.raw_buffer()->device_interface = baseBuffer.raw_buffer()->device_interface;
        buffer.set_device_dirty();
    }
    else
    {
        buffer.set_host_dirty();  // Indicate that data is on CPU.
        CV_Assert(targetId == DNN_TARGET_CPU);
    }
}
Пример #6
0
int main(int, char**)
{
    std::vector<std::chrono::duration<double, std::milli>> duration_vector;

    Halide::Buffer<uint8_t> input = Halide::Tools::load_image("../rgb.png");

    Halide::Buffer<float> kernel(3, 3);
    kernel(0,0) = 0; kernel(0,1) = 1.0f/5; kernel(0,2) = 0;
    kernel(1,0) = 1.0f/5; kernel(1,1) = 1.0f/5; kernel(1,2) = 1.0f/5;
    kernel(2,0) = 0; kernel(2,1) = 1; kernel(2,2) = 0;

    // Small size discrepancy with Halide benchmark:
    Halide::Buffer<uint8_t> output(input.width(), input.height(), input.channels());

    std::cout << "Dimensions : " << std::endl;
    std::cout << "input.extent(0): " << input.extent(0) << std::endl;  // Rows
    std::cout << "input.extent(1): " << input.extent(1) << std::endl;  // Cols
    std::cout << "input.extent(2): " << input.extent(2) << std::endl;  // Colors

    #ifdef __PROFILE_CUDA__
    cudaProfilerStop();
    #endif

    // Warm up
    pencil_convolution(input.extent(0), input.extent(1), input.extent(1),
                       (uint8_t *) input.raw_buffer()->host,
                       (float *) kernel.raw_buffer()->host,
                       (uint8_t *) output.raw_buffer()->host);

    #ifdef __PROFILE_CUDA__
    cudaProfilerStart();
    #endif

    // Tiramisu
    for (int i = 0; i < 100; i++) {
        auto start = std::chrono::high_resolution_clock::now();
        pencil_convolution(input.extent(0), input.extent(1), input.extent(1),
                           (uint8_t *) input.raw_buffer()->host,
                           (float *) kernel.raw_buffer()->host,
                           (uint8_t *) output.raw_buffer()->host);

        auto end = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double, std::milli> duration = end - start;
        duration_vector.push_back(duration);
    }

    std::cout << "time: " << median(duration_vector) << std::endl;

    return 0;
}
Пример #7
0
std::chrono::duration<double, std::milli> run_halide(Halide::Buffer<uint8_t> &in, Halide::Buffer<uint8_t> &y, Halide::Buffer<uint8_t> &u, Halide::Buffer<uint8_t> &v)
{
    in(0, 0, 0) = in(0, 0, 0);
    auto start = std::chrono::high_resolution_clock::now();
    rgbyuv420gpu_ref(in.raw_buffer(), y.raw_buffer(), u.raw_buffer(), v.raw_buffer());
    halide_copy_to_host(nullptr, y.raw_buffer());
    halide_copy_to_host(nullptr, u.raw_buffer());
    halide_copy_to_host(nullptr, v.raw_buffer());
    halide_device_free(nullptr, y.raw_buffer());
    halide_device_free(nullptr, u.raw_buffer());
    halide_device_free(nullptr, v.raw_buffer());
    halide_device_free(nullptr, in.raw_buffer());
    auto end = std::chrono::high_resolution_clock::now();
    return end - start;
}
Пример #8
0
int main(int, char**)
{
    std::vector<std::chrono::duration<double,std::milli>> duration_vector_1;
    std::vector<std::chrono::duration<double,std::milli>> duration_vector_2;
    Halide::Buffer<uint8_t> input = Halide::Tools::load_image("./utils/images/rgb.png");
    Halide::Buffer<int32_t> size(2);
    size(0) = input.extent(0);
    size(1) = input.extent(1);

    Halide::Buffer<uint8_t> output_ref_y(input.width(), input.height());
    Halide::Buffer<uint8_t> output_ref_u(input.width()/2, input.height()/2);
    Halide::Buffer<uint8_t> output_ref_v(input.width()/2, input.height()/2);

    Halide::Buffer<uint8_t> output_tiramisu_y(input.width(), input.height());
    Halide::Buffer<uint8_t> output_tiramisu_u(input.width()/2, input.height()/2);
    Halide::Buffer<uint8_t> output_tiramisu_v(input.width()/2, input.height()/2);

    std::cout << "STARTING TEST\n";
    std::cout << "y size (width, height): " << output_tiramisu_y.width() << ", " << output_tiramisu_y.height() << "\n";
    std::cout << "u size (width, height): " << output_tiramisu_u.width() << ", " << output_tiramisu_u.height() << "\n";
    std::cout << "v size (width, height): " << output_tiramisu_v.width() << ", " << output_tiramisu_v.height() << "\n";
    // Warm up
    rgbyuv420gpu_tiramisu(size.raw_buffer(), input.raw_buffer(), output_tiramisu_y.raw_buffer(), output_tiramisu_u.raw_buffer(), output_tiramisu_v.raw_buffer());
    run_halide(input, output_ref_y, output_ref_u, output_ref_v);

    // Tiramisu
    for (int i=0; i<NB_TESTS; i++)
    {
        auto start1 = std::chrono::high_resolution_clock::now();
        rgbyuv420gpu_tiramisu(size.raw_buffer(), input.raw_buffer(), output_tiramisu_y.raw_buffer(), output_tiramisu_u.raw_buffer(), output_tiramisu_v.raw_buffer());
        auto end1 = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double,std::milli> duration1 = end1 - start1;
        duration_vector_1.push_back(duration1);
    }

    // Reference
    for (int i=0; i<NB_TESTS; i++)
    {
        duration_vector_2.push_back(run_halide(input, output_ref_y, output_ref_u, output_ref_v));
    }

    print_time("performance_CPU.csv", "rgbyuv420gpu",
               {"Tiramisu", "Halide"},
               {median(duration_vector_1), median(duration_vector_2)});

    Halide::Tools::save_image(output_tiramisu_y, "./build/rgbyuv420gpu_y_tiramisu.png");
    Halide::Tools::save_image(output_tiramisu_u, "./build/rgbyuv420gpu_u_tiramisu.png");
    Halide::Tools::save_image(output_tiramisu_v, "./build/rgbyuv420gpu_v_tiramisu.png");
    Halide::Tools::save_image(output_ref_y, "./build/rgbyuv420gpu_y_ref.png");
    Halide::Tools::save_image(output_ref_u, "./build/rgbyuv420gpu_u_ref.png");
    Halide::Tools::save_image(output_ref_v, "./build/rgbyuv420gpu_v_ref.png");

    if (CHECK_CORRECTNESS)
    {
        std::cout << "Compare y buffer\n";
    	compare_buffers("benchmark_rgbyuv420gpu", output_tiramisu_y, output_ref_y);
        std::cout << "Compare u buffer\n";
    	compare_buffers("benchmark_rgbyuv420gpu", output_tiramisu_u, output_ref_u);
        std::cout << "Compare v buffer\n";
    	compare_buffers("benchmark_rgbyuv420gpu", output_tiramisu_y, output_ref_y);
    }

    return 0;
}
Пример #9
0
int main(int, char**)
{
    std::vector<std::chrono::duration<double,std::milli>> duration_vector_1;
    std::vector<std::chrono::duration<double,std::milli>> duration_vector_2;

    Halide::Buffer<uint8_t> input = Halide::Tools::load_image("./utils/images/rgb.png");

    Halide::Buffer<uint8_t> output_ref_f(input.width(), input.height(), input.channels());
    Halide::Buffer<uint8_t> output_ref_g(input.width(), input.height(), input.channels());
    Halide::Buffer<uint8_t> output_ref_h(input.width(), input.height(), input.channels());
    Halide::Buffer<uint8_t> output_ref_k(input.width(), input.height(), input.channels());
    Halide::Buffer<uint8_t> output_tiramisu_f(input.width(), input.height(), input.channels());
    Halide::Buffer<uint8_t> output_tiramisu_g(input.width(), input.height(), input.channels());
    Halide::Buffer<uint8_t> output_tiramisu_h(input.width(), input.height(), input.channels());
    Halide::Buffer<uint8_t> output_tiramisu_k(input.width(), input.height(), input.channels());

    // Warm up
    fusion_tiramisu(input.raw_buffer(), output_tiramisu_f.raw_buffer(),
		    output_tiramisu_g.raw_buffer(), output_tiramisu_h.raw_buffer(),
		    output_tiramisu_k.raw_buffer());
    fusion_ref(input.raw_buffer(), output_ref_f.raw_buffer(), output_ref_g.raw_buffer(),
	       output_ref_h.raw_buffer(), output_ref_k.raw_buffer());

    // Tiramisu
    for (int i=0; i<NB_TESTS; i++)
    {
        auto start1 = std::chrono::high_resolution_clock::now();
        fusion_tiramisu(input.raw_buffer(), output_tiramisu_f.raw_buffer(),
			output_tiramisu_g.raw_buffer(), output_tiramisu_h.raw_buffer(),
			output_tiramisu_k.raw_buffer());
        auto end1 = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double,std::milli> duration1 = end1 - start1;
        duration_vector_1.push_back(duration1);
    }

    // Reference
    for (int i=0; i<NB_TESTS; i++)
    {
        auto start2 = std::chrono::high_resolution_clock::now();
        fusion_ref(input.raw_buffer(), output_ref_f.raw_buffer(), output_ref_g.raw_buffer(),
		   output_ref_h.raw_buffer(), output_ref_k.raw_buffer());
        auto end2 = std::chrono::high_resolution_clock::now();
        std::chrono::duration<double,std::milli> duration2 = end2 - start2;
        duration_vector_2.push_back(duration2);
    }

    print_time("performance_CPU.csv", "fusion",
               {"Tiramisu", "Halide"},
               {median(duration_vector_1), median(duration_vector_2)});

    Halide::Tools::save_image(output_tiramisu_h, "./build/fusion_h_tiramisu.png");
    Halide::Tools::save_image(output_tiramisu_k, "./build/fusion_k_tiramisu.png");
    Halide::Tools::save_image(output_ref_h, "./build/fusion_h_ref.png");
    Halide::Tools::save_image(output_ref_k, "./build/fusion_k_ref.png");

    if (CHECK_CORRECTNESS)
	compare_buffers("Fusion",  output_ref_k, output_tiramisu_k);

    return 0;
}