Ejemplo n.º 1
0
Halide::Func ImageConverter(Halide::ImageParam image) {
	// First get min and max of the image
	RDom r(0, image.width(), 0, image.height());

	// Now rescale the image to the range 0..255 and project the value to a RGBA integer value
	Func imgmin;
	imgmin() = minimum(image(r.x, r.y));
	Func imgmax;
	imgmax() = maximum(image(r.x, r.y));
	Expr scale = 1.0f / (imgmax() - imgmin());
	Func rescaled;
	Var x, y;
	Expr val = cast<uint32_t>(255.0f * (image(x, y) - imgmin()) * scale + 0.5f);
	Expr scaled = val * cast<uint32_t>(0x010101);
	rescaled(x, y) = scaled;

	imgmin.compute_root();
	imgmax.compute_root();

	Var xo, yo, xi, yi;

	//rescaled.tile(x, y, xo, yo, xi, yi, 32, 8);
	//rescaled.vectorize(xi);
	//rescaled.unroll(yi);

	return rescaled;
}
Ejemplo n.º 2
0
Func blur_then_transpose(Func f, Func coeff, Expr size, Expr sigma) {

    Func blurred = performBlur(f, coeff, size, sigma);

    // Also compute attenuation due to zero boundary condition by
    // blurring an image of ones in the same way. This gives a
    // boundary condition equivalent to reweighting the Gaussian
    // near the edge. (TODO: add a generator param to select
    // different boundary conditions).
    Func ones;
    ones(x, y) = 1.0f;
    Func attenuation = performBlur(ones, coeff, size, sigma);

    // Invert the attenuation so we can multiply by it. The
    // attenuation is the same for every row/channel so we only
    // need one column.
    Func inverse_attenuation;
    inverse_attenuation(y) = 1.0f / attenuation(0, y);

    // Transpose it
    Func transposed;
    transposed(x, y) = blurred(y, x);

    // Correct for attenuation
    Func out;
    out(x, y) = transposed(x, y) * inverse_attenuation(x);

    // Schedule it.
    Var yi, xi, yii, xii;

    attenuation.compute_root();
    inverse_attenuation.compute_root().vectorize(y, 8);
    out.compute_root()
        .tile(x, y, xi, yi, 8, 32)
        .tile(xi, yi, xii, yii, 8, 8)
        .vectorize(xii).unroll(yii).parallel(y);
    blurred.compute_at(out, y);
    transposed.compute_at(out, xi).vectorize(y).unroll(x);

    for (int i = 0; i < blurred.num_update_definitions(); i++) {
        RDom r = blurred.reduction_domain(i);
        if (r.defined()) {
            blurred.update(i).reorder(x, r);
        }
        blurred.update(i).vectorize(x, 8).unroll(x);
    }

    return out;
}
Ejemplo n.º 3
0
Func color_correct(Func input, ImageParam matrix_3200, ImageParam matrix_7000, Param<float> kelvin) {
    // Get a color matrix by linearly interpolating between two
    // calibrated matrices using inverse kelvin.

    Func matrix;
    Expr alpha = (1.0f/kelvin - 1.0f/3200) / (1.0f/7000 - 1.0f/3200);
    Expr val =  (matrix_3200(x, y) * alpha + matrix_7000(x, y) * (1 - alpha));
    matrix(x, y) = cast<int16_t>(val * 256.0f); // Q8.8 fixed point
    matrix.compute_root();

    Func corrected;
    Expr ir = cast<int32_t>(input(x, y, 0));
    Expr ig = cast<int32_t>(input(x, y, 1));
    Expr ib = cast<int32_t>(input(x, y, 2));

    Expr r = matrix(3, 0) + matrix(0, 0) * ir + matrix(1, 0) * ig + matrix(2, 0) * ib;
    Expr g = matrix(3, 1) + matrix(0, 1) * ir + matrix(1, 1) * ig + matrix(2, 1) * ib;
    Expr b = matrix(3, 2) + matrix(0, 2) * ir + matrix(1, 2) * ig + matrix(2, 2) * ib;

    r = cast<int16_t>(r/256);
    g = cast<int16_t>(g/256);
    b = cast<int16_t>(b/256);
    corrected(x, y, c) = select(c == 0, r,
                                c == 1, g,
                                        b);

    return corrected;
}
Ejemplo n.º 4
0
    // Now we define methods that give our pipeline several different
    // schedules.
    void schedule_for_cpu() {
        // Compute the look-up-table ahead of time.
        lut.compute_root();

        // Compute color channels innermost. Promise that there will
        // be three of them and unroll across them.
        curved.reorder(c, x, y)
              .bound(c, 0, 3)
              .unroll(c);

        // Look-up-tables don't vectorize well, so just parallelize
        // curved in slices of 16 scanlines.
        Var yo, yi;
        curved.split(y, yo, yi, 16)
              .parallel(yo);

        // Compute sharpen as needed per scanline of curved.
        sharpen.compute_at(curved, yi);

        // Vectorize the sharpen. It's 16-bit so we'll vectorize it 8-wide.
        sharpen.vectorize(x, 8);

        // Compute the padded input as needed per scanline of curved,
        // reusing previous values computed within the same strip of
        // 16 scanlines.
        padded.store_at(curved, yo)
              .compute_at(curved, yi);

        // Also vectorize the padding. It's 8-bit, so we'll vectorize
        // 16-wide.
        padded.vectorize(x, 16);

        // JIT-compile the pipeline for the CPU.
        curved.compile_jit();
    }
Ejemplo n.º 5
0
Func build(bool use_shared) {
    Func host;
    Var x, y;
    host(x, y) = x + y;
    host.compute_root();

    // We'll either inline this (and hopefully use the GPU's L1 cache)
    // or stage it into shared.
    Func staged;
    staged(x, y) = host(x, y);

    // Now we just need to access the Func staged a bunch.
    const int stages = 10;
    Func f[stages];
    for (int i = 0; i < stages; i++) {
        Expr prev = (i == 0) ? Expr(0) : Expr(f[i-1](x, y));
        Expr stencil = 0;
        for (int dy = -1; dy <= 1; dy++) {
            for (int dx = -1; dx <= 1; dx++) {
                stencil += staged(select(prev > 0, x, x+dx),
                                  select(prev > 0, y, y+dy));
            }
        }
        if (i == 0) {
            f[i](x, y) = stencil;
        } else {
            f[i](x, y) = f[i-1](x, y) + stencil;
        }
    }

    Func final = f[stages-1];

    final.compute_root().gpu_tile(x, y, 8, 8);
Ejemplo n.º 6
0
int main(int argc, char **argv) {
    Func data;
    Var x;
    data(x) = sin(x);
    data.compute_root();

    Func sorted;
    std::vector<ExternFuncArgument> args;
    args.push_back(data);
    sorted.define_extern("sort_buffer", args, Float(32), 1);
    Buffer<float> output = sorted.realize(100);

    // Check the output
    Buffer<float> reference = lambda(x, sin(x)).realize(100);
    std::sort(&reference(0), &reference(100));

    RDom r(reference);
    float error = evaluate_may_gpu<float>(sum(abs(reference(r) - output(r))));

    if (error != 0) {
        printf("Output incorrect\n");
        return -1;
    }

    printf("Success!\n");
    return 0;
}
Func ColorMgetfilter(Func stBasis, float angle, uint8_t iXo, uint8_t iYo, uint8_t iTo, uint8_t iCo ) {
    // Compute a rotated basis at (iXo,iYo,iTo,iCo) order with angle value

    // temporary setting
    uint8_t numSTB = 63;
    uint8_t numSB = 21;

    angle = -1*angle - M_PI/2;
    float * weights;

    Func work; // work: rotated basis at a particular spatio-temporal order
    work(x,y,t) = cast<float>(0.0f);

    weights = (float *) calloc(iXo+iYo+1,sizeof(float));

    // compute weights for possible orders
    for (int i = 0; i <= iXo; i++)
        for (int j = 0; j <= iYo; j++)
            weights[iXo+iYo-i-j] += float(combination(iXo,i))*float(combination(iYo,j))*pow((-1.0f),float(i))*pow(cos(angle),float(iXo-i+j))*pow(sin(angle),float(iYo+i-j));

    // get filtered expression at paricular order and angle value
    // Func basis("basis");
    for (int k=0; k<=(iXo+iYo); k++) {
        int index = Mgetfilterindex(iXo+iYo-k,k,iTo,numSTB,numSB);
        // basis = spatial_temporal_derivative(T,iXo+iYo-k,k,iTo,iCo);
        if ((index > 0) && (weights[iXo+iYo-k] != 0))
            work(x,y,t) += weights[iXo+iYo-k]*stBasis(x,y,iCo,t)[index];
    }

    work.compute_root();

    free(weights);
    return work;
}
Ejemplo n.º 8
0
Func process(Func raw, Type result_type,
             ImageParam matrix_3200, ImageParam matrix_7000, Param<float> color_temp,
             Param<float> gamma, Param<float> contrast, Param<int> blackLevel, Param<int> whiteLevel) {

    Var yii, xi;

    Func denoised = hot_pixel_suppression(raw);
    Func deinterleaved = deinterleave(denoised);
    Func demosaiced = demosaic(deinterleaved);
    Func corrected = color_correct(demosaiced, matrix_3200, matrix_7000, color_temp);
    Func curved = apply_curve(corrected, result_type, gamma, contrast, blackLevel, whiteLevel);

    processed(x, y, c) = curved(x, y, c);

    // Schedule
    Expr out_width = processed.output_buffer().width();
    Expr out_height = processed.output_buffer().height();

    int strip_size = 32;
    int vec = target.natural_vector_size(UInt(16));
    if (target.has_feature(Target::HVX_64)) {
        vec = 32;
    } else if (target.has_feature(Target::HVX_128)) {
        vec = 64;
    }
    denoised.compute_at(processed, yi).store_at(processed, yo)
        .fold_storage(y, 8)
        .vectorize(x, vec);
    deinterleaved.compute_at(processed, yi).store_at(processed, yo)
        .fold_storage(y, 4)
        .vectorize(x, 2*vec, TailStrategy::RoundUp)
        .reorder(c, x, y)
        .unroll(c);
    corrected.compute_at(processed, x)
        .vectorize(x, vec)
        .reorder(c, x, y)
        .unroll(c);
    processed.compute_root()
        .split(y, yo, yi, strip_size)
        .split(yi, yi, yii, 2)
        .split(x, x, xi, 2*vec, TailStrategy::RoundUp)
        .reorder(xi, c, yii, x, yi, yo)
        .vectorize(xi, 2*vec)
        .parallel(yo);

    if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
        processed.hexagon();
        denoised.align_storage(x, vec);
        deinterleaved.align_storage(x, vec);
        corrected.align_storage(x, vec);
    }

    // We can generate slightly better code if we know the splits divide the extent.
    processed
        .bound(c, 0, 3)
        .bound(x, 0, ((out_width)/(2*vec))*(2*vec))
        .bound(y, 0, (out_height/strip_size)*strip_size);

    return processed;
}
Ejemplo n.º 9
0
int simple_rfactor_with_specialize_test(bool compile_module) {
    Func f("f"), g("g");
    Var x("x"), y("y");

    f(x, y) = x + y;
    f.compute_root();

    g(x, y) = 40;
    RDom r(10, 20, 30, 40);
    g(r.x, r.y) = min(f(r.x, r.y) + 2, g(r.x, r.y));

    Param<int> p;
    Var u("u");
    Func intm = g.update(0).specialize(p >= 10).rfactor(r.y, u);
    intm.compute_root();
    intm.vectorize(u, 8);
    intm.update(0).vectorize(r.x, 2);

    if (compile_module) {
        p.set(20);
        // Check the call graphs.
        Module m = g.compile_to_module({g.infer_arguments()});
        CheckCalls checker;
        m.functions().front().body.accept(&checker);

        CallGraphs expected = {
            {g.name(), {}},
            {g.update(0).name(), {f.name(), intm.name(), g.name()}},
            {intm.name(), {}},
            {intm.update(0).name(), {f.name(), intm.name()}},
            {f.name(), {}},
        };
        if (check_call_graphs(checker.calls, expected) != 0) {
            return -1;
        }
    } else {
        {
            p.set(0);
            Image<int> im = g.realize(80, 80);
            auto func = [](int x, int y, int z) {
                return (10 <= x && x <= 29) && (30 <= y && y <= 69) ? std::min(x + y + 2, 40) : 40;
            };
            if (check_image(im, func)) {
                return -1;
            }
        }
        {
            p.set(20);
            Image<int> im = g.realize(80, 80);
            auto func = [](int x, int y, int z) {
                return (10 <= x && x <= 29) && (30 <= y && y <= 69) ? std::min(x + y + 2, 40) : 40;
            };
            if (check_image(im, func)) {
                return -1;
            }
        }
    }
    return 0;
}
Ejemplo n.º 10
0
int count_host_alignment_asserts(Func f, std::map<string, int> m) {
    Target t = get_jit_target_from_environment();
    t.set_feature(Target::NoBoundsQuery);
    f.compute_root();
    Stmt s = Internal::lower({f.function()}, f.name(), t);
    CountHostAlignmentAsserts c(m);
    s.accept(&c);
    return c.count;
}
Ejemplo n.º 11
0
Func build() {
    Func in;
    in(x) = x;
    in.compute_root();

    Func up = upsample(upsample(in));

    return up;
}
Ejemplo n.º 12
0
int count_interleaves(Func f) {
    Target t = get_jit_target_from_environment();
    t.set_feature(Target::NoBoundsQuery);
    t.set_feature(Target::NoAsserts);
    f.compute_root();
    Stmt s = Internal::lower({f.function()}, f.name(), t);
    CountInterleaves i;
    s.accept(&i);
    return i.result;
}
Ejemplo n.º 13
0
int main(int argc, char **argv) {
    // Define a pipeline that dumps some squares to a file using an
    // external consumer stage.
    Func source;
    Var x;
    source(x) = x*x;

    Param<int> min, extent;
    Param<const char *> filename;

    Func sink;
    std::vector<ExternFuncArgument> args;
    args.push_back(source);
    args.push_back(filename);
    args.push_back(min);
    args.push_back(extent);
    sink.define_extern("dump_to_file", args, Int(32), 0);

    source.compute_root();

    sink.compile_jit();

    // Dump the first 10 squares to a file
    filename.set("halide_test_extern_consumer.txt");
    min.set(0);
    extent.set(10);
    sink.realize();

    if (!check_result())
        return -1;

    // Test ImageParam ExternFuncArgument via passed in image.
    Image<int32_t> buf = source.realize(10);
    ImageParam passed_in(Int(32), 1);
    passed_in.set(buf);

    Func sink2;
    std::vector<ExternFuncArgument> args2;
    args2.push_back(passed_in);
    args2.push_back(filename);
    args2.push_back(min);
    args2.push_back(extent);
    sink2.define_extern("dump_to_file", args2, Int(32), 0);

    sink2.realize();

    if (!check_result())
        return -1;

    printf("Success!\n");
    return 0;

}
Ejemplo n.º 14
0
Func process(Func raw, Type result_type,
             ImageParam matrix_3200, ImageParam matrix_7000, Param<float> color_temp,
             Param<float> gamma, Param<float> contrast) {

    Var xi, yi;

    Func denoised = hot_pixel_suppression(raw);
    Func deinterleaved = deinterleave(denoised);
    Func demosaiced = demosaic(deinterleaved);
    Func corrected = color_correct(demosaiced, matrix_3200, matrix_7000, color_temp);
    Func curved = apply_curve(corrected, result_type, gamma, contrast);

    processed(tx, ty, c) = curved(tx, ty, c);

    // Schedule
    processed.bound(c, 0, 3); // bound color loop 0-3, properly
    if (schedule == 0) {
        // Compute in chunks over tiles, vectorized by 8
        denoised.compute_at(processed, tx).vectorize(x, 8);
        deinterleaved.compute_at(processed, tx).vectorize(x, 8).reorder(c, x, y).unroll(c);
        corrected.compute_at(processed, tx).vectorize(x, 4).reorder(c, x, y).unroll(c);
        processed.tile(tx, ty, xi, yi, 32, 32).reorder(xi, yi, c, tx, ty);
        processed.parallel(ty);
    } else if (schedule == 1) {
        // Same as above, but don't vectorize (sse is bad at interleaved 16-bit ops)
        denoised.compute_at(processed, tx);
        deinterleaved.compute_at(processed, tx);
        corrected.compute_at(processed, tx);
        processed.tile(tx, ty, xi, yi, 128, 128).reorder(xi, yi, c, tx, ty);
        processed.parallel(ty);
    } else {
        denoised.compute_root();
        deinterleaved.compute_root();
        corrected.compute_root();
        processed.compute_root();
    }

    return processed;
}
Ejemplo n.º 15
0
int main(int argc, char **argv) {
    // Generate random input image.
    const int W = 128, H = 48;
    Buffer<uint8_t> in(W, H);
    for (int y = 0; y < H; y++) {
        for (int x = 0; x < W; x++) {
            in(x, y) = rand() & 0xff;
        }
    }

    Var x("x"), y("y");

    // Apply the boundary condition up-front.
    Func input = BoundaryConditions::repeat_edge(in);
    input.compute_root();

    // Define the dilate algorithm.
    Func max_x("max_x");
    Func dilate3x3("dilate3x3");
    max_x(x, y) = max3(input(x-1, y), input(x, y), input(x+1, y));
    dilate3x3(x, y) = max3(max_x(x, y-1), max_x(x, y), max_x(x, y+1));

    // Schedule.
    Target target = get_jit_target_from_environment();
    if (target.has_gpu_feature()) {
        dilate3x3.gpu_tile(x, y, 16, 16);
    } else if (target.features_any_of({Target::HVX_64, Target::HVX_128})) {
        dilate3x3.hexagon().vectorize(x, 64);
    } else {
        dilate3x3.vectorize(x, target.natural_vector_size<uint8_t>());
    }

    // Run the pipeline and verify the results are correct.
    Buffer<uint8_t> out = dilate3x3.realize(W, H, target);

    for (int y = 1; y < H-1; y++) {
        for (int x = 1; x < W-1; x++) {
            uint16_t correct = max3(max3(in(x-1, y-1), in(x, y-1), in(x+1, y-1)),
                                    max3(in(x-1, y  ), in(x, y  ), in(x+1, y  )),
                                    max3(in(x-1, y+1), in(x, y+1), in(x+1, y+1)));

            if (out(x, y) != correct) {
                std::cout << "out(" << x << ", " << y << ") = " << out(x, y) << " instead of " << correct << "\n";
                return -1;
            }
        }
    }

    std::cout << "Success!\n";
    return 0;
}
Ejemplo n.º 16
0
int rdom_with_predicate_rfactor_test(bool compile_module) {
    Func f("f"), g("g");
    Var x("x"), y("y"), z("z");

    f(x, y, z) = x + y + z;
    f.compute_root();

    g(x, y, z) = 1;
    RDom r(5, 10, 5, 10, 0, 20);
    r.where(r.x < r.y);
    r.where(r.x + 2*r.y <= r.z);
    g(r.x, r.y, r.z) += f(r.x, r.y, r.z);

    Var u("u"), v("v");
    Func intm = g.update(0).rfactor({{r.y, u}, {r.x, v}});
    intm.compute_root();
    Var ui("ui"), vi("vi"), t("t");
    intm.tile(u, v, ui, vi, 2, 2).fuse(u, v, t).parallel(t);
    intm.update(0).vectorize(r.z, 2);

    if (compile_module) {
        // Check the call graphs.
        Module m = g.compile_to_module({g.infer_arguments()});
        CheckCalls checker;
        m.functions().front().body.accept(&checker);

        CallGraphs expected = {
            {g.name(), {}},
            {g.update(0).name(), {intm.name(), g.name()}},
            {intm.name(), {}},
            {intm.update(0).name(), {f.name(), intm.name()}},
            {f.name(), {}},
        };
        if (check_call_graphs(checker.calls, expected) != 0) {
            return -1;
        }
    } else {
        Image<int> im = g.realize(20, 20, 20);
        auto func = [](int x, int y, int z) {
            return (5 <= x && x <= 14) && (5 <= y && y <= 14) &&
                   (0 <= z && z <= 19) && (x < y) && (x + 2*y <= z) ? x + y + z + 1 : 1;
        };
        if (check_image(im, func)) {
            return -1;
        }
    }
    return 0;
}
int main(int argc, char **argv) {
    Var x, y, z;
    RDom r(0, 4096, 0, 4096, 0, 256);
    Func big;
    big(x, y, z) = cast<uint8_t>(42);
    big.set_error_handler(&halide_error);
    big.compute_root();

    Func grand_total;
    grand_total() = cast<uint8_t>(sum(big(r.x, r.y, r.z)));
    grand_total.set_error_handler(&halide_error);

    Image<uint8_t> result = grand_total.realize();

    assert(error_occurred);
    printf("Success!\n");
}
Ejemplo n.º 18
0
Func make_noise(int depth) {
    Func f;
    Var x, y, c;
    if (depth == 0) {
        f(x, y, c) = random_float();
    } else {
        Func g = make_noise(depth - 1);
        Func g_up;
        f(x, y, c) = (g(x/2, y/2, c) +
                      g((x+1)/2, y/2, c) +
                      g(x/2, (y+1)/2, c) +
                      g((x+1)/2, (y+1)/2, c) +
                      0.25f * random_float()) / 4.25f;
    }
    f.compute_root();
    return f;
}
int main(int argc, char **argv) {
    ImageParam input(Float(32), 2);

    Var x, y;

    Func g;
    g(x, y) = input(x, y) * 2;
    g.compute_root();

    Func f;
    f(x, y) = g(x, y);

    f.parallel(y);
    f.trace_stores();
    f.compile_to_file("user_context_insanity", input, user_context_param());
    return 0;
}
Ejemplo n.º 20
0
// Defines a func to blur the columns of an input with a first order low
// pass IIR filter, followed by a transpose.
Func blur_cols_transpose(Func input, Expr height, Expr alpha) {
    Func blur;

    // Pure definition: do nothing.
    blur(x, y, c) = undef<float>();
    // Update 0: set the top row of the result to the input.
    blur(x, 0, c) = input(x, 0, c);
    // Update 1: run the IIR filter down the columns.
    RDom ry(1, height - 1);
    blur(x, ry, c) =
        (1 - alpha)*blur(x, ry - 1, c) + alpha*input(x, ry, c);
    // Update 2: run the IIR blur up the columns.
    Expr flip_ry = height - ry - 1;
    blur(x, flip_ry, c) =
        (1 - alpha)*blur(x, flip_ry + 1, c) + alpha*blur(x, flip_ry, c);

    // Transpose the blur.
    Func transpose;
    transpose(x, y, c) = blur(y, x, c);

    // Schedule:
    // Split the transpose into tiles of rows. Parallelize over channels
    // and strips (Halide supports nested parallelism).
    Var xo, yo;
    transpose.compute_root()
        .tile(x, y, xo, yo, x, y, 8, 8)
        .vectorize(x)
        .parallel(yo)
        .parallel(c);

    // Run the filter on each row of tiles (which corresponds to a strip of
    // columns in the input).
    blur.compute_at(transpose, yo);

    // Vectorize computations within the strips.
    blur.update(1)
        .reorder(x, ry)
        .vectorize(x);
    blur.update(2)
        .reorder(x, ry)
        .vectorize(x);

    return transpose;
}
Ejemplo n.º 21
0
int main(int argc, char **argv) {
    Func f;
    Var x;
    f(x) = sin(x);
    f.compute_root();

    const int N = 9;

    std::vector<Expr> exprs;
    for (int i = 0; i < N; i++) {
        exprs.push_back(f(i));
    }
    exprs = bitonic_sort(exprs);

    std::cout << exprs.size() << "\n";

    // Use update definitions to write them to another Func in sorted
    // order for inspection. Note that doing this doesn't explicitly
    // share work between each element - it'll generate the huge
    // min/max expression to extract each sorted element. llvm should
    // lift out common subexpressions though.
    Func g;
    g(x) = undef<float>();
    for (int i = 0; i < N; i++) {
        g(i) = exprs[i];
    }

    Buffer<float> result = g.realize(N);

    for (int i = 0; i < N; i++) {
        printf("%f ", result(i));
    }
    printf("\n");

    for (int i = 0; i < N-1; i++) {
        if (result(i) >= result(i+1)) {
            printf("Results were not in order\n");
            return -1;
        }
    }

    return 0;
}
Ejemplo n.º 22
0
//Convolution
Func ifft2_c2r(Func input, int W, int H) {

    Target target = get_target_from_environment();

    Fft2dDesc fwd_desc;
    Fft2dDesc inv_desc;
    inv_desc.gain = 1.0f/(W*H);

    //Make complex
    ComplexFunc input_complex;
    input_complex(x, y, c) = {input(x, y, c, 0), input(x, y, c, 1)};

    // Compute the inverse DFT
    Func res = fft2d_c2r(input_complex, W, H, target, inv_desc);

    //Schedule
    res.compute_root();

    return res;
}
Ejemplo n.º 23
0
Func blur(Func input, Expr sigma, Expr width, Expr height) {

    // Compute IIR coefficients using the method of Young and Van Vliet.
    Func coeff;
    Expr q = select(sigma < 2.5f,
                    3.97156f - 4.14554f*sqrt(1 - 0.26891f*sigma),
                    0.98711f*sigma - 0.96330f);
    Expr denom = 1.57825f + 2.44413f*q + 1.4281f*q*q + 0.422205f*q*q*q;
    coeff(x) = undef<float>();
    coeff(1) = (2.44413f*q + 2.85619f*q*q + 1.26661f*q*q*q)/denom;
    coeff(2) = -(1.4281f*q*q + 1.26661f*q*q*q)/denom;
    coeff(3) = (0.422205f*q*q*q)/denom;
    coeff(0) = 1 - (coeff(1) + coeff(2) + coeff(3));
    coeff.compute_root();

    Func blurY, blurX;
    blurY = blur_then_transpose(input, coeff, height, sigma);
    blurX = blur_then_transpose(blurY, coeff, width, sigma);
    return blurX;
}
Ejemplo n.º 24
0
int main(int argc, char **argv) {

    // Move this test to correctness once we can support >4d buffer_ts on the gpu

    if (!get_jit_target_from_environment().has_gpu_feature()) {
        printf("No gpu target enabled. Skipping test.\n");
        // This test is currently expected to error out.
        printf("Error: pretending that there was an error\n");
        return -1;
    }


    Func f;
    Var v0, v1, v2, v3, v4;

    f(v0, v1, v2, v3, v4) = v0 + 2*v1 + 4*v2 + 8*v3 + 16*v4;

    f.compute_root().gpu_blocks(v3, v4).gpu_threads(v1, v2);

    // Linearize into an output buffer
    Func g;
    g(v0) = f(v0 % 2, (v0 / 2) % 2, (v0 / 4) % 2, (v0 / 8) % 2, (v0 / 16) % 2);

    Image<int> result = g.realize(32);

    // Delete this code once this test works.
    printf("Error: I should not have successfully compiled.\n");
    return -1;

    for (int i = 0; i < result.width(); i++) {
        if (i != result(i)) {
            printf("result(%d) = %d instead of %d\n",
                   i, result(i), i);
            return -1;
        }
    }

    printf("Success!\n");
    return 0;
}
Ejemplo n.º 25
0
/* Do n unrolled iterations of game of life on a torus */
Func gameOfLife(ImageParam input, int n) {
    Var x, y;
    Func in;
    if (n == 1) {
        in(x, y) = input(x, y);
    } else {
        in = gameOfLife(input, n-1);
        in.compute_root();
    }

    Expr w = input.width(), h = input.height();
    Expr W = (x+w-1) % w, E = (x+1) % w, N = (y+h-1) % h, S = (y+1) % h;
    Expr livingNeighbors = (in(W, N) + in(x, N) +
                            in(E, N) + in(W, y) + 
                            in(E, y) + in(W, S) +
                            in(x, S) + in(E, S));    
    Expr alive = in(x, y) != 0;
    Func output;
    output(x, y) = select(livingNeighbors == 3 || (alive && livingNeighbors == 2), u8(1), u8(0));    

    return output;
}
Ejemplo n.º 26
0
int main(int argc, char **argv) {

    Func f;
    Var x, y;

    Func in;
    in(x, y) = x + y;
    in.compute_root();

    // Set f to zero
    f(x, y) = 0;

    // Then iterate over a circle, adding in(x, y) to f.
    Expr t = cast<int>(ceil(sqrt(10*10 - y*y)));
    f(x, y) += select(x > -t && x < t, in(x, y), 0);

    in.trace_loads();
    f.set_custom_trace(my_trace);
    f.realize(20, 20);

    int c = 0;
    for (int y = 0; y < 20; y++) {
        for (int x = 0; x < 20; x++) {
            if (x*x + y*y < 10*10) c++;
        }
    }

    if (count != c) {
        printf("Func 'in' should only have been loaded from at points "
               "within the circle x*x + y*y < 10*10. It was loaded %d "
               "times, but there are %d points within that circle\n", count, c);
        printf("Passing for now. TODO: re-enable this test once trim-no-ops is in.\n");
    }

    printf("Success!\n");

    return 0;
}
Ejemplo n.º 27
0
int subtraction_rfactor_test() {
    Func f("f"), g("g"), ref("ref");
    Var x("x"), y("y");

    f(x, y) = x + y;
    f.compute_root();

    Param<int> inner_extent, outer_extent;
    RDom r(10, inner_extent, 30, outer_extent);
    inner_extent.set(20);
    outer_extent.set(40);

    ref(x, y) = 40;
    ref(x, y) -= f(r.x, r.y);

    g(x, y) = 40;
    g(x, y) -= f(r.x, r.y);

    RVar rxi("rxi"), rxo("rxo");
    g.update(0).split(r.x, rxo, rxi, 2);

    Var u("u");
    Func intm = g.update(0).rfactor(rxo, u);
    intm.compute_root();
    intm.update(0).vectorize(u, 2);

    Image<int> im_ref = ref.realize(80, 80);
    Image<int> im = g.realize(80, 80);
    auto func = [&im_ref](int x, int y, int z) {
        return im_ref(x, y);
    };
    if (check_image(im, func)) {
        return -1;
    }
    return 0;
}
Ejemplo n.º 28
0
int main(int argc, char **argv) {

    Image<uint8_t> board1(32, 32), board2(32, 32), board3(32, 32);
    
    for (int y = 0; y < 32; y++) {
        for (int x = 0; x < 32; x++) {
            uint8_t val = ((rand() & 0xff) < 128) ? 1 : 0;
            board1(x, y) = val;
            board2(x, y) = val;
            board3(x, y) = val;
        }
    }
    
    ImageParam input(UInt(8), 2);

    {
        // Outer loop in C

        Func oneIteration = gameOfLife(input, 1);
        Func twoIterations = gameOfLife(input, 2);
        
        for (int i = 0; i < 10; i++) {
            input.set(board1);
            board1 = oneIteration.realize(32, 32);
            input.set(board1);
            board1 = oneIteration.realize(32, 32);
            input.set(board2);
            board2 = twoIterations.realize(32, 32);
            
            /*
            for (int y = 0; y < 32; y++) {
                for (int x = 0; x < 32; x++) {
                    printf(board1(x, y) ? "#" : " ");
                }
                printf("|");
                for (int x = 0; x < 32; x++) {
                    printf(board2(x, y) ? "#" : " ");
                }
                printf("\n");
            }
            */

            for (int y = 0; y < 32; y++) {
                for (int x = 0; x < 32; x++) {
                    if (board1(x, y) != board2(x, y)) {
                        printf("At timestep %d, boards one and two disagree at %d, %d: %d vs %d\n", 
                               i, x, y, board1(x, y), board2(x, y));
                        return -1;
                    }
                }
            }
        }
    }

    {
        // Outer loop in Halide using a reduction
        Func life;

        // Initialize step
        Var x, y, z;
        life(x, y, z) = input(x, y);

        // Update step
        Expr w = input.width(), h = input.height();
        RDom t(0, w, 0, h, 0, 21);
        Expr lastT = (t.z+1)%2;
        Expr W = (t.x+w-1) % w, E = (t.x+1) % w, N = (t.y+h-1) % h, S = (t.y+1) % h;
        Expr alive = life(t.x, t.y, lastT) != u8(0);
        Expr livingNeighbors = (life(W, N, lastT) + life(t.x, N, lastT) +
                                life(E, N, lastT) + life(W, t.y, lastT) + 
                                life(E, t.y, lastT) + life(W, S, lastT) +
                                life(t.x, S, lastT) + life(E, S, lastT));            
        life(t.x, t.y, t.z%2) = select(livingNeighbors == 3 || (alive && livingNeighbors == 2), u8(1), u8(0));
        life.compute_root();

        Func output;
        output(x, y) = life(x, y, 1);        

        input.set(board3);
        output.realize(board3);

        /*
        for (int y = 0; y < 32; y++) {
            for (int x = 0; x < 32; x++) {
                printf(board1(x, y) ? "#" : " ");
            }
            printf("|");
            for (int x = 0; x < 32; x++) {
                printf(board3(x, y) ? "#" : " ");
            }
            printf("\n");
        }
        */

        for (int y = 0; y < 32; y++) {
            for (int x = 0; x < 32; x++) {
                if (board1(x, y) != board3(x, y)) {
                    printf("Boards one and three disagree at %d, %d: %d vs %d\n", 
                           x, y, board1(x, y), board3(x, y));
                    return -1;
                }
            }
        }        
    }

    printf("Success!\n");
    return 0;

}
    // Now a schedule that uses CUDA or OpenCL.
    void schedule_for_gpu() {
        // We make the decision about whether to use the GPU for each
        // Func independently. If you have one Func computed on the
        // CPU, and the next computed on the GPU, Halide will do the
        // copy-to-gpu under the hood. For this pipeline, there's no
        // reason to use the CPU for any of the stages. Halide will
        // copy the input image to the GPU the first time we run the
        // pipeline, and leave it there to reuse on subsequent runs.

        // As before, we'll compute the LUT once at the start of the
        // pipeline.
        lut.compute_root();

        // Let's compute the look-up-table using the GPU in 16-wide
        // one-dimensional thread blocks. First we split the index
        // into blocks of size 16:
        Var block, thread;
        lut.split(i, block, thread, 16);
        // Then we tell cuda that our Vars 'block' and 'thread'
        // correspond to CUDA's notions of blocks and threads, or
        // OpenCL's notions of thread groups and threads.
        lut.gpu_blocks(block)
           .gpu_threads(thread);

        // This is a very common scheduling pattern on the GPU, so
        // there's a shorthand for it:

        // lut.gpu_tile(i, 16);

        // Func::gpu_tile method is similar to Func::tile, except that
        // it also specifies that the tile coordinates correspond to
        // GPU blocks, and the coordinates within each tile correspond
        // to GPU threads.

        // Compute color channels innermost. Promise that there will
        // be three of them and unroll across them.
        curved.reorder(c, x, y)
              .bound(c, 0, 3)
              .unroll(c);

        // Compute curved in 2D 8x8 tiles using the GPU.
        curved.gpu_tile(x, y, 8, 8);

        // This is equivalent to:
        // curved.tile(x, y, xo, yo, xi, yi, 8, 8)
        //       .gpu_blocks(xo, yo)
        //       .gpu_threads(xi, yi);

        // We'll leave sharpen as inlined into curved.

        // Compute the padded input as needed per GPU block, storing the
        // intermediate result in shared memory. Var::gpu_blocks, and
        // Var::gpu_threads exist to help you schedule producers within
        // GPU threads and blocks.
        padded.compute_at(curved, Var::gpu_blocks());

        // Use the GPU threads for the x and y coordinates of the
        // padded input.
        padded.gpu_threads(x, y);

        // JIT-compile the pipeline for the GPU. CUDA or OpenCL are
        // not enabled by default. We have to construct a Target
        // object, enable one of them, and then pass that target
        // object to compile_jit. Otherwise your CPU will very slowly
        // pretend it's a GPU, and use one thread per output pixel.

        // Start with a target suitable for the machine you're running
        // this on.
        Target target = get_host_target();

        // Then enable OpenCL or CUDA.

        // We'll enable OpenCL here, because it tends to give better
        // performance than CUDA, even with NVidia's drivers, because
        // NVidia's open source LLVM backend doesn't seem to do all
        // the same optimizations their proprietary compiler does.
        target.features |= Target::OpenCL;

        // Uncomment the next line and comment out the line above to
        // try CUDA instead.
        // target.features |= Target::CUDA;

        // If you want to see all of the OpenCL or CUDA API calls done
        // by the pipeline, you can also enable the GPUDebug
        // flag. This is helpful for figuring out which stages are
        // slow, or when CPU -> GPU copies happen. It hurts
        // performance though, so we'll leave it commented out.
        //target.features |= Target::GPUDebug;

        curved.compile_jit(target);
    }
Ejemplo n.º 30
0
int main(int argc, char **argv) {

    /* THE ALGORITHM */

    // Number of pyramid levels 
    int J = 8;

    // number of intensity levels
    Param<int> levels;
    // Parameters controlling the filter
    Param<float> alpha, beta;
    // Takes a 16-bit input
    ImageParam input(UInt(16), 3);

    // loop variables
    Var c, k;

    // Make the remapping function as a lookup table.
    Func remap;
    Expr fx = cast<float>(x) / 256.0f;
    remap(x) = alpha*fx*exp(-fx*fx/2.0f);
    
    // Convert to floating point
    Func floating;
    floating(x, y, c) = cast<float>(input(x, y, c)) / 65535.0f;
    
    // Set a boundary condition
    Func clamped;
    clamped(x, y, c) = floating(clamp(x, 0, input.width()-1), clamp(y, 0, input.height()-1), c);
    
    // Get the luminance channel
    Func gray;
    gray(x, y) = 0.299f * clamped(x, y, 0) + 0.587f * clamped(x, y, 1) + 0.114f * clamped(x, y, 2);

    // Make the processed Gaussian pyramid. 
    Func gPyramid[J];
    // Do a lookup into a lut with 256 entires per intensity level
    Expr idx = gray(x, y)*cast<float>(levels-1)*256.0f;
    idx = clamp(cast<int>(idx), 0, (levels-1)*256);
    gPyramid[0](x, y, k) = beta*gray(x, y) + remap(idx - 256*k);
    for (int j = 1; j < J; j++) {
        gPyramid[j](x, y, k) = downsample(gPyramid[j-1])(x, y, k);
    }    

    // Get its laplacian pyramid
    Func lPyramid[J];
    lPyramid[J-1] = gPyramid[J-1];
    for (int j = J-2; j >= 0; j--) {
        lPyramid[j](x, y, k) = gPyramid[j](x, y, k) - upsample(gPyramid[j+1])(x, y, k);
    }

    // Make the Gaussian pyramid of the input
    Func inGPyramid[J];
    inGPyramid[0] = gray;
    for (int j = 1; j < J; j++) {
        inGPyramid[j](x, y) = downsample(inGPyramid[j-1])(x, y);
    }        

    // Make the laplacian pyramid of the output
    Func outLPyramid[J];
    for (int j = 0; j < J; j++) {
        // Split input pyramid value into integer and floating parts
        Expr level = inGPyramid[j](x, y) * cast<float>(levels-1);
        Expr li = clamp(cast<int>(level), 0, levels-2);
        Expr lf = level - cast<float>(li);
        // Linearly interpolate between the nearest processed pyramid levels
        outLPyramid[j](x, y) = (1.0f - lf) * lPyramid[j](x, y, li) + lf * lPyramid[j](x, y, li+1);
    }
    
    // Make the Gaussian pyramid of the output
    Func outGPyramid[J];
    outGPyramid[J-1] = outLPyramid[J-1];
    for (int j = J-2; j >= 0; j--) {
        outGPyramid[j](x, y) = upsample(outGPyramid[j+1])(x, y) + outLPyramid[j](x, y);
    }    

    // Reintroduce color (Connelly: use eps to avoid scaling up noise w/ apollo3.png input)
    Func color;
    float eps = 0.01f;
    color(x, y, c) = outGPyramid[0](x, y) * (clamped(x, y, c)+eps) / (gray(x, y)+eps);
        
    Func output("local_laplacian");
    // Convert back to 16-bit
    output(x, y, c) = cast<uint16_t>(clamp(color(x, y, c), 0.0f, 1.0f) * 65535.0f);



    /* THE SCHEDULE */

    remap.compute_root();

    Var yi;
    output.split(y, y, yi, 4).parallel(y).vectorize(x, 4);
    for (int j = 0; j < 4; j++) {
        inGPyramid[j].compute_root().split(y, y, yi, 4).parallel(y).vectorize(x, 4);
        if (j > 0) gPyramid[j].compute_root().parallel(k).vectorize(x, 4);
        outGPyramid[j].compute_root().split(y, y, yi, 4).parallel(y).vectorize(x, 4);
    }
    for (int j = 4; j < J; j++) {
        inGPyramid[j].compute_root().parallel(y);
        gPyramid[j].compute_root().parallel(k);
        outGPyramid[j].compute_root().parallel(y);
    }

    output.compile_to_file("local_laplacian", levels, alpha, beta, input);

    return 0;
}