Ejemplos de Func::arg en C++ (Cpp)

Lenguaje de programación: C++ (Cpp)

Clase / Tipo: Func

Método / Función: arg

Ejemplos en hotexamples.com: 1

C++ (Cpp) Func::arg - 1 ejemplos encontrados. Estos son los ejemplos en C++ (Cpp) del mundo real mejor valorados de Func::arg de paquete AlgoSolution extraídos de proyectos de código abierto. Puedes valorar ejemplos para ayudarnos a mejorar la calidad de los ejemplos.

Métodos usados con frecuencia

Mostrar Ocultar

compute_root(30)

compute_at(24)

Call(18)

compile_jit(15)

compile_to_file(14)

bound(14)

gpu_tile(13)

add_custom_lowering_pass(8)

hexagon(6)

compile_to_assembly(6)

attrs(6)

maxModulo(5)

function(5)

define_extern(5)

compile_to_module(4)

glsl(4)

appendParam(4)

args(4)

free_ord(3)

getBody(3)

eval(3)

compileToFile(3)

gpu_threads(3)

infer_arguments(3)

dimensions(3)

accept(3)

initPrologues(3)

isAllowOverride(2)

getArguments(2)

gpu_blocks(2)

infer_input_bounds(2)

finiteHessian(2)

getName(2)

getResults(2)

getKind(2)

finiteGradient(2)

chunk(2)

compile_to_static_library(2)

codegen(2)

compile_to_lowered_stmt(2)

isStatic(1)

isPseudoMain(1)

isIgnoreRedefinition(1)

isAsync(1)

length(1)

hello(1)

GetLocalsPointer(1)

GetName(1)

inputs(1)

GetParentFunc(1)

Ejemplo n.º 1

Mostrar archivo

Archivo: bilateral_grid.cpp Proyecto: autumnm1981/Halide

int main(int argc, char **argv) {
    if (argc < 2) {
        printf("Spatial sigma is a compile-time parameter, please provide it as an argument.\n"
               "(llvm's ptx backend doesn't handle integer mods by non-consts yet)\n");
        return 0;
    }

    UniformImage input(Float(32), 2);
    Uniform<float> r_sigma;
    int s_sigma = atoi(argv[1]);
    Var x, y, z, c;

    // Add a boundary condition 
    Func clamped;
    clamped(x, y) = input(clamp(x, 0, input.width()-1),
                          clamp(y, 0, input.height()-1));

    // Construct the bilateral grid 
    RDom r(0, s_sigma, 0, s_sigma);
    Expr val = clamped(x * s_sigma + r.x - s_sigma/2, y * s_sigma + r.y - s_sigma/2);
    val = clamp(val, 0.0f, 1.0f);
    Expr zi = cast<int>(val * (1.0f/r_sigma) + 0.5f);
    Func grid;
    grid(x, y, zi, c) += select(c == 0, val, 1.0f);

    // Blur the grid using a five-tap filter
    Func blurx, blury, blurz;
    blurx(x, y, z) = grid(x-2, y, z) + grid(x-1, y, z)*4 + grid(x, y, z)*6 + grid(x+1, y, z)*4 + grid(x+2, y, z);
    blury(x, y, z) = blurx(x, y-2, z) + blurx(x, y-1, z)*4 + blurx(x, y, z)*6 + blurx(x, y+1, z)*4 + blurx(x, y+2, z);
    blurz(x, y, z) = blury(x, y, z-2) + blury(x, y, z-1)*4 + blury(x, y, z)*6 + blury(x, y, z+1)*4 + blury(x, y, z+2);

    // Take trilinear samples to compute the output
    val = clamp(clamped(x, y), 0.0f, 1.0f);
    Expr zv = val * (1.0f/r_sigma);
    zi = cast<int>(zv);
    Expr zf = zv - zi;
    Expr xf = cast<float>(x % s_sigma) / s_sigma;
    Expr yf = cast<float>(y % s_sigma) / s_sigma;
    Expr xi = x/s_sigma;
    Expr yi = y/s_sigma;
    Func interpolated;
    interpolated(x, y) = 
        lerp(lerp(lerp(blurz(xi, yi, zi), blurz(xi+1, yi, zi), xf),
                  lerp(blurz(xi, yi+1, zi), blurz(xi+1, yi+1, zi), xf), yf),
             lerp(lerp(blurz(xi, yi, zi+1), blurz(xi+1, yi, zi+1), xf),
                  lerp(blurz(xi, yi+1, zi+1), blurz(xi+1, yi+1, zi+1), xf), yf), zf);

    // Normalize
    Func smoothed;
    smoothed(x, y) = interpolated(x, y, 0)/interpolated(x, y, 1);

    #ifndef USE_GPU
    // Best schedule for CPU
    printf("Compiling for CPU\n");
    grid.root().parallel(z);
    grid.update().transpose(y, c).transpose(x, c).parallel(y);
    blurx.root().parallel(z).vectorize(x, 4);
    blury.root().parallel(z).vectorize(x, 4);
    blurz.root().parallel(z).vectorize(x, 4);
    smoothed.root().parallel(y).vectorize(x, 4); 
    #else    

    printf("Compiling for GPU");
    Var gridz = grid.arg(2);
    grid.transpose(y, gridz).transpose(x, gridz).transpose(y, c).transpose(x, c)
        .root().cudaTile(x, y, 16, 16);
    grid.update().transpose(y, c).transpose(x, c).transpose(i, c).transpose(j, c)
        .root().cudaTile(x, y, 16, 16);
    c = blurx.arg(3);
    blurx.transpose(y, z).transpose(x, z).transpose(y, c).transpose(x, c)
        .root().cudaTile(x, y, 8, 8);
    
    c = blury.arg(3);
    blury.transpose(y, z).transpose(x, z).transpose(y, c).transpose(x, c)
        .root().cudaTile(x, y, 8, 8);

    c = blurz.arg(3);
    blurz.transpose(y, z).transpose(x, z).transpose(y, c).transpose(x, c)
        .root().cudaTile(x, y, 8, 8);
    
    smoothed.root().cudaTile(x, y, s_sigma, s_sigma);
    #endif

    smoothed.compileToFile("bilateral_grid", {r_sigma, input});

    // Compared to Sylvain Paris' implementation from his webpage (on
    // which this is based), for filter params s_sigma 0.1, on a 4 megapixel
    // input, on a four core x86 (2 socket core2 mac pro)
    // Filter s_sigma: 2      4       8       16      32
    // Paris (ms):     5350   1345    472     245     184
    // Us (ms):        383    142     77      62      65
    // Speedup:        14     9.5     6.1     3.9     2.8

    // Our schedule and inlining are roughly the same as his, so the
    // gain is all down to vectorizing and parallelizing. In general
    // for larger blurs our win shrinks to roughly the number of
    // cores, as the stages we don't vectorize as well dominate (we
    // don't vectorize them well because they do gathers and scatters,
    // which don't work well on x86).  For smaller blurs, our win
    // grows, because the stages that we vectorize take up all the
    // time.
    

    return 0;
}