Ejemplo n.º 1
0
int main(int argc, char **argv) {
    // The camera pipe is specialized on the 2592x1968 images that
    // come in, so we'll just use an image instead of a uniform image.
    Image<int16_t> input(2592, 1968);
    UniformImage matrix_3200(Float(32), 2, "m3200"), matrix_7000(Float(32), 2, "m7000");
    Uniform<float> color_temp("color_temp", 3200.0f);
    Uniform<float> gamma("gamma", 1.8f);
    Uniform<float> contrast("contrast", 10.0f);

    // shift things inwards to give us enough padding on the
    // boundaries so that we don't need to check bounds. We're going
    // to make a 2560x1920 output image, just like the FCam pipe, so
    // shift by 16, 12
    Func shifted;
    shifted(x, y) = input(clamp(x+16, 0, input.width()-1), clamp(y+12, 0, input.height()-1)); 
    
    // Parameterized output type, because LLVM PTX (GPU) backend does not
    // currently allow 8-bit computations
    int bit_width = atoi(argv[1]);
    Type result_type = UInt(bit_width);
    
    // Pick a schedule   
    schedule = atoi(argv[2]);
    
    // Build the pipeline
    Func processed = process(shifted, result_type, matrix_3200, matrix_7000, color_temp, gamma, contrast);

    //string s = processed.serialize();
    //printf("%s\n", s.c_str());

    // In C++-11, this can be done as a simple initializer_list {color_temp,gamma,etc.} in place.
    Arg args[] = {color_temp, gamma, contrast, input, matrix_3200, matrix_7000};
    processed.compileToFile("curved", std::vector<Arg>(args, args+6));

    return 0;
}
Ejemplo n.º 2
0
int main(int argc, char **argv) {
    if (argc < 2) {
        printf("Spatial sigma is a compile-time parameter, please provide it as an argument.\n"
               "(llvm's ptx backend doesn't handle integer mods by non-consts yet)\n");
        return 0;
    }

    UniformImage input(Float(32), 2);
    Uniform<float> r_sigma;
    int s_sigma = atoi(argv[1]);
    Var x, y, z, c;

    // Add a boundary condition 
    Func clamped;
    clamped(x, y) = input(clamp(x, 0, input.width()-1),
                          clamp(y, 0, input.height()-1));

    // Construct the bilateral grid 
    RDom r(0, s_sigma, 0, s_sigma);
    Expr val = clamped(x * s_sigma + r.x - s_sigma/2, y * s_sigma + r.y - s_sigma/2);
    val = clamp(val, 0.0f, 1.0f);
    Expr zi = cast<int>(val * (1.0f/r_sigma) + 0.5f);
    Func grid;
    grid(x, y, zi, c) += select(c == 0, val, 1.0f);

    // Blur the grid using a five-tap filter
    Func blurx, blury, blurz;
    blurx(x, y, z) = grid(x-2, y, z) + grid(x-1, y, z)*4 + grid(x, y, z)*6 + grid(x+1, y, z)*4 + grid(x+2, y, z);
    blury(x, y, z) = blurx(x, y-2, z) + blurx(x, y-1, z)*4 + blurx(x, y, z)*6 + blurx(x, y+1, z)*4 + blurx(x, y+2, z);
    blurz(x, y, z) = blury(x, y, z-2) + blury(x, y, z-1)*4 + blury(x, y, z)*6 + blury(x, y, z+1)*4 + blury(x, y, z+2);

    // Take trilinear samples to compute the output
    val = clamp(clamped(x, y), 0.0f, 1.0f);
    Expr zv = val * (1.0f/r_sigma);
    zi = cast<int>(zv);
    Expr zf = zv - zi;
    Expr xf = cast<float>(x % s_sigma) / s_sigma;
    Expr yf = cast<float>(y % s_sigma) / s_sigma;
    Expr xi = x/s_sigma;
    Expr yi = y/s_sigma;
    Func interpolated;
    interpolated(x, y) = 
        lerp(lerp(lerp(blurz(xi, yi, zi), blurz(xi+1, yi, zi), xf),
                  lerp(blurz(xi, yi+1, zi), blurz(xi+1, yi+1, zi), xf), yf),
             lerp(lerp(blurz(xi, yi, zi+1), blurz(xi+1, yi, zi+1), xf),
                  lerp(blurz(xi, yi+1, zi+1), blurz(xi+1, yi+1, zi+1), xf), yf), zf);

    // Normalize
    Func smoothed;
    smoothed(x, y) = interpolated(x, y, 0)/interpolated(x, y, 1);

    #ifndef USE_GPU
    // Best schedule for CPU
    printf("Compiling for CPU\n");
    grid.root().parallel(z);
    grid.update().transpose(y, c).transpose(x, c).parallel(y);
    blurx.root().parallel(z).vectorize(x, 4);
    blury.root().parallel(z).vectorize(x, 4);
    blurz.root().parallel(z).vectorize(x, 4);
    smoothed.root().parallel(y).vectorize(x, 4); 
    #else    

    printf("Compiling for GPU");
    Var gridz = grid.arg(2);
    grid.transpose(y, gridz).transpose(x, gridz).transpose(y, c).transpose(x, c)
        .root().cudaTile(x, y, 16, 16);
    grid.update().transpose(y, c).transpose(x, c).transpose(i, c).transpose(j, c)
        .root().cudaTile(x, y, 16, 16);
    c = blurx.arg(3);
    blurx.transpose(y, z).transpose(x, z).transpose(y, c).transpose(x, c)
        .root().cudaTile(x, y, 8, 8);
    
    c = blury.arg(3);
    blury.transpose(y, z).transpose(x, z).transpose(y, c).transpose(x, c)
        .root().cudaTile(x, y, 8, 8);

    c = blurz.arg(3);
    blurz.transpose(y, z).transpose(x, z).transpose(y, c).transpose(x, c)
        .root().cudaTile(x, y, 8, 8);
    
    smoothed.root().cudaTile(x, y, s_sigma, s_sigma);
    #endif

    smoothed.compileToFile("bilateral_grid", {r_sigma, input});

    // Compared to Sylvain Paris' implementation from his webpage (on
    // which this is based), for filter params s_sigma 0.1, on a 4 megapixel
    // input, on a four core x86 (2 socket core2 mac pro)
    // Filter s_sigma: 2      4       8       16      32
    // Paris (ms):     5350   1345    472     245     184
    // Us (ms):        383    142     77      62      65
    // Speedup:        14     9.5     6.1     3.9     2.8

    // Our schedule and inlining are roughly the same as his, so the
    // gain is all down to vectorizing and parallelizing. In general
    // for larger blurs our win shrinks to roughly the number of
    // cores, as the stages we don't vectorize as well dominate (we
    // don't vectorize them well because they do gathers and scatters,
    // which don't work well on x86).  For smaller blurs, our win
    // grows, because the stages that we vectorize take up all the
    // time.
    

    return 0;
}
Ejemplo n.º 3
0
int main(int argc, char **argv) {

    /* THE ALGORITHM */

    // Number of pyramid levels 
    int J = 8;

    // number of intensity levels
    Uniform<int> levels;
    // Parameters controlling the filter
    Uniform<float> alpha, beta;
    // Takes a 16-bit input
    UniformImage input(UInt(16), 3);

    // loop variables
    Var x, y, c, k;

    // Make the remapping function as a lookup table.
    Func remap;
    Expr fx = cast<float>(x) / 256.0f;
    remap(x) = alpha*fx*exp(-fx*fx/2.0f);
    
    // Convert to floating point
    Func floating;
    floating(x, y, c) = cast<float>(input(x, y, c)) / 65535.0f;
    
    // Set a boundary condition
    Func clamped;
    clamped(x, y, c) = floating(clamp(x, 0, input.width()-1), clamp(y, 0, input.height()-1), c);
    
    // Get the luminance channel
    Func gray;
    gray(x, y) = 0.299f * clamped(x, y, 0) + 0.587f * clamped(x, y, 1) + 0.114f * clamped(x, y, 2);

    // Make the processed Gaussian pyramid. 
    Func gPyramid[J];
    // Do a lookup into a lut with 256 entires per intensity level
    Expr idx = gray(x, y)*cast<float>(levels-1)*256.0f;
    idx = clamp(cast<int>(idx), 0, (levels-1)*256);
    gPyramid[0](x, y, k) = beta*gray(x, y) + remap(idx - 256*k);
    //gPyramid[0](x, y, k) = remap(gray(x, y), cast<float>(k) / (levels-1), alpha, beta, levels-1);
    for (int j = 1; j < J; j++)
        gPyramid[j](x, y, k) = downsample(gPyramid[j-1])(x, y, k);
    
    // Get its laplacian pyramid
    Func lPyramid[J];
    lPyramid[J-1] = gPyramid[J-1];
    for (int j = J-2; j >= 0; j--) 
        lPyramid[j](x, y, k) = gPyramid[j](x, y, k) - upsample(gPyramid[j+1])(x, y, k);    

    // Make the Gaussian pyramid of the input
    Func inGPyramid[J];
    inGPyramid[0] = gray;
    for (int j = 1; j < J; j++)
        inGPyramid[j](x, y) = downsample(inGPyramid[j-1])(x, y);
        
    // Make the laplacian pyramid of the output
    Func outLPyramid[J];
    for (int j = 0; j < J; j++) {
        // Split input pyramid value into integer and floating parts
        Expr level = inGPyramid[j](x, y) * cast<float>(levels-1);
        Expr li = clamp(cast<int>(level), 0, levels-2);
        Expr lf = level - cast<float>(li);
        // Linearly interpolate between the nearest processed pyramid levels
        outLPyramid[j](x, y) = (1.0f - lf) * lPyramid[j](x, y, li) + lf * lPyramid[j](x, y, li+1);
    }
    
    // Make the Gaussian pyramid of the output
    Func outGPyramid[J];
    outGPyramid[J-1] = outLPyramid[J-1];
    for (int j = J-2; j >= 0; j--) 
        outGPyramid[j](x, y) = upsample(outGPyramid[j+1])(x, y) + outLPyramid[j](x, y);
    
    // Reintroduce color
    Func color;
    color(x, y, c) = outGPyramid[0](x, y) * clamped(x, y, c) / gray(x, y);
        
    Func output;
    // Convert back to 16-bit
    output(x, y, c) = cast<uint16_t>(clamp(color(x, y, c), 0.0f, 1.0f) * 65535.0f);



    /* THE SCHEDULE */


    // While normally we'd leave in just the best schedule per
    // architecture, here we kept track of everything we tried inside
    // a giant switch statement, to demonstrate how we go about
    // optimizing the schedule. The reference implementation on the
    // quad-core machine took 627 ms.

    // In any case, the remapping function should be a lut evaluated
    // ahead of time. It's so small relative to everything else that
    // its schedule really doesn't matter (provided we don't inline
    // it).
    remap.root();

    Var yi;

    // Variables to control mapping to GPU
    Var bx("blockidx"), by("blockidy"), tx("threadidx"), ty("threadidy");

    // Times are for a quad-core core2, a 32-core nehalem, and a 2-core omap4 cortex-a9
    switch (atoi(argv[1])) {
    case 0:
        // As a baseline, breadth-first scalar: 1572, 1791, 9690 
        output.root();
        for (int j = 0; j < J; j++) {
            inGPyramid[j].root();
            gPyramid[j].root();
            outGPyramid[j].root();
            if (j == J-1) break;
            lPyramid[j].root();
            outLPyramid[j].root();
        }
        break;        
    case 1:
        // parallelize each stage across outermost dimension: 769, 321, 5622 
        output.split(y, y, yi, 32).parallel(y);
        for (int j = 0; j < J; j++) {
            inGPyramid[j].root().split(y, y, yi, 4).parallel(y);
            gPyramid[j].root().parallel(k);
            outGPyramid[j].root().split(y, y, yi, 4).parallel(y);
            if (j == J-1) break;
            lPyramid[j].root().parallel(k);
            outLPyramid[j].root().split(y, y, yi, 4).parallel(y);
        }
        break;        
    case 2:
        // Same as above, but also vectorize across x: 855, 288, 7004
        output.split(y, y, yi, 32).parallel(y).vectorize(x, 4);
        for (int j = 0; j < J; j++) {
            inGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4);
            gPyramid[j].root().parallel(k).vectorize(x, 4);
            outGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4);
            if (j == J-1) break;
            lPyramid[j].root().parallel(k).vectorize(x, 4);
            outLPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4);
        }
        break;
    case 3:
        // parallelize across yi instead of y: Bad idea - 1136, 889, 7144 
        output.split(y, y, yi, 8).parallel(yi);
        for (int j = 0; j < J; j++) {
            inGPyramid[j].root().split(y, y, yi, 8).parallel(yi);
            gPyramid[j].root().parallel(k);
            outGPyramid[j].root().split(y, y, yi, 8).parallel(yi);
            if (j == J-1) break;
            lPyramid[j].root().parallel(k);
            outLPyramid[j].root().split(y, y, yi, 8).parallel(yi);
        }
        break;        
    case 4:
        // Parallelize, inlining all the laplacian pyramid levels
        // (they can be computed from the gaussian pyramids on the
        // fly): 491, 244, 4297
        output.split(y, y, yi, 32).parallel(y);
        for (int j = 0; j < J; j++) {
            inGPyramid[j].root().split(y, y, yi, 4).parallel(y);
            gPyramid[j].root().parallel(k);
            outGPyramid[j].root().split(y, y, yi, 4).parallel(y);
        }
        break;                
    case 5:
        // Same as above with vectorization (now that we're doing more
        // math and less memory, maybe it will matter): 585, 204, 5389
        output.split(y, y, yi, 32).parallel(y).vectorize(x, 4);
        for (int j = 0; j < J; j++) {
            inGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4);
            gPyramid[j].root().parallel(k).vectorize(x, 4);
            outGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4);
        }
        break;
    case 6:
        // Also inline every other pyramid level: Bad idea - 2118, 562, 16873
        output.split(y, y, yi, 32).parallel(y).vectorize(x, 4);
        for (int j = 0; j < J; j+=2) {
            inGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4);
            gPyramid[j].root().parallel(k).vectorize(x, 4);
            outGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4);
        }
        break;
    case 7:
        // Take care of the boundary condition earlier to avoid costly
        // branching: 648, 242, 6037
        output.split(y, y, yi, 32).parallel(y).vectorize(x, 4);
        clamped.root().split(y, y, yi, 32).parallel(y).vectorize(x, 4);
        for (int j = 0; j < J; j++) {
            inGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4);
            gPyramid[j].root().parallel(k).vectorize(x, 4);
            outGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4);
        }
        break;
    case 8:
        // Unroll by a factor of two to try and simplify the
        // upsampling math: not worth it - 583, 297, 5716
        output.split(y, y, yi, 32).parallel(y).unroll(x, 2).unroll(yi, 2);
        for (int j = 0; j < J; j++) {
            inGPyramid[j].root().split(y, y, yi, 4).parallel(y).unroll(x, 2).unroll(y, 2);
            gPyramid[j].root().parallel(k).unroll(x, 2).unroll(y, 2);
            outGPyramid[j].root().split(y, y, yi, 4).parallel(y).unroll(x, 2).unroll(y, 2);
        }
        break;                        
    case 9:
        // Same as case 5 but parallelize across y as well as k, in
        // case k is too small to saturate the machine: 693, 239, 5774
        output.split(y, y, yi, 32).parallel(y).vectorize(x, 4);
        for (int j = 0; j < J; j++) {
            inGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4);
            gPyramid[j].root().parallel(k).split(y, y, yi, 4).parallel(y).vectorize(x, 4);
            outGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4);
        }
        break;
    case 10:
        // Really-fine-grain parallelism. Don't both splitting
        // y. Should incur too much overhead to be good: 1083, 256, 5338
        output.parallel(y).vectorize(x, 4);
        for (int j = 0; j < J; j++) {
            inGPyramid[j].root().parallel(y).vectorize(x, 4);
            gPyramid[j].root().parallel(k).parallel(y).vectorize(x, 4);
            outGPyramid[j].root().parallel(y).vectorize(x, 4);
        }
        break;      
    case 11:
        // Same as case 5, but don't vectorize above a certain pyramid
        // level to prevent boundaries expanding too much (computing
        // an 8x8 top pyramid level instead of e.g. 5x5 requires much
        // much more input). 602, 194, 4836
        output.split(y, y, yi, 32).parallel(y).vectorize(x, 4);
        for (int j = 0; j < J; j++) {
            inGPyramid[j].root().parallel(y);
            gPyramid[j].root().parallel(k);
            outGPyramid[j].root().parallel(y);
            if (j < 5) {
                inGPyramid[j].vectorize(x, 4);
                gPyramid[j].vectorize(x, 4);
                outGPyramid[j].vectorize(x, 4);
            }
        }
        break;
    case 12:
        // The bottom pyramid level is gigantic. I wonder if we can
        // just compute those values on demand. Otherwise same as 5:
        // 293, 170, 5490
        output.split(y, y, yi, 32).parallel(y).vectorize(x, 4);
        for (int j = 0; j < J; j++) {
            inGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4);
            if (j > 0) gPyramid[j].root().parallel(k).vectorize(x, 4);
            outGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4);
        }
        break;
    case 13:
        // Should we inline the bottom pyramid level of everything?: 1044, 570, 17273
        output.split(y, y, yi, 32).parallel(y).vectorize(x, 4);
        for (int j = 1; j < J; j++) {
            inGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4);
            gPyramid[j].root().parallel(k).vectorize(x, 4);
            outGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4);
        }
        break;
    case 14:
        // 4 and 11 were pretty good for ARM. Can we do better by inlining
        // the root pyramid level like in 12? 427, 228, 4233
        output.split(y, y, yi, 32).parallel(y);
        for (int j = 0; j < J; j++) {
            inGPyramid[j].root().parallel(y);
            if (j > 0) gPyramid[j].root().parallel(k);
            outGPyramid[j].root().parallel(y);          
        }
        break;

    case 100:
        // output stage only on GPU
        output.root().split(y, by, ty, 32).split(x, bx, tx, 32)
            .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
        for (int j = 0; j < J; j++) {
            inGPyramid[j].root();
            gPyramid[j].root();
            outGPyramid[j].root();
            if (j == J-1) break;
            lPyramid[j].root();
            outLPyramid[j].root();
        }
        break;
    case 101:
        // all root on GPU, tiny blocks to prevent accidental bounds explosion
        output.root().split(y, by, ty, 2).split(x, bx, tx, 2)
            .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
        for (int j = 0; j < J; j++) {
            inGPyramid[j].root()
                .split(y, by, ty, 2).split(x, bx, tx, 2)
                .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
            gPyramid[j].root()
                .split(y, by, ty, 2).split(x, bx, tx, 2)
                .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
            outGPyramid[j].root()
                .split(y, by, ty, 2).split(x, bx, tx, 2)
                .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
            if (j == J-1) break;
            lPyramid[j].root()
                .split(y, by, ty, 2).split(x, bx, tx, 2)
                .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
            outLPyramid[j].root()
                .split(y, by, ty, 2).split(x, bx, tx, 2)
                .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
        }
        break;
    case 102:
        // all root on GPU
        output.root().split(y, by, ty, 32).split(x, bx, tx, 32)
            .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
        for (int j = 0; j < J; j++) {
            int blockw = 32, blockh = 32;
            if (j > 3) {
                blockw = 2;
                blockh = 2;
            }
            inGPyramid[j].root()
                .split(y, by, ty, blockh).split(x, bx, tx, blockw)
                .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
            gPyramid[j].root()
                .split(y, by, ty, blockh).split(x, bx, tx, blockw)
                .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
            outGPyramid[j].root()
                .split(y, by, ty, blockh).split(x, bx, tx, blockw)
                .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
            if (j == J-1) break;
            lPyramid[j].root()
                .split(y, by, ty, blockh).split(x, bx, tx, blockw)
                .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
            outLPyramid[j].root()
                .split(y, by, ty, blockh).split(x, bx, tx, blockw)
                .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
        }
        break;
    case 103:

        // most root, but inline laplacian pyramid levels - 49ms on Tesla
        output.root().split(y, by, ty, 32).split(x, bx, tx, 32)
            .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
        for (int j = 0; j < J; j++) {
            int blockw = 32, blockh = 32;
            if (j > 3) {
                blockw = 2;
                blockh = 2;
            }
            inGPyramid[j].root()
                .split(y, by, ty, blockh).split(x, bx, tx, blockw)
                .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
            gPyramid[j].root()
                .split(y, by, ty, blockh).split(x, bx, tx, blockw)
                .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
            outGPyramid[j].root()
                .split(y, by, ty, blockh).split(x, bx, tx, blockw)
                .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx);
        }
        break;
    default: 
        break;
    }

    output.compileToFile("local_laplacian", {levels, alpha, beta, input});


    return 0;
}