int main(int argc, char **argv) { // The camera pipe is specialized on the 2592x1968 images that // come in, so we'll just use an image instead of a uniform image. Image<int16_t> input(2592, 1968); UniformImage matrix_3200(Float(32), 2, "m3200"), matrix_7000(Float(32), 2, "m7000"); Uniform<float> color_temp("color_temp", 3200.0f); Uniform<float> gamma("gamma", 1.8f); Uniform<float> contrast("contrast", 10.0f); // shift things inwards to give us enough padding on the // boundaries so that we don't need to check bounds. We're going // to make a 2560x1920 output image, just like the FCam pipe, so // shift by 16, 12 Func shifted; shifted(x, y) = input(clamp(x+16, 0, input.width()-1), clamp(y+12, 0, input.height()-1)); // Parameterized output type, because LLVM PTX (GPU) backend does not // currently allow 8-bit computations int bit_width = atoi(argv[1]); Type result_type = UInt(bit_width); // Pick a schedule schedule = atoi(argv[2]); // Build the pipeline Func processed = process(shifted, result_type, matrix_3200, matrix_7000, color_temp, gamma, contrast); //string s = processed.serialize(); //printf("%s\n", s.c_str()); // In C++-11, this can be done as a simple initializer_list {color_temp,gamma,etc.} in place. Arg args[] = {color_temp, gamma, contrast, input, matrix_3200, matrix_7000}; processed.compileToFile("curved", std::vector<Arg>(args, args+6)); return 0; }
int main(int argc, char **argv) { if (argc < 2) { printf("Spatial sigma is a compile-time parameter, please provide it as an argument.\n" "(llvm's ptx backend doesn't handle integer mods by non-consts yet)\n"); return 0; } UniformImage input(Float(32), 2); Uniform<float> r_sigma; int s_sigma = atoi(argv[1]); Var x, y, z, c; // Add a boundary condition Func clamped; clamped(x, y) = input(clamp(x, 0, input.width()-1), clamp(y, 0, input.height()-1)); // Construct the bilateral grid RDom r(0, s_sigma, 0, s_sigma); Expr val = clamped(x * s_sigma + r.x - s_sigma/2, y * s_sigma + r.y - s_sigma/2); val = clamp(val, 0.0f, 1.0f); Expr zi = cast<int>(val * (1.0f/r_sigma) + 0.5f); Func grid; grid(x, y, zi, c) += select(c == 0, val, 1.0f); // Blur the grid using a five-tap filter Func blurx, blury, blurz; blurx(x, y, z) = grid(x-2, y, z) + grid(x-1, y, z)*4 + grid(x, y, z)*6 + grid(x+1, y, z)*4 + grid(x+2, y, z); blury(x, y, z) = blurx(x, y-2, z) + blurx(x, y-1, z)*4 + blurx(x, y, z)*6 + blurx(x, y+1, z)*4 + blurx(x, y+2, z); blurz(x, y, z) = blury(x, y, z-2) + blury(x, y, z-1)*4 + blury(x, y, z)*6 + blury(x, y, z+1)*4 + blury(x, y, z+2); // Take trilinear samples to compute the output val = clamp(clamped(x, y), 0.0f, 1.0f); Expr zv = val * (1.0f/r_sigma); zi = cast<int>(zv); Expr zf = zv - zi; Expr xf = cast<float>(x % s_sigma) / s_sigma; Expr yf = cast<float>(y % s_sigma) / s_sigma; Expr xi = x/s_sigma; Expr yi = y/s_sigma; Func interpolated; interpolated(x, y) = lerp(lerp(lerp(blurz(xi, yi, zi), blurz(xi+1, yi, zi), xf), lerp(blurz(xi, yi+1, zi), blurz(xi+1, yi+1, zi), xf), yf), lerp(lerp(blurz(xi, yi, zi+1), blurz(xi+1, yi, zi+1), xf), lerp(blurz(xi, yi+1, zi+1), blurz(xi+1, yi+1, zi+1), xf), yf), zf); // Normalize Func smoothed; smoothed(x, y) = interpolated(x, y, 0)/interpolated(x, y, 1); #ifndef USE_GPU // Best schedule for CPU printf("Compiling for CPU\n"); grid.root().parallel(z); grid.update().transpose(y, c).transpose(x, c).parallel(y); blurx.root().parallel(z).vectorize(x, 4); blury.root().parallel(z).vectorize(x, 4); blurz.root().parallel(z).vectorize(x, 4); smoothed.root().parallel(y).vectorize(x, 4); #else printf("Compiling for GPU"); Var gridz = grid.arg(2); grid.transpose(y, gridz).transpose(x, gridz).transpose(y, c).transpose(x, c) .root().cudaTile(x, y, 16, 16); grid.update().transpose(y, c).transpose(x, c).transpose(i, c).transpose(j, c) .root().cudaTile(x, y, 16, 16); c = blurx.arg(3); blurx.transpose(y, z).transpose(x, z).transpose(y, c).transpose(x, c) .root().cudaTile(x, y, 8, 8); c = blury.arg(3); blury.transpose(y, z).transpose(x, z).transpose(y, c).transpose(x, c) .root().cudaTile(x, y, 8, 8); c = blurz.arg(3); blurz.transpose(y, z).transpose(x, z).transpose(y, c).transpose(x, c) .root().cudaTile(x, y, 8, 8); smoothed.root().cudaTile(x, y, s_sigma, s_sigma); #endif smoothed.compileToFile("bilateral_grid", {r_sigma, input}); // Compared to Sylvain Paris' implementation from his webpage (on // which this is based), for filter params s_sigma 0.1, on a 4 megapixel // input, on a four core x86 (2 socket core2 mac pro) // Filter s_sigma: 2 4 8 16 32 // Paris (ms): 5350 1345 472 245 184 // Us (ms): 383 142 77 62 65 // Speedup: 14 9.5 6.1 3.9 2.8 // Our schedule and inlining are roughly the same as his, so the // gain is all down to vectorizing and parallelizing. In general // for larger blurs our win shrinks to roughly the number of // cores, as the stages we don't vectorize as well dominate (we // don't vectorize them well because they do gathers and scatters, // which don't work well on x86). For smaller blurs, our win // grows, because the stages that we vectorize take up all the // time. return 0; }
int main(int argc, char **argv) { /* THE ALGORITHM */ // Number of pyramid levels int J = 8; // number of intensity levels Uniform<int> levels; // Parameters controlling the filter Uniform<float> alpha, beta; // Takes a 16-bit input UniformImage input(UInt(16), 3); // loop variables Var x, y, c, k; // Make the remapping function as a lookup table. Func remap; Expr fx = cast<float>(x) / 256.0f; remap(x) = alpha*fx*exp(-fx*fx/2.0f); // Convert to floating point Func floating; floating(x, y, c) = cast<float>(input(x, y, c)) / 65535.0f; // Set a boundary condition Func clamped; clamped(x, y, c) = floating(clamp(x, 0, input.width()-1), clamp(y, 0, input.height()-1), c); // Get the luminance channel Func gray; gray(x, y) = 0.299f * clamped(x, y, 0) + 0.587f * clamped(x, y, 1) + 0.114f * clamped(x, y, 2); // Make the processed Gaussian pyramid. Func gPyramid[J]; // Do a lookup into a lut with 256 entires per intensity level Expr idx = gray(x, y)*cast<float>(levels-1)*256.0f; idx = clamp(cast<int>(idx), 0, (levels-1)*256); gPyramid[0](x, y, k) = beta*gray(x, y) + remap(idx - 256*k); //gPyramid[0](x, y, k) = remap(gray(x, y), cast<float>(k) / (levels-1), alpha, beta, levels-1); for (int j = 1; j < J; j++) gPyramid[j](x, y, k) = downsample(gPyramid[j-1])(x, y, k); // Get its laplacian pyramid Func lPyramid[J]; lPyramid[J-1] = gPyramid[J-1]; for (int j = J-2; j >= 0; j--) lPyramid[j](x, y, k) = gPyramid[j](x, y, k) - upsample(gPyramid[j+1])(x, y, k); // Make the Gaussian pyramid of the input Func inGPyramid[J]; inGPyramid[0] = gray; for (int j = 1; j < J; j++) inGPyramid[j](x, y) = downsample(inGPyramid[j-1])(x, y); // Make the laplacian pyramid of the output Func outLPyramid[J]; for (int j = 0; j < J; j++) { // Split input pyramid value into integer and floating parts Expr level = inGPyramid[j](x, y) * cast<float>(levels-1); Expr li = clamp(cast<int>(level), 0, levels-2); Expr lf = level - cast<float>(li); // Linearly interpolate between the nearest processed pyramid levels outLPyramid[j](x, y) = (1.0f - lf) * lPyramid[j](x, y, li) + lf * lPyramid[j](x, y, li+1); } // Make the Gaussian pyramid of the output Func outGPyramid[J]; outGPyramid[J-1] = outLPyramid[J-1]; for (int j = J-2; j >= 0; j--) outGPyramid[j](x, y) = upsample(outGPyramid[j+1])(x, y) + outLPyramid[j](x, y); // Reintroduce color Func color; color(x, y, c) = outGPyramid[0](x, y) * clamped(x, y, c) / gray(x, y); Func output; // Convert back to 16-bit output(x, y, c) = cast<uint16_t>(clamp(color(x, y, c), 0.0f, 1.0f) * 65535.0f); /* THE SCHEDULE */ // While normally we'd leave in just the best schedule per // architecture, here we kept track of everything we tried inside // a giant switch statement, to demonstrate how we go about // optimizing the schedule. The reference implementation on the // quad-core machine took 627 ms. // In any case, the remapping function should be a lut evaluated // ahead of time. It's so small relative to everything else that // its schedule really doesn't matter (provided we don't inline // it). remap.root(); Var yi; // Variables to control mapping to GPU Var bx("blockidx"), by("blockidy"), tx("threadidx"), ty("threadidy"); // Times are for a quad-core core2, a 32-core nehalem, and a 2-core omap4 cortex-a9 switch (atoi(argv[1])) { case 0: // As a baseline, breadth-first scalar: 1572, 1791, 9690 output.root(); for (int j = 0; j < J; j++) { inGPyramid[j].root(); gPyramid[j].root(); outGPyramid[j].root(); if (j == J-1) break; lPyramid[j].root(); outLPyramid[j].root(); } break; case 1: // parallelize each stage across outermost dimension: 769, 321, 5622 output.split(y, y, yi, 32).parallel(y); for (int j = 0; j < J; j++) { inGPyramid[j].root().split(y, y, yi, 4).parallel(y); gPyramid[j].root().parallel(k); outGPyramid[j].root().split(y, y, yi, 4).parallel(y); if (j == J-1) break; lPyramid[j].root().parallel(k); outLPyramid[j].root().split(y, y, yi, 4).parallel(y); } break; case 2: // Same as above, but also vectorize across x: 855, 288, 7004 output.split(y, y, yi, 32).parallel(y).vectorize(x, 4); for (int j = 0; j < J; j++) { inGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4); gPyramid[j].root().parallel(k).vectorize(x, 4); outGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4); if (j == J-1) break; lPyramid[j].root().parallel(k).vectorize(x, 4); outLPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4); } break; case 3: // parallelize across yi instead of y: Bad idea - 1136, 889, 7144 output.split(y, y, yi, 8).parallel(yi); for (int j = 0; j < J; j++) { inGPyramid[j].root().split(y, y, yi, 8).parallel(yi); gPyramid[j].root().parallel(k); outGPyramid[j].root().split(y, y, yi, 8).parallel(yi); if (j == J-1) break; lPyramid[j].root().parallel(k); outLPyramid[j].root().split(y, y, yi, 8).parallel(yi); } break; case 4: // Parallelize, inlining all the laplacian pyramid levels // (they can be computed from the gaussian pyramids on the // fly): 491, 244, 4297 output.split(y, y, yi, 32).parallel(y); for (int j = 0; j < J; j++) { inGPyramid[j].root().split(y, y, yi, 4).parallel(y); gPyramid[j].root().parallel(k); outGPyramid[j].root().split(y, y, yi, 4).parallel(y); } break; case 5: // Same as above with vectorization (now that we're doing more // math and less memory, maybe it will matter): 585, 204, 5389 output.split(y, y, yi, 32).parallel(y).vectorize(x, 4); for (int j = 0; j < J; j++) { inGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4); gPyramid[j].root().parallel(k).vectorize(x, 4); outGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4); } break; case 6: // Also inline every other pyramid level: Bad idea - 2118, 562, 16873 output.split(y, y, yi, 32).parallel(y).vectorize(x, 4); for (int j = 0; j < J; j+=2) { inGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4); gPyramid[j].root().parallel(k).vectorize(x, 4); outGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4); } break; case 7: // Take care of the boundary condition earlier to avoid costly // branching: 648, 242, 6037 output.split(y, y, yi, 32).parallel(y).vectorize(x, 4); clamped.root().split(y, y, yi, 32).parallel(y).vectorize(x, 4); for (int j = 0; j < J; j++) { inGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4); gPyramid[j].root().parallel(k).vectorize(x, 4); outGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4); } break; case 8: // Unroll by a factor of two to try and simplify the // upsampling math: not worth it - 583, 297, 5716 output.split(y, y, yi, 32).parallel(y).unroll(x, 2).unroll(yi, 2); for (int j = 0; j < J; j++) { inGPyramid[j].root().split(y, y, yi, 4).parallel(y).unroll(x, 2).unroll(y, 2); gPyramid[j].root().parallel(k).unroll(x, 2).unroll(y, 2); outGPyramid[j].root().split(y, y, yi, 4).parallel(y).unroll(x, 2).unroll(y, 2); } break; case 9: // Same as case 5 but parallelize across y as well as k, in // case k is too small to saturate the machine: 693, 239, 5774 output.split(y, y, yi, 32).parallel(y).vectorize(x, 4); for (int j = 0; j < J; j++) { inGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4); gPyramid[j].root().parallel(k).split(y, y, yi, 4).parallel(y).vectorize(x, 4); outGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4); } break; case 10: // Really-fine-grain parallelism. Don't both splitting // y. Should incur too much overhead to be good: 1083, 256, 5338 output.parallel(y).vectorize(x, 4); for (int j = 0; j < J; j++) { inGPyramid[j].root().parallel(y).vectorize(x, 4); gPyramid[j].root().parallel(k).parallel(y).vectorize(x, 4); outGPyramid[j].root().parallel(y).vectorize(x, 4); } break; case 11: // Same as case 5, but don't vectorize above a certain pyramid // level to prevent boundaries expanding too much (computing // an 8x8 top pyramid level instead of e.g. 5x5 requires much // much more input). 602, 194, 4836 output.split(y, y, yi, 32).parallel(y).vectorize(x, 4); for (int j = 0; j < J; j++) { inGPyramid[j].root().parallel(y); gPyramid[j].root().parallel(k); outGPyramid[j].root().parallel(y); if (j < 5) { inGPyramid[j].vectorize(x, 4); gPyramid[j].vectorize(x, 4); outGPyramid[j].vectorize(x, 4); } } break; case 12: // The bottom pyramid level is gigantic. I wonder if we can // just compute those values on demand. Otherwise same as 5: // 293, 170, 5490 output.split(y, y, yi, 32).parallel(y).vectorize(x, 4); for (int j = 0; j < J; j++) { inGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4); if (j > 0) gPyramid[j].root().parallel(k).vectorize(x, 4); outGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4); } break; case 13: // Should we inline the bottom pyramid level of everything?: 1044, 570, 17273 output.split(y, y, yi, 32).parallel(y).vectorize(x, 4); for (int j = 1; j < J; j++) { inGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4); gPyramid[j].root().parallel(k).vectorize(x, 4); outGPyramid[j].root().split(y, y, yi, 4).parallel(y).vectorize(x, 4); } break; case 14: // 4 and 11 were pretty good for ARM. Can we do better by inlining // the root pyramid level like in 12? 427, 228, 4233 output.split(y, y, yi, 32).parallel(y); for (int j = 0; j < J; j++) { inGPyramid[j].root().parallel(y); if (j > 0) gPyramid[j].root().parallel(k); outGPyramid[j].root().parallel(y); } break; case 100: // output stage only on GPU output.root().split(y, by, ty, 32).split(x, bx, tx, 32) .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx); for (int j = 0; j < J; j++) { inGPyramid[j].root(); gPyramid[j].root(); outGPyramid[j].root(); if (j == J-1) break; lPyramid[j].root(); outLPyramid[j].root(); } break; case 101: // all root on GPU, tiny blocks to prevent accidental bounds explosion output.root().split(y, by, ty, 2).split(x, bx, tx, 2) .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx); for (int j = 0; j < J; j++) { inGPyramid[j].root() .split(y, by, ty, 2).split(x, bx, tx, 2) .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx); gPyramid[j].root() .split(y, by, ty, 2).split(x, bx, tx, 2) .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx); outGPyramid[j].root() .split(y, by, ty, 2).split(x, bx, tx, 2) .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx); if (j == J-1) break; lPyramid[j].root() .split(y, by, ty, 2).split(x, bx, tx, 2) .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx); outLPyramid[j].root() .split(y, by, ty, 2).split(x, bx, tx, 2) .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx); } break; case 102: // all root on GPU output.root().split(y, by, ty, 32).split(x, bx, tx, 32) .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx); for (int j = 0; j < J; j++) { int blockw = 32, blockh = 32; if (j > 3) { blockw = 2; blockh = 2; } inGPyramid[j].root() .split(y, by, ty, blockh).split(x, bx, tx, blockw) .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx); gPyramid[j].root() .split(y, by, ty, blockh).split(x, bx, tx, blockw) .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx); outGPyramid[j].root() .split(y, by, ty, blockh).split(x, bx, tx, blockw) .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx); if (j == J-1) break; lPyramid[j].root() .split(y, by, ty, blockh).split(x, bx, tx, blockw) .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx); outLPyramid[j].root() .split(y, by, ty, blockh).split(x, bx, tx, blockw) .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx); } break; case 103: // most root, but inline laplacian pyramid levels - 49ms on Tesla output.root().split(y, by, ty, 32).split(x, bx, tx, 32) .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx); for (int j = 0; j < J; j++) { int blockw = 32, blockh = 32; if (j > 3) { blockw = 2; blockh = 2; } inGPyramid[j].root() .split(y, by, ty, blockh).split(x, bx, tx, blockw) .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx); gPyramid[j].root() .split(y, by, ty, blockh).split(x, bx, tx, blockw) .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx); outGPyramid[j].root() .split(y, by, ty, blockh).split(x, bx, tx, blockw) .transpose(bx, ty).parallel(by).parallel(ty).parallel(bx).parallel(tx); } break; default: break; } output.compileToFile("local_laplacian", {levels, alpha, beta, input}); return 0; }