Func process(Func raw, Type result_type, UniformImage matrix_3200, UniformImage matrix_7000, Uniform<float> color_temp, Uniform<float> gamma, Uniform<float> contrast) { Func processed("processed"); Var xi, yi; Func denoised = hot_pixel_suppression(raw); Func deinterleaved = deinterleave(denoised); Func demosaiced = demosaic(deinterleaved); Func corrected = color_correct(demosaiced, matrix_3200, matrix_7000, color_temp); Func curved = apply_curve(corrected, result_type, gamma, contrast); // Schedule Var co, ci; //#define USE_CI_HACK #ifndef USE_CI_HACK ci = c; #endif processed(tx, ty, c) = curved(tx, ty, ci); #ifdef USE_CI_HACK processed.split(c, co, ci, 3); // bound color loop to 0-3 #else processed.bound(c, 0, 3); // bound color loop 0-3, properly #endif if (schedule == 0) { // Compute in chunks over tiles, vectorized by 8 denoised.chunk(tx).vectorize(x, 8); deinterleaved.chunk(tx).vectorize(x, 8); corrected.chunk(tx).vectorize(x, 4); processed.tile(tx, ty, xi, yi, 32, 32).reorder(xi, yi, ci, tx, ty); processed.parallel(ty); } else if (schedule == 1) { // Same as above, but don't vectorize (sse is bad at interleaved 16-bit ops) denoised.chunk(tx); deinterleaved.chunk(tx); corrected.chunk(tx); processed.tile(tx, ty, xi, yi, 128, 128).reorder(xi, yi, ci, tx, ty); processed.parallel(ty); } else { denoised.root(); deinterleaved.root(); corrected.root(); processed.root(); } return processed; }
Func demosaic(Func deinterleaved) { // These are the values we already know from the input // x_y = the value of channel x at a site in the input of channel y // gb refers to green sites in the blue rows // gr refers to green sites in the red rows // Give more convenient names to the four channels we know Func r_r, g_gr, g_gb, b_b; g_gr(x, y) = deinterleaved(x, y, 0); r_r(x, y) = deinterleaved(x, y, 1); b_b(x, y) = deinterleaved(x, y, 2); g_gb(x, y) = deinterleaved(x, y, 3); // These are the ones we need to interpolate Func b_r, g_r, b_gr, r_gr, b_gb, r_gb, r_b, g_b; // First calculate green at the red and blue sites // Try interpolating vertically and horizontally. Also compute // differences vertically and horizontally. Use interpolation in // whichever direction had the smallest difference. Expr gv_r = (g_gb(x, y-1) + g_gb(x, y))/2; Expr gvd_r = abs(g_gb(x, y-1) - g_gb(x, y)); Expr gh_r = (g_gr(x+1, y) + g_gr(x, y))/2; Expr ghd_r = abs(g_gr(x+1, y) - g_gr(x, y)); g_r(x, y) = select(ghd_r < gvd_r, gh_r, gv_r); Expr gv_b = (g_gr(x, y+1) + g_gr(x, y))/2; Expr gvd_b = abs(g_gr(x, y+1) - g_gr(x, y)); Expr gh_b = (g_gb(x-1, y) + g_gb(x, y))/2; Expr ghd_b = abs(g_gb(x-1, y) - g_gb(x, y)); g_b(x, y) = select(ghd_b < gvd_b, gh_b, gv_b); // Next interpolate red at gr by first interpolating, then // correcting using the error green would have had if we had // interpolated it in the same way (i.e. add the second derivative // of the green channel at the same place). Expr correction; correction = g_gr(x, y) - (g_r(x, y) + g_r(x-1, y))/2; r_gr(x, y) = correction + (r_r(x-1, y) + r_r(x, y))/2; // Do the same for other reds and blues at green sites correction = g_gr(x, y) - (g_b(x, y) + g_b(x, y-1))/2; b_gr(x, y) = correction + (b_b(x, y) + b_b(x, y-1))/2; correction = g_gb(x, y) - (g_r(x, y) + g_r(x, y+1))/2; r_gb(x, y) = correction + (r_r(x, y) + r_r(x, y+1))/2; correction = g_gb(x, y) - (g_b(x, y) + g_b(x+1, y))/2; b_gb(x, y) = correction + (b_b(x, y) + b_b(x+1, y))/2; // Now interpolate diagonally to get red at blue and blue at // red. Hold onto your hats; this gets really fancy. We do the // same thing as for interpolating green where we try both // directions (in this case the positive and negative diagonals), // and use the one with the lowest absolute difference. But we // also use the same trick as interpolating red and blue at green // sites - we correct our interpolations using the second // derivative of green at the same sites. correction = g_b(x, y) - (g_r(x, y) + g_r(x-1, y+1))/2; Expr rp_b = correction + (r_r(x, y) + r_r(x-1, y+1))/2; Expr rpd_b = abs(r_r(x, y) - r_r(x-1, y+1)); correction = g_b(x, y) - (g_r(x-1, y) + g_r(x, y+1))/2; Expr rn_b = correction + (r_r(x-1, y) + r_r(x, y+1))/2; Expr rnd_b = abs(r_r(x-1, y) - r_r(x, y+1)); r_b(x, y) = select(rpd_b < rnd_b, rp_b, rn_b); // Same thing for blue at red correction = g_r(x, y) - (g_b(x, y) + g_b(x+1, y-1))/2; Expr bp_r = correction + (b_b(x, y) + b_b(x+1, y-1))/2; Expr bpd_r = abs(b_b(x, y) - b_b(x+1, y-1)); correction = g_r(x, y) - (g_b(x+1, y) + g_b(x, y-1))/2; Expr bn_r = correction + (b_b(x+1, y) + b_b(x, y-1))/2; Expr bnd_r = abs(b_b(x+1, y) - b_b(x, y-1)); b_r(x, y) = select(bpd_r < bnd_r, bp_r, bn_r); // Interleave the resulting channels Func r = interleave_y(interleave_x(r_gr, r_r), interleave_x(r_b, r_gb)); Func g = interleave_y(interleave_x(g_gr, g_r), interleave_x(g_b, g_gb)); Func b = interleave_y(interleave_x(b_gr, b_r), interleave_x(b_b, b_gb)); Func output; output(x, y, c) = select(c == 0, r(x, y), select(c == 1, g(x, y), b(x, y))); /* THE SCHEDULE */ if (schedule == 0) { // optimized for ARM // Compute these in chunks over tiles, vectorized by 8 g_r.chunk(tx).vectorize(x, 8); g_b.chunk(tx).vectorize(x, 8); r_gr.chunk(tx).vectorize(x, 8); b_gr.chunk(tx).vectorize(x, 8); r_gb.chunk(tx).vectorize(x, 8); b_gb.chunk(tx).vectorize(x, 8); r_b.chunk(tx).vectorize(x, 8); b_r.chunk(tx).vectorize(x, 8); // These interleave in y, so unrolling them in y helps r.chunk(tx).vectorize(x, 8).unroll(y, 2); g.chunk(tx).vectorize(x, 8).unroll(y, 2); b.chunk(tx).vectorize(x, 8).unroll(y, 2); } else if (schedule == 1) { // optimized for X86 // Don't vectorize, because sse is bad at 16-bit interleaving g_r.chunk(tx); g_b.chunk(tx); r_gr.chunk(tx); b_gr.chunk(tx); r_gb.chunk(tx); b_gb.chunk(tx); r_b.chunk(tx); b_r.chunk(tx); // These interleave in x and y, so unrolling them helps r.chunk(tx).unroll(x, 2).unroll(y, 2); g.chunk(tx).unroll(x, 2).unroll(y, 2); b.chunk(tx).unroll(x, 2).unroll(y, 2); } else { // Basic naive schedule g_r.root(); g_b.root(); r_gr.root(); b_gr.root(); r_gb.root(); b_gb.root(); r_b.root(); b_r.root(); r.root(); g.root(); b.root(); } return output; }