void generate() { // The algorithm. // Some free variables, where x and y represent the spatial dimensions. Var x("x"), y("y"), depth("depth"), batch("batch"); // Pad x and y with the value that produces zero after the input offset is // added. The input offset is bounded to the range of a uint8, so this is // safe. Func input_bounded = constant_exterior(input_, cast<uint8_t>(-input_offset_), { { Expr(), Expr() }, { 0, input_.dim(1).extent() }, { 0, input_.dim(2).extent() }, { Expr(), Expr() } }); // For the filter, add the offset and upcast to 16-bit. Func filter_with_offset("filter_with_offset"); filter_with_offset(depth, x, y) = cast<int16_t>(filter_(depth, x, y)) + filter_offset_; // Shift the input spatially in [x, y] by -[pad_width, pad_height]. Func shifted_input_with_offset("shifted_input_with_offset"); shifted_input_with_offset(depth, x, y, batch) = input_bounded( depth, x - pad_width_, y - pad_height_, batch); // Apply the depth multiplier. Func resampled_input("resampled_input"); resampled_input(depth, x, y, batch) = shifted_input_with_offset(depth / depth_multiplier_, x, y, batch); // For the input, add the offset and upcast to 16-bit. This happens after // resampling so we don't need to store/load as much data in the inner loop // (at the cost of one add in the inner loop instead). Func resampled_input_with_offset("resampled_input_with_offset"); resampled_input_with_offset(depth, x, y, batch) = cast<int16_t>(resampled_input(depth, x, y, batch)) + input_offset_; // Do the convolution in 32-bit. Apply the input stride. As before, the // case stride == 1 is written separately for performance reasons. Func convolved("convolved"); RDom filter_dom(0, filter_.dim(1).extent(), 0, filter_.dim(2).extent()); convolved(depth, x, y, batch) += (cast<int32_t>(filter_with_offset(depth, filter_dom.x, filter_dom.y)) * cast<int32_t>( resampled_input_with_offset(depth, x * stride_ + filter_dom.x, y * stride_ + filter_dom.y, batch))); Func scaled_plus_offset("scaled_plus_offset"); scaled_plus_offset(depth, x, y, batch) = multiply_quantized_multiplier( convolved(depth, x, y, batch) + bias_(depth), output_multiplier_, output_shift_) + output_offset_; // Saturate and narrow the output. output_(depth, x, y, batch) = clamp(u8_sat(scaled_plus_offset(depth, x, y, batch)), output_min_, output_max_); // The schedule. int vector_size_u8 = get_target().natural_vector_size<uint8_t>(); if (get_target().has_feature(Target::HVX_64)) { vector_size_u8 = 64; } else if (get_target().has_feature(Target::HVX_128)) { vector_size_u8 = 128; } const bool use_hexagon = get_target().features_any_of({ Target::HVX_64, Target::HVX_128 }); // Specifying .hexagon() on a Func will generate an RPC to run this stage // on Hexagon. If Hexagon is the host (that is, the architecture is // Hexagon), we have to omit the .hexagon() directive as we are already // running on Hexagon. if (use_hexagon && get_target().arch != Target::Hexagon) { output_.hexagon(); } output_.compute_root(); // We can't parallize batches, as we often have just a single batch to // process. Also, x and y dimensions are often fairly small (8x8, 16x16). // For now, we parallize along y, but may need to adapt when benchmarking // real models. Var yi("yi"); // For small tensors, make sure the split factor is not larger than the // output y extent. Expr y_split_factor = min(input_.dim(2).extent() / stride_, 4); output_.split(y, y, yi, y_split_factor).parallel(y); output_.vectorize(depth, vector_size_u8, TailStrategy::RoundUp); if (use_hexagon) { // Scheduling specifics for Hexagon. if (depth_multiplier_ > 1) { ScheduleResampledInput(output_, depth, y, depth_multiplier_, vector_size_u8, &resampled_input); } output_.prefetch(input_, yi); } else { // Scheduling specifics for CPU. // Special care has to be taken when the input depth is a multiple of 3, // because Halide specializes for this case (i.e., RGB color channels), or // the following Halide deinterleave compiler error will be encountered: // Internal error at third_party/halide/halide/src/Deinterleave.cpp:356 // Condition failed: e.type().lanes() % 3 == 0 if (depth_multiplier_ == 3) { ScheduleResampledInput(output_, depth, yi, depth_multiplier_, vector_size_u8, &resampled_input); } } SpecializeForFilterSizeAndInputStride(filter_dom, stride_, &filter_, &output_, &convolved); }
void generate() { Var x{"x"}, y{"y"}, c{"c"}; // We need a wrapper for the output so we can schedule the // multiply update in tiles. Func copy("copy"); copy(x, y, c) = input(x, y, c); output(x, y, c) = copy(x, y, c) * 2; input.dim(0).set_stride(4); output.dim(0).set_stride(4); Var tx("tx"), ty("ty"); Var ta("ta"), tb("tb"); // Break the output into tiles. const int tile_width = 128; const int tile_height = 32; switch ((Schedule)schedule) { case Schedule::Basic: default: output.compute_root() .reorder(c, x, y) .bound(c, 0, 4) .tile(x, y, tx, ty, ta, tb, tile_width, tile_height, TailStrategy::RoundUp); copy.compute_at(output, tx) .store_at(output, tx) .bound(c, 0, 4) .copy_to_host() .reorder_storage(c, x, y); break; case Schedule::Fold: output.compute_root() .reorder(c, x, y) .bound(c, 0, 4) .tile(x, y, tx, ty, ta, tb, tile_width, tile_height, TailStrategy::RoundUp); copy.compute_at(output, tx) .store_at(output, tx) .bound(c, 0, 4) .copy_to_host() .reorder_storage(c, x, y) .fold_storage(x, tile_width * 2); break; case Schedule::Async: output.compute_root() .reorder(c, x, y) .bound(c, 0, 4) .tile(x, y, tx, ty, ta, tb, tile_width, tile_height, TailStrategy::RoundUp); copy.compute_at(output, tx) .store_at(output, tx) .bound(c, 0, 4) .copy_to_host() .async() .reorder_storage(c, x, y) .fold_storage(x, tile_width * 2); break; case Schedule::Split: { Expr fac = output.dim(1).extent()/2; Var yo, yi; output.split(y, yo, yi, fac); output.compute_root() .reorder(c, x, yo) .bound(c, 0, 4) .tile(x, yi, tx, ty, ta, tb, tile_width, tile_height, TailStrategy::RoundUp) .parallel(yo); copy.compute_at(output, tx) .store_at(output, ty) .bound(c, 0, 4) .copy_to_host() .reorder_storage(c, x, y); } break; case Schedule::Split_Fold: { Expr fac = output.dim(1).extent()/2; Var yo, yi; output.split(y, yo, yi, fac); output.compute_root() .reorder(c, x, yo) .bound(c, 0, 4) .tile(x, yi, tx, ty, ta, tb, tile_width, tile_height, TailStrategy::RoundUp) .parallel(yo); copy.compute_at(output, tx) .store_at(output, ty) .bound(c, 0, 4) .copy_to_host() .async() .reorder_storage(c, x, y) .fold_storage(x, tile_width * 2); } break; } }