コード例 #1
0
    void generate() {
        // The algorithm.

        // Some free variables, where x and y represent the spatial dimensions.
        Var x("x"), y("y"), depth("depth"), batch("batch");

        // Pad x and y with the value that produces zero after the input offset is
        // added. The input offset is bounded to the range of a uint8, so this is
        // safe.
        Func input_bounded =
            constant_exterior(input_, cast<uint8_t>(-input_offset_),
                              { { Expr(), Expr() },
                                { 0, input_.dim(1).extent() },
                                { 0, input_.dim(2).extent() },
                                { Expr(), Expr() } });

        // For the filter, add the offset and upcast to 16-bit.
        Func filter_with_offset("filter_with_offset");
        filter_with_offset(depth, x, y) =
            cast<int16_t>(filter_(depth, x, y)) + filter_offset_;

        // Shift the input spatially in [x, y] by -[pad_width, pad_height].
        Func shifted_input_with_offset("shifted_input_with_offset");
        shifted_input_with_offset(depth, x, y, batch) = input_bounded(
            depth, x - pad_width_, y - pad_height_, batch);

        // Apply the depth multiplier.
        Func resampled_input("resampled_input");
        resampled_input(depth, x, y, batch) =
            shifted_input_with_offset(depth / depth_multiplier_, x, y, batch);

        // For the input, add the offset and upcast to 16-bit. This happens after
        // resampling so we don't need to store/load as much data in the inner loop
        // (at the cost of one add in the inner loop instead).
        Func resampled_input_with_offset("resampled_input_with_offset");
        resampled_input_with_offset(depth, x, y, batch) =
            cast<int16_t>(resampled_input(depth, x, y, batch)) + input_offset_;

        // Do the convolution in 32-bit. Apply the input stride. As before, the
        // case stride == 1 is written separately for performance reasons.
        Func convolved("convolved");
        RDom filter_dom(0, filter_.dim(1).extent(), 0, filter_.dim(2).extent());
        convolved(depth, x, y, batch) +=
            (cast<int32_t>(filter_with_offset(depth, filter_dom.x, filter_dom.y)) *
             cast<int32_t>(
                 resampled_input_with_offset(depth, x * stride_ + filter_dom.x,
                                             y * stride_ + filter_dom.y, batch)));

        Func scaled_plus_offset("scaled_plus_offset");
        scaled_plus_offset(depth, x, y, batch) =
            multiply_quantized_multiplier(
                convolved(depth, x, y, batch) + bias_(depth), output_multiplier_,
                output_shift_) +
            output_offset_;

        // Saturate and narrow the output.
        output_(depth, x, y, batch) =
            clamp(u8_sat(scaled_plus_offset(depth, x, y, batch)),
                  output_min_, output_max_);

        // The schedule.
        int vector_size_u8 = get_target().natural_vector_size<uint8_t>();
        if (get_target().has_feature(Target::HVX_64)) {
            vector_size_u8 = 64;
        } else if (get_target().has_feature(Target::HVX_128)) {
            vector_size_u8 = 128;
        }
        const bool use_hexagon =
            get_target().features_any_of({ Target::HVX_64, Target::HVX_128 });

        // Specifying .hexagon() on a Func will generate an RPC to run this stage
        // on Hexagon. If Hexagon is the host (that is, the architecture is
        // Hexagon), we have to omit the .hexagon() directive as we are already
        // running on Hexagon.
        if (use_hexagon && get_target().arch != Target::Hexagon) {
            output_.hexagon();
        }

        output_.compute_root();

        // We can't parallize batches, as we often have just a single batch to
        // process. Also, x and y dimensions are often fairly small (8x8, 16x16).
        // For now, we parallize along y, but may need to adapt when benchmarking
        // real models.
        Var yi("yi");
        // For small tensors, make sure the split factor is not larger than the
        // output y extent.
        Expr y_split_factor = min(input_.dim(2).extent() / stride_, 4);

        output_.split(y, y, yi, y_split_factor).parallel(y);
        output_.vectorize(depth, vector_size_u8, TailStrategy::RoundUp);

        if (use_hexagon) {
            // Scheduling specifics for Hexagon.

            if (depth_multiplier_ > 1) {
                ScheduleResampledInput(output_, depth, y, depth_multiplier_,
                                       vector_size_u8, &resampled_input);
            }
            output_.prefetch(input_, yi);
        } else {
            // Scheduling specifics for CPU.

            // Special care has to be taken when the input depth is a multiple of 3,
            // because Halide specializes for this case (i.e., RGB color channels), or
            // the following Halide deinterleave compiler error will be encountered:
            // Internal error at third_party/halide/halide/src/Deinterleave.cpp:356
            // Condition failed: e.type().lanes() % 3 == 0
            if (depth_multiplier_ == 3) {
                ScheduleResampledInput(output_, depth, yi, depth_multiplier_,
                                       vector_size_u8, &resampled_input);
            }
        }
        SpecializeForFilterSizeAndInputStride(filter_dom, stride_, &filter_,
                                              &output_, &convolved);
    }
コード例 #2
0
    void generate() {
        Var x{"x"}, y{"y"}, c{"c"};

        // We need a wrapper for the output so we can schedule the
        // multiply update in tiles.
        Func copy("copy");

        copy(x, y, c) = input(x, y, c);

        output(x, y, c) = copy(x, y, c) * 2;

        input.dim(0).set_stride(4);
        output.dim(0).set_stride(4);  

        Var tx("tx"), ty("ty");
        Var ta("ta"), tb("tb");

        // Break the output into tiles.
        const int tile_width = 128;
        const int tile_height = 32;

        switch ((Schedule)schedule) {
            case Schedule::Basic:
            default:
                output.compute_root()
                      .reorder(c, x, y)
                      .bound(c, 0, 4)
                      .tile(x, y, tx, ty, ta, tb, tile_width, tile_height, TailStrategy::RoundUp);

                copy.compute_at(output, tx)
                    .store_at(output, tx)
                    .bound(c, 0, 4)
                    .copy_to_host()
                    .reorder_storage(c, x, y);
            break;
            case Schedule::Fold:
                output.compute_root()
                      .reorder(c, x, y)
                      .bound(c, 0, 4)
                      .tile(x, y, tx, ty, ta, tb, tile_width, tile_height, TailStrategy::RoundUp);

                copy.compute_at(output, tx)
                    .store_at(output, tx)
                    .bound(c, 0, 4)
                    .copy_to_host()
                    .reorder_storage(c, x, y)
                    .fold_storage(x, tile_width * 2);
            break;
            case Schedule::Async:
                output.compute_root()
                      .reorder(c, x, y)
                      .bound(c, 0, 4)
                      .tile(x, y, tx, ty, ta, tb, tile_width, tile_height, TailStrategy::RoundUp);

                copy.compute_at(output, tx)
                    .store_at(output, tx)
                    .bound(c, 0, 4)
                    .copy_to_host()
                    .async()
                    .reorder_storage(c, x, y)
                    .fold_storage(x, tile_width * 2);
            break;
            case Schedule::Split: {
                Expr fac = output.dim(1).extent()/2;
                Var yo, yi;
                output.split(y, yo, yi, fac);
                output.compute_root()
                      .reorder(c, x, yo)
                      .bound(c, 0, 4)
                      .tile(x, yi, tx, ty, ta, tb, tile_width, tile_height, TailStrategy::RoundUp)
                      .parallel(yo);

                copy.compute_at(output, tx)
                    .store_at(output, ty)
                    .bound(c, 0, 4)
                    .copy_to_host()
                    .reorder_storage(c, x, y);
            }
            break;
            case Schedule::Split_Fold: {
                Expr fac = output.dim(1).extent()/2;
                Var yo, yi;
                output.split(y, yo, yi, fac);
                output.compute_root()
                      .reorder(c, x, yo)
                      .bound(c, 0, 4)
                      .tile(x, yi, tx, ty, ta, tb, tile_width, tile_height, TailStrategy::RoundUp)
                      .parallel(yo);

                copy.compute_at(output, tx)
                    .store_at(output, ty)
                    .bound(c, 0, 4)
                    .copy_to_host()
                    .async()
                    .reorder_storage(c, x, y)
                    .fold_storage(x, tile_width * 2);
            }
            break;
        }
    }