Esempio n. 1
0
static void
sfid_render_cache_rt_write_simd8_bgra_unorm8_xmajor(struct thread *t,
        const struct sfid_render_cache_args *args)
{
    __m256i argb;
    const float scale = 255.0f;
    struct reg src[4];

    memcpy(src, &t->grf[args->src], sizeof(src));

    const int cpp = 4;
    const int slice_y = args->rt.minimum_array_element * args->rt.qpitch;
    const int x = t->grf[1].uw[4];
    const int y = t->grf[1].uw[5] + slice_y;
    void *base = xmajor_offset(args->rt.pixels, x, y, args->rt.stride, cpp);

    if (gt.blend.enable) {
        /* Load unorm8 */
        __m128i lo = _mm_load_si128(base);
        __m128i hi = _mm_load_si128(base + 512);
        __m256i dst_argb = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1);
        dst_argb = _mm256_permute4x64_epi64(dst_argb, SWIZZLE(0, 2, 1, 3));

        blend_unorm8_argb(src, dst_argb);
    }

    gamma_correct(args->rt.format, src);

    const __m256i r = to_unorm(src[0].reg, scale);
    const __m256i g = to_unorm(src[1].reg, scale);
    const __m256i b = to_unorm(src[2].reg, scale);
    const __m256i a = to_unorm(src[3].reg, scale);

    argb = _mm256_slli_epi32(a, 8);
    argb = _mm256_or_si256(argb, r);
    argb = _mm256_slli_epi32(argb, 8);
    argb = _mm256_or_si256(argb, g);
    argb = _mm256_slli_epi32(argb, 8);
    argb = _mm256_or_si256(argb, b);

    /* Swizzle two middle pixel pairs so that dword 0-3 and 4-7
     * form linear owords of pixels. */
    argb = _mm256_permute4x64_epi64(argb, SWIZZLE(0, 2, 1, 3));
    __m256i mask = _mm256_permute4x64_epi64(t->mask_q1, SWIZZLE(0, 2, 1, 3));

    _mm_maskstore_epi32(base,
                        _mm256_extractf128_si256(mask, 0),
                        _mm256_extractf128_si256(argb, 0));
    _mm_maskstore_epi32(base + 512,
                        _mm256_extractf128_si256(mask, 1),
                        _mm256_extractf128_si256(argb, 1));
}
Esempio n. 2
0
static void
sfid_render_cache_rt_write_simd8_rgba_unorm8_linear(struct thread *t,
        const struct sfid_render_cache_args *args)
{
    const float scale = 255.0f;
    const struct reg *src = &t->grf[args->src];

    const __m256i r = to_unorm(src[0].reg, scale);
    const __m256i g = to_unorm(src[1].reg, scale);
    const __m256i b = to_unorm(src[2].reg, scale);
    const __m256i a = to_unorm(src[3].reg, scale);

    write_uint8_linear(t, args, r, g, b, a);
}
Esempio n. 3
0
static void encode_channel(uint8_t *base_addr, const channel_block &block, const surface<float> &surf,
                           const std::vector<plane> &planes, const std::pair<float, float> range, bool negative_line_stride) {
    block_iterator it(block.w, block.h, &surf);

    uint32_t n_blocks_in_line = (surf.width() / block.w);
    uint32_t n_block_lines = (surf.height() / block.h);

    // We need to support negative line stride.
    std::vector<std::pair<std::ptrdiff_t , std::ptrdiff_t >> line_offsets;
    for (const auto & plane : planes ) {
        if (negative_line_stride) {
            std::ptrdiff_t line_stride = -static_cast<std::ptrdiff_t>(plane.line_stride);
            // Each line  is still left to right.
            line_offsets.emplace_back(line_stride, plane.size + line_stride);
        } else {
            line_offsets.emplace_back(plane.line_stride, 0);
        }
    }

    // We need to preprocess the sample array to support continuation samples.
    std::vector<sample> samples;
    for (std::size_t i = 0; i < block.samples.size();) {
        const xyuv::sample &sample = block.samples[i];
        if (!sample.has_continuation) {
            samples.push_back(sample);
            ++i;
        }
        else {
            // Create a descriptor block containing all the bits of the samples.
            xyuv::sample sample_descriptor;
            sample_descriptor.integer_bits = 0;
            sample_descriptor.fractional_bits = 0;
            sample_descriptor.has_continuation = true;
            samples.push_back(sample_descriptor);
            size_t descriptor_pos = samples.size() -1;

            // Create a stack of all the bits in the sample.
            // We need to reverse the order of the samples to encode them correctly.
            std::vector<const xyuv::sample*> bit_stack;
            do {
                // Update descriptor
                samples[descriptor_pos].integer_bits += block.samples[i].integer_bits;
                samples[descriptor_pos].fractional_bits += block.samples[i].fractional_bits;
                bit_stack.push_back(&(block.samples[i]));
            } while(block.samples[i++].has_continuation);

            for (auto rit = bit_stack.rbegin(); rit != bit_stack.rend(); ++rit) {
                samples.push_back(*(*rit));
                samples.back().has_continuation = true;
            }

            samples.back().has_continuation = false;
        }
    }

    // Finally iterate over the image
    for (uint32_t line = 0; line < n_block_lines; line++) {
        // Precompute interleaved lines
        uint32_t interleaved_line[3] = {
                get_line(line, static_cast<interleave_pattern>(0), n_block_lines),
                get_line(line, static_cast<interleave_pattern>(1), n_block_lines),
                get_line(line, static_cast<interleave_pattern>(2), n_block_lines),
        };
        for (uint32_t b = 0; b < n_blocks_in_line; b++) {
            for (std::size_t s = 0; s < samples.size(); ) {
                uint8_t integer_bits = samples[s].integer_bits;
                uint8_t fractional_bits = samples[s].fractional_bits;

                float value = *it.advance();
                unorm_t unorm = to_unorm(value, integer_bits, fractional_bits, range);

                // If we hit a continuation block here, it means that we have the
                // Total bits descriptor and should skip it for the purpose of actual storing.
                if (samples[s].has_continuation) {
                    s++;
                }

                do {
                    const xyuv::sample &sample = samples[s];

                    uint8_t * ptr_to_line =
                            // Start with offset to frame
                            base_addr +
                            // Add offset to lowest byte in plane.
                            planes[sample.plane].base_offset +
                            // Add the size of the plane if applicable.
                            line_offsets[sample.plane].second +
                            // Add offset to current line.
                            interleaved_line[static_cast<uint32_t>(planes[sample.plane].interleave_mode)] * line_offsets[sample.plane].first;

                    // Read bits written bits from LSb fractional to MSb integer bit.
                    write_bits( ptr_to_line,
                               b * planes[sample.plane].block_stride + sample.offset,
                               sample.integer_bits + sample.fractional_bits, unorm);

                } while (samples[s++].has_continuation);
            }
        }
    }
}