static void sfid_render_cache_rt_write_simd8_bgra_unorm8_xmajor(struct thread *t, const struct sfid_render_cache_args *args) { __m256i argb; const float scale = 255.0f; struct reg src[4]; memcpy(src, &t->grf[args->src], sizeof(src)); const int cpp = 4; const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x = t->grf[1].uw[4]; const int y = t->grf[1].uw[5] + slice_y; void *base = xmajor_offset(args->rt.pixels, x, y, args->rt.stride, cpp); if (gt.blend.enable) { /* Load unorm8 */ __m128i lo = _mm_load_si128(base); __m128i hi = _mm_load_si128(base + 512); __m256i dst_argb = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); dst_argb = _mm256_permute4x64_epi64(dst_argb, SWIZZLE(0, 2, 1, 3)); blend_unorm8_argb(src, dst_argb); } gamma_correct(args->rt.format, src); const __m256i r = to_unorm(src[0].reg, scale); const __m256i g = to_unorm(src[1].reg, scale); const __m256i b = to_unorm(src[2].reg, scale); const __m256i a = to_unorm(src[3].reg, scale); argb = _mm256_slli_epi32(a, 8); argb = _mm256_or_si256(argb, r); argb = _mm256_slli_epi32(argb, 8); argb = _mm256_or_si256(argb, g); argb = _mm256_slli_epi32(argb, 8); argb = _mm256_or_si256(argb, b); /* Swizzle two middle pixel pairs so that dword 0-3 and 4-7 * form linear owords of pixels. */ argb = _mm256_permute4x64_epi64(argb, SWIZZLE(0, 2, 1, 3)); __m256i mask = _mm256_permute4x64_epi64(t->mask_q1, SWIZZLE(0, 2, 1, 3)); _mm_maskstore_epi32(base, _mm256_extractf128_si256(mask, 0), _mm256_extractf128_si256(argb, 0)); _mm_maskstore_epi32(base + 512, _mm256_extractf128_si256(mask, 1), _mm256_extractf128_si256(argb, 1)); }
static void sfid_render_cache_rt_write_simd8_rgba_unorm8_linear(struct thread *t, const struct sfid_render_cache_args *args) { const float scale = 255.0f; const struct reg *src = &t->grf[args->src]; const __m256i r = to_unorm(src[0].reg, scale); const __m256i g = to_unorm(src[1].reg, scale); const __m256i b = to_unorm(src[2].reg, scale); const __m256i a = to_unorm(src[3].reg, scale); write_uint8_linear(t, args, r, g, b, a); }
static void encode_channel(uint8_t *base_addr, const channel_block &block, const surface<float> &surf, const std::vector<plane> &planes, const std::pair<float, float> range, bool negative_line_stride) { block_iterator it(block.w, block.h, &surf); uint32_t n_blocks_in_line = (surf.width() / block.w); uint32_t n_block_lines = (surf.height() / block.h); // We need to support negative line stride. std::vector<std::pair<std::ptrdiff_t , std::ptrdiff_t >> line_offsets; for (const auto & plane : planes ) { if (negative_line_stride) { std::ptrdiff_t line_stride = -static_cast<std::ptrdiff_t>(plane.line_stride); // Each line is still left to right. line_offsets.emplace_back(line_stride, plane.size + line_stride); } else { line_offsets.emplace_back(plane.line_stride, 0); } } // We need to preprocess the sample array to support continuation samples. std::vector<sample> samples; for (std::size_t i = 0; i < block.samples.size();) { const xyuv::sample &sample = block.samples[i]; if (!sample.has_continuation) { samples.push_back(sample); ++i; } else { // Create a descriptor block containing all the bits of the samples. xyuv::sample sample_descriptor; sample_descriptor.integer_bits = 0; sample_descriptor.fractional_bits = 0; sample_descriptor.has_continuation = true; samples.push_back(sample_descriptor); size_t descriptor_pos = samples.size() -1; // Create a stack of all the bits in the sample. // We need to reverse the order of the samples to encode them correctly. std::vector<const xyuv::sample*> bit_stack; do { // Update descriptor samples[descriptor_pos].integer_bits += block.samples[i].integer_bits; samples[descriptor_pos].fractional_bits += block.samples[i].fractional_bits; bit_stack.push_back(&(block.samples[i])); } while(block.samples[i++].has_continuation); for (auto rit = bit_stack.rbegin(); rit != bit_stack.rend(); ++rit) { samples.push_back(*(*rit)); samples.back().has_continuation = true; } samples.back().has_continuation = false; } } // Finally iterate over the image for (uint32_t line = 0; line < n_block_lines; line++) { // Precompute interleaved lines uint32_t interleaved_line[3] = { get_line(line, static_cast<interleave_pattern>(0), n_block_lines), get_line(line, static_cast<interleave_pattern>(1), n_block_lines), get_line(line, static_cast<interleave_pattern>(2), n_block_lines), }; for (uint32_t b = 0; b < n_blocks_in_line; b++) { for (std::size_t s = 0; s < samples.size(); ) { uint8_t integer_bits = samples[s].integer_bits; uint8_t fractional_bits = samples[s].fractional_bits; float value = *it.advance(); unorm_t unorm = to_unorm(value, integer_bits, fractional_bits, range); // If we hit a continuation block here, it means that we have the // Total bits descriptor and should skip it for the purpose of actual storing. if (samples[s].has_continuation) { s++; } do { const xyuv::sample &sample = samples[s]; uint8_t * ptr_to_line = // Start with offset to frame base_addr + // Add offset to lowest byte in plane. planes[sample.plane].base_offset + // Add the size of the plane if applicable. line_offsets[sample.plane].second + // Add offset to current line. interleaved_line[static_cast<uint32_t>(planes[sample.plane].interleave_mode)] * line_offsets[sample.plane].first; // Read bits written bits from LSb fractional to MSb integer bit. write_bits( ptr_to_line, b * planes[sample.plane].block_stride + sample.offset, sample.integer_bits + sample.fractional_bits, unorm); } while (samples[s++].has_continuation); } } } }