static void write_uint8_linear(struct thread *t, const struct sfid_render_cache_args *args, __m256i r, __m256i g, __m256i b, __m256i a) { const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x = t->grf[1].uw[4]; const int y = t->grf[1].uw[5] + slice_y; __m256i rgba; rgba = _mm256_slli_epi32(a, 8); rgba = _mm256_or_si256(rgba, b); rgba = _mm256_slli_epi32(rgba, 8); rgba = _mm256_or_si256(rgba, g); rgba = _mm256_slli_epi32(rgba, 8); rgba = _mm256_or_si256(rgba, r); #define SWIZZLE(x, y, z, w) \ ( ((x) << 0) | ((y) << 2) | ((z) << 4) | ((w) << 6) ) /* Swizzle two middle pixel pairs so that dword 0-3 and 4-7 * form linear owords of pixels. */ rgba = _mm256_permute4x64_epi64(rgba, SWIZZLE(0, 2, 1, 3)); __m256i mask = _mm256_permute4x64_epi64(t->mask_q1, SWIZZLE(0, 2, 1, 3)); void *base = args->rt.pixels + x * args->rt.cpp + y * args->rt.stride; _mm_maskstore_epi32(base, _mm256_extractf128_si256(mask, 0), _mm256_extractf128_si256(rgba, 0)); _mm_maskstore_epi32(base + args->rt.stride, _mm256_extractf128_si256(mask, 1), _mm256_extractf128_si256(rgba, 1)); }
int32_t avx2_sumsignedbytes_variant2(int8_t* array, size_t size) { __m256i accumulator = _mm256_setzero_si256(); for (size_t i=0; i < size; i += 32) { const __m256i v = _mm256_loadu_si256((__m256i*)(array + i)); const __m256i v0 = _mm256_srai_epi32(v, 3*8); const __m256i v1 = _mm256_srai_epi32(_mm256_slli_epi32(v, 1*8), 3*8); const __m256i v2 = _mm256_srai_epi32(_mm256_slli_epi32(v, 2*8), 3*8); const __m256i v3 = _mm256_srai_epi32(_mm256_slli_epi32(v, 3*8), 3*8); accumulator = _mm256_add_epi32(accumulator, v0); accumulator = _mm256_add_epi32(accumulator, v1); accumulator = _mm256_add_epi32(accumulator, v2); accumulator = _mm256_add_epi32(accumulator, v3); } return int32_t(_mm256_extract_epi32(accumulator, 0)) + int32_t(_mm256_extract_epi32(accumulator, 1)) + int32_t(_mm256_extract_epi32(accumulator, 2)) + int32_t(_mm256_extract_epi32(accumulator, 3)) + int32_t(_mm256_extract_epi32(accumulator, 4)) + int32_t(_mm256_extract_epi32(accumulator, 5)) + int32_t(_mm256_extract_epi32(accumulator, 6)) + int32_t(_mm256_extract_epi32(accumulator, 7)); }
static void sfid_render_cache_rt_write_simd8_unorm8_ymajor(struct thread *t, const struct sfid_render_cache_args *args) { const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x = t->grf[1].uw[4]; const int y = t->grf[1].uw[5] + slice_y; const int cpp = 4; struct reg *src = &t->grf[args->src]; const __m256 scale = _mm256_set1_ps(255.0f); const __m256 half = _mm256_set1_ps(0.5f); __m256i r, g, b, a; __m256i rgba; switch (args->rt.format) { case SF_R8G8B8A8_UNORM: r = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[0].reg, scale), half)); g = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[1].reg, scale), half)); b = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[2].reg, scale), half)); a = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[3].reg, scale), half)); break; case SF_B8G8R8A8_UNORM: b = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[0].reg, scale), half)); g = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[1].reg, scale), half)); r = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[2].reg, scale), half)); a = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[3].reg, scale), half)); break; default: stub("unorm8 ymajor format"); return; } rgba = _mm256_slli_epi32(a, 8); rgba = _mm256_or_si256(rgba, b); rgba = _mm256_slli_epi32(rgba, 8); rgba = _mm256_or_si256(rgba, g); rgba = _mm256_slli_epi32(rgba, 8); rgba = _mm256_or_si256(rgba, r); /* Swizzle two middle pixel pairs so that dword 0-3 and 4-7 * form linear owords of pixels. */ rgba = _mm256_permute4x64_epi64(rgba, SWIZZLE(0, 2, 1, 3)); __m256i mask = _mm256_permute4x64_epi64(t->mask_q1, SWIZZLE(0, 2, 1, 3)); void *base = ymajor_offset(args->rt.pixels, x, y, args->rt.stride, cpp); _mm_maskstore_epi32(base, _mm256_extractf128_si256(mask, 0), _mm256_extractf128_si256(rgba, 0)); _mm_maskstore_epi32(base + 16, _mm256_extractf128_si256(mask, 1), _mm256_extractf128_si256(rgba, 1)); }
static void sfid_render_cache_rt_write_simd8_bgra_unorm8_xmajor(struct thread *t, const struct sfid_render_cache_args *args) { __m256i argb; const float scale = 255.0f; struct reg src[4]; memcpy(src, &t->grf[args->src], sizeof(src)); const int cpp = 4; const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x = t->grf[1].uw[4]; const int y = t->grf[1].uw[5] + slice_y; void *base = xmajor_offset(args->rt.pixels, x, y, args->rt.stride, cpp); if (gt.blend.enable) { /* Load unorm8 */ __m128i lo = _mm_load_si128(base); __m128i hi = _mm_load_si128(base + 512); __m256i dst_argb = _mm256_inserti128_si256(_mm256_castsi128_si256(lo), hi, 1); dst_argb = _mm256_permute4x64_epi64(dst_argb, SWIZZLE(0, 2, 1, 3)); blend_unorm8_argb(src, dst_argb); } gamma_correct(args->rt.format, src); const __m256i r = to_unorm(src[0].reg, scale); const __m256i g = to_unorm(src[1].reg, scale); const __m256i b = to_unorm(src[2].reg, scale); const __m256i a = to_unorm(src[3].reg, scale); argb = _mm256_slli_epi32(a, 8); argb = _mm256_or_si256(argb, r); argb = _mm256_slli_epi32(argb, 8); argb = _mm256_or_si256(argb, g); argb = _mm256_slli_epi32(argb, 8); argb = _mm256_or_si256(argb, b); /* Swizzle two middle pixel pairs so that dword 0-3 and 4-7 * form linear owords of pixels. */ argb = _mm256_permute4x64_epi64(argb, SWIZZLE(0, 2, 1, 3)); __m256i mask = _mm256_permute4x64_epi64(t->mask_q1, SWIZZLE(0, 2, 1, 3)); _mm_maskstore_epi32(base, _mm256_extractf128_si256(mask, 0), _mm256_extractf128_si256(argb, 0)); _mm_maskstore_epi32(base + 512, _mm256_extractf128_si256(mask, 1), _mm256_extractf128_si256(argb, 1)); }
static inline __m256i enc_reshuffle (__m256i in) { // Spread out 32-bit words over both halves of the input register: in = _mm256_permutevar8x32_epi32(in, _mm256_setr_epi32( 0, 1, 2, -1, 3, 4, 5, -1)); // Slice into 32-bit chunks and operate on all chunks in parallel. // All processing is done within the 32-bit chunk. First, shuffle: // before: [eeeeeeff|ccdddddd|bbbbcccc|aaaaaabb] // after: [00000000|aaaaaabb|bbbbcccc|ccdddddd] in = _mm256_shuffle_epi8(in, _mm256_set_epi8( -1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2, -1, 9, 10, 11, -1, 6, 7, 8, -1, 3, 4, 5, -1, 0, 1, 2)); // cd = [00000000|00000000|0000cccc|ccdddddd] const __m256i cd = _mm256_and_si256(in, _mm256_set1_epi32(0x00000FFF)); // ab = [0000aaaa|aabbbbbb|00000000|00000000] const __m256i ab = _mm256_and_si256(_mm256_slli_epi32(in, 4), _mm256_set1_epi32(0x0FFF0000)); // merged = [0000aaaa|aabbbbbb|0000cccc|ccdddddd] const __m256i merged = _mm256_or_si256(ab, cd); // bd = [00000000|00bbbbbb|00000000|00dddddd] const __m256i bd = _mm256_and_si256(merged, _mm256_set1_epi32(0x003F003F)); // ac = [00aaaaaa|00000000|00cccccc|00000000] const __m256i ac = _mm256_and_si256(_mm256_slli_epi32(merged, 2), _mm256_set1_epi32(0x3F003F00)); // indices = [00aaaaaa|00bbbbbb|00cccccc|00dddddd] const __m256i indices = _mm256_or_si256(ac, bd); // return = [00dddddd|00cccccc|00bbbbbb|00aaaaaa] return _mm256_bswap_epi32(indices); }
inline __m256i avx2_ringid_to_nsites_contained(const __m256i ringid) { // return 3*ringid*(ringid+1)+1; const __m256i one = _mm256_set1_epi32(1); __m256i nsites = _mm256_add_epi32(ringid, one); nsites = _mm256_mullo_epi32(ringid, nsites); nsites = _mm256_sub_epi32(_mm256_slli_epi32(nsites, 2), nsites); nsites = _mm256_add_epi32(nsites, one); return nsites; }
__m256 mm256_exp_ps(__m256 x) { __m256 tmp = _mm256_setzero_ps(), fx; __m256i emm0; __m256 one = *(__m256*)m256_ps_1; x = _mm256_min_ps(x, *(__m256*)m256_ps_exp_hi); x = _mm256_max_ps(x, *(__m256*)m256_ps_exp_lo); /* express exp(x) as exp(g + n*log(2)) */ fx = _mm256_mul_ps(x, *(__m256*)m256_ps_cephes_LOG2EF); fx = _mm256_add_ps(fx, *(__m256*)m256_ps_0p5); /* how to perform a floorf with SSE: just below */ /* step 1 : cast to int */ emm0 = _mm256_cvttps_epi32(fx); /* step 2 : cast back to float */ tmp = _mm256_cvtepi32_ps(emm0); /* if greater, substract 1 */ __m256 mask = _mm256_cmp_ps( tmp, fx, _CMP_GT_OS ); mask = _mm256_and_ps(mask, one); fx = _mm256_sub_ps(tmp, mask); tmp = _mm256_mul_ps(fx, *(__m256*)m256_ps_cephes_exp_C1); __m256 z = _mm256_mul_ps(fx, *(__m256*)m256_ps_cephes_exp_C2); x = _mm256_sub_ps(x, tmp); x = _mm256_sub_ps(x, z); z = _mm256_mul_ps(x,x); __m256 y = *(__m256*)m256_ps_cephes_exp_p0; y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p1); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p2); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p3); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p4); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p5); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, x); y = _mm256_add_ps(y, one); /* build 2^n */ emm0 = _mm256_cvttps_epi32(fx); emm0 = _mm256_add_epi32(emm0, *(__m256i*)m256_pi32_0x7f); emm0 = _mm256_slli_epi32(emm0, 23); __m256 pow2n = _mm256_castsi256_ps(emm0); y = _mm256_mul_ps(y, pow2n); _mm256_zeroupper(); return y; }
v8sf exp256_ps(v8sf x) { v8sf tmp = _mm256_setzero_ps(), fx; v8si imm0; v8sf one = *(v8sf*)_ps256_1; x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi); x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo); /* express exp(x) as exp(g + n*log(2)) */ fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF); fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5); /* how to perform a floorf with SSE: just below */ //imm0 = _mm256_cvttps_epi32(fx); //tmp = _mm256_cvtepi32_ps(imm0); tmp = _mm256_floor_ps(fx); /* if greater, substract 1 */ //v8sf mask = _mm256_cmpgt_ps(tmp, fx); v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS); mask = _mm256_and_ps(mask, one); fx = _mm256_sub_ps(tmp, mask); tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1); v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2); x = _mm256_sub_ps(x, tmp); x = _mm256_sub_ps(x, z); z = _mm256_mul_ps(x,x); v8sf y = *(v8sf*)_ps256_cephes_exp_p0; y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, x); y = _mm256_add_ps(y, one); /* build 2^n */ imm0 = _mm256_cvttps_epi32(fx); // another two AVX2 instructions imm0 = _mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f); imm0 = _mm256_slli_epi32(imm0, 23); v8sf pow2n = _mm256_castsi256_ps(imm0); y = _mm256_mul_ps(y, pow2n); return y; }
inline avx_m256_t newsin_ps(avx_m256_t x) { avx_m256_t sign_bit = _mm256_and_ps(x, _ps_sign_mask); x = _mm256_and_ps(x, _ps_inv_sign_mask); avx_m256_t y = _mm256_mul_ps(x, _ps_cephes_FOPI); avx_m256i_t emm2 = _mm256_cvttps_epi32(y); emm2 = _mm256_add_epi32(emm2, _pi32_1); emm2 = _mm256_and_si256(emm2, _pi32_inv1); y = _mm256_cvtepi32_ps(emm2); avx_m256i_t emm0 = _mm256_and_si256(emm2, _pi32_4); emm0 = _mm256_slli_epi32(emm0, 29); emm2 = _mm256_and_si256(emm2, _pi32_2); emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256()); avx_m256_t swap_sign_bit = _mm256_castsi256_ps(emm0); avx_m256_t poly_mask = _mm256_castsi256_ps(emm2); sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit); avx_m256_t temp = _ps_minus_cephes_DP123; temp = _mm256_mul_ps(y, temp); x = _mm256_add_ps(x, temp); avx_m256_t x2 = _mm256_mul_ps(x, x); avx_m256_t x3 = _mm256_mul_ps(x2, x); avx_m256_t x4 = _mm256_mul_ps(x2, x2); y = _ps_coscof_p0; avx_m256_t y2 = _ps_sincof_p0; y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p1); y2 = _mm256_add_ps(y2, _ps_sincof_p1); y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p2); y2 = _mm256_add_ps(y2, _ps_sincof_p2); y = _mm256_mul_ps(y, x4); y2 = _mm256_mul_ps(y2, x3); temp = _mm256_mul_ps(x2, _ps_0p5); temp = _mm256_sub_ps(temp, _ps_1); y = _mm256_sub_ps(y, temp); y2 = _mm256_add_ps(y2, x); y = _mm256_andnot_ps(poly_mask, y); y2 = _mm256_and_ps(poly_mask, y2); y = _mm256_add_ps(y, y2); y = _mm256_xor_ps(y, sign_bit); return y; } // newsin_ps()
static inline __m256i dec_reshuffle (__m256i in) { // Shuffle bytes to 32-bit bigendian: in = _mm256_bswap_epi32(in); // Mask in a single byte per shift: __m256i mask = _mm256_set1_epi32(0x3F000000); // Pack bytes together: __m256i out = _mm256_slli_epi32(_mm256_and_si256(in, mask), 2); mask = _mm256_srli_epi32(mask, 8); out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 4)); mask = _mm256_srli_epi32(mask, 8); out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 6)); mask = _mm256_srli_epi32(mask, 8); out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 8)); // Pack bytes together within 32-bit words, discarding words 3 and 7: out = _mm256_shuffle_epi8(out, _mm256_setr_epi8( 3, 2, 1, 7, 6, 5, 11, 10, 9, 15, 14, 13, -1, -1, -1, -1, 3, 2, 1, 7, 6, 5, 11, 10, 9, 15, 14, 13, -1, -1, -1, -1)); // Pack 32-bit words together, squashing empty words 3 and 7: return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32( 0, 1, 2, 4, 5, 6, -1, -1)); }
static FORCE_INLINE void FlowInterSimple_double_8px_AVX2( int w, PixelType *pdst, const PixelType *prefB, const PixelType *prefF, const int16_t *VXFullB, const int16_t *VXFullF, const int16_t *VYFullB, const int16_t *VYFullF, const uint8_t *MaskB, const uint8_t *MaskF, int nPelLog, const __m256i &dwords_ref_pitch, const __m256i &dwords_hoffsets) { __m256i dwords_w = _mm256_add_epi32(_mm256_set1_epi32(w << nPelLog), dwords_hoffsets); /// maybe do it another way __m256i dstF = lookup_double_AVX2(VXFullF, VYFullF, prefF, w, dwords_ref_pitch, dwords_w); __m256i dstB = lookup_double_AVX2(VXFullB, VYFullB, prefB, w, dwords_ref_pitch, dwords_w); __m256i maskf = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskF[w])); __m256i maskb = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskB[w])); __m256i dstF_dstB = _mm256_add_epi32(dstF, dstB); dstF_dstB = _mm256_slli_epi32(dstF_dstB, 8); __m256i dst; if (sizeof(PixelType) == 1) { __m256i dstB_dstF = _mm256_sub_epi16(dstB, dstF); __m256i maskf_maskb = _mm256_sub_epi16(maskf, maskb); dst = _mm256_madd_epi16(dstB_dstF, maskf_maskb); } else { __m256i dstB_dstF = _mm256_sub_epi32(dstB, dstF); __m256i maskf_maskb = _mm256_sub_epi32(maskf, maskb); dst = _mm256_mullo_epi32(dstB_dstF, maskf_maskb); } dst = _mm256_add_epi32(dst, dstF_dstB); dst = _mm256_srai_epi32(dst, 9); dst = _mm256_packus_epi32(dst, dst); dst = _mm256_permute4x64_epi64(dst, 0xe8); // 0b11101000 - copy third qword to second qword __m128i dst128 = _mm256_castsi256_si128(dst); if (sizeof(PixelType) == 1) { dst128 = _mm_packus_epi16(dst128, dst128); _mm_storel_epi64((__m128i *)&pdst[w], dst128); } else { _mm_storeu_si128((__m128i *)&pdst[w], dst128); } }
static INLINE void quantize(const __m256i *qp, __m256i *c, const int16_t *iscan_ptr, int log_scale, tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) { const __m256i abs_coeff = _mm256_abs_epi32(*c); __m256i q = _mm256_add_epi32(abs_coeff, qp[0]); __m256i q_lo = _mm256_mul_epi32(q, qp[1]); __m256i q_hi = _mm256_srli_epi64(q, 32); const __m256i qp_hi = _mm256_srli_epi64(qp[1], 32); q_hi = _mm256_mul_epi32(q_hi, qp_hi); q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale); q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale); q_hi = _mm256_slli_epi64(q_hi, 32); q = _mm256_or_si256(q_lo, q_hi); const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale); const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s); q = _mm256_andnot_si256(mask, q); __m256i dq = _mm256_mullo_epi32(q, qp[2]); dq = _mm256_srai_epi32(dq, log_scale); q = _mm256_sign_epi32(q, *c); dq = _mm256_sign_epi32(dq, *c); _mm256_storeu_si256((__m256i *)qcoeff, q); _mm256_storeu_si256((__m256i *)dqcoeff, dq); const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr); const __m128i zr = _mm_setzero_si128(); const __m128i lo = _mm_unpacklo_epi16(isc, zr); const __m128i hi = _mm_unpackhi_epi16(isc, zr); const __m256i iscan = _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); const __m256i zero = _mm256_setzero_si256(); const __m256i zc = _mm256_cmpeq_epi32(dq, zero); const __m256i nz = _mm256_cmpeq_epi32(zc, zero); __m256i cur_eob = _mm256_sub_epi32(iscan, nz); cur_eob = _mm256_and_si256(cur_eob, nz); *eob = _mm256_max_epi32(cur_eob, *eob); }
__m256 _inner_mm256_exp_ps1(__m256 arg) { arg = _mm256_mul_ps(arg, _mm256_set1_ps(1.4426950408889634073599246810018921374266459541529859f)); __m256i e = _mm256_add_epi32( _mm256_castps_si256(_mm256_cmp_ps(arg, _mm256_set1_ps(0.0f), _CMP_LT_OQ)), _mm256_cvttps_epi32(arg)); arg = _mm256_sub_ps(arg, _mm256_cvtepi32_ps(e)); __m256 intermediate_result; intermediate_result = _mm256_fmadd_ps(_mm256_set1_ps(0.0136779459179717f), arg, _mm256_set1_ps(0.0517692205767896f)); intermediate_result = _mm256_fmadd_ps(intermediate_result, arg, _mm256_set1_ps(0.241554388295527f)); intermediate_result = _mm256_fmadd_ps(intermediate_result, arg, _mm256_set1_ps(0.692998430056128f)); intermediate_result = _mm256_fmadd_ps(intermediate_result, arg, _mm256_set1_ps(0.999999804292074f)); arg = intermediate_result; __m256 res = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_add_epi32(e, _mm256_set1_epi32(127)), 23)); res = _mm256_mul_ps(res, arg); return res; }
__m256i test_mm256_slli_epi32(__m256i a) { // CHECK: @llvm.x86.avx2.pslli.d return _mm256_slli_epi32(a, 3); }
static void sfid_render_cache_rt_write_simd8_rgba_uint32_linear(struct thread *t, const struct sfid_render_cache_args *args) { const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x = t->grf[1].uw[4]; const int y = t->grf[1].uw[5] + slice_y; const struct reg *src = &t->grf[args->src]; __m128i *base0 = args->rt.pixels + x * args->rt.cpp + y * args->rt.stride; __m128i *base1 = (void *) base0 + args->rt.stride; __m256i rg0145 = _mm256_unpacklo_epi32(src[0].ireg, src[1].ireg); __m256i rg2367 = _mm256_unpackhi_epi32(src[0].ireg, src[1].ireg); __m256i ba0145 = _mm256_unpacklo_epi32(src[2].ireg, src[3].ireg); __m256i ba2367 = _mm256_unpackhi_epi32(src[2].ireg, src[3].ireg); __m256i rgba04 = _mm256_unpacklo_epi64(rg0145, ba0145); __m256i rgba15 = _mm256_unpackhi_epi64(rg0145, ba0145); __m256i rgba26 = _mm256_unpacklo_epi64(rg2367, ba2367); __m256i rgba37 = _mm256_unpackhi_epi64(rg2367, ba2367); struct reg mask = { .ireg = t->mask_q1 }; if (mask.d[0] < 0) base0[0] = _mm256_extractf128_si256(rgba04, 0); if (mask.d[1] < 0) base0[1] = _mm256_extractf128_si256(rgba15, 0); if (mask.d[2] < 0) base1[0] = _mm256_extractf128_si256(rgba26, 0); if (mask.d[3] < 0) base1[1] = _mm256_extractf128_si256(rgba37, 0); if (mask.d[4] < 0) base0[2] = _mm256_extractf128_si256(rgba04, 1); if (mask.d[5] < 0) base0[3] = _mm256_extractf128_si256(rgba15, 1); if (mask.d[6] < 0) base1[2] = _mm256_extractf128_si256(rgba26, 1); if (mask.d[7] < 0) base1[3] = _mm256_extractf128_si256(rgba37, 1); } static void write_uint16_linear(struct thread *t, const struct sfid_render_cache_args *args, __m256i r, __m256i g, __m256i b, __m256i a) { const int slice_y = args->rt.minimum_array_element * args->rt.qpitch; const int x = t->grf[1].uw[4]; const int y = t->grf[1].uw[5] + slice_y; __m256i rg, ba; rg = _mm256_slli_epi32(g, 16); rg = _mm256_or_si256(rg, r); ba = _mm256_slli_epi32(a, 16); ba = _mm256_or_si256(ba, b); __m256i p0 = _mm256_unpacklo_epi32(rg, ba); __m256i m0 = _mm256_cvtepi32_epi64(_mm256_extractf128_si256(t->mask_q1, 0)); __m256i p1 = _mm256_unpackhi_epi32(rg, ba); __m256i m1 = _mm256_cvtepi32_epi64(_mm256_extractf128_si256(t->mask_q1, 1)); void *base = args->rt.pixels + x * args->rt.cpp + y * args->rt.stride; _mm_maskstore_epi64(base, _mm256_extractf128_si256(m0, 0), _mm256_extractf128_si256(p0, 0)); _mm_maskstore_epi64((base + 16), _mm256_extractf128_si256(m1, 0), _mm256_extractf128_si256(p0, 1)); _mm_maskstore_epi64((base + args->rt.stride), _mm256_extractf128_si256(m0, 1), _mm256_extractf128_si256(p1, 0)); _mm_maskstore_epi64((base + args->rt.stride + 16), _mm256_extractf128_si256(m1, 1), _mm256_extractf128_si256(p1, 1)); } static void sfid_render_cache_rt_write_simd8_rgba_unorm16_linear(struct thread *t, const struct sfid_render_cache_args *args) { __m256i r, g, b, a; const __m256 scale = _mm256_set1_ps(65535.0f); const __m256 half = _mm256_set1_ps(0.5f); struct reg *src = &t->grf[args->src]; r = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[0].reg, scale), half)); g = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[1].reg, scale), half)); b = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[2].reg, scale), half)); a = _mm256_cvtps_epi32(_mm256_add_ps(_mm256_mul_ps(src[3].reg, scale), half)); write_uint16_linear(t, args, r, g, b, a); }
/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could replace both of them.. it is almost as fast, and gives you a free cosine with your sine */ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y; v8si imm0, imm2, imm4; #ifndef __AVX2__ v4si imm0_1, imm0_2; v4si imm2_1, imm2_2; v4si imm4_1, imm4_2; #endif sign_bit_sin = x; /* take the absolute value */ x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask); /* extract the sign bit (upper one) */ sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf*)_ps256_sign_mask); /* scale by 4/Pi */ y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI); #ifdef __AVX2__ /* store the integer part of y in imm2 */ imm2 = _mm256_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1); imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_inv1); y = _mm256_cvtepi32_ps(imm2); imm4 = imm2; /* get the swap sign flag for the sine */ imm0 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_4); imm0 = _mm256_slli_epi32(imm0, 29); //v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0); /* get the polynom selection mask for the sine*/ imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_2); imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0); //v8sf poly_mask = _mm256_castsi256_ps(imm2); #else /* we use SSE2 routines to perform the integer ops */ COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2); imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1); imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1); imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1); imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1); COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2); y = _mm256_cvtepi32_ps(imm2); imm4_1 = imm2_1; imm4_2 = imm2_2; imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4); imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4); imm0_1 = _mm_slli_epi32(imm0_1, 29); imm0_2 = _mm_slli_epi32(imm0_2, 29); COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2); imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2); imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); #endif v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0); v8sf poly_mask = _mm256_castsi256_ps(imm2); /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(v8sf*)_ps256_minus_cephes_DP1; xmm2 = *(v8sf*)_ps256_minus_cephes_DP2; xmm3 = *(v8sf*)_ps256_minus_cephes_DP3; xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); x = _mm256_add_ps(x, xmm1); x = _mm256_add_ps(x, xmm2); x = _mm256_add_ps(x, xmm3); #ifdef __AVX2__ imm4 = _mm256_sub_epi32(imm4, *(v8si*)_pi32_256_2); imm4 = _mm256_andnot_si128(imm4, *(v8si*)_pi32_256_4); imm4 = _mm256_slli_epi32(imm4, 29); #else imm4_1 = _mm_sub_epi32(imm4_1, *(v4si*)_pi32avx_2); imm4_2 = _mm_sub_epi32(imm4_2, *(v4si*)_pi32avx_2); imm4_1 = _mm_andnot_si128(imm4_1, *(v4si*)_pi32avx_4); imm4_2 = _mm_andnot_si128(imm4_2, *(v4si*)_pi32avx_4); imm4_1 = _mm_slli_epi32(imm4_1, 29); imm4_2 = _mm_slli_epi32(imm4_2, 29); COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4); #endif v8sf sign_bit_cos = _mm256_castsi256_ps(imm4); sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin); /* Evaluate the first polynom (0 <= x <= Pi/4) */ v8sf z = _mm256_mul_ps(x,x); y = *(v8sf*)_ps256_coscof_p0; y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2); y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5); y = _mm256_sub_ps(y, tmp); y = _mm256_add_ps(y, *(v8sf*)_ps256_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ v8sf y2 = *(v8sf*)_ps256_sincof_p0; y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; v8sf ysin2 = _mm256_and_ps(xmm3, y2); v8sf ysin1 = _mm256_andnot_ps(xmm3, y); y2 = _mm256_sub_ps(y2,ysin2); y = _mm256_sub_ps(y, ysin1); xmm1 = _mm256_add_ps(ysin1,ysin2); xmm2 = _mm256_add_ps(y,y2); /* update the sign */ *s = _mm256_xor_ps(xmm1, sign_bit_sin); *c = _mm256_xor_ps(xmm2, sign_bit_cos); }
__m256i test_mm256_slli_epi32(__m256i a) { // CHECK-LABEL: test_mm256_slli_epi32 // CHECK: call <8 x i32> @llvm.x86.avx2.pslli.d(<8 x i32> %{{.*}}, i32 %{{.*}}) return _mm256_slli_epi32(a, 3); }
static FORCE_INLINE void FlowInterSimple_generic_8px_AVX2( int w, PixelType *pdst, const PixelType *prefB, const PixelType *prefF, const int16_t *VXFullB, const int16_t *VXFullF, const int16_t *VYFullB, const int16_t *VYFullF, const uint8_t *MaskB, const uint8_t *MaskF, int nPelLog, const __m256i &dwords_time256, const __m256i &dwords_256_time256, const __m256i &dwords_ref_pitch, const __m256i &dwords_hoffsets) { __m256i dwords_w = _mm256_add_epi32(_mm256_set1_epi32(w << nPelLog), dwords_hoffsets); __m256i dstF = lookup_AVX2(VXFullF, VYFullF, prefF, w, dwords_time256, dwords_ref_pitch, dwords_w); __m256i dstB = lookup_AVX2(VXFullB, VYFullB, prefB, w, dwords_256_time256, dwords_ref_pitch, dwords_w); __m256i maskf = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskF[w])); __m256i maskb = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskB[w])); const __m256i dwords_255 = _mm256_set1_epi32(255); __m256i maskf_inv = _mm256_sub_epi32(dwords_255, maskf); __m256i maskb_inv = _mm256_sub_epi32(dwords_255, maskb); __m256i f, b; if (sizeof(PixelType) == 1) { __m256i dstF_dstB = _mm256_or_si256(dstF, _mm256_slli_epi32(dstB, 16)); maskf = _mm256_or_si256(_mm256_slli_epi32(maskf, 16), maskf_inv); maskb = _mm256_or_si256(maskb, _mm256_slli_epi32(maskb_inv, 16)); f = _mm256_madd_epi16(dstF_dstB, maskf); b = _mm256_madd_epi16(dstF_dstB, maskb); } else { __m256i dstF_maskf_inv = _mm256_mullo_epi32(dstF, maskf_inv); __m256i dstB_maskb_inv = _mm256_mullo_epi32(dstB, maskb_inv); __m256i dstB_maskf = _mm256_mullo_epi32(dstB, maskf); __m256i dstF_maskb = _mm256_mullo_epi32(dstF, maskb); f = _mm256_add_epi32(dstF_maskf_inv, dstB_maskf); b = _mm256_add_epi32(dstB_maskb_inv, dstF_maskb); } f = _mm256_add_epi32(f, dwords_255); b = _mm256_add_epi32(b, dwords_255); f = _mm256_srai_epi32(f, 8); b = _mm256_srai_epi32(b, 8); if (sizeof(PixelType) == 1) { f = _mm256_madd_epi16(f, dwords_256_time256); b = _mm256_madd_epi16(b, dwords_time256); } else { f = _mm256_mullo_epi32(f, dwords_256_time256); b = _mm256_mullo_epi32(b, dwords_time256); } __m256i dst = _mm256_add_epi32(f, b); dst = _mm256_srai_epi32(dst, 8); dst = _mm256_packus_epi32(dst, dst); dst = _mm256_permute4x64_epi64(dst, 0xe8); // 0b11101000 - copy third qword to second qword __m128i dst128 = _mm256_castsi256_si128(dst); if (sizeof(PixelType) == 1) { dst128 = _mm_packus_epi16(dst128, dst128); _mm_storel_epi64((__m128i *)&pdst[w], dst128); } else { _mm_storeu_si128((__m128i *)&pdst[w], dst128); } }
static void mshabal256_compress(mshabal256_context *sc, const unsigned char *buf0, const unsigned char *buf1, const unsigned char *buf2, const unsigned char *buf3, const unsigned char *buf4, const unsigned char *buf5, const unsigned char *buf6, const unsigned char *buf7, size_t num) { union { u32 words[64 * MSHABAL256_FACTOR]; __m256i data[16]; } u; size_t j; __m256i A[12], B[16], C[16]; __m256i one; for (j = 0; j < 12; j++) A[j] = _mm256_loadu_si256((__m256i *)sc->state + j); for (j = 0; j < 16; j++) { B[j] = _mm256_loadu_si256((__m256i *)sc->state + j + 12); C[j] = _mm256_loadu_si256((__m256i *)sc->state + j + 28); } one = _mm256_set1_epi32(C32(0xFFFFFFFF)); #define M(i) _mm256_load_si256(u.data + (i)) while (num-- > 0) { for (j = 0; j < 64 * MSHABAL256_FACTOR; j += 4 * MSHABAL256_FACTOR) { size_t o = j / MSHABAL256_FACTOR; u.words[j + 0] = *(u32 *)(buf0 + o); u.words[j + 1] = *(u32 *)(buf1 + o); u.words[j + 2] = *(u32 *)(buf2 + o); u.words[j + 3] = *(u32 *)(buf3 + o); u.words[j + 4] = *(u32 *)(buf4 + o); u.words[j + 5] = *(u32 *)(buf5 + o); u.words[j + 6] = *(u32 *)(buf6 + o); u.words[j + 7] = *(u32 *)(buf7 + o); } for (j = 0; j < 16; j++) B[j] = _mm256_add_epi32(B[j], M(j)); A[0] = _mm256_xor_si256(A[0], _mm256_set1_epi32(sc->Wlow)); A[1] = _mm256_xor_si256(A[1], _mm256_set1_epi32(sc->Whigh)); for (j = 0; j < 16; j++) B[j] = _mm256_or_si256(_mm256_slli_epi32(B[j], 17), _mm256_srli_epi32(B[j], 15)); #define PP(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) do { \ __m256i tt; \ tt = _mm256_or_si256(_mm256_slli_epi32(xa1, 15), \ _mm256_srli_epi32(xa1, 17)); \ tt = _mm256_add_epi32(_mm256_slli_epi32(tt, 2), tt); \ tt = _mm256_xor_si256(_mm256_xor_si256(xa0, tt), xc); \ tt = _mm256_add_epi32(_mm256_slli_epi32(tt, 1), tt); \ tt = _mm256_xor_si256(\ _mm256_xor_si256(tt, xb1), \ _mm256_xor_si256(_mm256_andnot_si256(xb3, xb2), xm)); \ xa0 = tt; \ tt = xb0; \ tt = _mm256_or_si256(_mm256_slli_epi32(tt, 1), \ _mm256_srli_epi32(tt, 31)); \ xb0 = _mm256_xor_si256(tt, _mm256_xor_si256(xa0, one)); \ } while (0) PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); A[0xB] = _mm256_add_epi32(A[0xB], C[0x6]); A[0xA] = _mm256_add_epi32(A[0xA], C[0x5]); A[0x9] = _mm256_add_epi32(A[0x9], C[0x4]); A[0x8] = _mm256_add_epi32(A[0x8], C[0x3]); A[0x7] = _mm256_add_epi32(A[0x7], C[0x2]); A[0x6] = _mm256_add_epi32(A[0x6], C[0x1]); A[0x5] = _mm256_add_epi32(A[0x5], C[0x0]); A[0x4] = _mm256_add_epi32(A[0x4], C[0xF]); A[0x3] = _mm256_add_epi32(A[0x3], C[0xE]); A[0x2] = _mm256_add_epi32(A[0x2], C[0xD]); A[0x1] = _mm256_add_epi32(A[0x1], C[0xC]); A[0x0] = _mm256_add_epi32(A[0x0], C[0xB]); A[0xB] = _mm256_add_epi32(A[0xB], C[0xA]); A[0xA] = _mm256_add_epi32(A[0xA], C[0x9]); A[0x9] = _mm256_add_epi32(A[0x9], C[0x8]); A[0x8] = _mm256_add_epi32(A[0x8], C[0x7]); A[0x7] = _mm256_add_epi32(A[0x7], C[0x6]); A[0x6] = _mm256_add_epi32(A[0x6], C[0x5]); A[0x5] = _mm256_add_epi32(A[0x5], C[0x4]); A[0x4] = _mm256_add_epi32(A[0x4], C[0x3]); A[0x3] = _mm256_add_epi32(A[0x3], C[0x2]); A[0x2] = _mm256_add_epi32(A[0x2], C[0x1]); A[0x1] = _mm256_add_epi32(A[0x1], C[0x0]); A[0x0] = _mm256_add_epi32(A[0x0], C[0xF]); A[0xB] = _mm256_add_epi32(A[0xB], C[0xE]); A[0xA] = _mm256_add_epi32(A[0xA], C[0xD]); A[0x9] = _mm256_add_epi32(A[0x9], C[0xC]); A[0x8] = _mm256_add_epi32(A[0x8], C[0xB]); A[0x7] = _mm256_add_epi32(A[0x7], C[0xA]); A[0x6] = _mm256_add_epi32(A[0x6], C[0x9]); A[0x5] = _mm256_add_epi32(A[0x5], C[0x8]); A[0x4] = _mm256_add_epi32(A[0x4], C[0x7]); A[0x3] = _mm256_add_epi32(A[0x3], C[0x6]); A[0x2] = _mm256_add_epi32(A[0x2], C[0x5]); A[0x1] = _mm256_add_epi32(A[0x1], C[0x4]); A[0x0] = _mm256_add_epi32(A[0x0], C[0x3]); #define SWAP_AND_SUB(xb, xc, xm) do { \ __m256i tmp; \ tmp = xb; \ xb = _mm256_sub_epi32(xc, xm); \ xc = tmp; \ } while (0) SWAP_AND_SUB(B[0x0], C[0x0], M(0x0)); SWAP_AND_SUB(B[0x1], C[0x1], M(0x1)); SWAP_AND_SUB(B[0x2], C[0x2], M(0x2)); SWAP_AND_SUB(B[0x3], C[0x3], M(0x3)); SWAP_AND_SUB(B[0x4], C[0x4], M(0x4)); SWAP_AND_SUB(B[0x5], C[0x5], M(0x5)); SWAP_AND_SUB(B[0x6], C[0x6], M(0x6)); SWAP_AND_SUB(B[0x7], C[0x7], M(0x7)); SWAP_AND_SUB(B[0x8], C[0x8], M(0x8)); SWAP_AND_SUB(B[0x9], C[0x9], M(0x9)); SWAP_AND_SUB(B[0xA], C[0xA], M(0xA)); SWAP_AND_SUB(B[0xB], C[0xB], M(0xB)); SWAP_AND_SUB(B[0xC], C[0xC], M(0xC)); SWAP_AND_SUB(B[0xD], C[0xD], M(0xD)); SWAP_AND_SUB(B[0xE], C[0xE], M(0xE)); SWAP_AND_SUB(B[0xF], C[0xF], M(0xF)); buf0 += 64; buf1 += 64; buf2 += 64; buf3 += 64; buf4 += 64; buf5 += 64; buf6 += 64; buf7 += 64; if (++sc->Wlow == 0) sc->Whigh++; } for (j = 0; j < 12; j++) _mm256_storeu_si256((__m256i *)sc->state + j, A[j]); for (j = 0; j < 16; j++) { _mm256_storeu_si256((__m256i *)sc->state + j + 12, B[j]); _mm256_storeu_si256((__m256i *)sc->state + j + 28, C[j]); } #undef M }
//----------------------------------------------------------------------------------------- // Rasterize the occludee AABB and depth test it against the CPU rasterized depth buffer // If any of the rasterized AABB pixels passes the depth test exit early and mark the occludee // as visible. If all rasterized AABB pixels are occluded then the occludee is culled //----------------------------------------------------------------------------------------- bool TransformedAABBoxAVX::RasterizeAndDepthTestAABBox(UINT *pRenderTargetPixels, const __m128 pXformedPos[], UINT idx) { // Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster) // Denormal are zero (DAZ) is bit 6 and Flush to zero (FZ) is bit 15. // so to enable the two to have to set bits 6 and 15 which 1000 0000 0100 0000 = 0x8040 _mm_setcsr( _mm_getcsr() | 0x8040 ); __m256i colOffset = _mm256_setr_epi32(0, 1, 2, 3, 0, 1, 2, 3); __m256i rowOffset = _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1); float* pDepthBuffer = (float*)pRenderTargetPixels; // Rasterize the AABB triangles 4 at a time for(UINT i = 0; i < AABB_TRIANGLES; i += SSE) { vFloat4 xformedPos[3]; Gather(xformedPos, i, pXformedPos, idx); // use fixed-point only for X and Y. Avoid work for Z and W. __m128i fxPtX[3], fxPtY[3]; for(int m = 0; m < 3; m++) { fxPtX[m] = _mm_cvtps_epi32(xformedPos[m].X); fxPtY[m] = _mm_cvtps_epi32(xformedPos[m].Y); } // Fab(x, y) = Ax + By + C = 0 // Fab(x, y) = (ya - yb)x + (xb - xa)y + (xa * yb - xb * ya) = 0 // Compute A = (ya - yb) for the 3 line segments that make up each triangle __m128i A0 = _mm_sub_epi32(fxPtY[1], fxPtY[2]); __m128i A1 = _mm_sub_epi32(fxPtY[2], fxPtY[0]); __m128i A2 = _mm_sub_epi32(fxPtY[0], fxPtY[1]); // Compute B = (xb - xa) for the 3 line segments that make up each triangle __m128i B0 = _mm_sub_epi32(fxPtX[2], fxPtX[1]); __m128i B1 = _mm_sub_epi32(fxPtX[0], fxPtX[2]); __m128i B2 = _mm_sub_epi32(fxPtX[1], fxPtX[0]); // Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle __m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[1], fxPtY[2]), _mm_mullo_epi32(fxPtX[2], fxPtY[1])); __m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[2], fxPtY[0]), _mm_mullo_epi32(fxPtX[0], fxPtY[2])); __m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[0], fxPtY[1]), _mm_mullo_epi32(fxPtX[1], fxPtY[0])); // Compute triangle area __m128i triArea = _mm_mullo_epi32(B2, A1); triArea = _mm_sub_epi32(triArea, _mm_mullo_epi32(B1, A2)); __m128 oneOverTriArea = _mm_rcp_ps(_mm_cvtepi32_ps(triArea)); __m128 Z[3]; Z[0] = xformedPos[0].Z; Z[1] = _mm_mul_ps(_mm_sub_ps(xformedPos[1].Z, Z[0]), oneOverTriArea); Z[2] = _mm_mul_ps(_mm_sub_ps(xformedPos[2].Z, Z[0]), oneOverTriArea); // Use bounding box traversal strategy to determine which pixels to rasterize //__m128i startX = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~1)); __m128i startX = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~3)); __m128i endX = HelperSSE::Min(HelperSSE::Max(HelperSSE::Max(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(SCREENW - 1)); __m128i startY = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~1)); __m128i endY = HelperSSE::Min(HelperSSE::Max(HelperSSE::Max(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(SCREENH - 1)); // Now we have 4 triangles set up. Rasterize them each individually. for(int lane=0; lane < SSE; lane++) { // Skip triangle if area is zero if(triArea.m128i_i32[lane] <= 0) { continue; } // Extract this triangle's properties from the SIMD versions __m256 zz[3]; for (int vv = 0; vv < 3; vv++) { zz[vv] = _mm256_set1_ps(Z[vv].m128_f32[lane]); } int startXx = startX.m128i_i32[lane]; int endXx = endX.m128i_i32[lane]; int startYy = startY.m128i_i32[lane]; int endYy = endY.m128i_i32[lane]; __m256i aa0 = _mm256_set1_epi32(A0.m128i_i32[lane]); __m256i aa1 = _mm256_set1_epi32(A1.m128i_i32[lane]); __m256i aa2 = _mm256_set1_epi32(A2.m128i_i32[lane]); __m256i bb0 = _mm256_set1_epi32(B0.m128i_i32[lane]); __m256i bb1 = _mm256_set1_epi32(B1.m128i_i32[lane]); __m256i bb2 = _mm256_set1_epi32(B2.m128i_i32[lane]); __m256i aa0Inc = _mm256_slli_epi32(aa0, 2); __m256i aa1Inc = _mm256_slli_epi32(aa1, 2); __m256i aa2Inc = _mm256_slli_epi32(aa2, 2); __m256i bb0Inc = _mm256_slli_epi32(bb0, 1); __m256i bb1Inc = _mm256_slli_epi32(bb1, 1); __m256i bb2Inc = _mm256_slli_epi32(bb2, 1); __m256i row, col; // Traverse pixels in 2x4 blocks and store 2x4 pixel quad depths contiguously in memory ==> 2*X // This method provides better performance int rowIdx = (startYy * SCREENW + 2 * startXx); col = _mm256_add_epi32(colOffset, _mm256_set1_epi32(startXx)); __m256i aa0Col = _mm256_mullo_epi32(aa0, col); __m256i aa1Col = _mm256_mullo_epi32(aa1, col); __m256i aa2Col = _mm256_mullo_epi32(aa2, col); row = _mm256_add_epi32(rowOffset, _mm256_set1_epi32(startYy)); __m256i bb0Row = _mm256_add_epi32(_mm256_mullo_epi32(bb0, row), _mm256_set1_epi32(C0.m128i_i32[lane])); __m256i bb1Row = _mm256_add_epi32(_mm256_mullo_epi32(bb1, row), _mm256_set1_epi32(C1.m128i_i32[lane])); __m256i bb2Row = _mm256_add_epi32(_mm256_mullo_epi32(bb2, row), _mm256_set1_epi32(C2.m128i_i32[lane])); __m256i sum0Row = _mm256_add_epi32(aa0Col, bb0Row); __m256i sum1Row = _mm256_add_epi32(aa1Col, bb1Row); __m256i sum2Row = _mm256_add_epi32(aa2Col, bb2Row); __m256 zx = _mm256_mul_ps(_mm256_cvtepi32_ps(aa1Inc), zz[1]); zx = _mm256_add_ps(zx, _mm256_mul_ps(_mm256_cvtepi32_ps(aa2Inc), zz[2])); // Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY) for (int r = startYy; r < endYy; r += 2, rowIdx += 2 * SCREENW, sum0Row = _mm256_add_epi32(sum0Row, bb0Inc), sum1Row = _mm256_add_epi32(sum1Row, bb1Inc), sum2Row = _mm256_add_epi32(sum2Row, bb2Inc)) { // Compute barycentric coordinates int index = rowIdx; __m256i alpha = sum0Row; __m256i beta = sum1Row; __m256i gama = sum2Row; //Compute barycentric-interpolated depth __m256 depth = zz[0]; depth = _mm256_add_ps(depth, _mm256_mul_ps(_mm256_cvtepi32_ps(beta), zz[1])); depth = _mm256_add_ps(depth, _mm256_mul_ps(_mm256_cvtepi32_ps(gama), zz[2])); __m256i anyOut = _mm256_setzero_si256(); for (int c = startXx; c < endXx; c += 4, index += 8, alpha = _mm256_add_epi32(alpha, aa0Inc), beta = _mm256_add_epi32(beta, aa1Inc), gama = _mm256_add_epi32(gama, aa2Inc), depth = _mm256_add_ps(depth, zx)) { //Test Pixel inside triangle __m256i mask = _mm256_or_si256(_mm256_or_si256(alpha, beta), gama); __m256 previousDepthValue = _mm256_loadu_ps(&pDepthBuffer[index]); __m256 depthMask = _mm256_cmp_ps(depth, previousDepthValue, 0x1D); __m256i finalMask = _mm256_andnot_si256(mask, _mm256_castps_si256(depthMask)); anyOut = _mm256_or_si256(anyOut, finalMask); }//for each column if (!_mm256_testz_si256(anyOut, _mm256_set1_epi32(0x80000000))) { return true; //early exit } }// for each row }// for each triangle }// for each set of SIMD# triangles return false; }
__m256i inline ShL(__m256i x, int n) { return _mm256_slli_epi32(x, n); }
__m256 mm256_cos_ps(__m256 x) { __m256 xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y; __m256i emm0, emm2; /* take the absolute value */ x = _mm256_and_ps(x, *(__m256*)m256_ps_inv_sign_mask); /* scale by 4/Pi */ y = _mm256_mul_ps(x, *(__m256*)m256_ps_cephes_FOPI); /* store the integer part of y in mm0 */ emm2 = _mm256_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ emm2 = _mm256_add_epi32(emm2, *(__m256i*)m256_pi32_1); emm2 = _mm256_and_si256(emm2, *(__m256i*)m256_pi32_inv1); y = _mm256_cvtepi32_ps(emm2); emm2 = _mm256_sub_epi32(emm2, *(__m256i*)m256_pi32_2); /* get the swap sign flag */ emm0 = _mm256_andnot_si256(emm2, *(__m256i*)m256_pi32_4); emm0 = _mm256_slli_epi32(emm0, 29); /* get the polynom selection mask */ emm2 = _mm256_and_si256(emm2, *(__m256i*)m256_pi32_2); emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256()); __m256 sign_bit = _mm256_castsi256_ps(emm0); __m256 poly_mask = _mm256_castsi256_ps(emm2); /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(__m256*)m256_ps_minus_cephes_DP1; xmm2 = *(__m256*)m256_ps_minus_cephes_DP2; xmm3 = *(__m256*)m256_ps_minus_cephes_DP3; xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); x = _mm256_add_ps(x, xmm1); x = _mm256_add_ps(x, xmm2); x = _mm256_add_ps(x, xmm3); /* Evaluate the first polynom (0 <= x <= Pi/4) */ y = *(__m256*)m256_ps_coscof_p0; __m256 z = _mm256_mul_ps(x,x); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(__m256*)m256_ps_coscof_p1); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(__m256*)m256_ps_coscof_p2); y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); __m256 tmp = _mm256_mul_ps(z, *(__m256*)m256_ps_0p5); y = _mm256_sub_ps(y, tmp); y = _mm256_add_ps(y, *(__m256*)m256_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ __m256 y2 = *(__m256*)m256_ps_sincof_p0; y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(__m256*)m256_ps_sincof_p1); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(__m256*)m256_ps_sincof_p2); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; y2 = _mm256_and_ps(xmm3, y2); //, xmm3); y = _mm256_andnot_ps(xmm3, y); y = _mm256_add_ps(y,y2); /* update the sign */ y = _mm256_xor_ps(y, sign_bit); _mm256_zeroupper(); return y; }
/** * \brief quantize transformed coefficents * */ void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width, int32_t height, int8_t type, int8_t scan_idx, int8_t block_type) { const encoder_control_t * const encoder = state->encoder_control; const uint32_t log2_block_size = kvz_g_convert_to_bit[width] + 2; const uint32_t * const scan = kvz_g_sig_last_scan[scan_idx][log2_block_size - 1]; int32_t qp_scaled = kvz_get_scaled_qp(type, state->global->QP, (encoder->bitdepth - 8) * 6); const uint32_t log2_tr_size = kvz_g_convert_to_bit[width] + 2; const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"[type]); const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_size - 2][scalinglist_type][qp_scaled % 6]; const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - log2_tr_size; //!< Represents scaling through forward transform const int32_t q_bits = QUANT_SHIFT + qp_scaled / 6 + transform_shift; const int32_t add = ((state->global->slicetype == KVZ_SLICE_I) ? 171 : 85) << (q_bits - 9); const int32_t q_bits8 = q_bits - 8; assert(quant_coeff[0] <= (1 << 15) - 1 && quant_coeff[0] >= -(1 << 15)); //Assuming flat values to fit int16_t uint32_t ac_sum = 0; __m256i v_ac_sum = _mm256_setzero_si256(); __m256i v_quant_coeff = _mm256_set1_epi16(quant_coeff[0]); for (int32_t n = 0; n < width * height; n += 16) { __m256i v_level = _mm256_loadu_si256((__m256i*)&(coef[n])); __m256i v_sign = _mm256_cmpgt_epi16(_mm256_setzero_si256(), v_level); v_sign = _mm256_or_si256(v_sign, _mm256_set1_epi16(1)); v_level = _mm256_abs_epi16(v_level); __m256i low_a = _mm256_unpacklo_epi16(v_level, _mm256_set1_epi16(0)); __m256i high_a = _mm256_unpackhi_epi16(v_level, _mm256_set1_epi16(0)); __m256i low_b = _mm256_unpacklo_epi16(v_quant_coeff, _mm256_set1_epi16(0)); __m256i high_b = _mm256_unpackhi_epi16(v_quant_coeff, _mm256_set1_epi16(0)); __m256i v_level32_a = _mm256_madd_epi16(low_a, low_b); __m256i v_level32_b = _mm256_madd_epi16(high_a, high_b); v_level32_a = _mm256_add_epi32(v_level32_a, _mm256_set1_epi32(add)); v_level32_b = _mm256_add_epi32(v_level32_b, _mm256_set1_epi32(add)); v_level32_a = _mm256_srai_epi32(v_level32_a, q_bits); v_level32_b = _mm256_srai_epi32(v_level32_b, q_bits); v_level = _mm256_packs_epi32(v_level32_a, v_level32_b); v_level = _mm256_sign_epi16(v_level, v_sign); _mm256_storeu_si256((__m256i*)&(q_coef[n]), v_level); v_ac_sum = _mm256_add_epi32(v_ac_sum, v_level32_a); v_ac_sum = _mm256_add_epi32(v_ac_sum, v_level32_b); } __m128i temp = _mm_add_epi32(_mm256_castsi256_si128(v_ac_sum), _mm256_extracti128_si256(v_ac_sum, 1)); temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, KVZ_PERMUTE(2, 3, 0, 1))); temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, KVZ_PERMUTE(1, 0, 1, 0))); ac_sum += _mm_cvtsi128_si32(temp); if (!(encoder->sign_hiding && ac_sum >= 2)) return; int32_t delta_u[LCU_WIDTH*LCU_WIDTH >> 2]; for (int32_t n = 0; n < width * height; n += 16) { __m256i v_level = _mm256_loadu_si256((__m256i*)&(coef[n])); v_level = _mm256_abs_epi16(v_level); __m256i low_a = _mm256_unpacklo_epi16(v_level, _mm256_set1_epi16(0)); __m256i high_a = _mm256_unpackhi_epi16(v_level, _mm256_set1_epi16(0)); __m256i low_b = _mm256_unpacklo_epi16(v_quant_coeff, _mm256_set1_epi16(0)); __m256i high_b = _mm256_unpackhi_epi16(v_quant_coeff, _mm256_set1_epi16(0)); __m256i v_level32_a = _mm256_madd_epi16(low_a, low_b); __m256i v_level32_b = _mm256_madd_epi16(high_a, high_b); v_level32_a = _mm256_add_epi32(v_level32_a, _mm256_set1_epi32(add)); v_level32_b = _mm256_add_epi32(v_level32_b, _mm256_set1_epi32(add)); v_level32_a = _mm256_srai_epi32(v_level32_a, q_bits); v_level32_b = _mm256_srai_epi32(v_level32_b, q_bits); v_level = _mm256_packs_epi32(v_level32_a, v_level32_b); __m256i v_coef = _mm256_loadu_si256((__m256i*)&(coef[n])); __m256i v_coef_a = _mm256_unpacklo_epi16(_mm256_abs_epi16(v_coef), _mm256_set1_epi16(0)); __m256i v_coef_b = _mm256_unpackhi_epi16(_mm256_abs_epi16(v_coef), _mm256_set1_epi16(0)); __m256i v_quant_coeff_a = _mm256_unpacklo_epi16(v_quant_coeff, _mm256_set1_epi16(0)); __m256i v_quant_coeff_b = _mm256_unpackhi_epi16(v_quant_coeff, _mm256_set1_epi16(0)); v_coef_a = _mm256_madd_epi16(v_coef_a, v_quant_coeff_a); v_coef_b = _mm256_madd_epi16(v_coef_b, v_quant_coeff_b); v_coef_a = _mm256_sub_epi32(v_coef_a, _mm256_slli_epi32(_mm256_unpacklo_epi16(v_level, _mm256_set1_epi16(0)), q_bits) ); v_coef_b = _mm256_sub_epi32(v_coef_b, _mm256_slli_epi32(_mm256_unpackhi_epi16(v_level, _mm256_set1_epi16(0)), q_bits) ); v_coef_a = _mm256_srai_epi32(v_coef_a, q_bits8); v_coef_b = _mm256_srai_epi32(v_coef_b, q_bits8); _mm_storeu_si128((__m128i*)&(delta_u[n+0*4]), _mm256_castsi256_si128(v_coef_a)); _mm_storeu_si128((__m128i*)&(delta_u[n+2*4]), _mm256_extracti128_si256(v_coef_a, 1)); _mm_storeu_si128((__m128i*)&(delta_u[n+1*4]), _mm256_castsi256_si128(v_coef_b)); _mm_storeu_si128((__m128i*)&(delta_u[n+3*4]), _mm256_extracti128_si256(v_coef_b, 1)); } if (ac_sum >= 2) { #define SCAN_SET_SIZE 16 #define LOG2_SCAN_SET_SIZE 4 int32_t n, last_cg = -1, abssum = 0, subset, subpos; for (subset = (width*height - 1) >> LOG2_SCAN_SET_SIZE; subset >= 0; subset--) { int32_t first_nz_pos_in_cg = SCAN_SET_SIZE, last_nz_pos_in_cg = -1; subpos = subset << LOG2_SCAN_SET_SIZE; abssum = 0; // Find last coeff pos for (n = SCAN_SET_SIZE - 1; n >= 0; n--) { if (q_coef[scan[n + subpos]]) { last_nz_pos_in_cg = n; break; } } // First coeff pos for (n = 0; n <SCAN_SET_SIZE; n++) { if (q_coef[scan[n + subpos]]) { first_nz_pos_in_cg = n; break; } } // Sum all kvz_quant coeffs between first and last for (n = first_nz_pos_in_cg; n <= last_nz_pos_in_cg; n++) { abssum += q_coef[scan[n + subpos]]; } if (last_nz_pos_in_cg >= 0 && last_cg == -1) { last_cg = 1; } if (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4) { int32_t signbit = (q_coef[scan[subpos + first_nz_pos_in_cg]] > 0 ? 0 : 1); if (signbit != (abssum & 0x1)) { // compare signbit with sum_parity int32_t min_cost_inc = 0x7fffffff, min_pos = -1, cur_cost = 0x7fffffff; int16_t final_change = 0, cur_change = 0; for (n = (last_cg == 1 ? last_nz_pos_in_cg : SCAN_SET_SIZE - 1); n >= 0; n--) { uint32_t blkPos = scan[n + subpos]; if (q_coef[blkPos] != 0) { if (delta_u[blkPos] > 0) { cur_cost = -delta_u[blkPos]; cur_change = 1; } else if (n == first_nz_pos_in_cg && abs(q_coef[blkPos]) == 1) { cur_cost = 0x7fffffff; } else { cur_cost = delta_u[blkPos]; cur_change = -1; } } else if (n < first_nz_pos_in_cg && ((coef[blkPos] >= 0) ? 0 : 1) != signbit) { cur_cost = 0x7fffffff; } else { cur_cost = -delta_u[blkPos]; cur_change = 1; } if (cur_cost < min_cost_inc) { min_cost_inc = cur_cost; final_change = cur_change; min_pos = blkPos; } } // CG loop if (q_coef[min_pos] == 32767 || q_coef[min_pos] == -32768) { final_change = -1; } if (coef[min_pos] >= 0) q_coef[min_pos] += final_change; else q_coef[min_pos] -= final_change; } // Hide } if (last_cg == 1) last_cg = 0; } #undef SCAN_SET_SIZE #undef LOG2_SCAN_SET_SIZE }
/* evaluation of 8 sines at onces using AVX intrisics The code is the exact rewriting of the cephes sinf function. Precision is excellent as long as x < 8192 (I did not bother to take into account the special handling they have for greater values -- it does not return garbage for arguments over 8192, though, but the extra precision is missing). Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the surprising but correct result. */ v8sf sin256_ps(v8sf x) { // any x v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y; v8si imm0, imm2; #ifndef __AVX2__ v4si imm0_1, imm0_2; v4si imm2_1, imm2_2; #endif sign_bit = x; /* take the absolute value */ x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask); /* extract the sign bit (upper one) */ sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask); /* scale by 4/Pi */ y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI); /* Here we start a series of integer operations, which are in the realm of AVX2. If we don't have AVX, let's perform them using SSE2 directives */ #ifdef __AVX2__ /* store the integer part of y in mm0 */ imm2 = _mm256_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ // another two AVX2 instruction imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1); imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_inv1); y = _mm256_cvtepi32_ps(imm2); /* get the swap sign flag */ imm0 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_4); imm0 = _mm256_slli_epi32(imm0, 29); /* get the polynom selection mask there is one polynom for 0 <= x <= Pi/4 and another one for Pi/4<x<=Pi/2 Both branches will be computed. */ imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_2); imm2 = _mm256_cmpeq_epi32(imm2,*(v8si*)_pi32_256_0); #else /* we use SSE2 routines to perform the integer ops */ COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2); imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1); imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1); imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1); imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1); COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2); y = _mm256_cvtepi32_ps(imm2); imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4); imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4); imm0_1 = _mm_slli_epi32(imm0_1, 29); imm0_2 = _mm_slli_epi32(imm0_2, 29); COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2); imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2); imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); #endif v8sf swap_sign_bit = _mm256_castsi256_ps(imm0); v8sf poly_mask = _mm256_castsi256_ps(imm2); sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit); /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(v8sf*)_ps256_minus_cephes_DP1; xmm2 = *(v8sf*)_ps256_minus_cephes_DP2; xmm3 = *(v8sf*)_ps256_minus_cephes_DP3; xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); x = _mm256_add_ps(x, xmm1); x = _mm256_add_ps(x, xmm2); x = _mm256_add_ps(x, xmm3); /* Evaluate the first polynom (0 <= x <= Pi/4) */ y = *(v8sf*)_ps256_coscof_p0; v8sf z = _mm256_mul_ps(x,x); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2); y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5); y = _mm256_sub_ps(y, tmp); y = _mm256_add_ps(y, *(v8sf*)_ps256_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ v8sf y2 = *(v8sf*)_ps256_sincof_p0; y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; y2 = _mm256_and_ps(xmm3, y2); //, xmm3); y = _mm256_andnot_ps(xmm3, y); y = _mm256_add_ps(y,y2); /* update the sign */ y = _mm256_xor_ps(y, sign_bit); return y; }