inline void avx2_positive_hexid_to_ringid_segid_runid( const __m256i hexid, __m256i& ringid, __m256i& segid, __m256i& runid) { // ringid = positive_hexid_to_ringid(hexid); // unsigned iring = hexid - ringid_to_nsites_contained(ringid-1); // segid = int(iring/ringid); // runid = iring - segid*ringid; const __m256i one = _mm256_set1_epi32(1); ringid = avx2_positive_hexid_to_ringid(hexid); runid = _mm256_sub_epi32(hexid, avx2_ringid_to_nsites_contained(_mm256_sub_epi32(ringid,one))); segid = _mm256_setzero_si256(); const __m256i ringid_minus_one = _mm256_sub_epi32(ringid, one); __m256i mask = _mm256_cmpgt_epi32(runid, ringid_minus_one); runid = _mm256_sub_epi32(runid, _mm256_and_si256(mask, ringid)); segid = _mm256_add_epi32(segid, _mm256_and_si256(mask, one)); mask = _mm256_cmpgt_epi32(runid, ringid_minus_one); runid = _mm256_sub_epi32(runid, _mm256_and_si256(mask, ringid)); segid = _mm256_add_epi32(segid, _mm256_and_si256(mask, one)); mask = _mm256_cmpgt_epi32(runid, ringid_minus_one); runid = _mm256_sub_epi32(runid, _mm256_and_si256(mask, ringid)); segid = _mm256_add_epi32(segid, _mm256_and_si256(mask, one)); mask = _mm256_cmpgt_epi32(runid, ringid_minus_one); runid = _mm256_sub_epi32(runid, _mm256_and_si256(mask, ringid)); segid = _mm256_add_epi32(segid, _mm256_and_si256(mask, one)); mask = _mm256_cmpgt_epi32(runid, ringid_minus_one); runid = _mm256_sub_epi32(runid, _mm256_and_si256(mask, ringid)); segid = _mm256_add_epi32(segid, _mm256_and_si256(mask, one)); }
inline void avx2_hexid_to_uv_cw(const __m256i hexid, __m256i& u, __m256i& v) { #if 0 // This code is correct but it's not worth maintaining two versions const __m256i one = _mm256_set1_epi32(1); __m256i ringid = avx2_positive_hexid_to_ringid(hexid); __m256i iring = _mm256_sub_epi32(hexid, avx2_ringid_to_nsites_contained(_mm256_sub_epi32(ringid,one))); u = ringid; v = _mm256_setzero_si256(); __m256i irun = _mm256_min_epu32(iring, ringid); v = _mm256_sub_epi32(v, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); u = _mm256_sub_epi32(u, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); u = _mm256_sub_epi32(u, irun); v = _mm256_add_epi32(v, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); v = _mm256_add_epi32(v, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); u = _mm256_add_epi32(u, irun); iring = _mm256_sub_epi32(iring, irun); u = _mm256_add_epi32(u, irun); v = _mm256_add_epi32(v, iring); const __m256i mask = _mm256_cmpeq_epi32(hexid, _mm256_setzero_si256()); u = _mm256_andnot_si256(mask, u); v = _mm256_andnot_si256(mask, v); #else // hexid_to_uv_ccw(hexid, u, v); // u += v; // v = -v; avx2_hexid_to_uv_ccw(hexid, u, v); u = _mm256_add_epi32(u, v); v = _mm256_sign_epi32(v, _mm256_cmpeq_epi32(v, v)); #endif }
static FORCE_INLINE void FlowInterSimple_double_8px_AVX2( int w, PixelType *pdst, const PixelType *prefB, const PixelType *prefF, const int16_t *VXFullB, const int16_t *VXFullF, const int16_t *VYFullB, const int16_t *VYFullF, const uint8_t *MaskB, const uint8_t *MaskF, int nPelLog, const __m256i &dwords_ref_pitch, const __m256i &dwords_hoffsets) { __m256i dwords_w = _mm256_add_epi32(_mm256_set1_epi32(w << nPelLog), dwords_hoffsets); /// maybe do it another way __m256i dstF = lookup_double_AVX2(VXFullF, VYFullF, prefF, w, dwords_ref_pitch, dwords_w); __m256i dstB = lookup_double_AVX2(VXFullB, VYFullB, prefB, w, dwords_ref_pitch, dwords_w); __m256i maskf = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskF[w])); __m256i maskb = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskB[w])); __m256i dstF_dstB = _mm256_add_epi32(dstF, dstB); dstF_dstB = _mm256_slli_epi32(dstF_dstB, 8); __m256i dst; if (sizeof(PixelType) == 1) { __m256i dstB_dstF = _mm256_sub_epi16(dstB, dstF); __m256i maskf_maskb = _mm256_sub_epi16(maskf, maskb); dst = _mm256_madd_epi16(dstB_dstF, maskf_maskb); } else { __m256i dstB_dstF = _mm256_sub_epi32(dstB, dstF); __m256i maskf_maskb = _mm256_sub_epi32(maskf, maskb); dst = _mm256_mullo_epi32(dstB_dstF, maskf_maskb); } dst = _mm256_add_epi32(dst, dstF_dstB); dst = _mm256_srai_epi32(dst, 9); dst = _mm256_packus_epi32(dst, dst); dst = _mm256_permute4x64_epi64(dst, 0xe8); // 0b11101000 - copy third qword to second qword __m128i dst128 = _mm256_castsi256_si128(dst); if (sizeof(PixelType) == 1) { dst128 = _mm_packus_epi16(dst128, dst128); _mm_storel_epi64((__m128i *)&pdst[w], dst128); } else { _mm_storeu_si128((__m128i *)&pdst[w], dst128); } }
inline __m256i avx2_ringid_to_nsites_contained(const __m256i ringid) { // return 3*ringid*(ringid+1)+1; const __m256i one = _mm256_set1_epi32(1); __m256i nsites = _mm256_add_epi32(ringid, one); nsites = _mm256_mullo_epi32(ringid, nsites); nsites = _mm256_sub_epi32(_mm256_slli_epi32(nsites, 2), nsites); nsites = _mm256_add_epi32(nsites, one); return nsites; }
inline __m256i avx2_positive_ringid_segid_runid_to_hexid( const __m256i ringid, const __m256i segid, const __m256i runid) { // return ringid_to_nsites_contained(ringid-1)+segid*ringid+runid; const __m256i one = _mm256_set1_epi32(1); __m256i nsites = avx2_ringid_to_nsites_contained(_mm256_sub_epi32(ringid, one)); nsites = _mm256_add_epi32(nsites, _mm256_mullo_epi32(segid, ringid)); nsites = _mm256_add_epi32(nsites, runid); return nsites; }
inline avx_m256_t newcos_ps(avx_m256_t x) { x = _mm256_and_ps(x, _ps_inv_sign_mask); avx_m256_t y = _mm256_mul_ps(x, _ps_cephes_FOPI); avx_m256i_t emm2 = _mm256_cvttps_epi32(y); emm2 = _mm256_add_epi32(emm2, _pi32_1); emm2 = _mm256_and_si256(emm2, _pi32_inv1); y = _mm256_cvtepi32_ps(emm2); emm2 = _mm256_sub_epi32(emm2, _pi32_2); avx_m256i_t emm0 = _mm256_andnot_si256(emm2, _pi32_4); emm0 = _mm256_slli_epi32(emm0, 29); emm2 = _mm256_and_si256(emm2, _pi32_2); emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256()); avx_m256_t sign_bit = _mm256_castsi256_ps(emm0); avx_m256_t poly_mask = _mm256_castsi256_ps(emm2); avx_m256_t temp = _ps_minus_cephes_DP123; temp = _mm256_mul_ps(y, temp); x = _mm256_add_ps(x, temp); avx_m256_t x2 = _mm256_mul_ps(x, x); avx_m256_t x3 = _mm256_mul_ps(x2, x); avx_m256_t x4 = _mm256_mul_ps(x2, x2); y = _ps_coscof_p0; avx_m256_t y2 = _ps_sincof_p0; y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p1); y2 = _mm256_add_ps(y2, _ps_sincof_p1); y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p2); y2 = _mm256_add_ps(y2, _ps_sincof_p2); y = _mm256_mul_ps(y, x4); y2 = _mm256_mul_ps(y2, x3); temp = _mm256_mul_ps(x2, _ps_0p5); temp = _mm256_sub_ps(temp, _ps_1); y = _mm256_sub_ps(y, temp); y2 = _mm256_add_ps(y2, x); y = _mm256_andnot_ps(poly_mask, y); y2 = _mm256_and_ps(poly_mask, y2); y = _mm256_add_ps(y, y2); y = _mm256_xor_ps(y, sign_bit); return y; } // newcos_ps()
void vpx_highbd_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { int idx; tran_low_t *t_coeff = coeff; for (idx = 0; idx < 4; ++idx) { const int16_t *src_ptr = src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; vpx_highbd_hadamard_16x16_avx2(src_ptr, src_stride, t_coeff + idx * 256); } for (idx = 0; idx < 256; idx += 8) { __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256)); __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512)); __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768)); __m256i b0 = _mm256_add_epi32(coeff0, coeff1); __m256i b1 = _mm256_sub_epi32(coeff0, coeff1); __m256i b2 = _mm256_add_epi32(coeff2, coeff3); __m256i b3 = _mm256_sub_epi32(coeff2, coeff3); b0 = _mm256_srai_epi32(b0, 2); b1 = _mm256_srai_epi32(b1, 2); b2 = _mm256_srai_epi32(b2, 2); b3 = _mm256_srai_epi32(b3, 2); coeff0 = _mm256_add_epi32(b0, b2); coeff1 = _mm256_add_epi32(b1, b3); coeff2 = _mm256_sub_epi32(b0, b2); coeff3 = _mm256_sub_epi32(b1, b3); _mm256_storeu_si256((__m256i *)coeff, coeff0); _mm256_storeu_si256((__m256i *)(coeff + 256), coeff1); _mm256_storeu_si256((__m256i *)(coeff + 512), coeff2); _mm256_storeu_si256((__m256i *)(coeff + 768), coeff3); coeff += 8; t_coeff += 8; } }
EvalSum& operator -= (const EvalSum& rhs) { #if defined USE_AVX2_EVAL mm = _mm256_sub_epi32(mm, rhs.mm); #elif defined USE_SSE_EVAL m[0] = _mm_sub_epi32(m[0], rhs.m[0]); m[1] = _mm_sub_epi32(m[1], rhs.m[1]); #else m_p[0][0] -= rhs.m_p[0][0]; m_p[0][1] -= rhs.m_p[0][1]; m_p[1][0] -= rhs.m_p[1][0]; m_p[1][1] -= rhs.m_p[1][1]; m_p[2][0] -= rhs.m_p[2][0]; m_p[2][1] -= rhs.m_p[2][1]; #endif return *this; }
__m256i branchfree_search8_avx(int* source, size_t n, __m256i target) { __m256i offsets = _mm256_setzero_si256(); if(n == 0) return offsets; __m256i ha = _mm256_set1_epi32(n>>1); while(n>1) { n -= n>>1; __m256i offsetsplushalf = _mm256_add_epi32(offsets,ha); ha = _mm256_sub_epi32(ha,_mm256_srli_epi32(ha,1)); __m256i keys = _mm256_i32gather_epi32(source,offsetsplushalf,4); __m256i lt = _mm256_cmpgt_epi32(target,keys); offsets = _mm256_blendv_epi8(offsets,offsetsplushalf,lt); } __m256i lastkeys = _mm256_i32gather_epi32(source,offsets,4); __m256i lastlt = _mm256_cmpgt_epi32(target,lastkeys); __m256i oneswhereneeded = _mm256_srli_epi32(lastlt,31); __m256i answer = _mm256_add_epi32(offsets,oneswhereneeded); return answer; }
static INLINE void quantize(const __m256i *qp, __m256i *c, const int16_t *iscan_ptr, int log_scale, tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) { const __m256i abs_coeff = _mm256_abs_epi32(*c); __m256i q = _mm256_add_epi32(abs_coeff, qp[0]); __m256i q_lo = _mm256_mul_epi32(q, qp[1]); __m256i q_hi = _mm256_srli_epi64(q, 32); const __m256i qp_hi = _mm256_srli_epi64(qp[1], 32); q_hi = _mm256_mul_epi32(q_hi, qp_hi); q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale); q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale); q_hi = _mm256_slli_epi64(q_hi, 32); q = _mm256_or_si256(q_lo, q_hi); const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale); const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s); q = _mm256_andnot_si256(mask, q); __m256i dq = _mm256_mullo_epi32(q, qp[2]); dq = _mm256_srai_epi32(dq, log_scale); q = _mm256_sign_epi32(q, *c); dq = _mm256_sign_epi32(dq, *c); _mm256_storeu_si256((__m256i *)qcoeff, q); _mm256_storeu_si256((__m256i *)dqcoeff, dq); const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr); const __m128i zr = _mm_setzero_si128(); const __m128i lo = _mm_unpacklo_epi16(isc, zr); const __m128i hi = _mm_unpackhi_epi16(isc, zr); const __m256i iscan = _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); const __m256i zero = _mm256_setzero_si256(); const __m256i zc = _mm256_cmpeq_epi32(dq, zero); const __m256i nz = _mm256_cmpeq_epi32(zc, zero); __m256i cur_eob = _mm256_sub_epi32(iscan, nz); cur_eob = _mm256_and_si256(cur_eob, nz); *eob = _mm256_max_epi32(cur_eob, *eob); }
void calculate_fma_float (unsigned char * out, double X0, double Y0, double scale, unsigned YSTART, unsigned SX, unsigned SY) { __m256 dd = _mm256_set1_ps ((float) scale); __m256 XX0 = _mm256_set1_ps ((float) X0); for (unsigned j = YSTART; j < SY; j++) { __m256 y0 = _mm256_set1_ps (j*(float) scale + (float) Y0); for (unsigned i = 0; i < SX; i += 8) { __m256i ind = _mm256_setr_epi32 (i, i + 1, i + 2, i + 3, i + 4, i + 5, i + 6, i + 7); __m256 x0 = _mm256_fmadd_ps (dd, _mm256_cvtepi32_ps (ind), XX0); __m256 x = x0; __m256 y = y0; __m256i counts = _mm256_setzero_si256 (); __m256i cmp_mask = _mm256_set1_epi32 (0xFFFFFFFFu); for (unsigned n = 0; n < 255; n++) { __m256 x2 = _mm256_mul_ps (x, x); __m256 y2 = _mm256_mul_ps (y, y); __m256 abs = _mm256_add_ps (x2, y2); __m256i cmp = _mm256_castps_si256 (_mm256_cmp_ps (abs, _mm256_set1_ps (4), 1)); cmp_mask = _mm256_and_si256 (cmp_mask, cmp); if (_mm256_testz_si256 (cmp_mask, cmp_mask)) { break; } counts = _mm256_sub_epi32 (counts, cmp_mask); __m256 t = _mm256_add_ps (x, x); y = _mm256_fmadd_ps (t, y, y0); x = _mm256_add_ps (_mm256_sub_ps (x2, y2), x0); } __m256i result = _mm256_shuffle_epi8 (counts, _mm256_setr_epi8 (0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12, 0, 4, 8, 12)); __m128i result128 = _128i_shuffle (_mm256_extractf128_si256 (result, 0), _mm256_extractf128_si256 (result, 1), 0, 0, 0, 0); result128 = _mm_shuffle_epi32 (result128, combine_4_2bits (0, 2, 0, 2)); _mm_storel_epi64 ((__m128i*) out, result128); out += 8; } } }
__m256 mm256_cos_ps(__m256 x) { __m256 xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y; __m256i emm0, emm2; /* take the absolute value */ x = _mm256_and_ps(x, *(__m256*)m256_ps_inv_sign_mask); /* scale by 4/Pi */ y = _mm256_mul_ps(x, *(__m256*)m256_ps_cephes_FOPI); /* store the integer part of y in mm0 */ emm2 = _mm256_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ emm2 = _mm256_add_epi32(emm2, *(__m256i*)m256_pi32_1); emm2 = _mm256_and_si256(emm2, *(__m256i*)m256_pi32_inv1); y = _mm256_cvtepi32_ps(emm2); emm2 = _mm256_sub_epi32(emm2, *(__m256i*)m256_pi32_2); /* get the swap sign flag */ emm0 = _mm256_andnot_si256(emm2, *(__m256i*)m256_pi32_4); emm0 = _mm256_slli_epi32(emm0, 29); /* get the polynom selection mask */ emm2 = _mm256_and_si256(emm2, *(__m256i*)m256_pi32_2); emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256()); __m256 sign_bit = _mm256_castsi256_ps(emm0); __m256 poly_mask = _mm256_castsi256_ps(emm2); /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(__m256*)m256_ps_minus_cephes_DP1; xmm2 = *(__m256*)m256_ps_minus_cephes_DP2; xmm3 = *(__m256*)m256_ps_minus_cephes_DP3; xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); x = _mm256_add_ps(x, xmm1); x = _mm256_add_ps(x, xmm2); x = _mm256_add_ps(x, xmm3); /* Evaluate the first polynom (0 <= x <= Pi/4) */ y = *(__m256*)m256_ps_coscof_p0; __m256 z = _mm256_mul_ps(x,x); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(__m256*)m256_ps_coscof_p1); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(__m256*)m256_ps_coscof_p2); y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); __m256 tmp = _mm256_mul_ps(z, *(__m256*)m256_ps_0p5); y = _mm256_sub_ps(y, tmp); y = _mm256_add_ps(y, *(__m256*)m256_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ __m256 y2 = *(__m256*)m256_ps_sincof_p0; y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(__m256*)m256_ps_sincof_p1); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(__m256*)m256_ps_sincof_p2); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; y2 = _mm256_and_ps(xmm3, y2); //, xmm3); y = _mm256_andnot_ps(xmm3, y); y = _mm256_add_ps(y,y2); /* update the sign */ y = _mm256_xor_ps(y, sign_bit); _mm256_zeroupper(); return y; }
int32_t j = 0; int32_t end_query = 0; int32_t end_ref = 0; int32_t score = NEG_INF; __m256i vNegInf = _mm256_set1_epi32(NEG_INF); __m256i vOpen = _mm256_set1_epi32(open); __m256i vGap = _mm256_set1_epi32(gap); __m256i vOne = _mm256_set1_epi32(1); __m256i vN = _mm256_set1_epi32(N); __m256i vGapN = _mm256_set1_epi32(gap*N); __m256i vNegOne = _mm256_set1_epi32(-1); __m256i vI = _mm256_set_epi32(0,1,2,3,4,5,6,7); __m256i vJreset = _mm256_set_epi32(0,-1,-2,-3,-4,-5,-6,-7); __m256i vMax = vNegInf; __m256i vILimit = _mm256_set1_epi32(s1Len); __m256i vILimit1 = _mm256_sub_epi32(vILimit, vOne); __m256i vJLimit = _mm256_set1_epi32(s2Len); __m256i vJLimit1 = _mm256_sub_epi32(vJLimit, vOne); __m256i vIBoundary = _mm256_set_epi32( -open-0*gap, -open-1*gap, -open-2*gap, -open-3*gap, -open-4*gap, -open-5*gap, -open-6*gap, -open-7*gap ); /* convert _s1 from char to int in range 0-23 */
/*! * \brief Subtract the two given values and return the result. */ ETL_STATIC_INLINE(avx_simd_int) sub(avx_simd_int lhs, avx_simd_int rhs) { return _mm256_sub_epi32(lhs.value, rhs.value); }
inline void avx2_hexid_to_uv_ccw(const __m256i hexid, __m256i& u, __m256i& v) { // if(hexid==0) { u = v = 0; return; } // unsigned ringid; // unsigned segid; // unsigned runid; // positive_hexid_to_ringid_segid_runid(hexid, ringid, segid, runid); // switch(segid) // { // case 0: u = ringid-runid; v = runid; break; // case 1: u = -runid; v = ringid; break; // case 2: u = -ringid; v = ringid-runid; break; // case 3: u = runid-ringid; v = -runid; break; // case 4: u = runid; v = -ringid; break; // case 5: u = ringid; v = runid-ringid; break; // default: assert(0); // } const __m256i one = _mm256_set1_epi32(1); __m256i ringid = avx2_positive_hexid_to_ringid(hexid); __m256i iring = _mm256_sub_epi32(hexid, avx2_ringid_to_nsites_contained(_mm256_sub_epi32(ringid,one))); u = ringid; v = _mm256_setzero_si256(); __m256i irun = _mm256_min_epu32(iring, ringid); u = _mm256_sub_epi32(u, irun); v = _mm256_add_epi32(v, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); u = _mm256_sub_epi32(u, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); v = _mm256_sub_epi32(v, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); u = _mm256_add_epi32(u, irun); v = _mm256_sub_epi32(v, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); u = _mm256_add_epi32(u, irun); iring = _mm256_sub_epi32(iring, irun); v = _mm256_add_epi32(v, iring); const __m256i mask = _mm256_cmpeq_epi32(hexid, _mm256_setzero_si256()); u = _mm256_andnot_si256(mask, u); v = _mm256_andnot_si256(mask, v); }
static void highbd_hadamard_col8_avx2(__m256i *in, int iter) { __m256i a0 = in[0]; __m256i a1 = in[1]; __m256i a2 = in[2]; __m256i a3 = in[3]; __m256i a4 = in[4]; __m256i a5 = in[5]; __m256i a6 = in[6]; __m256i a7 = in[7]; __m256i b0 = _mm256_add_epi32(a0, a1); __m256i b1 = _mm256_sub_epi32(a0, a1); __m256i b2 = _mm256_add_epi32(a2, a3); __m256i b3 = _mm256_sub_epi32(a2, a3); __m256i b4 = _mm256_add_epi32(a4, a5); __m256i b5 = _mm256_sub_epi32(a4, a5); __m256i b6 = _mm256_add_epi32(a6, a7); __m256i b7 = _mm256_sub_epi32(a6, a7); a0 = _mm256_add_epi32(b0, b2); a1 = _mm256_add_epi32(b1, b3); a2 = _mm256_sub_epi32(b0, b2); a3 = _mm256_sub_epi32(b1, b3); a4 = _mm256_add_epi32(b4, b6); a5 = _mm256_add_epi32(b5, b7); a6 = _mm256_sub_epi32(b4, b6); a7 = _mm256_sub_epi32(b5, b7); if (iter == 0) { b0 = _mm256_add_epi32(a0, a4); b7 = _mm256_add_epi32(a1, a5); b3 = _mm256_add_epi32(a2, a6); b4 = _mm256_add_epi32(a3, a7); b2 = _mm256_sub_epi32(a0, a4); b6 = _mm256_sub_epi32(a1, a5); b1 = _mm256_sub_epi32(a2, a6); b5 = _mm256_sub_epi32(a3, a7); a0 = _mm256_unpacklo_epi32(b0, b1); a1 = _mm256_unpacklo_epi32(b2, b3); a2 = _mm256_unpackhi_epi32(b0, b1); a3 = _mm256_unpackhi_epi32(b2, b3); a4 = _mm256_unpacklo_epi32(b4, b5); a5 = _mm256_unpacklo_epi32(b6, b7); a6 = _mm256_unpackhi_epi32(b4, b5); a7 = _mm256_unpackhi_epi32(b6, b7); b0 = _mm256_unpacklo_epi64(a0, a1); b1 = _mm256_unpacklo_epi64(a4, a5); b2 = _mm256_unpackhi_epi64(a0, a1); b3 = _mm256_unpackhi_epi64(a4, a5); b4 = _mm256_unpacklo_epi64(a2, a3); b5 = _mm256_unpacklo_epi64(a6, a7); b6 = _mm256_unpackhi_epi64(a2, a3); b7 = _mm256_unpackhi_epi64(a6, a7); in[0] = _mm256_permute2x128_si256(b0, b1, 0x20); in[1] = _mm256_permute2x128_si256(b0, b1, 0x31); in[2] = _mm256_permute2x128_si256(b2, b3, 0x20); in[3] = _mm256_permute2x128_si256(b2, b3, 0x31); in[4] = _mm256_permute2x128_si256(b4, b5, 0x20); in[5] = _mm256_permute2x128_si256(b4, b5, 0x31); in[6] = _mm256_permute2x128_si256(b6, b7, 0x20); in[7] = _mm256_permute2x128_si256(b6, b7, 0x31); } else { in[0] = _mm256_add_epi32(a0, a4); in[7] = _mm256_add_epi32(a1, a5); in[3] = _mm256_add_epi32(a2, a6); in[4] = _mm256_add_epi32(a3, a7); in[2] = _mm256_sub_epi32(a0, a4); in[6] = _mm256_sub_epi32(a1, a5); in[1] = _mm256_sub_epi32(a2, a6); in[5] = _mm256_sub_epi32(a3, a7); } }
static FORCE_INLINE void FlowInterExtra_8px_AVX2( int w, PixelType *pdst, const PixelType *prefB, const PixelType *prefF, const int16_t *VXFullB, const int16_t *VXFullF, const int16_t *VYFullB, const int16_t *VYFullF, const uint8_t *MaskB, const uint8_t *MaskF, int nPelLog, const int16_t *VXFullBB, const int16_t *VXFullFF, const int16_t *VYFullBB, const int16_t *VYFullFF, const __m256i &dwords_time256, const __m256i &dwords_256_time256, const __m256i &dwords_ref_pitch, const __m256i &dwords_hoffsets) { __m256i dwords_w = _mm256_add_epi32(_mm256_set1_epi32(w << nPelLog), dwords_hoffsets); __m256i dstF = lookup_AVX2(VXFullF, VYFullF, prefF, w, dwords_time256, dwords_ref_pitch, dwords_w); __m256i dstB = lookup_AVX2(VXFullB, VYFullB, prefB, w, dwords_256_time256, dwords_ref_pitch, dwords_w); __m256i dstFF = lookup_AVX2(VXFullFF, VYFullFF, prefF, w, dwords_time256, dwords_ref_pitch, dwords_w); __m256i dstBB = lookup_AVX2(VXFullBB, VYFullBB, prefB, w, dwords_256_time256, dwords_ref_pitch, dwords_w); __m256i minfb = mm256_min_epu<PixelType>(dstF, dstB); __m256i maxfb = mm256_max_epu<PixelType>(dstF, dstB); __m256i medianBB = mm256_max_epu<PixelType>(minfb, mm256_min_epu<PixelType>(maxfb, dstBB)); __m256i medianFF = mm256_max_epu<PixelType>(minfb, mm256_min_epu<PixelType>(maxfb, dstFF)); __m256i maskf = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskF[w])); __m256i maskb = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskB[w])); const __m256i dwords_255 = _mm256_set1_epi32(255); __m256i maskf_inv = _mm256_sub_epi32(dwords_255, maskf); __m256i maskb_inv = _mm256_sub_epi32(dwords_255, maskb); if (sizeof(PixelType) == 1) { dstF = _mm256_mullo_epi16(dstF, maskf_inv); dstB = _mm256_mullo_epi16(dstB, maskb_inv); medianBB = _mm256_mullo_epi16(medianBB, maskf); medianFF = _mm256_mullo_epi16(medianFF, maskb); } else { dstF = _mm256_mullo_epi32(dstF, maskf_inv); dstB = _mm256_mullo_epi32(dstB, maskb_inv); medianBB = _mm256_mullo_epi32(medianBB, maskf); medianFF = _mm256_mullo_epi32(medianFF, maskb); } dstF = _mm256_add_epi32(dstF, dwords_255); dstB = _mm256_add_epi32(dstB, dwords_255); dstF = _mm256_add_epi32(dstF, medianBB); dstB = _mm256_add_epi32(dstB, medianFF); dstF = _mm256_srai_epi32(dstF, 8); dstB = _mm256_srai_epi32(dstB, 8); if (sizeof(PixelType) == 2) { dstF = _mm256_sub_epi16(dstF, _mm256_set1_epi32(32768)); dstB = _mm256_sub_epi16(dstB, _mm256_set1_epi32(32768)); } dstF = _mm256_madd_epi16(dstF, dwords_256_time256); dstB = _mm256_madd_epi16(dstB, dwords_time256); if (sizeof(PixelType) == 2) { // dstF = _mm256_add_epi32(dstF, _mm256_slli_epi32(dwords_256_time256, 15)); // dstB = _mm256_add_epi32(dstB, _mm256_slli_epi32(dwords_time256, 15)); // Knowing that they add up to 256, the two additions can be combined. dstF = _mm256_add_epi32(dstF, _mm256_set1_epi32(256 << 15)); } __m256i dst = _mm256_add_epi32(dstF, dstB); dst = _mm256_srai_epi32(dst, 8); dst = _mm256_packus_epi32(dst, dst); dst = _mm256_permute4x64_epi64(dst, 0xe8); // 0b11101000 - copy third qword to second qword __m128i dst128 = _mm256_castsi256_si128(dst); if (sizeof(PixelType) == 1) { dst128 = _mm_packus_epi16(dst128, dst128); _mm_storel_epi64((__m128i *)&pdst[w], dst128); } else { _mm_storeu_si128((__m128i *)&pdst[w], dst128); } }
inline __m256i avx2_uv_to_hexid_ccw(const __m256i u, const __m256i v) { // if(u==0 and v==0)return 0; // int ringid = uv_to_ringid(u,v); // unsigned segid; // int runid; // int upv = u+v; // if(upv==ringid and v!=ringid) { segid=0; runid=v; } // else if(v==ringid and u!=-ringid) { segid=1; runid=-u; } // else if(u==-ringid and upv!=-ringid) { segid=2; runid=ringid-v; } // else if(u+v==-ringid and v!=-ringid) { segid=3; runid=-v; } // else if(v==-ringid and u!=ringid) { segid=4; runid=u; } // else /*if(u==ringid and upv!=ringid)*/{ segid=5; runid=ringid+v; } // return positive_ringid_segid_runid_to_hexid(ringid, segid, runid); const __m256i one = _mm256_set1_epi32(1); const __m256i minus_one = _mm256_set1_epi32(-1); const __m256i ringid = avx2_uv_to_ringid(u,v); const __m256i minus_ringid = _mm256_sign_epi32(ringid, minus_one); const __m256i upv = _mm256_add_epi32(u, v); __m256i not_found_mask = minus_one; __m256i hexid = avx2_ringid_to_nsites_contained(_mm256_sub_epi32(ringid, one)); // Seg ID = 0 // if(upv==ringid and v!=ringid) { segid=0; runid=v; } __m256i here_mask = _mm256_cmpeq_epi32(upv, ringid); hexid = _mm256_add_epi32(hexid, _mm256_and_si256(not_found_mask, _mm256_blendv_epi8(ringid, v, here_mask))); not_found_mask = _mm256_andnot_si256(here_mask, not_found_mask); // hexid = _mm256_add_epi32(hexid, _mm256_or_si256( // _mm256_and_si256(here_mask, v), // _mm256_and_si256(not_found_mask, ringid))); // Seg ID = 1 // else if(v==ringid and u!=-ringid) { segid=1; runid=-u; } here_mask = _mm256_cmpeq_epi32(v, ringid); hexid = _mm256_sub_epi32(hexid, _mm256_and_si256(not_found_mask, _mm256_blendv_epi8(minus_ringid, u, here_mask))); not_found_mask = _mm256_andnot_si256(here_mask, not_found_mask); // hexid = _mm256_sub_epi32(hexid, _mm256_or_si256( // _mm256_and_si256(here_mask, u), // _mm256_and_si256(not_found_mask, minus_ringid))); // Seg ID = 2 // else if(u==-ringid and upv!=-ringid) { segid=2; runid=ringid-v; } here_mask = _mm256_cmpeq_epi32(u, minus_ringid); hexid = _mm256_sub_epi32(hexid, _mm256_and_si256(not_found_mask, _mm256_blendv_epi8(minus_ringid, upv, here_mask))); not_found_mask = _mm256_andnot_si256(here_mask, not_found_mask); // hexid = _mm256_sub_epi32(hexid, _mm256_or_si256( // _mm256_and_si256(here_mask, upv), // _mm256_and_si256(not_found_mask, minus_ringid))); // Seg ID = 3 // else if(u+v==-ringid and v!=-ringid) { segid=3; runid=-v; } here_mask = _mm256_cmpeq_epi32(upv, minus_ringid); hexid = _mm256_sub_epi32(hexid, _mm256_and_si256(not_found_mask, _mm256_blendv_epi8(minus_ringid, v, here_mask))); not_found_mask = _mm256_andnot_si256(here_mask, not_found_mask); // hexid = _mm256_sub_epi32(hexid, _mm256_or_si256( // _mm256_and_si256(here_mask, v), // _mm256_and_si256(not_found_mask, minus_ringid))); // Seg ID = 4 // else if(v==-ringid and u!=ringid) { segid=4; runid=u; } here_mask = _mm256_cmpeq_epi32(v, minus_ringid); hexid = _mm256_add_epi32(hexid, _mm256_and_si256(not_found_mask, _mm256_blendv_epi8(ringid, u, here_mask))); not_found_mask = _mm256_andnot_si256(here_mask, not_found_mask); // hexid = _mm256_add_epi32(hexid, _mm256_or_si256( // _mm256_and_si256(here_mask, u), // _mm256_and_si256(not_found_mask, ringid))); // Seg ID = 5 // else /*if(u==ringid and upv!=ringid)*/{ segid=5; runid=ringid+v; } hexid = _mm256_add_epi32(hexid, _mm256_and_si256(not_found_mask, upv)); const __m256i mask = _mm256_cmpeq_epi32(ringid, _mm256_setzero_si256()); hexid = _mm256_andnot_si256(mask, hexid); return hexid; }
static FORCE_INLINE void FlowInter_8px_AVX2( int w, PixelType *pdst, const PixelType *prefB, const PixelType *prefF, const int16_t *VXFullB, const int16_t *VXFullF, const int16_t *VYFullB, const int16_t *VYFullF, const uint8_t *MaskB, const uint8_t *MaskF, int nPelLog, const __m256i &dwords_time256, const __m256i &dwords_256_time256, const __m256i &dwords_ref_pitch, const __m256i &dwords_hoffsets) { __m256i dwords_w = _mm256_add_epi32(_mm256_set1_epi32(w << nPelLog), dwords_hoffsets); __m256i dstF = lookup_AVX2(VXFullF, VYFullF, prefF, w, dwords_time256, dwords_ref_pitch, dwords_w); __m256i dstB = lookup_AVX2(VXFullB, VYFullB, prefB, w, dwords_256_time256, dwords_ref_pitch, dwords_w); __m256i dstF0 = _mm256_i32gather_epi32((const int *)prefF, dwords_w, sizeof(PixelType)); __m256i dstB0 = _mm256_i32gather_epi32((const int *)prefB, dwords_w, sizeof(PixelType)); dstF0 = _mm256_and_si256(dstF0, _mm256_set1_epi32((1 << (sizeof(PixelType) * 8)) - 1)); dstB0 = _mm256_and_si256(dstB0, _mm256_set1_epi32((1 << (sizeof(PixelType) * 8)) - 1)); __m256i maskf = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskF[w])); __m256i maskb = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskB[w])); const __m256i dwords_255 = _mm256_set1_epi32(255); __m256i maskf_inv = _mm256_sub_epi32(dwords_255, maskf); __m256i maskb_inv = _mm256_sub_epi32(dwords_255, maskb); __m256i dstF_maskf_inv, dstB_maskb_inv, dstF0_maskb, dstB0_maskf; if (sizeof(PixelType) == 1) { dstF_maskf_inv = _mm256_mullo_epi16(dstF, maskf_inv); dstB_maskb_inv = _mm256_mullo_epi16(dstB, maskb_inv); dstF0_maskb = _mm256_mullo_epi16(dstF0, maskb); dstB0_maskf = _mm256_mullo_epi16(dstB0, maskf); } else { dstF_maskf_inv = _mm256_mullo_epi32(dstF, maskf_inv); dstB_maskb_inv = _mm256_mullo_epi32(dstB, maskb_inv); dstF0_maskb = _mm256_mullo_epi32(dstF0, maskb); dstB0_maskf = _mm256_mullo_epi32(dstB0, maskf); } __m256i f = _mm256_add_epi32(dstF0_maskb, dstB_maskb_inv); __m256i b = _mm256_add_epi32(dstB0_maskf, dstF_maskf_inv); if (sizeof(PixelType) == 1) { f = _mm256_mullo_epi32(f, maskf); b = _mm256_mullo_epi32(b, maskb); f = _mm256_add_epi32(f, dwords_255); b = _mm256_add_epi32(b, dwords_255); f = _mm256_srai_epi32(f, 8); b = _mm256_srai_epi32(b, 8); } else { const __m256i qwords_255 = _mm256_set1_epi64x(255); __m256i tempf = _mm256_mul_epu32(f, maskf); __m256i tempb = _mm256_mul_epu32(b, maskb); tempf = _mm256_add_epi64(tempf, qwords_255); tempb = _mm256_add_epi64(tempb, qwords_255); tempf = _mm256_srli_epi64(tempf, 8); tempb = _mm256_srli_epi64(tempb, 8); f = _mm256_srli_epi64(f, 32); b = _mm256_srli_epi64(b, 32); f = _mm256_mul_epu32(f, _mm256_srli_epi64(maskf, 32)); b = _mm256_mul_epu32(b, _mm256_srli_epi64(maskb, 32)); f = _mm256_add_epi64(f, qwords_255); b = _mm256_add_epi64(b, qwords_255); f = _mm256_srli_epi64(f, 8); b = _mm256_srli_epi64(b, 8); f = _mm256_or_si256(tempf, _mm256_slli_epi64(f, 32)); b = _mm256_or_si256(tempb, _mm256_slli_epi64(b, 32)); } f = _mm256_add_epi32(f, dstF_maskf_inv); b = _mm256_add_epi32(b, dstB_maskb_inv); f = _mm256_add_epi32(f, dwords_255); b = _mm256_add_epi32(b, dwords_255); f = _mm256_srai_epi32(f, 8); b = _mm256_srai_epi32(b, 8); if (sizeof(PixelType) == 1) { f = _mm256_madd_epi16(f, dwords_256_time256); b = _mm256_madd_epi16(b, dwords_time256); } else { f = _mm256_mullo_epi32(f, dwords_256_time256); b = _mm256_mullo_epi32(b, dwords_time256); } __m256i dst = _mm256_add_epi32(f, b); dst = _mm256_srai_epi32(dst, 8); dst = _mm256_packus_epi32(dst, dst); dst = _mm256_permute4x64_epi64(dst, 0xe8); // 0b11101000 - copy third qword to second qword __m128i dst128 = _mm256_castsi256_si128(dst); if (sizeof(PixelType) == 1) { dst128 = _mm_packus_epi16(dst128, dst128); _mm_storel_epi64((__m128i *)&pdst[w], dst128); } else { _mm_storeu_si128((__m128i *)&pdst[w], dst128); } }
static FORCE_INLINE void FlowInterSimple_generic_8px_AVX2( int w, PixelType *pdst, const PixelType *prefB, const PixelType *prefF, const int16_t *VXFullB, const int16_t *VXFullF, const int16_t *VYFullB, const int16_t *VYFullF, const uint8_t *MaskB, const uint8_t *MaskF, int nPelLog, const __m256i &dwords_time256, const __m256i &dwords_256_time256, const __m256i &dwords_ref_pitch, const __m256i &dwords_hoffsets) { __m256i dwords_w = _mm256_add_epi32(_mm256_set1_epi32(w << nPelLog), dwords_hoffsets); __m256i dstF = lookup_AVX2(VXFullF, VYFullF, prefF, w, dwords_time256, dwords_ref_pitch, dwords_w); __m256i dstB = lookup_AVX2(VXFullB, VYFullB, prefB, w, dwords_256_time256, dwords_ref_pitch, dwords_w); __m256i maskf = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskF[w])); __m256i maskb = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskB[w])); const __m256i dwords_255 = _mm256_set1_epi32(255); __m256i maskf_inv = _mm256_sub_epi32(dwords_255, maskf); __m256i maskb_inv = _mm256_sub_epi32(dwords_255, maskb); __m256i f, b; if (sizeof(PixelType) == 1) { __m256i dstF_dstB = _mm256_or_si256(dstF, _mm256_slli_epi32(dstB, 16)); maskf = _mm256_or_si256(_mm256_slli_epi32(maskf, 16), maskf_inv); maskb = _mm256_or_si256(maskb, _mm256_slli_epi32(maskb_inv, 16)); f = _mm256_madd_epi16(dstF_dstB, maskf); b = _mm256_madd_epi16(dstF_dstB, maskb); } else { __m256i dstF_maskf_inv = _mm256_mullo_epi32(dstF, maskf_inv); __m256i dstB_maskb_inv = _mm256_mullo_epi32(dstB, maskb_inv); __m256i dstB_maskf = _mm256_mullo_epi32(dstB, maskf); __m256i dstF_maskb = _mm256_mullo_epi32(dstF, maskb); f = _mm256_add_epi32(dstF_maskf_inv, dstB_maskf); b = _mm256_add_epi32(dstB_maskb_inv, dstF_maskb); } f = _mm256_add_epi32(f, dwords_255); b = _mm256_add_epi32(b, dwords_255); f = _mm256_srai_epi32(f, 8); b = _mm256_srai_epi32(b, 8); if (sizeof(PixelType) == 1) { f = _mm256_madd_epi16(f, dwords_256_time256); b = _mm256_madd_epi16(b, dwords_time256); } else { f = _mm256_mullo_epi32(f, dwords_256_time256); b = _mm256_mullo_epi32(b, dwords_time256); } __m256i dst = _mm256_add_epi32(f, b); dst = _mm256_srai_epi32(dst, 8); dst = _mm256_packus_epi32(dst, dst); dst = _mm256_permute4x64_epi64(dst, 0xe8); // 0b11101000 - copy third qword to second qword __m128i dst128 = _mm256_castsi256_si128(dst); if (sizeof(PixelType) == 1) { dst128 = _mm_packus_epi16(dst128, dst128); _mm_storel_epi64((__m128i *)&pdst[w], dst128); } else { _mm_storeu_si128((__m128i *)&pdst[w], dst128); } }
/** * \brief quantize transformed coefficents * */ void kvz_quant_flat_avx2(const encoder_state_t * const state, coeff_t *coef, coeff_t *q_coef, int32_t width, int32_t height, int8_t type, int8_t scan_idx, int8_t block_type) { const encoder_control_t * const encoder = state->encoder_control; const uint32_t log2_block_size = kvz_g_convert_to_bit[width] + 2; const uint32_t * const scan = kvz_g_sig_last_scan[scan_idx][log2_block_size - 1]; int32_t qp_scaled = kvz_get_scaled_qp(type, state->global->QP, (encoder->bitdepth - 8) * 6); const uint32_t log2_tr_size = kvz_g_convert_to_bit[width] + 2; const int32_t scalinglist_type = (block_type == CU_INTRA ? 0 : 3) + (int8_t)("\0\3\1\2"[type]); const int32_t *quant_coeff = encoder->scaling_list.quant_coeff[log2_tr_size - 2][scalinglist_type][qp_scaled % 6]; const int32_t transform_shift = MAX_TR_DYNAMIC_RANGE - encoder->bitdepth - log2_tr_size; //!< Represents scaling through forward transform const int32_t q_bits = QUANT_SHIFT + qp_scaled / 6 + transform_shift; const int32_t add = ((state->global->slicetype == KVZ_SLICE_I) ? 171 : 85) << (q_bits - 9); const int32_t q_bits8 = q_bits - 8; assert(quant_coeff[0] <= (1 << 15) - 1 && quant_coeff[0] >= -(1 << 15)); //Assuming flat values to fit int16_t uint32_t ac_sum = 0; __m256i v_ac_sum = _mm256_setzero_si256(); __m256i v_quant_coeff = _mm256_set1_epi16(quant_coeff[0]); for (int32_t n = 0; n < width * height; n += 16) { __m256i v_level = _mm256_loadu_si256((__m256i*)&(coef[n])); __m256i v_sign = _mm256_cmpgt_epi16(_mm256_setzero_si256(), v_level); v_sign = _mm256_or_si256(v_sign, _mm256_set1_epi16(1)); v_level = _mm256_abs_epi16(v_level); __m256i low_a = _mm256_unpacklo_epi16(v_level, _mm256_set1_epi16(0)); __m256i high_a = _mm256_unpackhi_epi16(v_level, _mm256_set1_epi16(0)); __m256i low_b = _mm256_unpacklo_epi16(v_quant_coeff, _mm256_set1_epi16(0)); __m256i high_b = _mm256_unpackhi_epi16(v_quant_coeff, _mm256_set1_epi16(0)); __m256i v_level32_a = _mm256_madd_epi16(low_a, low_b); __m256i v_level32_b = _mm256_madd_epi16(high_a, high_b); v_level32_a = _mm256_add_epi32(v_level32_a, _mm256_set1_epi32(add)); v_level32_b = _mm256_add_epi32(v_level32_b, _mm256_set1_epi32(add)); v_level32_a = _mm256_srai_epi32(v_level32_a, q_bits); v_level32_b = _mm256_srai_epi32(v_level32_b, q_bits); v_level = _mm256_packs_epi32(v_level32_a, v_level32_b); v_level = _mm256_sign_epi16(v_level, v_sign); _mm256_storeu_si256((__m256i*)&(q_coef[n]), v_level); v_ac_sum = _mm256_add_epi32(v_ac_sum, v_level32_a); v_ac_sum = _mm256_add_epi32(v_ac_sum, v_level32_b); } __m128i temp = _mm_add_epi32(_mm256_castsi256_si128(v_ac_sum), _mm256_extracti128_si256(v_ac_sum, 1)); temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, KVZ_PERMUTE(2, 3, 0, 1))); temp = _mm_add_epi32(temp, _mm_shuffle_epi32(temp, KVZ_PERMUTE(1, 0, 1, 0))); ac_sum += _mm_cvtsi128_si32(temp); if (!(encoder->sign_hiding && ac_sum >= 2)) return; int32_t delta_u[LCU_WIDTH*LCU_WIDTH >> 2]; for (int32_t n = 0; n < width * height; n += 16) { __m256i v_level = _mm256_loadu_si256((__m256i*)&(coef[n])); v_level = _mm256_abs_epi16(v_level); __m256i low_a = _mm256_unpacklo_epi16(v_level, _mm256_set1_epi16(0)); __m256i high_a = _mm256_unpackhi_epi16(v_level, _mm256_set1_epi16(0)); __m256i low_b = _mm256_unpacklo_epi16(v_quant_coeff, _mm256_set1_epi16(0)); __m256i high_b = _mm256_unpackhi_epi16(v_quant_coeff, _mm256_set1_epi16(0)); __m256i v_level32_a = _mm256_madd_epi16(low_a, low_b); __m256i v_level32_b = _mm256_madd_epi16(high_a, high_b); v_level32_a = _mm256_add_epi32(v_level32_a, _mm256_set1_epi32(add)); v_level32_b = _mm256_add_epi32(v_level32_b, _mm256_set1_epi32(add)); v_level32_a = _mm256_srai_epi32(v_level32_a, q_bits); v_level32_b = _mm256_srai_epi32(v_level32_b, q_bits); v_level = _mm256_packs_epi32(v_level32_a, v_level32_b); __m256i v_coef = _mm256_loadu_si256((__m256i*)&(coef[n])); __m256i v_coef_a = _mm256_unpacklo_epi16(_mm256_abs_epi16(v_coef), _mm256_set1_epi16(0)); __m256i v_coef_b = _mm256_unpackhi_epi16(_mm256_abs_epi16(v_coef), _mm256_set1_epi16(0)); __m256i v_quant_coeff_a = _mm256_unpacklo_epi16(v_quant_coeff, _mm256_set1_epi16(0)); __m256i v_quant_coeff_b = _mm256_unpackhi_epi16(v_quant_coeff, _mm256_set1_epi16(0)); v_coef_a = _mm256_madd_epi16(v_coef_a, v_quant_coeff_a); v_coef_b = _mm256_madd_epi16(v_coef_b, v_quant_coeff_b); v_coef_a = _mm256_sub_epi32(v_coef_a, _mm256_slli_epi32(_mm256_unpacklo_epi16(v_level, _mm256_set1_epi16(0)), q_bits) ); v_coef_b = _mm256_sub_epi32(v_coef_b, _mm256_slli_epi32(_mm256_unpackhi_epi16(v_level, _mm256_set1_epi16(0)), q_bits) ); v_coef_a = _mm256_srai_epi32(v_coef_a, q_bits8); v_coef_b = _mm256_srai_epi32(v_coef_b, q_bits8); _mm_storeu_si128((__m128i*)&(delta_u[n+0*4]), _mm256_castsi256_si128(v_coef_a)); _mm_storeu_si128((__m128i*)&(delta_u[n+2*4]), _mm256_extracti128_si256(v_coef_a, 1)); _mm_storeu_si128((__m128i*)&(delta_u[n+1*4]), _mm256_castsi256_si128(v_coef_b)); _mm_storeu_si128((__m128i*)&(delta_u[n+3*4]), _mm256_extracti128_si256(v_coef_b, 1)); } if (ac_sum >= 2) { #define SCAN_SET_SIZE 16 #define LOG2_SCAN_SET_SIZE 4 int32_t n, last_cg = -1, abssum = 0, subset, subpos; for (subset = (width*height - 1) >> LOG2_SCAN_SET_SIZE; subset >= 0; subset--) { int32_t first_nz_pos_in_cg = SCAN_SET_SIZE, last_nz_pos_in_cg = -1; subpos = subset << LOG2_SCAN_SET_SIZE; abssum = 0; // Find last coeff pos for (n = SCAN_SET_SIZE - 1; n >= 0; n--) { if (q_coef[scan[n + subpos]]) { last_nz_pos_in_cg = n; break; } } // First coeff pos for (n = 0; n <SCAN_SET_SIZE; n++) { if (q_coef[scan[n + subpos]]) { first_nz_pos_in_cg = n; break; } } // Sum all kvz_quant coeffs between first and last for (n = first_nz_pos_in_cg; n <= last_nz_pos_in_cg; n++) { abssum += q_coef[scan[n + subpos]]; } if (last_nz_pos_in_cg >= 0 && last_cg == -1) { last_cg = 1; } if (last_nz_pos_in_cg - first_nz_pos_in_cg >= 4) { int32_t signbit = (q_coef[scan[subpos + first_nz_pos_in_cg]] > 0 ? 0 : 1); if (signbit != (abssum & 0x1)) { // compare signbit with sum_parity int32_t min_cost_inc = 0x7fffffff, min_pos = -1, cur_cost = 0x7fffffff; int16_t final_change = 0, cur_change = 0; for (n = (last_cg == 1 ? last_nz_pos_in_cg : SCAN_SET_SIZE - 1); n >= 0; n--) { uint32_t blkPos = scan[n + subpos]; if (q_coef[blkPos] != 0) { if (delta_u[blkPos] > 0) { cur_cost = -delta_u[blkPos]; cur_change = 1; } else if (n == first_nz_pos_in_cg && abs(q_coef[blkPos]) == 1) { cur_cost = 0x7fffffff; } else { cur_cost = delta_u[blkPos]; cur_change = -1; } } else if (n < first_nz_pos_in_cg && ((coef[blkPos] >= 0) ? 0 : 1) != signbit) { cur_cost = 0x7fffffff; } else { cur_cost = -delta_u[blkPos]; cur_change = 1; } if (cur_cost < min_cost_inc) { min_cost_inc = cur_cost; final_change = cur_change; min_pos = blkPos; } } // CG loop if (q_coef[min_pos] == 32767 || q_coef[min_pos] == -32768) { final_change = -1; } if (coef[min_pos] >= 0) q_coef[min_pos] += final_change; else q_coef[min_pos] -= final_change; } // Hide } if (last_cg == 1) last_cg = 0; } #undef SCAN_SET_SIZE #undef LOG2_SCAN_SET_SIZE }
/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could replace both of them.. it is almost as fast, and gives you a free cosine with your sine */ void sincos256_ps(v8sf x, v8sf *s, v8sf *c) { v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y; v8si imm0, imm2, imm4; #ifndef __AVX2__ v4si imm0_1, imm0_2; v4si imm2_1, imm2_2; v4si imm4_1, imm4_2; #endif sign_bit_sin = x; /* take the absolute value */ x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask); /* extract the sign bit (upper one) */ sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf*)_ps256_sign_mask); /* scale by 4/Pi */ y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI); #ifdef __AVX2__ /* store the integer part of y in imm2 */ imm2 = _mm256_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1); imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_inv1); y = _mm256_cvtepi32_ps(imm2); imm4 = imm2; /* get the swap sign flag for the sine */ imm0 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_4); imm0 = _mm256_slli_epi32(imm0, 29); //v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0); /* get the polynom selection mask for the sine*/ imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_2); imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0); //v8sf poly_mask = _mm256_castsi256_ps(imm2); #else /* we use SSE2 routines to perform the integer ops */ COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2); imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1); imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1); imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1); imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1); COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2); y = _mm256_cvtepi32_ps(imm2); imm4_1 = imm2_1; imm4_2 = imm2_2; imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4); imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4); imm0_1 = _mm_slli_epi32(imm0_1, 29); imm0_2 = _mm_slli_epi32(imm0_2, 29); COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0); imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2); imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2); imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128()); imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128()); COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2); #endif v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0); v8sf poly_mask = _mm256_castsi256_ps(imm2); /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(v8sf*)_ps256_minus_cephes_DP1; xmm2 = *(v8sf*)_ps256_minus_cephes_DP2; xmm3 = *(v8sf*)_ps256_minus_cephes_DP3; xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); x = _mm256_add_ps(x, xmm1); x = _mm256_add_ps(x, xmm2); x = _mm256_add_ps(x, xmm3); #ifdef __AVX2__ imm4 = _mm256_sub_epi32(imm4, *(v8si*)_pi32_256_2); imm4 = _mm256_andnot_si128(imm4, *(v8si*)_pi32_256_4); imm4 = _mm256_slli_epi32(imm4, 29); #else imm4_1 = _mm_sub_epi32(imm4_1, *(v4si*)_pi32avx_2); imm4_2 = _mm_sub_epi32(imm4_2, *(v4si*)_pi32avx_2); imm4_1 = _mm_andnot_si128(imm4_1, *(v4si*)_pi32avx_4); imm4_2 = _mm_andnot_si128(imm4_2, *(v4si*)_pi32avx_4); imm4_1 = _mm_slli_epi32(imm4_1, 29); imm4_2 = _mm_slli_epi32(imm4_2, 29); COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4); #endif v8sf sign_bit_cos = _mm256_castsi256_ps(imm4); sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin); /* Evaluate the first polynom (0 <= x <= Pi/4) */ v8sf z = _mm256_mul_ps(x,x); y = *(v8sf*)_ps256_coscof_p0; y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2); y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5); y = _mm256_sub_ps(y, tmp); y = _mm256_add_ps(y, *(v8sf*)_ps256_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ v8sf y2 = *(v8sf*)_ps256_sincof_p0; y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; v8sf ysin2 = _mm256_and_ps(xmm3, y2); v8sf ysin1 = _mm256_andnot_ps(xmm3, y); y2 = _mm256_sub_ps(y2,ysin2); y = _mm256_sub_ps(y, ysin1); xmm1 = _mm256_add_ps(ysin1,ysin2); xmm2 = _mm256_add_ps(y,y2); /* update the sign */ *s = _mm256_xor_ps(xmm1, sign_bit_sin); *c = _mm256_xor_ps(xmm2, sign_bit_cos); }
void TransLut_FindIndexAvx2 <TransLut::MapperLog>::find_index (const TransLut::FloatIntMix val_arr [8], __m256i &index, __m256 &frac) { assert (val_arr != 0); // Constants static const int mant_size = 23; static const int exp_bias = 127; static const uint32_t base = (exp_bias + LOGLUT_MIN_L2) << mant_size; static const float val_min = 1.0f / (int64_t (1) << -LOGLUT_MIN_L2); // static const float val_max = float (int64_t (1) << LOGLUT_MAX_L2); static const int frac_size = mant_size - LOGLUT_RES_L2; static const uint32_t frac_mask = (1 << frac_size) - 1; const __m256 zero_f = _mm256_setzero_ps (); const __m256 one_f = _mm256_set1_ps (1); const __m256 frac_mul = _mm256_set1_ps (1.0f / (1 << frac_size)); const __m256 mul_eps = _mm256_set1_ps (1.0f / val_min); const __m256 mask_abs_f = _mm256_load_ps ( reinterpret_cast <const float *> (fstb::ToolsAvx2::_mask_abs) ); const __m256i zero_i = _mm256_setzero_si256 (); const __m256i mask_abs_epi32 = _mm256_set1_epi32 (0x7FFFFFFF); const __m256i one_epi32 = _mm256_set1_epi32 (1); const __m256i base_epi32 = _mm256_set1_epi32 (int (base)); const __m256i frac_mask_epi32 = _mm256_set1_epi32 (frac_mask); const __m256i val_min_epi32 = _mm256_set1_epi32 ((LOGLUT_MIN_L2 + exp_bias) << mant_size); const __m256i val_max_epi32 = _mm256_set1_epi32 ((LOGLUT_MAX_L2 + exp_bias) << mant_size); const __m256i index_max_epi32 = _mm256_set1_epi32 ((LOGLUT_MAX_L2 - LOGLUT_MIN_L2) << LOGLUT_RES_L2); const __m256i hsize_epi32 = _mm256_set1_epi32 (LOGLUT_HSIZE); const __m256i mirror_epi32 = _mm256_set1_epi32 (LOGLUT_HSIZE - 1); // It really starts here const __m256 val_f = _mm256_load_ps (reinterpret_cast <const float *> (val_arr)); const __m256 val_a = _mm256_and_ps (val_f, mask_abs_f); const __m256i val_i = _mm256_load_si256 (reinterpret_cast <const __m256i *> (val_arr)); const __m256i val_u = _mm256_and_si256 (val_i, mask_abs_epi32); // Standard path __m256i index_std = _mm256_sub_epi32 (val_u, base_epi32); index_std = _mm256_srli_epi32 (index_std, frac_size); index_std = _mm256_add_epi32 (index_std, one_epi32); __m256i frac_stdi = _mm256_and_si256 (val_u, frac_mask_epi32); __m256 frac_std = _mm256_cvtepi32_ps (frac_stdi); frac_std = _mm256_mul_ps (frac_std, frac_mul); // Epsilon path __m256 frac_eps = _mm256_max_ps (val_a, zero_f); frac_eps = _mm256_mul_ps (frac_eps, mul_eps); // Range cases const __m256i eps_flag_i = _mm256_cmpgt_epi32 (val_min_epi32, val_u); const __m256i std_flag_i = _mm256_cmpgt_epi32 (val_max_epi32, val_u); const __m256 eps_flag_f = _mm256_castsi256_ps (eps_flag_i); const __m256 std_flag_f = _mm256_castsi256_ps (std_flag_i); __m256i index_tmp = fstb::ToolsAvx2::select (std_flag_i, index_std, index_max_epi32); __m256 frac_tmp = fstb::ToolsAvx2::select (std_flag_f, frac_std, one_f); index_tmp = fstb::ToolsAvx2::select (eps_flag_i, zero_i, index_tmp); frac_tmp = fstb::ToolsAvx2::select (eps_flag_f, frac_eps, frac_tmp); // Sign cases const __m256i neg_flag_i = _mm256_srai_epi32 (val_i, 31); const __m256 neg_flag_f = _mm256_castsi256_ps (neg_flag_i); const __m256i index_neg = _mm256_sub_epi32 (mirror_epi32, index_tmp); const __m256i index_pos = _mm256_add_epi32 (hsize_epi32, index_tmp); const __m256 frac_neg = _mm256_sub_ps (one_f, frac_tmp); index = fstb::ToolsAvx2::select (neg_flag_i, index_neg, index_pos); frac = fstb::ToolsAvx2::select (neg_flag_f, frac_neg, frac_tmp); }
__m256i test_mm256_sub_epi32(__m256i a, __m256i b) { // CHECK: sub <8 x i32> return _mm256_sub_epi32(a, b); }
/* natural logarithm computed for 8 simultaneous float return NaN for x <= 0 */ v8sf log256_ps(v8sf x) { v8si imm0; v8sf one = *(v8sf*)_ps256_1; //v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps()); v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS); x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos); /* cut off denormalized stuff */ // can be done with AVX2 imm0 = _mm256_srli_epi32(_mm256_castps_si256(x), 23); /* keep only the fractional part */ x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask); x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5); // this is again another AVX2 instruction imm0 = _mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f); v8sf e = _mm256_cvtepi32_ps(imm0); e = _mm256_add_ps(e, one); /* part2: if( x < SQRTHF ) { e -= 1; x = x + x - 1.0; } else { x = x - 1.0; } */ //v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF); v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS); v8sf tmp = _mm256_and_ps(x, mask); x = _mm256_sub_ps(x, one); e = _mm256_sub_ps(e, _mm256_and_ps(one, mask)); x = _mm256_add_ps(x, tmp); v8sf z = _mm256_mul_ps(x,x); v8sf y = *(v8sf*)_ps256_cephes_log_p0; y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8); y = _mm256_mul_ps(y, x); y = _mm256_mul_ps(y, z); tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1); y = _mm256_add_ps(y, tmp); tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5); y = _mm256_sub_ps(y, tmp); tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2); x = _mm256_add_ps(x, y); x = _mm256_add_ps(x, tmp); x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN return x; }