void test1bit (void) { d1 = _mm256_extractf128_pd (e2, k4); /* { dg-error "the last argument must be a 1-bit immediate" } */ a1 = _mm256_extractf128_ps (b2, k4); /* { dg-error "the last argument must be a 1-bit immediate" } */ i1 = _mm256_extractf128_si256 (l2, k4); /* { dg-error "the last argument must be a 1-bit immediate" } */ e1 = _mm256_insertf128_pd (e2, d1, k4); /* { dg-error "the last argument must be a 1-bit immediate" } */ b1 = _mm256_insertf128_ps (b2, a1, k4); /* { dg-error "the last argument must be a 1-bit immediate" } */ l1 = _mm256_insertf128_si256 (l2, i1, k4);/* { dg-error "the last argument must be a 1-bit immediate" } */ }
//! \brief //! Performs a bitwise right shift logical by the specified count //! inline __m256i srli(__m256i arg, int count) { __m128i arg_low = _mm256_castsi256_si128(arg); __m128i arg_hi = _mm256_extractf128_si256(arg, 1); __m128i newlow = _mm_srli_epi32(arg_low, count); __m128i newhi = _mm_srli_epi32(arg_hi, count); __m256i result = _mm256_castsi128_si256(newlow); result = _mm256_insertf128_si256(result, newhi, 1); return result; }
static INLINE void quantize(const __m256i *qp, __m256i *c, const int16_t *iscan_ptr, int log_scale, tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) { const __m256i abs_coeff = _mm256_abs_epi32(*c); __m256i q = _mm256_add_epi32(abs_coeff, qp[0]); __m256i q_lo = _mm256_mul_epi32(q, qp[1]); __m256i q_hi = _mm256_srli_epi64(q, 32); const __m256i qp_hi = _mm256_srli_epi64(qp[1], 32); q_hi = _mm256_mul_epi32(q_hi, qp_hi); q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale); q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale); q_hi = _mm256_slli_epi64(q_hi, 32); q = _mm256_or_si256(q_lo, q_hi); const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale); const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s); q = _mm256_andnot_si256(mask, q); __m256i dq = _mm256_mullo_epi32(q, qp[2]); dq = _mm256_srai_epi32(dq, log_scale); q = _mm256_sign_epi32(q, *c); dq = _mm256_sign_epi32(dq, *c); _mm256_storeu_si256((__m256i *)qcoeff, q); _mm256_storeu_si256((__m256i *)dqcoeff, dq); const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr); const __m128i zr = _mm_setzero_si128(); const __m128i lo = _mm_unpacklo_epi16(isc, zr); const __m128i hi = _mm_unpackhi_epi16(isc, zr); const __m256i iscan = _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); const __m256i zero = _mm256_setzero_si256(); const __m256i zc = _mm256_cmpeq_epi32(dq, zero); const __m256i nz = _mm256_cmpeq_epi32(zc, zero); __m256i cur_eob = _mm256_sub_epi32(iscan, nz); cur_eob = _mm256_and_si256(cur_eob, nz); *eob = _mm256_max_epi32(cur_eob, *eob); }
static INLINE __m256i mm256_loadu2_16(const uint16_t *p0, const uint16_t *p1) { const __m256i d = _mm256_castsi128_si256(_mm_loadu_si128((const __m128i *)p1)); return _mm256_insertf128_si256(d, _mm_loadu_si128((const __m128i *)p0), 1); }
#include <nt2/sdk/meta/strip.hpp> #include <nt2/include/functions/details/simd/sse/sse4_1/shli.hpp> namespace nt2 { namespace functors { // no special validate for shli template<class Extension,class Info> struct call<shli_,tag::simd_(tag::arithmetic_,Extension),Info> { template<class Sig> struct result; template<class This,class A0, class A1> struct result<This(A0,A1)> : meta::strip<A0>{};// NT2_FUNCTOR_CALL(2) { typedef typename meta::scalar_of<A0>::type sctype; typedef typename meta::as_integer<sctype>::type sitype; typedef typename simd::native<sitype, tag::sse_ > isvtype; typedef typename meta::as_integer<A0>::type itype; isvtype a00 = { _mm256_extractf128_si256(simd::native_cast<itype>(a0), 0)}; isvtype a01 = { _mm256_extractf128_si256(simd::native_cast<itype>(a0), 1)}; itype that = { _mm256_insertf128_si256(that,nt2::shli( a00, a1), 0)}; return simd::native_cast<A0>(_mm256_insertf128_si256(that, nt2::shli(a01, a1), 1)); } }; } } #endif
{ template<class Dummy> struct call<tag::abs_(tag::simd_<tag::signed_, tag::avx_)), tag::cpu_, Dummy> : callable { template<class Sig> struct result; template<class This,class A0> struct result<This(A0)> : meta::strip<A0>{};// NT2_FUNCTOR_CALL(1) { typedef typename meta::scalar_of<A0>::type sctype; typedef typename simd::native<sctype, tag::sse_ > svtype; svtype a00 = { _mm256_extractf128_si256(a0, 0)}; svtype a01 = { _mm256_extractf128_si256(a0, 1)}; A0 that = { _mm256_insertf128_si256(that,nt2::abs(a00), 0)}; that = _mm256_insertf128_si256(that,nt2::abs(a01), 1); // NT2_AVX_JOIN128INT1(that, nt2::abs); return that; } }; } } ///////////////////////////////////////////////////////////////////////////// // Implementation when type A0 is unsigned_ ///////////////////////////////////////////////////////////////////////////// NT2_REGISTER_DISPATCH(tag::abs_, tag::cpu_, (A0), ((simd_<unsigned_<A0>,tag::avx_>)) );
inline avx_m256_t newexp_ps(avx_m256_t x) { avx_m256_t one = _ps_1; avx_m256_t zero = _ps_0; x = _mm256_min_ps(x, _ps_exp_hi); x = _mm256_max_ps(x, _ps_exp_lo); avx_m256_t temp_2 = _mm256_mul_ps(x, _ps_cephes_LOG2EF); temp_2 = _mm256_add_ps(temp_2, _ps_0p5); avx_m256i_t emm0 = _mm256_cvttps_epi32(temp_2); avx_m256_t temp_1 = _mm256_cvtepi32_ps(emm0); avx_m256_t temp_3 = _mm256_sub_ps(temp_1, temp_2); avx_m256_t mask = _mm256_cmp_ps(temp_3, zero, _CMP_GT_OQ); mask = _mm256_and_ps(mask, one); temp_2 = _mm256_sub_ps(temp_1, mask); emm0 = _mm256_cvttps_epi32(temp_2); temp_1 = _mm256_mul_ps(temp_2, _ps_cephes_exp_C12); x = _mm256_sub_ps(x, temp_1); avx_m256_t x2 = _mm256_mul_ps(x, x); avx_m256_t x3 = _mm256_mul_ps(x2, x); avx_m256_t x4 = _mm256_mul_ps(x2, x2); temp_1 = _mm256_add_ps(x, one); temp_2 = _mm256_mul_ps(x2, _ps_cephes_exp_p5); temp_3 = _mm256_mul_ps(x3, _ps_cephes_exp_p4); temp_1 = _mm256_add_ps(temp_1, temp_2); temp_2 = _mm256_mul_ps(x3, _ps_cephes_exp_p0); temp_1 = _mm256_add_ps(temp_1, temp_3); avx_m256_t temp_4 = _mm256_mul_ps(x, _ps_cephes_exp_p2); temp_3 = _mm256_mul_ps(x2, _ps_cephes_exp_p1); emm0 = _mm256_castps_si256(_mm256_add_ps(_mm256_castsi256_ps(emm0), _mm256_castsi256_ps(_pi32_0x7f))); temp_2 = _mm256_add_ps(temp_2, temp_3); temp_3 = _mm256_add_ps(temp_3, temp_4); //emm0 = _mm256_slli_epi32(emm0, 23); // convert emm0 into two 128-bit integer vectors // perform shift on both vectors // combine both vectors into 256-bit emm0 __m128i emm0hi = _mm256_extractf128_si256(emm0, 0); __m128i emm0lo = _mm256_extractf128_si256(emm0, 1); emm0hi = _mm_slli_epi32(emm0hi, 23); emm0lo = _mm_slli_epi32(emm0lo, 23); emm0 = _mm256_insertf128_si256(emm0, emm0hi, 0); emm0 = _mm256_insertf128_si256(emm0, emm0lo, 1); avx_m256_t pow2n = _mm256_castsi256_ps(emm0); temp_2 = _mm256_add_ps(temp_2, temp_3); temp_2 = _mm256_mul_ps(temp_2, x4); avx_m256_t y = _mm256_add_ps(temp_1, temp_2); y = _mm256_mul_ps(y, pow2n); return y; } // newexp_ps()
__m256i test_mm256_insertf128_si256_1(__m256i a, __m128i b) { // CHECK-LABEL: @test_mm256_insertf128_si256_1 // CHECK: shufflevector{{.*}}<i32 0, i32 1, i32 4, i32 5> return _mm256_insertf128_si256(a, b, 1); }
__m256i str, mask, res, blockmask; __m256i s2mask, s3mask, s4mask, s5mask; /* _mm256_shuffle_epi8 works on 128-bit lanes, so we need to get * the two 128-bit lanes into big-endian order separately: */ l0 = _mm_loadu_si128((__m128i *)c); l0 = _mm_shuffle_epi8(l0, _mm_setr_epi8(2, 2, 1, 0, 5, 5, 4, 3, 8, 8, 7, 6, 11, 11, 10, 9)); l1 = _mm_loadu_si128((__m128i *)&c[12]); l1 = _mm_shuffle_epi8(l1, _mm_setr_epi8(2, 2, 1, 0, 5, 5, 4, 3, 8, 8, 7, 6, 11, 11, 10, 9)); /* Combine into a single 256-bit register: */ str = _mm256_castsi128_si256(l0); str = _mm256_insertf128_si256(str, l1, 1); /* Mask to pass through only the lower 6 bits of one byte: */ mask = _mm256_set1_epi32(0x3F000000); /* Shift bits by 2, mask in only the first byte: */ res = _mm256_and_si256(_mm256_srli_epi32(str, 2), mask); mask = _mm256_srli_epi32(mask, 8); /* Shift bits by 4, mask in only the second byte: */ res = _mm256_or_si256(res, _mm256_and_si256(_mm256_srli_epi32(str, 4), mask)); mask = _mm256_srli_epi32(mask, 8); /* Shift bits by 6, mask in only the third byte: */ res = _mm256_or_si256(res, _mm256_and_si256(_mm256_srli_epi32(str, 6) , mask)); mask = _mm256_srli_epi32(mask, 8);
INLINE avxi( const ssei& a, const ssei& b ) : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a),b,1)) {}
INLINE explicit avxi( const ssei& a ) : m256(_mm256_insertf128_si256(_mm256_castsi128_si256(a),a,1)) {}
template<index_t index> INLINE const avxi insert (const avxi& a, const ssei& b) { return _mm256_insertf128_si256 (a,b,index); }
static inline __m256d gmx_mm256_exp2_pd(__m256d x) { /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */ const __m256d arglimit = _mm256_set1_pd(1022.0); const __m128i expbase = _mm_set1_epi32(1023); const __m256d P2 = _mm256_set1_pd(2.30933477057345225087e-2); const __m256d P1 = _mm256_set1_pd(2.02020656693165307700e1); const __m256d P0 = _mm256_set1_pd(1.51390680115615096133e3); /* Q2 == 1.0 */ const __m256d Q1 = _mm256_set1_pd(2.33184211722314911771e2); const __m256d Q0 = _mm256_set1_pd(4.36821166879210612817e3); const __m256d one = _mm256_set1_pd(1.0); const __m256d two = _mm256_set1_pd(2.0); __m256d valuemask; __m256i iexppart; __m128i iexppart128a, iexppart128b; __m256d fexppart; __m256d intpart; __m256d z, z2; __m256d PolyP, PolyQ; iexppart128a = _mm256_cvtpd_epi32(x); intpart = _mm256_round_pd(x, _MM_FROUND_TO_NEAREST_INT); /* Add exponent bias */ iexppart128a = _mm_add_epi32(iexppart128a, expbase); /* We now want to shift the exponent 52 positions left, but to achieve this we need * to separate the 128-bit register data into two registers (4x64-bit > 128bit) * shift them, and then merge into a single __m256d. * Elements 0/1 should end up in iexppart128a, and 2/3 in iexppart128b. * It doesnt matter what we put in the 2nd/4th position, since that data will be * shifted out and replaced with zeros. */ iexppart128b = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(3, 3, 2, 2)); iexppart128a = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(1, 1, 0, 0)); iexppart128b = _mm_slli_epi64(iexppart128b, 52); iexppart128a = _mm_slli_epi64(iexppart128a, 52); iexppart = _mm256_castsi128_si256(iexppart128a); iexppart = _mm256_insertf128_si256(iexppart, iexppart128b, 0x1); valuemask = _mm256_cmp_pd(arglimit, gmx_mm256_abs_pd(x), _CMP_GE_OQ); fexppart = _mm256_and_pd(valuemask, _mm256_castsi256_pd(iexppart)); z = _mm256_sub_pd(x, intpart); z2 = _mm256_mul_pd(z, z); PolyP = _mm256_mul_pd(P2, z2); PolyP = _mm256_add_pd(PolyP, P1); PolyQ = _mm256_add_pd(z2, Q1); PolyP = _mm256_mul_pd(PolyP, z2); PolyQ = _mm256_mul_pd(PolyQ, z2); PolyP = _mm256_add_pd(PolyP, P0); PolyQ = _mm256_add_pd(PolyQ, Q0); PolyP = _mm256_mul_pd(PolyP, z); z = _mm256_mul_pd(PolyP, gmx_mm256_inv_pd(_mm256_sub_pd(PolyQ, PolyP))); z = _mm256_add_pd(one, _mm256_mul_pd(two, z)); z = _mm256_mul_pd(z, fexppart); return z; }
void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8, int width, int height, const uint8_t *ref8, int ref_stride, const uint8_t *mask, int mask_stride, int invert_mask) { int i = 0; uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8); const uint16_t *src0 = invert_mask ? pred : ref; const uint16_t *src1 = invert_mask ? ref : pred; const int stride0 = invert_mask ? width : ref_stride; const int stride1 = invert_mask ? ref_stride : width; const __m256i zero = _mm256_setzero_si256(); if (width == 8) { do { const __m256i s0 = mm256_loadu2_16(src0 + stride0, src0); const __m256i s1 = mm256_loadu2_16(src1 + stride1, src1); const __m128i m_l = _mm_loadl_epi64((const __m128i *)mask); const __m128i m_h = _mm_loadl_epi64((const __m128i *)(mask + 8)); __m256i m = _mm256_castsi128_si256(m_l); m = _mm256_insertf128_si256(m, m_h, 1); const __m256i m_16 = _mm256_unpacklo_epi8(m, zero); const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16); _mm_storeu_si128((__m128i *)(comp_pred), _mm256_castsi256_si128(comp)); _mm_storeu_si128((__m128i *)(comp_pred + width), _mm256_extractf128_si256(comp, 1)); src0 += (stride0 << 1); src1 += (stride1 << 1); mask += (mask_stride << 1); comp_pred += (width << 1); i += 2; } while (i < height); } else if (width == 16) { do { const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0)); const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1)); const __m256i m_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask)); const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16); _mm256_storeu_si256((__m256i *)comp_pred, comp); src0 += stride0; src1 += stride1; mask += mask_stride; comp_pred += width; i += 1; } while (i < height); } else if (width == 32) { do { const __m256i s0 = _mm256_loadu_si256((const __m256i *)src0); const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + 16)); const __m256i s1 = _mm256_loadu_si256((const __m256i *)src1); const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + 16)); const __m256i m01_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask)); const __m256i m23_16 = _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + 16))); const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16); const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16); _mm256_storeu_si256((__m256i *)comp_pred, comp); _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1); src0 += stride0; src1 += stride1; mask += mask_stride; comp_pred += width; i += 1; } while (i < height); } }
inline void newexp_ps_dual(avx_m256_t x1, avx_m256_t x2, avx_m256_t* exp1, avx_m256_t* exp2) { avx_m256_t one = _ps_1; avx_m256_t zero = _ps_0; x1 = _mm256_min_ps(x1, _ps_exp_hi); x2 = _mm256_min_ps(x2, _ps_exp_hi); x1 = _mm256_max_ps(x1, _ps_exp_lo); x2 = _mm256_max_ps(x2, _ps_exp_lo); avx_m256_t temp_21 = _mm256_mul_ps(x1, _ps_cephes_LOG2EF); avx_m256_t temp_22 = _mm256_mul_ps(x2, _ps_cephes_LOG2EF); temp_21 = _mm256_add_ps(temp_21, _ps_0p5); temp_22 = _mm256_add_ps(temp_22, _ps_0p5); avx_m256i_t emm01 = _mm256_cvttps_epi32(temp_21); avx_m256i_t emm02 = _mm256_cvttps_epi32(temp_22); avx_m256_t temp_11 = _mm256_cvtepi32_ps(emm01); avx_m256_t temp_12 = _mm256_cvtepi32_ps(emm02); avx_m256_t temp_31 = _mm256_sub_ps(temp_11, temp_21); avx_m256_t temp_32 = _mm256_sub_ps(temp_12, temp_22); avx_m256_t mask1 = _mm256_cmp_ps(temp_31, zero, _CMP_GT_OQ); avx_m256_t mask2 = _mm256_cmp_ps(temp_32, zero, _CMP_GT_OQ); mask1 = _mm256_and_ps(mask1, one); mask2 = _mm256_and_ps(mask2, one); temp_21 = _mm256_sub_ps(temp_11, mask1); temp_22 = _mm256_sub_ps(temp_12, mask2); emm01 = _mm256_cvttps_epi32(temp_21); emm02 = _mm256_cvttps_epi32(temp_22); temp_11 = _mm256_mul_ps(temp_21, _ps_cephes_exp_C12); temp_12 = _mm256_mul_ps(temp_22, _ps_cephes_exp_C12); x1 = _mm256_sub_ps(x1, temp_11); x2 = _mm256_sub_ps(x2, temp_12); avx_m256_t x21 = _mm256_mul_ps(x1, x1); avx_m256_t x22 = _mm256_mul_ps(x2, x2); avx_m256_t x31 = _mm256_mul_ps(x21, x1); avx_m256_t x32 = _mm256_mul_ps(x22, x2); avx_m256_t x41 = _mm256_mul_ps(x21, x21); avx_m256_t x42 = _mm256_mul_ps(x22, x22); temp_11 = _mm256_add_ps(x1, one); temp_12 = _mm256_add_ps(x2, one); temp_21 = _mm256_mul_ps(x21, _ps_cephes_exp_p5); temp_22 = _mm256_mul_ps(x22, _ps_cephes_exp_p5); temp_31 = _mm256_mul_ps(x31, _ps_cephes_exp_p4); temp_32 = _mm256_mul_ps(x32, _ps_cephes_exp_p4); temp_11 = _mm256_add_ps(temp_11, temp_21); temp_12 = _mm256_add_ps(temp_12, temp_22); temp_21 = _mm256_mul_ps(x31, _ps_cephes_exp_p0); temp_22 = _mm256_mul_ps(x32, _ps_cephes_exp_p0); temp_11 = _mm256_add_ps(temp_11, temp_31); temp_12 = _mm256_add_ps(temp_12, temp_32); avx_m256_t temp_41 = _mm256_mul_ps(x1, _ps_cephes_exp_p2); avx_m256_t temp_42 = _mm256_mul_ps(x2, _ps_cephes_exp_p2); temp_31 = _mm256_mul_ps(x21, _ps_cephes_exp_p1); temp_32 = _mm256_mul_ps(x22, _ps_cephes_exp_p1); //emm01 = _mm256_add_epi32(emm01, _pi32_0x7f); //emm02 = _mm256_add_epi32(emm02, _pi32_0x7f); emm01 = _mm256_castps_si256(_mm256_add_ps(_mm256_castsi256_ps(emm01), _mm256_castsi256_ps(_pi32_0x7f))); emm02 = _mm256_castps_si256(_mm256_add_ps(_mm256_castsi256_ps(emm02), _mm256_castsi256_ps(_pi32_0x7f))); temp_21 = _mm256_add_ps(temp_21, temp_31); temp_22 = _mm256_add_ps(temp_22, temp_32); temp_31 = _mm256_add_ps(temp_31, temp_41); temp_32 = _mm256_add_ps(temp_32, temp_42); //emm01 = _mm256_slli_epi32(emm01, 23); __m128i emm0hi1 = _mm256_extractf128_si256(emm01, 0); __m128i emm0lo1 = _mm256_extractf128_si256(emm01, 1); emm0hi1 = _mm_slli_epi32(emm0hi1, 23); emm0lo1 = _mm_slli_epi32(emm0lo1, 23); emm01 = _mm256_insertf128_si256(emm01, emm0hi1, 0); emm01 = _mm256_insertf128_si256(emm01, emm0lo1, 1); //emm02 = _mm256_slli_epi32(emm02, 23); __m128i emm0hi2 = _mm256_extractf128_si256(emm02, 0); __m128i emm0lo2 = _mm256_extractf128_si256(emm02, 1); emm0hi2 = _mm_slli_epi32(emm0hi2, 23); emm0lo2 = _mm_slli_epi32(emm0lo2, 23); emm02 = _mm256_insertf128_si256(emm02, emm0hi2, 0); emm02 = _mm256_insertf128_si256(emm02, emm0lo2, 1); avx_m256_t pow2n1 = _mm256_castsi256_ps(emm01); avx_m256_t pow2n2 = _mm256_castsi256_ps(emm02); temp_21 = _mm256_add_ps(temp_21, temp_31); temp_22 = _mm256_add_ps(temp_22, temp_32); temp_21 = _mm256_mul_ps(temp_21, x41); temp_22 = _mm256_mul_ps(temp_22, x42); avx_m256_t y1 = _mm256_add_ps(temp_11, temp_21); avx_m256_t y2 = _mm256_add_ps(temp_12, temp_22); *exp1 = _mm256_mul_ps(y1, pow2n1); *exp2 = _mm256_mul_ps(y2, pow2n2); } // newexp_ps_dual()
namespace simd { namespace ext { template<class Dummy> struct call<boost::simd::tag::shrai_(tag::simd_<tag::arithmetic_, tag::avx_), boost::simd::tag::simd_<tag::arithmetic_, tag::avx_)), boost::simd::boost::simd::tag::avx_, Dummy> : callable { template<class Sig> struct result; template<class This,class A0, class A1> struct result<This(A0,A1)> : meta::strip<A0> {}; // BOOST_SIMD_FUNCTOR_CALL(2) { typedef typename meta::scalar_of<A0>::type sctype; typedef typename simd::native<sctype, boost::simd::tag::sse_ > svtype; svtype a00 = { _mm256_extractf128_si256(a0, 0)}; svtype a01 = { _mm256_extractf128_si256(a0, 1)}; A0 that = { _mm256_insertf128_si256(that,boost::simd::shrai( a00, a1), 0)}; that = _mm256_insertf128_si256(that, boost::simd::shrai(a01, a1), 1); return that; } }; } } } #endif #endif
inline void newsincos_ps(avx_m256_t x, avx_m256_t *s, avx_m256_t *c) { avx_m256_t sign_bit = _mm256_and_ps(x, _ps_sign_mask); x = _mm256_and_ps(x, _ps_inv_sign_mask); avx_m256_t y = _mm256_mul_ps(x, _ps_cephes_FOPI); //avx_m256i_t emm2 = _mm256_cvttps_epi32(y); //emm2 = _mm256_add_epi32(emm2, _pi32_1); avx_m256i_t emm2 = _mm256_cvttps_epi32(_mm256_add_ps(y, _ps_1)); //emm2 = _mm256_and_si256(emm2, _pi32_inv1); emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2), _mm256_castsi256_ps(_pi32_inv1))); y = _mm256_cvtepi32_ps(emm2); //avx_m256i_t cos_emm2 = _mm256_sub_epi32(emm2, _pi32_2); avx_m256i_t cos_emm2 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm2), _ps_2)); //avx_m256i_t emm0 = _mm256_and_si256(emm2, _pi32_4); avx_m256i_t emm0 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2), _mm256_castsi256_ps(_pi32_4))); //avx_m256i_t cos_emm0 = _mm256_andnot_si256(cos_emm2, _pi32_4); avx_m256i_t cos_emm0 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm2), _mm256_castsi256_ps(_pi32_4))); //emm0 = _mm256_slli_epi32(emm0, 29); __m128i emm0hi = _mm256_extractf128_si256(emm0, 0); __m128i emm0lo = _mm256_extractf128_si256(emm0, 1); emm0hi = _mm_slli_epi32(emm0hi, 29); emm0lo = _mm_slli_epi32(emm0lo, 29); emm0 = _mm256_insertf128_si256(emm0, emm0hi, 0); emm0 = _mm256_insertf128_si256(emm0, emm0lo, 1); //cos_emm0 = _mm256_slli_epi32(cos_emm0, 29); __m128i cos_emm0hi = _mm256_extractf128_si256(cos_emm0, 0); __m128i cos_emm0lo = _mm256_extractf128_si256(cos_emm0, 1); cos_emm0hi = _mm_slli_epi32(cos_emm0hi, 29); cos_emm0lo = _mm_slli_epi32(cos_emm0lo, 29); cos_emm0 = _mm256_insertf128_si256(cos_emm0, cos_emm0hi, 0); cos_emm0 = _mm256_insertf128_si256(cos_emm0, cos_emm0lo, 1); //emm2 = _mm256_and_si256(emm2, _pi32_2); emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2), _mm256_castsi256_ps(_pi32_2))); //cos_emm2 = _mm256_and_si256(cos_emm2, _pi32_2); cos_emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm2), _mm256_castsi256_ps(_pi32_2))); //emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256()); emm2 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm2), _mm256_setzero_ps(), _CMP_EQ_UQ)); //cos_emm2 = _mm256_cmpeq_epi32(cos_emm2, _mm256_setzero_si256()); cos_emm2 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm2), _mm256_setzero_ps(), _CMP_EQ_UQ)); avx_m256_t emm0f = _mm256_castsi256_ps(emm0); avx_m256_t emm2f = _mm256_castsi256_ps(emm2); avx_m256_t cos_emm0f = _mm256_castsi256_ps(cos_emm0); avx_m256_t cos_emm2f = _mm256_castsi256_ps(cos_emm2); sign_bit = _mm256_xor_ps(sign_bit, emm0f); avx_m256_t temp_2 = _ps_minus_cephes_DP123; temp_2 = _mm256_mul_ps(y, temp_2); x = _mm256_add_ps(x, temp_2); avx_m256_t x2 = _mm256_mul_ps(x, x); avx_m256_t x3 = _mm256_mul_ps(x2, x); avx_m256_t x4 = _mm256_mul_ps(x2, x2); y = _ps_coscof_p0; avx_m256_t y2 = _ps_sincof_p0; y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p1); y2 = _mm256_add_ps(y2, _ps_sincof_p1); y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p2); y2 = _mm256_add_ps(y2, _ps_sincof_p2); y = _mm256_mul_ps(y, x4); y2 = _mm256_mul_ps(y2, x3); temp_2 = _mm256_mul_ps(x2, _ps_0p5); y2 = _mm256_add_ps(y2, x); temp_2 = _mm256_sub_ps(temp_2, _ps_1); y = _mm256_sub_ps(y, temp_2); avx_m256_t cos_y = y; avx_m256_t cos_y2 = y2; y = _mm256_andnot_ps(emm2f, y); cos_y = _mm256_andnot_ps(cos_emm2f, cos_y); y2 = _mm256_and_ps(emm2f, y2); cos_y2 = _mm256_and_ps(cos_emm2f, cos_y2); y = _mm256_add_ps(y, y2); cos_y = _mm256_add_ps(cos_y, cos_y2); *s = _mm256_xor_ps(y, sign_bit); *c = _mm256_xor_ps(cos_y, cos_emm0f); } // newsincos_ps()
static INLINE void init_one_qp(const __m128i *p, __m256i *qp) { const __m128i zero = _mm_setzero_si128(); const __m128i dc = _mm_unpacklo_epi16(*p, zero); const __m128i ac = _mm_unpackhi_epi16(*p, zero); *qp = _mm256_insertf128_si256(_mm256_castsi128_si256(dc), ac, 1); }
inline void newsincos_ps_dual(avx_m256_t x1, avx_m256_t x2, avx_m256_t *s1, avx_m256_t *s2, avx_m256_t *c1, avx_m256_t *c2) { avx_m256_t tempa = _ps_sign_mask; avx_m256_t tempb = _ps_inv_sign_mask; avx_m256_t sign_bit1 = _mm256_and_ps(x1, tempa); avx_m256_t sign_bit2 = _mm256_and_ps(x2, tempa); x1 = _mm256_and_ps(x1, tempb); x2 = _mm256_and_ps(x2, tempb); tempa = _ps_cephes_FOPI; avx_m256_t y1 = _mm256_mul_ps(x1, tempa); avx_m256_t y2 = _mm256_mul_ps(x2, tempa); //avx_m256i_t emm21 = _mm256_cvttps_epi32(y1); //avx_m256i_t emm22 = _mm256_cvttps_epi32(y2); //emm21 = _mm256_add_epi32(emm21, _pi32_1); //emm22 = _mm256_add_epi32(emm22, _pi32_1); avx_m256i_t emm21 = _mm256_cvttps_epi32(_mm256_add_ps(y1, _ps_1)); avx_m256i_t emm22 = _mm256_cvttps_epi32(_mm256_add_ps(y2, _ps_1)); //emm21 = _mm256_and_si256(emm21, _pi32_inv1); //emm22 = _mm256_and_si256(emm22, _pi32_inv1); emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21), _mm256_castsi256_ps(_pi32_inv1))); emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22), _mm256_castsi256_ps(_pi32_inv1))); y1 = _mm256_cvtepi32_ps(emm21); y2 = _mm256_cvtepi32_ps(emm22); //avx_m256i_t tempia = _pi32_2; //avx_m256i_t cos_emm21 = _mm256_sub_epi32(emm21, tempia); //avx_m256i_t cos_emm22 = _mm256_sub_epi32(emm22, tempia); avx_m256i_t cos_emm21 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm21), _ps_2)); avx_m256i_t cos_emm22 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm22), _ps_2)); //avx_m256i_t tempib = _pi32_4; //avx_m256i_t emm01 = _mm256_and_si256(emm21, tempib); //avx_m256i_t emm02 = _mm256_and_si256(emm22, tempib); avx_m256i_t emm01 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21), _mm256_castsi256_ps(_pi32_4))); avx_m256i_t emm02 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22), _mm256_castsi256_ps(_pi32_4))); //avx_m256i_t cos_emm01 = _mm256_andnot_si256(cos_emm21, tempib); //avx_m256i_t cos_emm02 = _mm256_andnot_si256(cos_emm22, tempib); avx_m256i_t cos_emm01 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm21), _mm256_castsi256_ps(_pi32_4))); avx_m256i_t cos_emm02 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm22), _mm256_castsi256_ps(_pi32_4))); //emm01 = _mm256_slli_epi32(emm01, 29); __m128i emm0hi1 = _mm256_extractf128_si256(emm01, 0); __m128i emm0lo1 = _mm256_extractf128_si256(emm01, 1); emm0hi1 = _mm_slli_epi32(emm0hi1, 29); emm0lo1 = _mm_slli_epi32(emm0lo1, 29); emm01 = _mm256_insertf128_si256(emm01, emm0hi1, 0); emm01 = _mm256_insertf128_si256(emm01, emm0lo1, 1); //emm02 = _mm256_slli_epi32(emm02, 29); __m128i emm0hi2 = _mm256_extractf128_si256(emm02, 0); __m128i emm0lo2 = _mm256_extractf128_si256(emm02, 1); emm0hi2 = _mm_slli_epi32(emm0hi2, 29); emm0lo2 = _mm_slli_epi32(emm0lo2, 29); emm02 = _mm256_insertf128_si256(emm02, emm0hi1, 0); emm02 = _mm256_insertf128_si256(emm02, emm0lo1, 1); //cos_emm01 = _mm256_slli_epi32(cos_emm01, 29); __m128i cos_emm0hi1 = _mm256_extractf128_si256(cos_emm01, 0); __m128i cos_emm0lo1 = _mm256_extractf128_si256(cos_emm01, 1); cos_emm0hi1 = _mm_slli_epi32(cos_emm0hi1, 29); cos_emm0lo1 = _mm_slli_epi32(cos_emm0lo1, 29); cos_emm01 = _mm256_insertf128_si256(cos_emm01, cos_emm0hi1, 0); cos_emm01 = _mm256_insertf128_si256(cos_emm01, cos_emm0lo1, 1); //cos_emm02 = _mm256_slli_epi32(cos_emm02, 29); __m128i cos_emm0hi2 = _mm256_extractf128_si256(cos_emm02, 0); __m128i cos_emm0lo2 = _mm256_extractf128_si256(cos_emm02, 1); cos_emm0hi2 = _mm_slli_epi32(cos_emm0hi2, 29); cos_emm0lo2 = _mm_slli_epi32(cos_emm0lo2, 29); cos_emm02 = _mm256_insertf128_si256(cos_emm02, cos_emm0hi2, 0); cos_emm02 = _mm256_insertf128_si256(cos_emm02, cos_emm0lo2, 1); //tempia = _pi32_2; //tempib = _mm256_setzero_si256(); //emm21 = _mm256_and_si256(emm21, tempia); //emm22 = _mm256_and_si256(emm22, tempia); emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21), _mm256_castsi256_ps(_pi32_2))); emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22), _mm256_castsi256_ps(_pi32_2))); //cos_emm21 = _mm256_and_si256(cos_emm21, tempia); //cos_emm22 = _mm256_and_si256(cos_emm22, tempia); cos_emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm21), _mm256_castsi256_ps(_pi32_2))); cos_emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm22), _mm256_castsi256_ps(_pi32_2))); //emm21 = _mm256_cmpeq_epi32(emm21, tempib); //emm22 = _mm256_cmpeq_epi32(emm22, tempib); emm21 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm21), _mm256_setzero_ps(), _CMP_EQ_UQ)); emm22 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm22), _mm256_setzero_ps(), _CMP_EQ_UQ)); //cos_emm21 = _mm256_cmpeq_epi32(cos_emm21, tempib); //cos_emm22 = _mm256_cmpeq_epi32(cos_emm22, tempib); cos_emm21 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm21), _mm256_setzero_ps(), _CMP_EQ_UQ)); cos_emm22 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm22), _mm256_setzero_ps(), _CMP_EQ_UQ)); avx_m256_t emm0f1 = _mm256_castsi256_ps(emm01); avx_m256_t emm0f2 = _mm256_castsi256_ps(emm02); avx_m256_t emm2f1 = _mm256_castsi256_ps(emm21); avx_m256_t emm2f2 = _mm256_castsi256_ps(emm22); avx_m256_t cos_emm0f1 = _mm256_castsi256_ps(cos_emm01); avx_m256_t cos_emm0f2 = _mm256_castsi256_ps(cos_emm02); avx_m256_t cos_emm2f1 = _mm256_castsi256_ps(cos_emm21); avx_m256_t cos_emm2f2 = _mm256_castsi256_ps(cos_emm22); sign_bit1 = _mm256_xor_ps(sign_bit1, emm0f1); sign_bit2 = _mm256_xor_ps(sign_bit2, emm0f2); tempa = _ps_minus_cephes_DP123; tempb = _mm256_mul_ps(y2, tempa); tempa = _mm256_mul_ps(y1, tempa); x2 = _mm256_add_ps(x2, tempb); x1 = _mm256_add_ps(x1, tempa); avx_m256_t x21 = _mm256_mul_ps(x1, x1); avx_m256_t x22 = _mm256_mul_ps(x2, x2); avx_m256_t x31 = _mm256_mul_ps(x21, x1); avx_m256_t x32 = _mm256_mul_ps(x22, x2); avx_m256_t x41 = _mm256_mul_ps(x21, x21); avx_m256_t x42 = _mm256_mul_ps(x22, x22); tempa = _ps_coscof_p0; tempb = _ps_sincof_p0; y1 = _mm256_mul_ps(x21, tempa); y2 = _mm256_mul_ps(x22, tempa); avx_m256_t y21 = _mm256_mul_ps(x21, tempb); avx_m256_t y22 = _mm256_mul_ps(x22, tempb); tempa = _ps_coscof_p1; tempb = _ps_sincof_p1; y1 = _mm256_add_ps(y1, tempa); y2 = _mm256_add_ps(y2, tempa); y21 = _mm256_add_ps(y21, tempb); y22 = _mm256_add_ps(y22, tempb); y1 = _mm256_mul_ps(y1, x21); y2 = _mm256_mul_ps(y2, x22); y21 = _mm256_mul_ps(y21, x21); y22 = _mm256_mul_ps(y22, x22); tempa = _ps_coscof_p2; tempb = _ps_sincof_p2; y1 = _mm256_add_ps(y1, tempa); y2 = _mm256_add_ps(y2, tempa); y21 = _mm256_add_ps(y21, tempb); y22 = _mm256_add_ps(y22, tempb); y1 = _mm256_mul_ps(y1, x41); y2 = _mm256_mul_ps(y2, x42); y21 = _mm256_mul_ps(y21, x31); y22 = _mm256_mul_ps(y22, x32); tempa = _ps_0p5; tempb = _ps_1; avx_m256_t temp_21 = _mm256_mul_ps(x21, tempa); avx_m256_t temp_22 = _mm256_mul_ps(x22, tempa); y21 = _mm256_add_ps(y21, x1); y22 = _mm256_add_ps(y22, x2); temp_21 = _mm256_sub_ps(temp_21, tempb); temp_22 = _mm256_sub_ps(temp_22, tempb); y1 = _mm256_sub_ps(y1, temp_21); y2 = _mm256_sub_ps(y2, temp_22); avx_m256_t cos_y1 = y1; avx_m256_t cos_y2 = y2; avx_m256_t cos_y21 = y21; avx_m256_t cos_y22 = y22; y1 = _mm256_andnot_ps(emm2f1, y1); y2 = _mm256_andnot_ps(emm2f2, y2); cos_y1 = _mm256_andnot_ps(cos_emm2f1, cos_y1); cos_y2 = _mm256_andnot_ps(cos_emm2f2, cos_y2); y21 = _mm256_and_ps(emm2f1, y21); y22 = _mm256_and_ps(emm2f2, y22); cos_y21 = _mm256_and_ps(cos_emm2f1, cos_y21); cos_y22 = _mm256_and_ps(cos_emm2f2, cos_y22); y1 = _mm256_add_ps(y1, y21); y2 = _mm256_add_ps(y2, y22); cos_y1 = _mm256_add_ps(cos_y1, cos_y21); cos_y2 = _mm256_add_ps(cos_y2, cos_y22); *s1 = _mm256_xor_ps(y1, sign_bit1); *s2 = _mm256_xor_ps(y2, sign_bit2); *c1 = _mm256_xor_ps(cos_y1, cos_emm0f1); *c2 = _mm256_xor_ps(cos_y2, cos_emm0f2); } // newsincos_ps_dual()
((simd_<arithmetic_<A0>,tag::avx_>)) ((simd_<arithmetic_<A0>,tag::avx_>)) ); namespace nt2 { namespace ext { template<class Dummy> struct call<tag::shrai_(tag::simd_<tag::arithmetic_, tag::avx_), tag::simd_<tag::arithmetic_, tag::avx_)), tag::cpu_, Dummy> : callable { template<class Sig> struct result; template<class This,class A0, class A1> struct result<This(A0,A1)> : meta::strip<A0>{};// NT2_FUNCTOR_CALL(2) { typedef typename meta::scalar_of<A0>::type sctype; typedef typename simd::native<sctype, tag::sse_ > svtype; svtype a00 = { _mm256_extractf128_si256(a0, 0)}; svtype a01 = { _mm256_extractf128_si256(a0, 1)}; A0 that = { _mm256_insertf128_si256(that,nt2::shrai( a00, a1), 0)}; that = _mm256_insertf128_si256(that, nt2::shrai(a01, a1), 1); return that; } }; } } #endif // modified by jt the 04/01/2011
__m256i test_mm256_insertf128_si256_0(__m256i a, __m128i b) { // CHECK-LABEL: @test_mm256_insertf128_si256_0 // CHECK: shufflevector{{.*}}<i32 4, i32 5, i32 2, i32 3> return _mm256_insertf128_si256(a, b, 0); }