static inline void desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts) { __m128i ptype0, ptype1, vtag0, vtag1; union { uint16_t e[4]; uint64_t dword; } vol; ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]); ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]); vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]); vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]); ptype1 = _mm_unpacklo_epi32(ptype0, ptype1); vtag1 = _mm_unpacklo_epi32(vtag0, vtag1); ptype1 = _mm_slli_epi16(ptype1, PTYPE_SHIFT); vtag1 = _mm_srli_epi16(vtag1, VTAG_SHIFT); ptype1 = _mm_or_si128(ptype1, vtag1); vol.dword = _mm_cvtsi128_si64(ptype1) & OLFLAGS_MASK_V; rx_pkts[0]->ol_flags = vol.e[0]; rx_pkts[1]->ol_flags = vol.e[1]; rx_pkts[2]->ol_flags = vol.e[2]; rx_pkts[3]->ol_flags = vol.e[3]; }
void transpose_out(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3) { __m128i T0 = _mm_unpacklo_epi64(B0, B1); __m128i T1 = _mm_unpacklo_epi64(B2, B3); __m128i T2 = _mm_unpackhi_epi64(B0, B1); __m128i T3 = _mm_unpackhi_epi64(B2, B3); T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0)); T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0)); T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0)); T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0)); T0 = _mm_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0)); T1 = _mm_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0)); T2 = _mm_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0)); T3 = _mm_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0)); T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0)); T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0)); T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0)); T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0)); B0 = _mm_unpacklo_epi32(T0, T1); B1 = _mm_unpackhi_epi32(T0, T1); B2 = _mm_unpacklo_epi32(T2, T3); B3 = _mm_unpackhi_epi32(T2, T3); }
static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i, j; __m128i L = _mm_cvtsi32_si128(out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); __m128i pa; GetSumAbsDiff32(&T, &TL, &pa); // pa = sum |T-TL| for (j = 0; j < 4; ++j) { const __m128i L_lo = _mm_unpacklo_epi32(L, L); const __m128i TL_lo = _mm_unpacklo_epi32(TL, L); const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); // pb = sum |L-TL| const __m128i mask = _mm_cmpgt_epi32(pb, pa); const __m128i A = _mm_and_si128(mask, L); const __m128i B = _mm_andnot_si128(mask, T); const __m128i pred = _mm_or_si128(A, B); // pred = (L > T)? L : T L = _mm_add_epi8(src, pred); out[i + j] = _mm_cvtsi128_si32(L); // Shift the pre-computed value for the next iteration. T = _mm_srli_si128(T, 4); TL = _mm_srli_si128(TL, 4); src = _mm_srli_si128(src, 4); pa = _mm_srli_si128(pa, 4); } } if (i != num_pixels) { VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i); } }
static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; __m128i pa; __m128i L = _mm_cvtsi32_si128(out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); { // We can unpack with any value on the upper 32 bits, provided it's the // same on both operands (so that their sum of abs diff is zero). Here we // use T. const __m128i T_lo = _mm_unpacklo_epi32(T, T); const __m128i TL_lo = _mm_unpacklo_epi32(TL, T); const __m128i T_hi = _mm_unpackhi_epi32(T, T); const __m128i TL_hi = _mm_unpackhi_epi32(TL, T); const __m128i s_lo = _mm_sad_epu8(T_lo, TL_lo); const __m128i s_hi = _mm_sad_epu8(T_hi, TL_hi); pa = _mm_packs_epi32(s_lo, s_hi); // pa = sum |T-TL| } DO_PRED11(0); DO_PRED11_SHIFT; DO_PRED11(1); DO_PRED11_SHIFT; DO_PRED11(2); DO_PRED11_SHIFT; DO_PRED11(3); } if (i != num_pixels) { VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i); } }
inline FORCE_INLINE void transpose8_epi16(__m128i &x0, __m128i &x1, __m128i &x2, __m128i &x3, __m128i &x4, __m128i &x5, __m128i &x6, __m128i &x7) { __m128i t0, t1, t2, t3, t4, t5, t6, t7; __m128i tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7; t0 = _mm_unpacklo_epi16(x0, x1); t1 = _mm_unpacklo_epi16(x2, x3); t2 = _mm_unpacklo_epi16(x4, x5); t3 = _mm_unpacklo_epi16(x6, x7); t4 = _mm_unpackhi_epi16(x0, x1); t5 = _mm_unpackhi_epi16(x2, x3); t6 = _mm_unpackhi_epi16(x4, x5); t7 = _mm_unpackhi_epi16(x6, x7); tt0 = _mm_unpacklo_epi32(t0, t1); tt1 = _mm_unpackhi_epi32(t0, t1); tt2 = _mm_unpacklo_epi32(t2, t3); tt3 = _mm_unpackhi_epi32(t2, t3); tt4 = _mm_unpacklo_epi32(t4, t5); tt5 = _mm_unpackhi_epi32(t4, t5); tt6 = _mm_unpacklo_epi32(t6, t7); tt7 = _mm_unpackhi_epi32(t6, t7); x0 = _mm_unpacklo_epi64(tt0, tt2); x1 = _mm_unpackhi_epi64(tt0, tt2); x2 = _mm_unpacklo_epi64(tt1, tt3); x3 = _mm_unpackhi_epi64(tt1, tt3); x4 = _mm_unpacklo_epi64(tt4, tt6); x5 = _mm_unpackhi_epi64(tt4, tt6); x6 = _mm_unpacklo_epi64(tt5, tt7); x7 = _mm_unpackhi_epi64(tt5, tt7); }
// Convert 16 packed ARGB 16b-values to r[], g[], b[] static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41( const uint16_t* const rgbx, __m128i* const r, __m128i* const g, __m128i* const b) { const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x const __m128i in2 = LOAD_16(rgbx + 16); // r4 | ... const __m128i in3 = LOAD_16(rgbx + 24); // r6 | ... // aarrggbb as 16-bit. const __m128i shuff0 = _mm_set_epi8(-1, -1, -1, -1, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0); const __m128i shuff1 = _mm_set_epi8(13, 12, 5, 4, -1, -1, -1, -1, 11, 10, 3, 2, 9, 8, 1, 0); const __m128i A0 = _mm_shuffle_epi8(in0, shuff0); const __m128i A1 = _mm_shuffle_epi8(in1, shuff1); const __m128i A2 = _mm_shuffle_epi8(in2, shuff0); const __m128i A3 = _mm_shuffle_epi8(in3, shuff1); // R0R1G0G1 // B0B1**** // R2R3G2G3 // B2B3**** // (OR is used to free port 5 for the unpack) const __m128i B0 = _mm_unpacklo_epi32(A0, A1); const __m128i B1 = _mm_or_si128(A0, A1); const __m128i B2 = _mm_unpacklo_epi32(A2, A3); const __m128i B3 = _mm_or_si128(A2, A3); // Gather the channels. *r = _mm_unpacklo_epi64(B0, B2); *g = _mm_unpackhi_epi64(B0, B2); *b = _mm_unpackhi_epi64(B1, B3); }
static int SSE4x4(const uint8_t* a, const uint8_t* b) { const __m128i zero = _mm_setzero_si128(); // Load values. Note that we read 8 pixels instead of 4, // but the a/b buffers are over-allocated to that effect. const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[BPS * 0]); const __m128i a1 = _mm_loadl_epi64((const __m128i*)&a[BPS * 1]); const __m128i a2 = _mm_loadl_epi64((const __m128i*)&a[BPS * 2]); const __m128i a3 = _mm_loadl_epi64((const __m128i*)&a[BPS * 3]); const __m128i b0 = _mm_loadl_epi64((const __m128i*)&b[BPS * 0]); const __m128i b1 = _mm_loadl_epi64((const __m128i*)&b[BPS * 1]); const __m128i b2 = _mm_loadl_epi64((const __m128i*)&b[BPS * 2]); const __m128i b3 = _mm_loadl_epi64((const __m128i*)&b[BPS * 3]); // Combine pair of lines. const __m128i a01 = _mm_unpacklo_epi32(a0, a1); const __m128i a23 = _mm_unpacklo_epi32(a2, a3); const __m128i b01 = _mm_unpacklo_epi32(b0, b1); const __m128i b23 = _mm_unpacklo_epi32(b2, b3); // Convert to 16b. const __m128i a01s = _mm_unpacklo_epi8(a01, zero); const __m128i a23s = _mm_unpacklo_epi8(a23, zero); const __m128i b01s = _mm_unpacklo_epi8(b01, zero); const __m128i b23s = _mm_unpacklo_epi8(b23, zero); // subtract, square and accumulate const __m128i d0 = _mm_subs_epi16(a01s, b01s); const __m128i d1 = _mm_subs_epi16(a23s, b23s); const __m128i e0 = _mm_madd_epi16(d0, d0); const __m128i e1 = _mm_madd_epi16(d1, d1); const __m128i sum = _mm_add_epi32(e0, e1); int32_t tmp[4]; _mm_storeu_si128((__m128i*)tmp, sum); return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); }
static inline void inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a, const gint16 * b, gint len, const gint16 * icoeff, gint bstride) { gint i = 0; __m128i sum[4], t[4]; __m128i f = _mm_set_epi64x (0, *((long long *) icoeff)); const gint16 *c[4] = { (gint16 *) ((gint8 *) b + 0 * bstride), (gint16 *) ((gint8 *) b + 1 * bstride), (gint16 *) ((gint8 *) b + 2 * bstride), (gint16 *) ((gint8 *) b + 3 * bstride) }; sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 (); f = _mm_unpacklo_epi16 (f, sum[0]); for (; i < len; i += 8) { t[0] = _mm_loadu_si128 ((__m128i *) (a + i)); sum[0] = _mm_add_epi32 (sum[0], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[0] + i)))); sum[1] = _mm_add_epi32 (sum[1], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[1] + i)))); sum[2] = _mm_add_epi32 (sum[2], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[2] + i)))); sum[3] = _mm_add_epi32 (sum[3], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[3] + i)))); } t[0] = _mm_unpacklo_epi32 (sum[0], sum[1]); t[1] = _mm_unpacklo_epi32 (sum[2], sum[3]); t[2] = _mm_unpackhi_epi32 (sum[0], sum[1]); t[3] = _mm_unpackhi_epi32 (sum[2], sum[3]); sum[0] = _mm_add_epi32 (_mm_unpacklo_epi64 (t[0], t[1]), _mm_unpackhi_epi64 (t[0], t[1])); sum[2] = _mm_add_epi32 (_mm_unpacklo_epi64 (t[2], t[3]), _mm_unpackhi_epi64 (t[2], t[3])); sum[0] = _mm_add_epi32 (sum[0], sum[2]); sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); sum[0] = _mm_madd_epi16 (sum[0], f); sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, 3))); sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, 1))); sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); sum[0] = _mm_packs_epi32 (sum[0], sum[0]); *o = _mm_extract_epi16 (sum[0], 0); }
static inline void yuv_to_packed_shader_3(uint8_t** dstp, const uint8_t** srcp, const int dpitch, const int spitch, const int width, const int height, void* _buff) noexcept { const uint8_t* sr = srcp[0]; const uint8_t* sg = srcp[1]; const uint8_t* sb = srcp[2]; uint8_t* d = dstp[0]; float* buff = reinterpret_cast<float*>(_buff); const uint8_t *rlsb, *glsb, *blsb; if (STACK16) { rlsb = sr + height * spitch; glsb = sg + height * spitch; blsb = sb + height * spitch; } const __m128i zero = _mm_setzero_si128(); const __m128 rcp = _mm_set1_ps(1.0f / (STACK16 ? 65535 : 255)); for (int y = 0; y < height; ++y) { for (int x = 0; x < width; x += 4) { __m128i r, g, b; if (!STACK16) { r = _mm_cvtepu8_epi32(loadl(sr + x)); g = _mm_cvtepu8_epi32(loadl(sg + x)); b = _mm_cvtepu8_epi32(loadl(sb + x)); } else { r = _mm_unpacklo_epi16(_mm_unpacklo_epi8(loadl(rlsb + x), loadl(sr + x)), zero); g = _mm_unpacklo_epi16(_mm_unpacklo_epi8(loadl(glsb + x), loadl(sg + x)), zero); b = _mm_unpacklo_epi16(_mm_unpacklo_epi8(loadl(blsb + x), loadl(sb + x)), zero); } __m128i rg = _mm_unpacklo_epi32(r, g); __m128i ba = _mm_unpacklo_epi32(b, zero); __m128 rgba0 = _mm_cvtepi32_ps(_mm_unpacklo_epi64(rg, ba)); __m128 rgba1 = _mm_cvtepi32_ps(_mm_unpackhi_epi64(rg, ba)); _mm_store_ps(buff + 4 * x + 0, _mm_mul_ps(rgba0, rcp)); _mm_store_ps(buff + 4 * x + 4, _mm_mul_ps(rgba1, rcp)); rg = _mm_unpackhi_epi32(r, g); ba = _mm_unpackhi_epi32(b, zero); rgba0 = _mm_cvtepi32_ps(_mm_unpacklo_epi64(rg, ba)); rgba1 = _mm_cvtepi32_ps(_mm_unpackhi_epi64(rg, ba)); _mm_store_ps(buff + 4 * x + 8, _mm_mul_ps(rgba0, rcp)); _mm_store_ps(buff + 4 * x + 12, _mm_mul_ps(rgba1, rcp)); } convert_float_to_half(d, buff, width * 4); d += dpitch; sr += spitch; sg += spitch; sb += spitch; if (STACK16) { rlsb += spitch; glsb += spitch; blsb += spitch; } } }
static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch, uint8_t *dst, const int16_t *x_filter) { const __m128i k_256 = _mm_set1_epi16(1 << 8); const __m128i f_values = _mm_load_si128((const __m128i *)x_filter); // pack and duplicate the filter values const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); const __m128i A = _mm_loadl_epi64((const __m128i *)src_x); const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch)); const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2)); const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3)); const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4)); const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5)); const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6)); const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7)); // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57 const __m128i tr0_2 = _mm_unpacklo_epi16(E, F); // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77 const __m128i tr0_3 = _mm_unpacklo_epi16(G, H); // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1); // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73 const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3); // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2); const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2); const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3); const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3); // multiply 2 adjacent elements with the filter and add the result const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); // add and saturate the results together const __m128i min_x2x1 = _mm_min_epi16(x2, x1); const __m128i max_x2x1 = _mm_max_epi16(x2, x1); __m128i temp = _mm_adds_epi16(x0, x3); temp = _mm_adds_epi16(temp, min_x2x1); temp = _mm_adds_epi16(temp, max_x2x1); // round and shift by 7 bit each 16 bit temp = _mm_mulhrs_epi16(temp, k_256); // shrink to 8 bit each 16 bits temp = _mm_packus_epi16(temp, temp); // save only 8 bytes convolve result _mm_storel_epi64((__m128i *)dst, temp); }
void aom_highbd_upsampled_pred_sse2(uint16_t *pred, int width, int height, const uint8_t *ref8, const int ref_stride) { const int stride = ref_stride << 3; uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); int i, j; if (width >= 8) { // read 8 points at one time for (i = 0; i < height; i++) { for (j = 0; j < width; j += 8) { __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); __m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32)); __m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40)); __m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48)); __m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56)); __m128i t0, t1, t2, t3; t0 = _mm_unpacklo_epi16(s0, s1); t1 = _mm_unpacklo_epi16(s2, s3); t2 = _mm_unpacklo_epi16(s4, s5); t3 = _mm_unpacklo_epi16(s6, s7); t0 = _mm_unpacklo_epi32(t0, t1); t2 = _mm_unpacklo_epi32(t2, t3); t0 = _mm_unpacklo_epi64(t0, t2); _mm_storeu_si128((__m128i *)(pred), t0); pred += 8; ref += 64; // 8 * 8; } ref += stride - (width << 3); } } else { // read 4 points at one time for (i = 0; i < height; i++) { for (j = 0; j < width; j += 4) { __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); __m128i t0, t1; t0 = _mm_unpacklo_epi16(s0, s1); t1 = _mm_unpacklo_epi16(s2, s3); t0 = _mm_unpacklo_epi32(t0, t1); _mm_storel_epi64((__m128i *)(pred), t0); pred += 4; ref += 4 * 8; } ref += stride - (width << 3); } } }
template<int shift, int active_bits> void Haar_invtransform_H_final_1_sse4_2_int32_t(void *_idata, const int istride, const char *odata, const int ostride, const int iwidth, const int iheight, const int ooffset_x, const int ooffset_y, const int owidth, const int oheight) { int32_t *idata = (int32_t *)_idata; const int skip = 1; const __m128i ONE = _mm_set1_epi32(1); const __m128i OFFSET = _mm_set1_epi32(1 << (active_bits - 1)); (void)iwidth; (void)iheight; for (int y = ooffset_y; y < ooffset_y + oheight; y+=skip) { for (int x = ooffset_x; x < ooffset_x + owidth; x += 8) { __m128i D0 = _mm_load_si128((__m128i *)&idata[y*istride + x + 0]); __m128i D4 = _mm_load_si128((__m128i *)&idata[y*istride + x + 4]); __m128i A0 = _mm_unpacklo_epi32(D0, D4); __m128i A2 = _mm_unpackhi_epi32(D0, D4); __m128i E0 = _mm_unpacklo_epi32(A0, A2); __m128i O1 = _mm_unpackhi_epi32(A0, A2); __m128i X0 = _mm_sub_epi32(E0, _mm_srai_epi32(_mm_add_epi32(O1, ONE), 1)); __m128i X1 = _mm_add_epi32(O1, X0); __m128i Z0 = _mm_unpacklo_epi32(X0, X1); __m128i Z4 = _mm_unpackhi_epi32(X0, X1); if (shift != 0) { Z0 = _mm_add_epi32(Z0, ONE); Z4 = _mm_add_epi32(Z4, ONE); Z0 = _mm_srai_epi32(Z0, shift); Z4 = _mm_srai_epi32(Z4, shift); } Z0 = _mm_add_epi32(Z0, OFFSET); Z4 = _mm_add_epi32(Z4, OFFSET); Z0 = _mm_slli_epi32(Z0, (16 - active_bits)); Z4 = _mm_slli_epi32(Z4, (16 - active_bits)); __m128i R = _mm_packus_epi32(Z0, Z4); R = _mm_srli_epi16(R, (16 - active_bits)); _mm_store_si128((__m128i *)&odata[2*((y - ooffset_y)*ostride + x - ooffset_x)], R); } } }
static inline void desc_to_olflags_v(__m128i descs[4], uint8_t vlan_flags, struct rte_mbuf **rx_pkts) { __m128i ptype0, ptype1, vtag0, vtag1; union { uint16_t e[4]; uint64_t dword; } vol; /* mask everything except rss type */ const __m128i rsstype_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x000F, 0x000F, 0x000F, 0x000F); /* map rss type to rss hash flag */ const __m128i rss_flags = _mm_set_epi8(PKT_RX_FDIR, 0, 0, 0, 0, 0, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0); /* mask everything except vlan present bit */ const __m128i vlan_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP); /* map vlan present (0x8) to ol_flags */ const __m128i vlan_map = _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, vlan_flags, 0, 0, 0, 0, 0, 0, 0, 0); ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]); ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]); vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]); vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]); ptype0 = _mm_unpacklo_epi32(ptype0, ptype1); ptype0 = _mm_and_si128(ptype0, rsstype_msk); ptype0 = _mm_shuffle_epi8(rss_flags, ptype0); vtag1 = _mm_unpacklo_epi32(vtag0, vtag1); vtag1 = _mm_and_si128(vtag1, vlan_msk); vtag1 = _mm_shuffle_epi8(vlan_map, vtag1); vtag1 = _mm_or_si128(ptype0, vtag1); vol.dword = _mm_cvtsi128_si64(vtag1); rx_pkts[0]->ol_flags = vol.e[0]; rx_pkts[1]->ol_flags = vol.e[1]; rx_pkts[2]->ol_flags = vol.e[2]; rx_pkts[3]->ol_flags = vol.e[3]; }
OD_SIMD_INLINE void od_transpose4(__m128i *t0, __m128i *t1, __m128i *t2, __m128i *t3) { __m128i a = _mm_unpacklo_epi32(*t0, *t1); __m128i b = _mm_unpacklo_epi32(*t2, *t3); __m128i c = _mm_unpackhi_epi32(*t0, *t1); __m128i d = _mm_unpackhi_epi32(*t2, *t3); *t0 = _mm_unpacklo_epi64(a, b); *t1 = _mm_unpackhi_epi64(a, b); *t2 = _mm_unpacklo_epi64(c, d); *t3 = _mm_unpackhi_epi64(c, d); }
// Predictor11: select. static void GetSumAbsDiff32_SSE2(const __m128i* const A, const __m128i* const B, __m128i* const out) { // We can unpack with any value on the upper 32 bits, provided it's the same // on both operands (to that their sum of abs diff is zero). Here we use *A. const __m128i A_lo = _mm_unpacklo_epi32(*A, *A); const __m128i B_lo = _mm_unpacklo_epi32(*B, *A); const __m128i A_hi = _mm_unpackhi_epi32(*A, *A); const __m128i B_hi = _mm_unpackhi_epi32(*B, *A); const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo); const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi); *out = _mm_packs_epi32(s_lo, s_hi); }
/*Transpose 8 vectors with 8 16-bit values.*/ OD_SIMD_INLINE void od_transpose16x8(__m128i *t0, __m128i *t1, __m128i *t2, __m128i *t3, __m128i *t4, __m128i *t5, __m128i *t6, __m128i *t7) { __m128i a0; __m128i b0; __m128i c0; __m128i d0; __m128i e0; __m128i f0; __m128i g0; __m128i h0; __m128i a1; __m128i b1; __m128i c1; __m128i d1; __m128i e1; __m128i f1; __m128i g1; __m128i h1; /*00112233*/ a0 = _mm_unpacklo_epi16(*t0, *t1); b0 = _mm_unpacklo_epi16(*t2, *t3); c0 = _mm_unpacklo_epi16(*t4, *t5); d0 = _mm_unpacklo_epi16(*t6, *t7); /*44556677*/ e0 = _mm_unpackhi_epi16(*t0, *t1); f0 = _mm_unpackhi_epi16(*t2, *t3); g0 = _mm_unpackhi_epi16(*t4, *t5); h0 = _mm_unpackhi_epi16(*t6, *t7); /*00001111*/ a1 = _mm_unpacklo_epi32(a0, b0); b1 = _mm_unpacklo_epi32(c0, d0); /*22223333*/ c1 = _mm_unpackhi_epi32(a0, b0); d1 = _mm_unpackhi_epi32(c0, d0); /*44445555*/ e1 = _mm_unpacklo_epi32(e0, f0); f1 = _mm_unpacklo_epi32(g0, h0); /*66667777*/ g1 = _mm_unpackhi_epi32(e0, f0); h1 = _mm_unpackhi_epi32(g0, h0); *t0 = _mm_unpacklo_epi64(a1, b1); *t1 = _mm_unpackhi_epi64(a1, b1); *t2 = _mm_unpacklo_epi64(c1, d1); *t3 = _mm_unpackhi_epi64(c1, d1); *t4 = _mm_unpacklo_epi64(e1, f1); *t5 = _mm_unpackhi_epi64(e1, f1); *t6 = _mm_unpacklo_epi64(g1, h1); *t7 = _mm_unpackhi_epi64(g1, h1); }
void av1_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) { __m128i in0, in1; __m128i tmp; const __m128i zero = _mm_setzero_si128(); in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); in1 = _mm_unpacklo_epi64( in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride))); in0 = _mm_unpacklo_epi64( in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride))); tmp = _mm_add_epi16(in0, in1); in0 = _mm_unpacklo_epi16(zero, tmp); in1 = _mm_unpackhi_epi16(zero, tmp); in0 = _mm_srai_epi32(in0, 16); in1 = _mm_srai_epi32(in1, 16); tmp = _mm_add_epi32(in0, in1); in0 = _mm_unpacklo_epi32(tmp, zero); in1 = _mm_unpackhi_epi32(tmp, zero); tmp = _mm_add_epi32(in0, in1); in0 = _mm_srli_si128(tmp, 8); in1 = _mm_add_epi32(tmp, in0); in0 = _mm_slli_epi32(in1, 1); store_output(&in0, output); }
static inline void packed_shader_to_yuv_3(uint8_t** dstp, const uint8_t** srcp, const int dpitch, const int spitch, const int width, const int height, void* _buff) noexcept { const uint8_t* s = srcp[0]; uint8_t* dr = dstp[0]; uint8_t* dg = dstp[1]; uint8_t* db = dstp[2]; float * buff = reinterpret_cast<float*>(_buff); uint8_t *lr, *lg, *lb; const __m128 coef = _mm_set1_ps(STACK16 ? 65535.0f : 255.0f); const __m128i mask = _mm_set1_epi16(0x00FF); if (STACK16) { lr = dr + height * dpitch; lg = dg + height * dpitch; lb = db + height * dpitch; } for (int y = 0; y < height; ++y) { convert_half_to_float(buff, s, width * 4); for (int x = 0; x < width; x += 4) { __m128i s0 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + 4 * x + 0))); // R0,G0,B0,A0 __m128i s1 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + 4 * x + 4))); // R1,G1,B1,A1 __m128i s2 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + 4 * x + 8))); // R2,G2,B2,A2 __m128i s3 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + 4 * x + 12))); // R3,G3,B3,A3 s0 = _mm_or_si128(s0, _mm_slli_epi32(s1, 16)); //R0,R1,G0,G1,B0,B1,A0,A1 s1 = _mm_or_si128(s2, _mm_slli_epi32(s3, 16)); //R2,R3,G2,G3,B2,B3,A2,A3 s2 = _mm_unpacklo_epi32(s0, s1); // R0,R1,R2,R3,G0,G1,G2,G3 s3 = _mm_unpackhi_epi32(s0, s1); // B0,B1,B2,B3,A0,A1,A2,A3 if (!STACK16) { s0 = _mm_packus_epi16(s2, s3); *(reinterpret_cast<int32_t*>(dr + x)) = _mm_cvtsi128_si32(s0); *(reinterpret_cast<int32_t*>(dg + x)) = _mm_cvtsi128_si32(_mm_srli_si128(s0, 4)); *(reinterpret_cast<int32_t*>(db + x)) = _mm_cvtsi128_si32(_mm_srli_si128(s0, 8)); } else { __m128i rgbamsb = _mm_packus_epi16(_mm_srli_epi16(s2, 8), _mm_srli_epi16(s3, 8)); __m128i rgbalsb = _mm_packus_epi16(_mm_and_si128(s2, mask), _mm_and_si128(s3, mask)); *(reinterpret_cast<int32_t*>(dr + x)) = _mm_cvtsi128_si32(rgbamsb); *(reinterpret_cast<int32_t*>(lr + x)) = _mm_cvtsi128_si32(rgbalsb); *(reinterpret_cast<int32_t*>(dg + x)) = _mm_cvtsi128_si32(_mm_srli_si128(rgbamsb, 4)); *(reinterpret_cast<int32_t*>(lg + x)) = _mm_cvtsi128_si32(_mm_srli_si128(rgbalsb, 4)); *(reinterpret_cast<int32_t*>(db + x)) = _mm_cvtsi128_si32(_mm_srli_si128(rgbamsb, 8)); *(reinterpret_cast<int32_t*>(lb + x)) = _mm_cvtsi128_si32(_mm_srli_si128(rgbalsb, 8)); } } s += spitch; dr += dpitch; dg += dpitch; db += dpitch; if (STACK16) { lr += dpitch; lg += dpitch; lb += dpitch; } } }
/* Routine optimized for shuffling a buffer for a type size of 4 bytes. */ static void shuffle4(uint8_t* dest, uint8_t* src, size_t size) { size_t i, j, k; size_t numof16belem; __m128i xmm0[4], xmm1[4]; numof16belem = size / (16*4); for (i = 0, j = 0; i < numof16belem; i++, j += 16*4) { /* Fetch and transpose bytes and words in groups of 64 bytes */ for (k = 0; k < 4; k++) { xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16)); xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0x8d); xmm0[k] = _mm_unpacklo_epi8(xmm1[k], xmm0[k]); xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x04e); xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]); } /* Transpose double words */ for (k = 0; k < 2; k++) { xmm1[k*2] = _mm_unpacklo_epi32(xmm0[k*2], xmm0[k*2+1]); xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[k*2], xmm0[k*2+1]); } /* Transpose quad words */ for (k = 0; k < 2; k++) { xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+2]); xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+2]); } /* Store the result vectors */ for (k = 0; k < 4; k++) { ((__m128i *)dest)[k*numof16belem+i] = xmm0[k]; } } }
void interpolate_gint16_linear_sse2 (gpointer op, const gpointer ap, gint len, const gpointer icp, gint astride) { gint i = 0; gint16 *o = op, *a = ap, *ic = icp; __m128i ta, tb, t1, t2; __m128i f = _mm_set_epi64x (0, *((gint64 *) ic)); const gint16 *c[2] = { (gint16 *) ((gint8 *) a + 0 * astride), (gint16 *) ((gint8 *) a + 1 * astride) }; f = _mm_unpacklo_epi32 (f, f); f = _mm_unpacklo_epi64 (f, f); for (; i < len; i += 8) { ta = _mm_load_si128 ((__m128i *) (c[0] + i)); tb = _mm_load_si128 ((__m128i *) (c[1] + i)); t1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f); t2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f); t1 = _mm_add_epi32 (t1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); t2 = _mm_add_epi32 (t2, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); t1 = _mm_srai_epi32 (t1, PRECISION_S16); t2 = _mm_srai_epi32 (t2, PRECISION_S16); t1 = _mm_packs_epi32 (t1, t2); _mm_store_si128 ((__m128i *) (o + i), t1); } }
template<class R0,class R1> inline void eval(A0 const& a0, R0& r0, R1& r1, const simd::native<typename boost::simd::meta::uint64_t_<A0>::type,boost::simd::tag::sse_ > &)const { typedef simd::native<typename boost::simd::meta::uint64_t_<A0>::type,boost::simd::tag::sse_> rtype; r1 = bitwise_cast<rtype>(_mm_unpackhi_epi32(a0, Zero<A0>())); r0 = bitwise_cast<rtype>(_mm_unpacklo_epi32(a0, Zero<A0>())); }
static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *dst, const int16_t *filter) { const __m128i k_256 = _mm_set1_epi16(1 << 8); const __m128i f_values = _mm_load_si128((const __m128i *)filter); // pack and duplicate the filter values const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); // TRANSPOSE... // 00 01 02 03 04 05 06 07 // 10 11 12 13 14 15 16 17 // 20 21 22 23 24 25 26 27 // 30 31 32 33 34 35 36 37 // // TO // // 00 10 20 30 // 01 11 21 31 // 02 12 22 32 // 03 13 23 33 // 04 14 24 34 // 05 15 25 35 // 06 16 26 36 // 07 17 27 37 // // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1); // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1); // 02 03 12 13 22 23 32 33 const __m128i s3s2 = _mm_srli_si128(s1s0, 8); // 06 07 16 17 26 27 36 37 const __m128i s7s6 = _mm_srli_si128(s5s4, 8); // multiply 2 adjacent elements with the filter and add the result const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); // add and saturate the results together const __m128i min_x2x1 = _mm_min_epi16(x2, x1); const __m128i max_x2x1 = _mm_max_epi16(x2, x1); __m128i temp = _mm_adds_epi16(x0, x3); temp = _mm_adds_epi16(temp, min_x2x1); temp = _mm_adds_epi16(temp, max_x2x1); // round and shift by 7 bit each 16 bit temp = _mm_mulhrs_epi16(temp, k_256); // shrink to 8 bit each 16 bits temp = _mm_packus_epi16(temp, temp); // save only 4 bytes *(int *)dst = _mm_cvtsi128_si32(temp); }
OD_SIMD_INLINE __m128i od_mullo_epi32_sse2(__m128i a, int b1) { __m128i b = _mm_set1_epi32(b1); __m128i lo = _mm_mul_epu32(a, b); __m128i hi = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4)); return _mm_unpacklo_epi32(_mm_shuffle_epi32(lo, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(hi, _MM_SHUFFLE(0, 0, 2, 0))); }
inline __m128i Convert8DigitsSSE2(uint32_t value) { assert(value <= 99999999); // abcd, efgh = abcdefgh divmod 10000 const __m128i abcdefgh = _mm_cvtsi32_si128(value); const __m128i abcd = _mm_srli_epi64(_mm_mul_epu32(abcdefgh, reinterpret_cast<const __m128i*>(kDiv10000Vector)[0]), 45); const __m128i efgh = _mm_sub_epi32(abcdefgh, _mm_mul_epu32(abcd, reinterpret_cast<const __m128i*>(k10000Vector)[0])); // v1 = [ abcd, efgh, 0, 0, 0, 0, 0, 0 ] const __m128i v1 = _mm_unpacklo_epi16(abcd, efgh); // v1a = v1 * 4 = [ abcd * 4, efgh * 4, 0, 0, 0, 0, 0, 0 ] const __m128i v1a = _mm_slli_epi64(v1, 2); // v2 = [ abcd * 4, abcd * 4, abcd * 4, abcd * 4, efgh * 4, efgh * 4, efgh * 4, efgh * 4 ] const __m128i v2a = _mm_unpacklo_epi16(v1a, v1a); const __m128i v2 = _mm_unpacklo_epi32(v2a, v2a); // v4 = v2 div 10^3, 10^2, 10^1, 10^0 = [ a, ab, abc, abcd, e, ef, efg, efgh ] const __m128i v3 = _mm_mulhi_epu16(v2, reinterpret_cast<const __m128i*>(kDivPowersVector)[0]); const __m128i v4 = _mm_mulhi_epu16(v3, reinterpret_cast<const __m128i*>(kShiftPowersVector)[0]); // v5 = v4 * 10 = [ a0, ab0, abc0, abcd0, e0, ef0, efg0, efgh0 ] const __m128i v5 = _mm_mullo_epi16(v4, reinterpret_cast<const __m128i*>(k10Vector)[0]); // v6 = v5 << 16 = [ 0, a0, ab0, abc0, 0, e0, ef0, efg0 ] const __m128i v6 = _mm_slli_epi64(v5, 16); // v7 = v4 - v6 = { a, b, c, d, e, f, g, h } const __m128i v7 = _mm_sub_epi16(v4, v6); return v7; }
inline __m128i func_mul_epu32(__m128i a, __m128i b){ #if 1 // Multiply elements 0 and 2, and bput the 64 bit results into a vector. __m128i tmp02 = _mm_mul_epu32(a, b); // Shift the vectors by one word to the right, making 3->2, and 1->0, and // then multiply into double word vector. __m128i tmp13 = _mm_mul_epu32( _mm_srli_si128(a, 4), _mm_srli_si128(b,4)); // Shuffle the vectors to place the lower 32 bits of each results in the // lower two words. I have some concerns about endianness and portability // related to this function. __m128i tmpres02 = _mm_shuffle_epi32(tmp02, _MM_SHUFFLE(0,0,2,0)); __m128i tmpres13 = _mm_shuffle_epi32(tmp13, _MM_SHUFFLE(0,0,2,0)); // upcack the shuffled vectors into a return value return _mm_unpacklo_epi32(tmpres02, tmpres13); #else pvInt ret; int * p_a; int * p_b; p_a = (int *)&a; p_b = (int *)&b; for(int m=0; m < VEC_SIZE; m++){ ret.v[m] = p_a[m] * p_b[m]; } return ret.r; #endif }
/* @note: When this function is changed, make corresponding change to * fm10k_dev_supported_ptypes_get(). */ static inline void fm10k_desc_to_pktype_v(__m128i descs[4], struct rte_mbuf **rx_pkts) { __m128i l3l4type0, l3l4type1, l3type, l4type; union { uint16_t e[4]; uint64_t dword; } vol; /* L3 pkt type mask Bit4 to Bit6 */ const __m128i l3type_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x0070, 0x0070, 0x0070, 0x0070); /* L4 pkt type mask Bit7 to Bit9 */ const __m128i l4type_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x0380, 0x0380, 0x0380, 0x0380); /* convert RRC l3 type to mbuf format */ const __m128i l3type_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, RTE_PTYPE_L3_IPV6_EXT, RTE_PTYPE_L3_IPV6, RTE_PTYPE_L3_IPV4_EXT, RTE_PTYPE_L3_IPV4, 0); /* Convert RRC l4 type to mbuf format l4type_flags shift-left 8 bits * to fill into8 bits length. */ const __m128i l4type_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, RTE_PTYPE_TUNNEL_GENEVE >> 8, RTE_PTYPE_TUNNEL_NVGRE >> 8, RTE_PTYPE_TUNNEL_VXLAN >> 8, RTE_PTYPE_TUNNEL_GRE >> 8, RTE_PTYPE_L4_UDP >> 8, RTE_PTYPE_L4_TCP >> 8, 0); l3l4type0 = _mm_unpacklo_epi16(descs[0], descs[1]); l3l4type1 = _mm_unpacklo_epi16(descs[2], descs[3]); l3l4type0 = _mm_unpacklo_epi32(l3l4type0, l3l4type1); l3type = _mm_and_si128(l3l4type0, l3type_msk); l4type = _mm_and_si128(l3l4type0, l4type_msk); l3type = _mm_srli_epi16(l3type, L3TYPE_SHIFT); l4type = _mm_srli_epi16(l4type, L4TYPE_SHIFT); l3type = _mm_shuffle_epi8(l3type_flags, l3type); /* l4type_flags shift-left for 8 bits, need shift-right back */ l4type = _mm_shuffle_epi8(l4type_flags, l4type); l4type = _mm_slli_epi16(l4type, 8); l3l4type0 = _mm_or_si128(l3type, l4type); vol.dword = _mm_cvtsi128_si64(l3l4type0); rx_pkts[0]->packet_type = vol.e[0]; rx_pkts[1]->packet_type = vol.e[1]; rx_pkts[2]->packet_type = vol.e[2]; rx_pkts[3]->packet_type = vol.e[3]; }
SIMDValue SIMDInt32x4Operation::OpMul(const SIMDValue& aValue, const SIMDValue& bValue) { SIMDValue result; X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); if (AutoSystemInfo::Data.SSE4_1Available()) { // a * b, only available in SSE4 x86Result.m128i_value = _mm_mullo_epi32(tmpaValue.m128i_value, tmpbValue.m128i_value); result = X86SIMDValue::ToSIMDValue(x86Result); } else if (AutoSystemInfo::Data.SSE2Available()) { // mul 2,0: r0 = a0*b0; r1 = a2*b2 __m128i tmp1 = _mm_mul_epu32(tmpaValue.m128i_value, tmpbValue.m128i_value); // mul 3,1: r0=a1*b1; r1=a3*b3 __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(tmpaValue.m128i_value, 4), _mm_srli_si128(tmpbValue.m128i_value, 4)); // shuffle x86Results to [63..0] and pack x86Result.m128i_value = _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0))); result = X86SIMDValue::ToSIMDValue(x86Result); } else { result.i32[SIMD_X] = aValue.i32[SIMD_X] * bValue.i32[SIMD_X]; result.i32[SIMD_Y] = aValue.i32[SIMD_Y] * bValue.i32[SIMD_Y]; result.i32[SIMD_Z] = aValue.i32[SIMD_Z] * bValue.i32[SIMD_Z]; result.i32[SIMD_W] = aValue.i32[SIMD_W] * bValue.i32[SIMD_W]; } return result; }
static void vpx_highbd_filter_block1d4_h4_sse2( const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { // We will load multiple shifted versions of the row and shuffle them into // 16-bit words of the form // ... s[2] s[1] s[0] s[-1] // ... s[4] s[3] s[2] s[1] // Then we call multiply and add to get partial results // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2] // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4] // The two results are then added together to get the even output __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3; __m128i res_reg; __m128i even, odd; __m128i kernel_reg; // Kernel __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used const __m128i reg_round = _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1); const __m128i reg_zero = _mm_setzero_si128(); int h; // Start one pixel before as we need tap/2 - 1 = 1 sample from the past src_ptr -= 1; // Load Kernel kernel_reg = _mm_loadu_si128((const __m128i *)kernel); kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); for (h = height; h > 0; --h) { src_reg = _mm_loadu_si128((const __m128i *)src_ptr); src_reg_shift_1 = _mm_srli_si128(src_reg, 2); src_reg_shift_2 = _mm_srli_si128(src_reg, 4); src_reg_shift_3 = _mm_srli_si128(src_reg, 6); // Output 2 0 even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, &kernel_reg_45); // Output 3 1 odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3, &kernel_reg_23, &kernel_reg_45); // Combine to get the first half of the dst res_reg = _mm_unpacklo_epi32(even, odd); res_reg = mm_round_epi32_sse2(&res_reg, ®_round, CONV8_ROUNDING_BITS); res_reg = _mm_packs_epi32(res_reg, reg_zero); // Saturate the result and save res_reg = _mm_min_epi16(res_reg, reg_max); res_reg = _mm_max_epi16(res_reg, reg_zero); _mm_storel_epi64((__m128i *)dst_ptr, res_reg); src_ptr += src_stride; dst_ptr += dst_stride; } }
static inline void desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts) { __m128i ptype0, ptype1, vtag0, vtag1; union { uint16_t e[4]; uint64_t dword; } vol; /* pkt type + vlan olflags mask */ const __m128i pkttype_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT); /* mask everything except rss type */ const __m128i rsstype_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x000F, 0x000F, 0x000F, 0x000F); /* map rss type to rss hash flag */ const __m128i rss_flags = _mm_set_epi8(PKT_RX_FDIR, 0, 0, 0, 0, 0, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0); ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]); ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]); vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]); vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]); ptype0 = _mm_unpacklo_epi32(ptype0, ptype1); ptype0 = _mm_and_si128(ptype0, rsstype_msk); ptype0 = _mm_shuffle_epi8(rss_flags, ptype0); vtag1 = _mm_unpacklo_epi32(vtag0, vtag1); vtag1 = _mm_srli_epi16(vtag1, VTAG_SHIFT); vtag1 = _mm_and_si128(vtag1, pkttype_msk); vtag1 = _mm_or_si128(ptype0, vtag1); vol.dword = _mm_cvtsi128_si64(vtag1); rx_pkts[0]->ol_flags = vol.e[0]; rx_pkts[1]->ol_flags = vol.e[1]; rx_pkts[2]->ol_flags = vol.e[2]; rx_pkts[3]->ol_flags = vol.e[3]; }
static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) { const __m128i zero = _mm_setzero_si128(); // Load values. Note that we read 8 pixels instead of 4, // but the a/b buffers are over-allocated to that effect. const __m128i a0 = _mm_loadl_epi64((__m128i*)&a[BPS * 0]); const __m128i a1 = _mm_loadl_epi64((__m128i*)&a[BPS * 1]); const __m128i a2 = _mm_loadl_epi64((__m128i*)&a[BPS * 2]); const __m128i a3 = _mm_loadl_epi64((__m128i*)&a[BPS * 3]); const __m128i b0 = _mm_loadl_epi64((__m128i*)&b[BPS * 0]); const __m128i b1 = _mm_loadl_epi64((__m128i*)&b[BPS * 1]); const __m128i b2 = _mm_loadl_epi64((__m128i*)&b[BPS * 2]); const __m128i b3 = _mm_loadl_epi64((__m128i*)&b[BPS * 3]); // Combine pair of lines and convert to 16b. const __m128i a01 = _mm_unpacklo_epi32(a0, a1); const __m128i a23 = _mm_unpacklo_epi32(a2, a3); const __m128i b01 = _mm_unpacklo_epi32(b0, b1); const __m128i b23 = _mm_unpacklo_epi32(b2, b3); const __m128i a01s = _mm_unpacklo_epi8(a01, zero); const __m128i a23s = _mm_unpacklo_epi8(a23, zero); const __m128i b01s = _mm_unpacklo_epi8(b01, zero); const __m128i b23s = _mm_unpacklo_epi8(b23, zero); // Compute differences; (a-b)^2 = (abs(a-b))^2 = (sat8(a-b) + sat8(b-a))^2 // TODO(cduvivier): Dissassemble and figure out why this is fastest. We don't // need absolute values, there is no need to do calculation // in 8bit as we are already in 16bit, ... Yet this is what // benchmarks the fastest! const __m128i d0 = _mm_subs_epu8(a01s, b01s); const __m128i d1 = _mm_subs_epu8(b01s, a01s); const __m128i d2 = _mm_subs_epu8(a23s, b23s); const __m128i d3 = _mm_subs_epu8(b23s, a23s); // Square and add them all together. const __m128i madd0 = _mm_madd_epi16(d0, d0); const __m128i madd1 = _mm_madd_epi16(d1, d1); const __m128i madd2 = _mm_madd_epi16(d2, d2); const __m128i madd3 = _mm_madd_epi16(d3, d3); const __m128i sum0 = _mm_add_epi32(madd0, madd1); const __m128i sum1 = _mm_add_epi32(madd2, madd3); const __m128i sum2 = _mm_add_epi32(sum0, sum1); int32_t tmp[4]; _mm_storeu_si128((__m128i*)tmp, sum2); return (tmp[3] + tmp[2] + tmp[1] + tmp[0]); }