コード例 #1
1
ファイル: ixgbe_rxtx_vec.c プロジェクト: RIFTIO/RIFT.ware
static inline void
desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts)
{
	__m128i ptype0, ptype1, vtag0, vtag1;
	union {
		uint16_t e[4];
		uint64_t dword;
	} vol;

	ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]);
	ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]);
	vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]);
	vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]);

	ptype1 = _mm_unpacklo_epi32(ptype0, ptype1);
	vtag1 = _mm_unpacklo_epi32(vtag0, vtag1);

	ptype1 = _mm_slli_epi16(ptype1, PTYPE_SHIFT);
	vtag1 = _mm_srli_epi16(vtag1, VTAG_SHIFT);

	ptype1 = _mm_or_si128(ptype1, vtag1);
	vol.dword = _mm_cvtsi128_si64(ptype1) & OLFLAGS_MASK_V;

	rx_pkts[0]->ol_flags = vol.e[0];
	rx_pkts[1]->ol_flags = vol.e[1];
	rx_pkts[2]->ol_flags = vol.e[2];
	rx_pkts[3]->ol_flags = vol.e[3];
}
コード例 #2
0
ファイル: idea_sse2.cpp プロジェクト: louiz/botan
void transpose_out(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3)
   {
   __m128i T0 = _mm_unpacklo_epi64(B0, B1);
   __m128i T1 = _mm_unpacklo_epi64(B2, B3);
   __m128i T2 = _mm_unpackhi_epi64(B0, B1);
   __m128i T3 = _mm_unpackhi_epi64(B2, B3);

   T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
   T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
   T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
   T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));

   T0 = _mm_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
   T1 = _mm_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
   T2 = _mm_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
   T3 = _mm_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));

   T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
   T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
   T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
   T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));

   B0 = _mm_unpacklo_epi32(T0, T1);
   B1 = _mm_unpackhi_epi32(T0, T1);
   B2 = _mm_unpacklo_epi32(T2, T3);
   B3 = _mm_unpackhi_epi32(T2, T3);
   }
コード例 #3
0
ファイル: lossless_sse2.c プロジェクト: 2025Labs/iOS
static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
                                int num_pixels, uint32_t* out) {
  int i, j;
  __m128i L = _mm_cvtsi32_si128(out[-1]);
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
    __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
    __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
    __m128i pa;
    GetSumAbsDiff32(&T, &TL, &pa);   // pa = sum |T-TL|
    for (j = 0; j < 4; ++j) {
      const __m128i L_lo = _mm_unpacklo_epi32(L, L);
      const __m128i TL_lo = _mm_unpacklo_epi32(TL, L);
      const __m128i pb = _mm_sad_epu8(L_lo, TL_lo);  // pb = sum |L-TL|
      const __m128i mask = _mm_cmpgt_epi32(pb, pa);
      const __m128i A = _mm_and_si128(mask, L);
      const __m128i B = _mm_andnot_si128(mask, T);
      const __m128i pred = _mm_or_si128(A, B);    // pred = (L > T)? L : T
      L = _mm_add_epi8(src, pred);
      out[i + j] = _mm_cvtsi128_si32(L);
      // Shift the pre-computed value for the next iteration.
      T = _mm_srli_si128(T, 4);
      TL = _mm_srli_si128(TL, 4);
      src = _mm_srli_si128(src, 4);
      pa = _mm_srli_si128(pa, 4);
    }
  }
  if (i != num_pixels) {
    VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
  }
}
コード例 #4
0
ファイル: lossless_sse2.c プロジェクト: bovender/XLToolbox
static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper,
                                int num_pixels, uint32_t* out) {
  int i;
  __m128i pa;
  __m128i L = _mm_cvtsi32_si128(out[-1]);
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]);
    __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]);
    __m128i src = _mm_loadu_si128((const __m128i*)&in[i]);
    {
      // We can unpack with any value on the upper 32 bits, provided it's the
      // same on both operands (so that their sum of abs diff is zero). Here we
      // use T.
      const __m128i T_lo = _mm_unpacklo_epi32(T, T);
      const __m128i TL_lo = _mm_unpacklo_epi32(TL, T);
      const __m128i T_hi = _mm_unpackhi_epi32(T, T);
      const __m128i TL_hi = _mm_unpackhi_epi32(TL, T);
      const __m128i s_lo = _mm_sad_epu8(T_lo, TL_lo);
      const __m128i s_hi = _mm_sad_epu8(T_hi, TL_hi);
      pa = _mm_packs_epi32(s_lo, s_hi);  // pa = sum |T-TL|
    }
    DO_PRED11(0);
    DO_PRED11_SHIFT;
    DO_PRED11(1);
    DO_PRED11_SHIFT;
    DO_PRED11(2);
    DO_PRED11_SHIFT;
    DO_PRED11(3);
  }
  if (i != num_pixels) {
    VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i);
  }
}
コード例 #5
0
ファイル: resize_impl_sse2.cpp プロジェクト: dubhater/zimg
inline FORCE_INLINE void transpose8_epi16(__m128i &x0, __m128i &x1, __m128i &x2, __m128i &x3, __m128i &x4, __m128i &x5, __m128i &x6, __m128i &x7)
{
	__m128i t0, t1, t2, t3, t4, t5, t6, t7;
	__m128i tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7;

	t0 = _mm_unpacklo_epi16(x0, x1);
	t1 = _mm_unpacklo_epi16(x2, x3);
	t2 = _mm_unpacklo_epi16(x4, x5);
	t3 = _mm_unpacklo_epi16(x6, x7);
	t4 = _mm_unpackhi_epi16(x0, x1);
	t5 = _mm_unpackhi_epi16(x2, x3);
	t6 = _mm_unpackhi_epi16(x4, x5);
	t7 = _mm_unpackhi_epi16(x6, x7);

	tt0 = _mm_unpacklo_epi32(t0, t1);
	tt1 = _mm_unpackhi_epi32(t0, t1);
	tt2 = _mm_unpacklo_epi32(t2, t3);
	tt3 = _mm_unpackhi_epi32(t2, t3);
	tt4 = _mm_unpacklo_epi32(t4, t5);
	tt5 = _mm_unpackhi_epi32(t4, t5);
	tt6 = _mm_unpacklo_epi32(t6, t7);
	tt7 = _mm_unpackhi_epi32(t6, t7);

	x0 = _mm_unpacklo_epi64(tt0, tt2);
	x1 = _mm_unpackhi_epi64(tt0, tt2);
	x2 = _mm_unpacklo_epi64(tt1, tt3);
	x3 = _mm_unpackhi_epi64(tt1, tt3);
	x4 = _mm_unpacklo_epi64(tt4, tt6);
	x5 = _mm_unpackhi_epi64(tt4, tt6);
	x6 = _mm_unpacklo_epi64(tt5, tt7);
	x7 = _mm_unpackhi_epi64(tt5, tt7);
}
コード例 #6
0
ファイル: yuv_sse41.c プロジェクト: bovender/XLToolbox
// Convert 16 packed ARGB 16b-values to r[], g[], b[]
static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41(
    const uint16_t* const rgbx,
    __m128i* const r, __m128i* const g, __m128i* const b) {
  const __m128i in0 = LOAD_16(rgbx +  0);  // r0 | g0 | b0 |x| r1 | g1 | b1 |x
  const __m128i in1 = LOAD_16(rgbx +  8);  // r2 | g2 | b2 |x| r3 | g3 | b3 |x
  const __m128i in2 = LOAD_16(rgbx + 16);  // r4 | ...
  const __m128i in3 = LOAD_16(rgbx + 24);  // r6 | ...
  // aarrggbb as 16-bit.
  const __m128i shuff0 =
      _mm_set_epi8(-1, -1, -1, -1, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0);
  const __m128i shuff1 =
      _mm_set_epi8(13, 12, 5, 4, -1, -1, -1, -1, 11, 10, 3, 2, 9, 8, 1, 0);
  const __m128i A0 = _mm_shuffle_epi8(in0, shuff0);
  const __m128i A1 = _mm_shuffle_epi8(in1, shuff1);
  const __m128i A2 = _mm_shuffle_epi8(in2, shuff0);
  const __m128i A3 = _mm_shuffle_epi8(in3, shuff1);
  // R0R1G0G1
  // B0B1****
  // R2R3G2G3
  // B2B3****
  // (OR is used to free port 5 for the unpack)
  const __m128i B0 = _mm_unpacklo_epi32(A0, A1);
  const __m128i B1 = _mm_or_si128(A0, A1);
  const __m128i B2 = _mm_unpacklo_epi32(A2, A3);
  const __m128i B3 = _mm_or_si128(A2, A3);
  // Gather the channels.
  *r = _mm_unpacklo_epi64(B0, B2);
  *g = _mm_unpackhi_epi64(B0, B2);
  *b = _mm_unpackhi_epi64(B1, B3);
}
コード例 #7
0
ファイル: dsp.enc_sse2.c プロジェクト: Antranilan/Sparky
static int SSE4x4(const uint8_t* a, const uint8_t* b) {
  const __m128i zero = _mm_setzero_si128();

  // Load values. Note that we read 8 pixels instead of 4,
  // but the a/b buffers are over-allocated to that effect.
  const __m128i a0 = _mm_loadl_epi64((const __m128i*)&a[BPS * 0]);
  const __m128i a1 = _mm_loadl_epi64((const __m128i*)&a[BPS * 1]);
  const __m128i a2 = _mm_loadl_epi64((const __m128i*)&a[BPS * 2]);
  const __m128i a3 = _mm_loadl_epi64((const __m128i*)&a[BPS * 3]);
  const __m128i b0 = _mm_loadl_epi64((const __m128i*)&b[BPS * 0]);
  const __m128i b1 = _mm_loadl_epi64((const __m128i*)&b[BPS * 1]);
  const __m128i b2 = _mm_loadl_epi64((const __m128i*)&b[BPS * 2]);
  const __m128i b3 = _mm_loadl_epi64((const __m128i*)&b[BPS * 3]);
  // Combine pair of lines.
  const __m128i a01 = _mm_unpacklo_epi32(a0, a1);
  const __m128i a23 = _mm_unpacklo_epi32(a2, a3);
  const __m128i b01 = _mm_unpacklo_epi32(b0, b1);
  const __m128i b23 = _mm_unpacklo_epi32(b2, b3);
  // Convert to 16b.
  const __m128i a01s = _mm_unpacklo_epi8(a01, zero);
  const __m128i a23s = _mm_unpacklo_epi8(a23, zero);
  const __m128i b01s = _mm_unpacklo_epi8(b01, zero);
  const __m128i b23s = _mm_unpacklo_epi8(b23, zero);
  // subtract, square and accumulate
  const __m128i d0 = _mm_subs_epi16(a01s, b01s);
  const __m128i d1 = _mm_subs_epi16(a23s, b23s);
  const __m128i e0 = _mm_madd_epi16(d0, d0);
  const __m128i e1 = _mm_madd_epi16(d1, d1);
  const __m128i sum = _mm_add_epi32(e0, e1);

  int32_t tmp[4];
  _mm_storeu_si128((__m128i*)tmp, sum);
  return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
}
コード例 #8
0
static inline void
inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a,
    const gint16 * b, gint len, const gint16 * icoeff, gint bstride)
{
  gint i = 0;
  __m128i sum[4], t[4];
  __m128i f = _mm_set_epi64x (0, *((long long *) icoeff));
  const gint16 *c[4] = { (gint16 *) ((gint8 *) b + 0 * bstride),
    (gint16 *) ((gint8 *) b + 1 * bstride),
    (gint16 *) ((gint8 *) b + 2 * bstride),
    (gint16 *) ((gint8 *) b + 3 * bstride)
  };

  sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 ();
  f = _mm_unpacklo_epi16 (f, sum[0]);

  for (; i < len; i += 8) {
    t[0] = _mm_loadu_si128 ((__m128i *) (a + i));
    sum[0] =
        _mm_add_epi32 (sum[0], _mm_madd_epi16 (t[0],
            _mm_load_si128 ((__m128i *) (c[0] + i))));
    sum[1] =
        _mm_add_epi32 (sum[1], _mm_madd_epi16 (t[0],
            _mm_load_si128 ((__m128i *) (c[1] + i))));
    sum[2] =
        _mm_add_epi32 (sum[2], _mm_madd_epi16 (t[0],
            _mm_load_si128 ((__m128i *) (c[2] + i))));
    sum[3] =
        _mm_add_epi32 (sum[3], _mm_madd_epi16 (t[0],
            _mm_load_si128 ((__m128i *) (c[3] + i))));
  }
  t[0] = _mm_unpacklo_epi32 (sum[0], sum[1]);
  t[1] = _mm_unpacklo_epi32 (sum[2], sum[3]);
  t[2] = _mm_unpackhi_epi32 (sum[0], sum[1]);
  t[3] = _mm_unpackhi_epi32 (sum[2], sum[3]);

  sum[0] =
      _mm_add_epi32 (_mm_unpacklo_epi64 (t[0], t[1]), _mm_unpackhi_epi64 (t[0],
          t[1]));
  sum[2] =
      _mm_add_epi32 (_mm_unpacklo_epi64 (t[2], t[3]), _mm_unpackhi_epi64 (t[2],
          t[3]));
  sum[0] = _mm_add_epi32 (sum[0], sum[2]);

  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
  sum[0] = _mm_madd_epi16 (sum[0], f);

  sum[0] =
      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2,
              3)));
  sum[0] =
      _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1,
              1)));

  sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
  sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16);
  sum[0] = _mm_packs_epi32 (sum[0], sum[0]);
  *o = _mm_extract_epi16 (sum[0], 0);
}
コード例 #9
0
static inline void
yuv_to_packed_shader_3(uint8_t** dstp, const uint8_t** srcp, const int dpitch,
    const int spitch, const int width, const int height, void* _buff) noexcept
{
    const uint8_t* sr = srcp[0];
    const uint8_t* sg = srcp[1];
    const uint8_t* sb = srcp[2];

    uint8_t* d = dstp[0];
    float* buff = reinterpret_cast<float*>(_buff);

    const uint8_t *rlsb, *glsb, *blsb;
    if (STACK16) {
        rlsb = sr + height * spitch;
        glsb = sg + height * spitch;
        blsb = sb + height * spitch;
    }

    const __m128i zero = _mm_setzero_si128();
    const __m128 rcp = _mm_set1_ps(1.0f / (STACK16 ? 65535 : 255));

    for (int y = 0; y < height; ++y) {
        for (int x = 0; x < width; x += 4) {
            __m128i r, g, b;
            if (!STACK16) {
                r = _mm_cvtepu8_epi32(loadl(sr + x));
                g = _mm_cvtepu8_epi32(loadl(sg + x));
                b = _mm_cvtepu8_epi32(loadl(sb + x));
            } else {
                r = _mm_unpacklo_epi16(_mm_unpacklo_epi8(loadl(rlsb + x), loadl(sr + x)), zero);
                g = _mm_unpacklo_epi16(_mm_unpacklo_epi8(loadl(glsb + x), loadl(sg + x)), zero);
                b = _mm_unpacklo_epi16(_mm_unpacklo_epi8(loadl(blsb + x), loadl(sb + x)), zero);
            }
            __m128i rg = _mm_unpacklo_epi32(r, g);
            __m128i ba = _mm_unpacklo_epi32(b, zero);
            __m128 rgba0 = _mm_cvtepi32_ps(_mm_unpacklo_epi64(rg, ba));
            __m128 rgba1 = _mm_cvtepi32_ps(_mm_unpackhi_epi64(rg, ba));
            _mm_store_ps(buff + 4 * x + 0, _mm_mul_ps(rgba0, rcp));
            _mm_store_ps(buff + 4 * x + 4, _mm_mul_ps(rgba1, rcp));

            rg = _mm_unpackhi_epi32(r, g);
            ba = _mm_unpackhi_epi32(b, zero);
            rgba0 = _mm_cvtepi32_ps(_mm_unpacklo_epi64(rg, ba));
            rgba1 = _mm_cvtepi32_ps(_mm_unpackhi_epi64(rg, ba));
            _mm_store_ps(buff + 4 * x +  8, _mm_mul_ps(rgba0, rcp));
            _mm_store_ps(buff + 4 * x + 12, _mm_mul_ps(rgba1, rcp));
        }
        convert_float_to_half(d, buff, width * 4);
        d += dpitch;
        sr += spitch;
        sg += spitch;
        sb += spitch;
        if (STACK16) {
            rlsb += spitch;
            glsb += spitch;
            blsb += spitch;
        }
    }
}
コード例 #10
0
static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
                                  uint8_t *dst, const int16_t *x_filter) {
  const __m128i k_256 = _mm_set1_epi16(1 << 8);
  const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
  // pack and duplicate the filter values
  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
  const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);
  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));
  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2));
  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3));
  const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4));
  const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5));
  const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6));
  const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7));
  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
  const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
  const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
  // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57
  const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);
  // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77
  const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);
  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
  const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
  // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73
  const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
  // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77
  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
  // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
  const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);
  const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);
  const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);
  const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);
  // multiply 2 adjacent elements with the filter and add the result
  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
  // add and saturate the results together
  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
  __m128i temp = _mm_adds_epi16(x0, x3);
  temp = _mm_adds_epi16(temp, min_x2x1);
  temp = _mm_adds_epi16(temp, max_x2x1);
  // round and shift by 7 bit each 16 bit
  temp = _mm_mulhrs_epi16(temp, k_256);
  // shrink to 8 bit each 16 bits
  temp = _mm_packus_epi16(temp, temp);
  // save only 8 bytes convolve result
  _mm_storel_epi64((__m128i *)dst, temp);
}
コード例 #11
0
ファイル: highbd_variance_sse2.c プロジェクト: negge/aom
void aom_highbd_upsampled_pred_sse2(uint16_t *pred, int width, int height,
                                    const uint8_t *ref8, const int ref_stride) {
  const int stride = ref_stride << 3;
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
  int i, j;

  if (width >= 8) {
    // read 8 points at one time
    for (i = 0; i < height; i++) {
      for (j = 0; j < width; j += 8) {
        __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
        __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
        __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
        __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
        __m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32));
        __m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40));
        __m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48));
        __m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56));
        __m128i t0, t1, t2, t3;

        t0 = _mm_unpacklo_epi16(s0, s1);
        t1 = _mm_unpacklo_epi16(s2, s3);
        t2 = _mm_unpacklo_epi16(s4, s5);
        t3 = _mm_unpacklo_epi16(s6, s7);
        t0 = _mm_unpacklo_epi32(t0, t1);
        t2 = _mm_unpacklo_epi32(t2, t3);
        t0 = _mm_unpacklo_epi64(t0, t2);

        _mm_storeu_si128((__m128i *)(pred), t0);
        pred += 8;
        ref += 64;  // 8 * 8;
      }
      ref += stride - (width << 3);
    }
  } else {
    // read 4 points at one time
    for (i = 0; i < height; i++) {
      for (j = 0; j < width; j += 4) {
        __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
        __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
        __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
        __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
        __m128i t0, t1;

        t0 = _mm_unpacklo_epi16(s0, s1);
        t1 = _mm_unpacklo_epi16(s2, s3);
        t0 = _mm_unpacklo_epi32(t0, t1);

        _mm_storel_epi64((__m128i *)(pred), t0);
        pred += 4;
        ref += 4 * 8;
      }
      ref += stride - (width << 3);
    }
  }
}
コード例 #12
0
template<int shift, int active_bits> void Haar_invtransform_H_final_1_sse4_2_int32_t(void *_idata,
																			   const int istride,
																			   const char *odata,
																			   const int ostride,
																			   const int iwidth,
																			   const int iheight,
																			   const int ooffset_x,
																			   const int ooffset_y,
																			   const int owidth,
																			   const int oheight) {
  int32_t *idata = (int32_t *)_idata;
  const int skip = 1;
  const __m128i ONE = _mm_set1_epi32(1);
  const __m128i OFFSET = _mm_set1_epi32(1 << (active_bits - 1));

  (void)iwidth;
  (void)iheight;

  for (int y = ooffset_y; y < ooffset_y + oheight; y+=skip) {
    for (int x = ooffset_x; x < ooffset_x + owidth; x += 8) {
      __m128i D0 = _mm_load_si128((__m128i *)&idata[y*istride + x + 0]);
      __m128i D4 = _mm_load_si128((__m128i *)&idata[y*istride + x + 4]);

      __m128i A0 = _mm_unpacklo_epi32(D0, D4);
      __m128i A2 = _mm_unpackhi_epi32(D0, D4);

      __m128i E0 = _mm_unpacklo_epi32(A0, A2);
      __m128i O1 = _mm_unpackhi_epi32(A0, A2);

      __m128i X0 = _mm_sub_epi32(E0, _mm_srai_epi32(_mm_add_epi32(O1, ONE), 1));
      __m128i X1 = _mm_add_epi32(O1, X0);

      __m128i Z0 = _mm_unpacklo_epi32(X0, X1);
      __m128i Z4 = _mm_unpackhi_epi32(X0, X1);

      if (shift != 0) {
        Z0 = _mm_add_epi32(Z0, ONE);
        Z4 = _mm_add_epi32(Z4, ONE);
        Z0 = _mm_srai_epi32(Z0, shift);
        Z4 = _mm_srai_epi32(Z4, shift);
      }

      Z0 = _mm_add_epi32(Z0, OFFSET);
      Z4 = _mm_add_epi32(Z4, OFFSET);

      Z0 = _mm_slli_epi32(Z0, (16 - active_bits));
      Z4 = _mm_slli_epi32(Z4, (16 - active_bits));

      __m128i R = _mm_packus_epi32(Z0, Z4);

      R = _mm_srli_epi16(R, (16 - active_bits));
      _mm_store_si128((__m128i *)&odata[2*((y - ooffset_y)*ostride + x - ooffset_x)], R);
    }
  }
}
コード例 #13
0
static inline void
desc_to_olflags_v(__m128i descs[4], uint8_t vlan_flags,
	struct rte_mbuf **rx_pkts)
{
	__m128i ptype0, ptype1, vtag0, vtag1;
	union {
		uint16_t e[4];
		uint64_t dword;
	} vol;

	/* mask everything except rss type */
	const __m128i rsstype_msk = _mm_set_epi16(
			0x0000, 0x0000, 0x0000, 0x0000,
			0x000F, 0x000F, 0x000F, 0x000F);

	/* map rss type to rss hash flag */
	const __m128i rss_flags = _mm_set_epi8(PKT_RX_FDIR, 0, 0, 0,
			0, 0, 0, PKT_RX_RSS_HASH,
			PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0,
			PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0);

	/* mask everything except vlan present bit */
	const __m128i vlan_msk = _mm_set_epi16(
			0x0000, 0x0000,
			0x0000, 0x0000,
			IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP,
			IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP);
	/* map vlan present (0x8) to ol_flags */
	const __m128i vlan_map = _mm_set_epi8(
		0, 0, 0, 0,
		0, 0, 0, vlan_flags,
		0, 0, 0, 0,
		0, 0, 0, 0);

	ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]);
	ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]);
	vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]);
	vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]);

	ptype0 = _mm_unpacklo_epi32(ptype0, ptype1);
	ptype0 = _mm_and_si128(ptype0, rsstype_msk);
	ptype0 = _mm_shuffle_epi8(rss_flags, ptype0);

	vtag1 = _mm_unpacklo_epi32(vtag0, vtag1);
	vtag1 = _mm_and_si128(vtag1, vlan_msk);
	vtag1 = _mm_shuffle_epi8(vlan_map, vtag1);

	vtag1 = _mm_or_si128(ptype0, vtag1);
	vol.dword = _mm_cvtsi128_si64(vtag1);

	rx_pkts[0]->ol_flags = vol.e[0];
	rx_pkts[1]->ol_flags = vol.e[1];
	rx_pkts[2]->ol_flags = vol.e[2];
	rx_pkts[3]->ol_flags = vol.e[3];
}
コード例 #14
0
ファイル: sse2dct.c プロジェクト: AlecGamble/daala
OD_SIMD_INLINE void od_transpose4(__m128i *t0, __m128i *t1,
 __m128i *t2, __m128i *t3) {
  __m128i a = _mm_unpacklo_epi32(*t0, *t1);
  __m128i b = _mm_unpacklo_epi32(*t2, *t3);
  __m128i c = _mm_unpackhi_epi32(*t0, *t1);
  __m128i d = _mm_unpackhi_epi32(*t2, *t3);
  *t0 = _mm_unpacklo_epi64(a, b);
  *t1 = _mm_unpackhi_epi64(a, b);
  *t2 = _mm_unpacklo_epi64(c, d);
  *t3 = _mm_unpackhi_epi64(c, d);
}
コード例 #15
0
ファイル: lossless_enc_sse2.c プロジェクト: Achraf33/opencv
// Predictor11: select.
static void GetSumAbsDiff32_SSE2(const __m128i* const A, const __m128i* const B,
                                 __m128i* const out) {
  // We can unpack with any value on the upper 32 bits, provided it's the same
  // on both operands (to that their sum of abs diff is zero). Here we use *A.
  const __m128i A_lo = _mm_unpacklo_epi32(*A, *A);
  const __m128i B_lo = _mm_unpacklo_epi32(*B, *A);
  const __m128i A_hi = _mm_unpackhi_epi32(*A, *A);
  const __m128i B_hi = _mm_unpackhi_epi32(*B, *A);
  const __m128i s_lo = _mm_sad_epu8(A_lo, B_lo);
  const __m128i s_hi = _mm_sad_epu8(A_hi, B_hi);
  *out = _mm_packs_epi32(s_lo, s_hi);
}
コード例 #16
0
ファイル: sse2mcenc.c プロジェクト: KyleSiefring/daala
/*Transpose 8 vectors with 8 16-bit values.*/
OD_SIMD_INLINE void od_transpose16x8(__m128i *t0, __m128i *t1,
 __m128i *t2, __m128i *t3,  __m128i *t4, __m128i *t5,
 __m128i *t6, __m128i *t7) {
  __m128i a0;
  __m128i b0;
  __m128i c0;
  __m128i d0;
  __m128i e0;
  __m128i f0;
  __m128i g0;
  __m128i h0;
  __m128i a1;
  __m128i b1;
  __m128i c1;
  __m128i d1;
  __m128i e1;
  __m128i f1;
  __m128i g1;
  __m128i h1;
  /*00112233*/
  a0 = _mm_unpacklo_epi16(*t0, *t1);
  b0 = _mm_unpacklo_epi16(*t2, *t3);
  c0 = _mm_unpacklo_epi16(*t4, *t5);
  d0 = _mm_unpacklo_epi16(*t6, *t7);
  /*44556677*/
  e0 = _mm_unpackhi_epi16(*t0, *t1);
  f0 = _mm_unpackhi_epi16(*t2, *t3);
  g0 = _mm_unpackhi_epi16(*t4, *t5);
  h0 = _mm_unpackhi_epi16(*t6, *t7);
  /*00001111*/
  a1 = _mm_unpacklo_epi32(a0, b0);
  b1 = _mm_unpacklo_epi32(c0, d0);
  /*22223333*/
  c1 = _mm_unpackhi_epi32(a0, b0);
  d1 = _mm_unpackhi_epi32(c0, d0);
  /*44445555*/
  e1 = _mm_unpacklo_epi32(e0, f0);
  f1 = _mm_unpacklo_epi32(g0, h0);
  /*66667777*/
  g1 = _mm_unpackhi_epi32(e0, f0);
  h1 = _mm_unpackhi_epi32(g0, h0);
  *t0 = _mm_unpacklo_epi64(a1, b1);
  *t1 = _mm_unpackhi_epi64(a1, b1);
  *t2 = _mm_unpacklo_epi64(c1, d1);
  *t3 = _mm_unpackhi_epi64(c1, d1);
  *t4 = _mm_unpacklo_epi64(e1, f1);
  *t5 = _mm_unpackhi_epi64(e1, f1);
  *t6 = _mm_unpacklo_epi64(g1, h1);
  *t7 = _mm_unpackhi_epi64(g1, h1);
}
コード例 #17
0
ファイル: av1_fwd_txfm_sse2.c プロジェクト: shacklettbp/aom
void av1_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) {
  __m128i in0, in1;
  __m128i tmp;
  const __m128i zero = _mm_setzero_si128();
  in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride));
  in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride));
  in1 = _mm_unpacklo_epi64(
      in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride)));
  in0 = _mm_unpacklo_epi64(
      in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride)));

  tmp = _mm_add_epi16(in0, in1);
  in0 = _mm_unpacklo_epi16(zero, tmp);
  in1 = _mm_unpackhi_epi16(zero, tmp);
  in0 = _mm_srai_epi32(in0, 16);
  in1 = _mm_srai_epi32(in1, 16);

  tmp = _mm_add_epi32(in0, in1);
  in0 = _mm_unpacklo_epi32(tmp, zero);
  in1 = _mm_unpackhi_epi32(tmp, zero);

  tmp = _mm_add_epi32(in0, in1);
  in0 = _mm_srli_si128(tmp, 8);

  in1 = _mm_add_epi32(tmp, in0);
  in0 = _mm_slli_epi32(in1, 1);
  store_output(&in0, output);
}
コード例 #18
0
static inline void
packed_shader_to_yuv_3(uint8_t** dstp, const uint8_t** srcp, const int dpitch,
    const int spitch, const int width, const int height, void* _buff) noexcept
{
    const uint8_t* s = srcp[0];

    uint8_t* dr = dstp[0];
    uint8_t* dg = dstp[1];
    uint8_t* db = dstp[2];

    float * buff = reinterpret_cast<float*>(_buff);

    uint8_t *lr, *lg, *lb;
    const __m128 coef = _mm_set1_ps(STACK16 ? 65535.0f : 255.0f);
    const __m128i mask = _mm_set1_epi16(0x00FF);

    if (STACK16) {
        lr = dr + height * dpitch;
        lg = dg + height * dpitch;
        lb = db + height * dpitch;
    }

    for (int y = 0; y < height; ++y) {
        convert_half_to_float(buff, s, width * 4);
        for (int x = 0; x < width; x += 4) {
            __m128i s0 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + 4 * x +  0))); // R0,G0,B0,A0
            __m128i s1 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + 4 * x +  4))); // R1,G1,B1,A1
            __m128i s2 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + 4 * x +  8))); // R2,G2,B2,A2
            __m128i s3 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + 4 * x + 12))); // R3,G3,B3,A3
            s0 = _mm_or_si128(s0, _mm_slli_epi32(s1, 16)); //R0,R1,G0,G1,B0,B1,A0,A1
            s1 = _mm_or_si128(s2, _mm_slli_epi32(s3, 16)); //R2,R3,G2,G3,B2,B3,A2,A3
            s2 = _mm_unpacklo_epi32(s0, s1); // R0,R1,R2,R3,G0,G1,G2,G3
            s3 = _mm_unpackhi_epi32(s0, s1); // B0,B1,B2,B3,A0,A1,A2,A3
            if (!STACK16) {
                s0 = _mm_packus_epi16(s2, s3);
                *(reinterpret_cast<int32_t*>(dr + x)) = _mm_cvtsi128_si32(s0);
                *(reinterpret_cast<int32_t*>(dg + x)) = _mm_cvtsi128_si32(_mm_srli_si128(s0, 4));
                *(reinterpret_cast<int32_t*>(db + x)) = _mm_cvtsi128_si32(_mm_srli_si128(s0, 8));
            } else {
                __m128i rgbamsb = _mm_packus_epi16(_mm_srli_epi16(s2, 8), _mm_srli_epi16(s3, 8));
                __m128i rgbalsb = _mm_packus_epi16(_mm_and_si128(s2, mask), _mm_and_si128(s3, mask));
                *(reinterpret_cast<int32_t*>(dr + x)) = _mm_cvtsi128_si32(rgbamsb);
                *(reinterpret_cast<int32_t*>(lr + x)) = _mm_cvtsi128_si32(rgbalsb);
                *(reinterpret_cast<int32_t*>(dg + x)) = _mm_cvtsi128_si32(_mm_srli_si128(rgbamsb, 4));
                *(reinterpret_cast<int32_t*>(lg + x)) = _mm_cvtsi128_si32(_mm_srli_si128(rgbalsb, 4));
                *(reinterpret_cast<int32_t*>(db + x)) = _mm_cvtsi128_si32(_mm_srli_si128(rgbamsb, 8));
                *(reinterpret_cast<int32_t*>(lb + x)) = _mm_cvtsi128_si32(_mm_srli_si128(rgbalsb, 8));
            }
        }
        s += spitch;
        dr += dpitch;
        dg += dpitch;
        db += dpitch;
        if (STACK16) {
            lr += dpitch;
            lg += dpitch;
            lb += dpitch;
        }
    }
}
コード例 #19
0
ファイル: shuffle.c プロジェクト: B-Rich/PyTables
/* Routine optimized for shuffling a buffer for a type size of 4 bytes. */
static void
shuffle4(uint8_t* dest, uint8_t* src, size_t size)
{
  size_t i, j, k;
  size_t numof16belem;
  __m128i xmm0[4], xmm1[4];

  numof16belem = size / (16*4);
  for (i = 0, j = 0; i < numof16belem; i++, j += 16*4) {
    /* Fetch and transpose bytes and words in groups of 64 bytes */
    for (k = 0; k < 4; k++) {
      xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16));
      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0x8d);
      xmm0[k] = _mm_unpacklo_epi8(xmm1[k], xmm0[k]);
      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x04e);
      xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]);
    }
    /* Transpose double words */
    for (k = 0; k < 2; k++) {
      xmm1[k*2] = _mm_unpacklo_epi32(xmm0[k*2], xmm0[k*2+1]);
      xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[k*2], xmm0[k*2+1]);
    }
    /* Transpose quad words */
    for (k = 0; k < 2; k++) {
      xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+2]);
      xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+2]);
    }
    /* Store the result vectors */
    for (k = 0; k < 4; k++) {
      ((__m128i *)dest)[k*numof16belem+i] = xmm0[k];
    }
  }
}
コード例 #20
0
void
interpolate_gint16_linear_sse2 (gpointer op, const gpointer ap,
    gint len, const gpointer icp, gint astride)
{
  gint i = 0;
  gint16 *o = op, *a = ap, *ic = icp;
  __m128i ta, tb, t1, t2;
  __m128i f = _mm_set_epi64x (0, *((gint64 *) ic));
  const gint16 *c[2] = { (gint16 *) ((gint8 *) a + 0 * astride),
    (gint16 *) ((gint8 *) a + 1 * astride)
  };

  f = _mm_unpacklo_epi32 (f, f);
  f = _mm_unpacklo_epi64 (f, f);

  for (; i < len; i += 8) {
    ta = _mm_load_si128 ((__m128i *) (c[0] + i));
    tb = _mm_load_si128 ((__m128i *) (c[1] + i));

    t1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f);
    t2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f);

    t1 = _mm_add_epi32 (t1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
    t2 = _mm_add_epi32 (t2, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));

    t1 = _mm_srai_epi32 (t1, PRECISION_S16);
    t2 = _mm_srai_epi32 (t2, PRECISION_S16);

    t1 = _mm_packs_epi32 (t1, t2);
    _mm_store_si128 ((__m128i *) (o + i), t1);
  }
}
コード例 #21
0
ファイル: split.hpp プロジェクト: Quanteek/nt2
 template<class R0,class R1> inline void
 eval(A0 const& a0, R0& r0, R1& r1, const simd::native<typename  boost::simd::meta::uint64_t_<A0>::type,boost::simd::tag::sse_ > &)const
 {
   typedef simd::native<typename  boost::simd::meta::uint64_t_<A0>::type,boost::simd::tag::sse_> rtype;
   r1 = bitwise_cast<rtype>(_mm_unpackhi_epi32(a0, Zero<A0>()));
   r0 = bitwise_cast<rtype>(_mm_unpacklo_epi32(a0, Zero<A0>()));
 }
コード例 #22
0
static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
                                  uint8_t *dst, const int16_t *filter) {
  const __m128i k_256 = _mm_set1_epi16(1 << 8);
  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
  // pack and duplicate the filter values
  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
  const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
  // TRANSPOSE...
  // 00 01 02 03 04 05 06 07
  // 10 11 12 13 14 15 16 17
  // 20 21 22 23 24 25 26 27
  // 30 31 32 33 34 35 36 37
  //
  // TO
  //
  // 00 10 20 30
  // 01 11 21 31
  // 02 12 22 32
  // 03 13 23 33
  // 04 14 24 34
  // 05 15 25 35
  // 06 16 26 36
  // 07 17 27 37
  //
  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
  const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
  const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
  const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
  const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
  // 02 03 12 13 22 23 32 33
  const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
  // 06 07 16 17 26 27 36 37
  const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
  // multiply 2 adjacent elements with the filter and add the result
  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
  // add and saturate the results together
  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
  __m128i temp = _mm_adds_epi16(x0, x3);
  temp = _mm_adds_epi16(temp, min_x2x1);
  temp = _mm_adds_epi16(temp, max_x2x1);
  // round and shift by 7 bit each 16 bit
  temp = _mm_mulhrs_epi16(temp, k_256);
  // shrink to 8 bit each 16 bits
  temp = _mm_packus_epi16(temp, temp);
  // save only 4 bytes
  *(int *)dst = _mm_cvtsi128_si32(temp);
}
コード例 #23
0
ファイル: sse2dct.c プロジェクト: AlecGamble/daala
OD_SIMD_INLINE __m128i od_mullo_epi32_sse2(__m128i a, int b1) {
  __m128i b = _mm_set1_epi32(b1);
  __m128i lo = _mm_mul_epu32(a, b);
  __m128i hi = _mm_mul_epu32(_mm_srli_si128(a, 4), _mm_srli_si128(b, 4));
  return _mm_unpacklo_epi32(_mm_shuffle_epi32(lo, _MM_SHUFFLE(0, 0, 2, 0)),
   _mm_shuffle_epi32(hi, _MM_SHUFFLE(0, 0, 2, 0)));
}
コード例 #24
0
ファイル: sse2.cpp プロジェクト: qw2552479/itoa-benchmark
inline __m128i Convert8DigitsSSE2(uint32_t value) {
	assert(value <= 99999999);

	// abcd, efgh = abcdefgh divmod 10000 
	const __m128i abcdefgh = _mm_cvtsi32_si128(value);
	const __m128i abcd = _mm_srli_epi64(_mm_mul_epu32(abcdefgh, reinterpret_cast<const __m128i*>(kDiv10000Vector)[0]), 45);
	const __m128i efgh = _mm_sub_epi32(abcdefgh, _mm_mul_epu32(abcd, reinterpret_cast<const __m128i*>(k10000Vector)[0]));

	// v1 = [ abcd, efgh, 0, 0, 0, 0, 0, 0 ]
	const __m128i v1 = _mm_unpacklo_epi16(abcd, efgh);

	// v1a = v1 * 4 = [ abcd * 4, efgh * 4, 0, 0, 0, 0, 0, 0 ]
	const __m128i v1a = _mm_slli_epi64(v1, 2);

	// v2 = [ abcd * 4, abcd * 4, abcd * 4, abcd * 4, efgh * 4, efgh * 4, efgh * 4, efgh * 4 ]
	const __m128i v2a = _mm_unpacklo_epi16(v1a, v1a);
	const __m128i v2 = _mm_unpacklo_epi32(v2a, v2a);

	// v4 = v2 div 10^3, 10^2, 10^1, 10^0 = [ a, ab, abc, abcd, e, ef, efg, efgh ]
	const __m128i v3 = _mm_mulhi_epu16(v2, reinterpret_cast<const __m128i*>(kDivPowersVector)[0]);
	const __m128i v4 = _mm_mulhi_epu16(v3, reinterpret_cast<const __m128i*>(kShiftPowersVector)[0]);

	// v5 = v4 * 10 = [ a0, ab0, abc0, abcd0, e0, ef0, efg0, efgh0 ]
	const __m128i v5 = _mm_mullo_epi16(v4, reinterpret_cast<const __m128i*>(k10Vector)[0]);

	// v6 = v5 << 16 = [ 0, a0, ab0, abc0, 0, e0, ef0, efg0 ]
	const __m128i v6 = _mm_slli_epi64(v5, 16);

	// v7 = v4 - v6 = { a, b, c, d, e, f, g, h }
	const __m128i v7 = _mm_sub_epi16(v4, v6);

	return v7;
}
コード例 #25
0
ファイル: psc_sse2.c プロジェクト: ALaDyn/psc
inline __m128i 
func_mul_epu32(__m128i a, __m128i b){
#if 1
  // Multiply elements 0 and 2, and bput the 64 bit results into a vector.
  __m128i tmp02 = _mm_mul_epu32(a, b);
  // Shift the vectors by one word to the right, making 3->2, and 1->0, and 
  // then multiply into double word vector.
  __m128i tmp13 = _mm_mul_epu32( _mm_srli_si128(a, 4), _mm_srli_si128(b,4));
  // Shuffle the vectors to place the lower 32 bits of each results in the
  // lower two words. I have some concerns about endianness and portability
  // related to this function. 
  __m128i tmpres02 = _mm_shuffle_epi32(tmp02, _MM_SHUFFLE(0,0,2,0));
  __m128i tmpres13 = _mm_shuffle_epi32(tmp13, _MM_SHUFFLE(0,0,2,0));
  // upcack the shuffled vectors into a return value
  return _mm_unpacklo_epi32(tmpres02, tmpres13);
#else
  pvInt ret;
  int * p_a;
  int * p_b;
  p_a = (int *)&a;
  p_b = (int *)&b;
  for(int m=0; m < VEC_SIZE; m++){
    ret.v[m] = p_a[m] * p_b[m];
  }
  return ret.r;
#endif
    
}
コード例 #26
0
ファイル: fm10k_rxtx_vec.c プロジェクト: ljvblfz/DPDK
/* @note: When this function is changed, make corresponding change to
 * fm10k_dev_supported_ptypes_get().
 */
static inline void
fm10k_desc_to_pktype_v(__m128i descs[4], struct rte_mbuf **rx_pkts)
{
	__m128i l3l4type0, l3l4type1, l3type, l4type;
	union {
		uint16_t e[4];
		uint64_t dword;
	} vol;

	/* L3 pkt type mask  Bit4 to Bit6 */
	const __m128i l3type_msk = _mm_set_epi16(
			0x0000, 0x0000, 0x0000, 0x0000,
			0x0070, 0x0070, 0x0070, 0x0070);

	/* L4 pkt type mask  Bit7 to Bit9 */
	const __m128i l4type_msk = _mm_set_epi16(
			0x0000, 0x0000, 0x0000, 0x0000,
			0x0380, 0x0380, 0x0380, 0x0380);

	/* convert RRC l3 type to mbuf format */
	const __m128i l3type_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
			0, 0, 0, RTE_PTYPE_L3_IPV6_EXT,
			RTE_PTYPE_L3_IPV6, RTE_PTYPE_L3_IPV4_EXT,
			RTE_PTYPE_L3_IPV4, 0);

	/* Convert RRC l4 type to mbuf format l4type_flags shift-left 8 bits
	 * to fill into8 bits length.
	 */
	const __m128i l4type_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0,
			RTE_PTYPE_TUNNEL_GENEVE >> 8,
			RTE_PTYPE_TUNNEL_NVGRE >> 8,
			RTE_PTYPE_TUNNEL_VXLAN >> 8,
			RTE_PTYPE_TUNNEL_GRE >> 8,
			RTE_PTYPE_L4_UDP >> 8,
			RTE_PTYPE_L4_TCP >> 8,
			0);

	l3l4type0 = _mm_unpacklo_epi16(descs[0], descs[1]);
	l3l4type1 = _mm_unpacklo_epi16(descs[2], descs[3]);
	l3l4type0 = _mm_unpacklo_epi32(l3l4type0, l3l4type1);

	l3type = _mm_and_si128(l3l4type0, l3type_msk);
	l4type = _mm_and_si128(l3l4type0, l4type_msk);

	l3type = _mm_srli_epi16(l3type, L3TYPE_SHIFT);
	l4type = _mm_srli_epi16(l4type, L4TYPE_SHIFT);

	l3type = _mm_shuffle_epi8(l3type_flags, l3type);
	/* l4type_flags shift-left for 8 bits, need shift-right back */
	l4type = _mm_shuffle_epi8(l4type_flags, l4type);

	l4type = _mm_slli_epi16(l4type, 8);
	l3l4type0 = _mm_or_si128(l3type, l4type);
	vol.dword = _mm_cvtsi128_si64(l3l4type0);

	rx_pkts[0]->packet_type = vol.e[0];
	rx_pkts[1]->packet_type = vol.e[1];
	rx_pkts[2]->packet_type = vol.e[2];
	rx_pkts[3]->packet_type = vol.e[3];
}
コード例 #27
0
    SIMDValue SIMDInt32x4Operation::OpMul(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        SIMDValue result;
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);

        if (AutoSystemInfo::Data.SSE4_1Available())
        {   // a * b, only available in SSE4
            x86Result.m128i_value = _mm_mullo_epi32(tmpaValue.m128i_value, tmpbValue.m128i_value);
            result = X86SIMDValue::ToSIMDValue(x86Result);
        }
        else if (AutoSystemInfo::Data.SSE2Available())
        {
            // mul 2,0: r0 = a0*b0; r1 = a2*b2
            __m128i tmp1 = _mm_mul_epu32(tmpaValue.m128i_value, tmpbValue.m128i_value);
            // mul 3,1: r0=a1*b1; r1=a3*b3
            __m128i tmp2 = _mm_mul_epu32(_mm_srli_si128(tmpaValue.m128i_value, 4), _mm_srli_si128(tmpbValue.m128i_value, 4));
            // shuffle x86Results to [63..0] and pack
            x86Result.m128i_value = _mm_unpacklo_epi32(_mm_shuffle_epi32(tmp1, _MM_SHUFFLE(0, 0, 2, 0)), _mm_shuffle_epi32(tmp2, _MM_SHUFFLE(0, 0, 2, 0)));
            result = X86SIMDValue::ToSIMDValue(x86Result);
        }
        else
        {
            result.i32[SIMD_X] = aValue.i32[SIMD_X] * bValue.i32[SIMD_X];
            result.i32[SIMD_Y] = aValue.i32[SIMD_Y] * bValue.i32[SIMD_Y];
            result.i32[SIMD_Z] = aValue.i32[SIMD_Z] * bValue.i32[SIMD_Z];
            result.i32[SIMD_W] = aValue.i32[SIMD_W] * bValue.i32[SIMD_W];
        }

        return result;
    }
コード例 #28
0
static void vpx_highbd_filter_block1d4_h4_sse2(
    const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr,
    ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) {
  // We will load multiple shifted versions of the row and shuffle them into
  // 16-bit words of the form
  // ... s[2] s[1] s[0] s[-1]
  // ... s[4] s[3] s[2] s[1]
  // Then we call multiply and add to get partial results
  // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2]
  // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4]
  // The two results are then added together to get the even output

  __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3;
  __m128i res_reg;
  __m128i even, odd;

  __m128i kernel_reg;                    // Kernel
  __m128i kernel_reg_23, kernel_reg_45;  // Segments of the kernel used
  const __m128i reg_round =
      _mm_set1_epi32(CONV8_ROUNDING_NUM);  // Used for rounding
  const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1);
  const __m128i reg_zero = _mm_setzero_si128();
  int h;

  // Start one pixel before as we need tap/2 - 1 = 1 sample from the past
  src_ptr -= 1;

  // Load Kernel
  kernel_reg = _mm_loadu_si128((const __m128i *)kernel);
  kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg);
  kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg);

  for (h = height; h > 0; --h) {
    src_reg = _mm_loadu_si128((const __m128i *)src_ptr);
    src_reg_shift_1 = _mm_srli_si128(src_reg, 2);
    src_reg_shift_2 = _mm_srli_si128(src_reg, 4);
    src_reg_shift_3 = _mm_srli_si128(src_reg, 6);

    // Output 2 0
    even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23,
                                  &kernel_reg_45);

    // Output 3 1
    odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3,
                                 &kernel_reg_23, &kernel_reg_45);

    // Combine to get the first half of the dst
    res_reg = _mm_unpacklo_epi32(even, odd);
    res_reg = mm_round_epi32_sse2(&res_reg, &reg_round, CONV8_ROUNDING_BITS);
    res_reg = _mm_packs_epi32(res_reg, reg_zero);

    // Saturate the result and save
    res_reg = _mm_min_epi16(res_reg, reg_max);
    res_reg = _mm_max_epi16(res_reg, reg_zero);
    _mm_storel_epi64((__m128i *)dst_ptr, res_reg);

    src_ptr += src_stride;
    dst_ptr += dst_stride;
  }
}
コード例 #29
0
ファイル: ixgbe_rxtx_vec.c プロジェクト: ljvblfz/DPDK
static inline void
desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts)
{
	__m128i ptype0, ptype1, vtag0, vtag1;
	union {
		uint16_t e[4];
		uint64_t dword;
	} vol;

	/* pkt type + vlan olflags mask */
	const __m128i pkttype_msk = _mm_set_epi16(
			0x0000, 0x0000, 0x0000, 0x0000,
			PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT,
			PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT);

	/* mask everything except rss type */
	const __m128i rsstype_msk = _mm_set_epi16(
			0x0000, 0x0000, 0x0000, 0x0000,
			0x000F, 0x000F, 0x000F, 0x000F);

	/* map rss type to rss hash flag */
	const __m128i rss_flags = _mm_set_epi8(PKT_RX_FDIR, 0, 0, 0,
			0, 0, 0, PKT_RX_RSS_HASH,
			PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0,
			PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0);

	ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]);
	ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]);
	vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]);
	vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]);

	ptype0 = _mm_unpacklo_epi32(ptype0, ptype1);
	ptype0 = _mm_and_si128(ptype0, rsstype_msk);
	ptype0 = _mm_shuffle_epi8(rss_flags, ptype0);

	vtag1 = _mm_unpacklo_epi32(vtag0, vtag1);
	vtag1 = _mm_srli_epi16(vtag1, VTAG_SHIFT);
	vtag1 = _mm_and_si128(vtag1, pkttype_msk);

	vtag1 = _mm_or_si128(ptype0, vtag1);
	vol.dword = _mm_cvtsi128_si64(vtag1);

	rx_pkts[0]->ol_flags = vol.e[0];
	rx_pkts[1]->ol_flags = vol.e[1];
	rx_pkts[2]->ol_flags = vol.e[2];
	rx_pkts[3]->ol_flags = vol.e[3];
}
コード例 #30
0
ファイル: enc_sse2.c プロジェクト: haroldma/Universal.WebP
static int SSE4x4SSE2(const uint8_t* a, const uint8_t* b) {
    const __m128i zero = _mm_setzero_si128();

    // Load values. Note that we read 8 pixels instead of 4,
    // but the a/b buffers are over-allocated to that effect.
    const __m128i a0 = _mm_loadl_epi64((__m128i*)&a[BPS * 0]);
    const __m128i a1 = _mm_loadl_epi64((__m128i*)&a[BPS * 1]);
    const __m128i a2 = _mm_loadl_epi64((__m128i*)&a[BPS * 2]);
    const __m128i a3 = _mm_loadl_epi64((__m128i*)&a[BPS * 3]);
    const __m128i b0 = _mm_loadl_epi64((__m128i*)&b[BPS * 0]);
    const __m128i b1 = _mm_loadl_epi64((__m128i*)&b[BPS * 1]);
    const __m128i b2 = _mm_loadl_epi64((__m128i*)&b[BPS * 2]);
    const __m128i b3 = _mm_loadl_epi64((__m128i*)&b[BPS * 3]);

    // Combine pair of lines and convert to 16b.
    const __m128i a01 = _mm_unpacklo_epi32(a0, a1);
    const __m128i a23 = _mm_unpacklo_epi32(a2, a3);
    const __m128i b01 = _mm_unpacklo_epi32(b0, b1);
    const __m128i b23 = _mm_unpacklo_epi32(b2, b3);
    const __m128i a01s = _mm_unpacklo_epi8(a01, zero);
    const __m128i a23s = _mm_unpacklo_epi8(a23, zero);
    const __m128i b01s = _mm_unpacklo_epi8(b01, zero);
    const __m128i b23s = _mm_unpacklo_epi8(b23, zero);

    // Compute differences; (a-b)^2 = (abs(a-b))^2 = (sat8(a-b) + sat8(b-a))^2
    // TODO(cduvivier): Dissassemble and figure out why this is fastest. We don't
    //                  need absolute values, there is no need to do calculation
    //                  in 8bit as we are already in 16bit, ... Yet this is what
    //                  benchmarks the fastest!
    const __m128i d0 = _mm_subs_epu8(a01s, b01s);
    const __m128i d1 = _mm_subs_epu8(b01s, a01s);
    const __m128i d2 = _mm_subs_epu8(a23s, b23s);
    const __m128i d3 = _mm_subs_epu8(b23s, a23s);

    // Square and add them all together.
    const __m128i madd0 = _mm_madd_epi16(d0, d0);
    const __m128i madd1 = _mm_madd_epi16(d1, d1);
    const __m128i madd2 = _mm_madd_epi16(d2, d2);
    const __m128i madd3 = _mm_madd_epi16(d3, d3);
    const __m128i sum0 = _mm_add_epi32(madd0, madd1);
    const __m128i sum1 = _mm_add_epi32(madd2, madd3);
    const __m128i sum2 = _mm_add_epi32(sum0, sum1);

    int32_t tmp[4];
    _mm_storeu_si128((__m128i*)tmp, sum2);
    return (tmp[3] + tmp[2] + tmp[1] + tmp[0]);
}