Beispiel #1
0
static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
  int x = 0;
  if (!inverse) {
    const int kSpan = 2;
    const __m128i zero = _mm_setzero_si128();
    const __m128i kRound =
        _mm_set_epi16(0, 1 << 7, 1 << 7, 1 << 7, 0, 1 << 7, 1 << 7, 1 << 7);
    const __m128i kMult =
        _mm_set_epi16(0, 0x0101, 0x0101, 0x0101, 0, 0x0101, 0x0101, 0x0101);
    const __m128i kOne64 = _mm_set_epi16(1u << 8, 0, 0, 0, 1u << 8, 0, 0, 0);
    const int w2 = width & ~(kSpan - 1);
    for (x = 0; x < w2; x += kSpan) {
      const __m128i argb0 = _mm_loadl_epi64((__m128i*)&ptr[x]);
      const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero);
      const __m128i tmp0 = _mm_shufflelo_epi16(argb1, _MM_SHUFFLE(3, 3, 3, 3));
      const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, _MM_SHUFFLE(3, 3, 3, 3));
      const __m128i tmp2 = _mm_srli_epi64(tmp1, 16);
      const __m128i scale0 = _mm_mullo_epi16(tmp1, kMult);
      const __m128i scale1 = _mm_or_si128(tmp2, kOne64);
      const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0);
      const __m128i argb3 = _mm_mullo_epi16(argb1, scale1);
      const __m128i argb4 = _mm_adds_epu16(argb2, argb3);
      const __m128i argb5 = _mm_adds_epu16(argb4, kRound);
      const __m128i argb6 = _mm_srli_epi16(argb5, 8);
      const __m128i argb7 = _mm_packus_epi16(argb6, zero);
      _mm_storel_epi64((__m128i*)&ptr[x], argb7);
    }
  }
  width -= x;
  if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse);
}
Beispiel #2
0
static void
TEST (void)
{
  union128i_w s1, s2;
  union128i_b u;
  char e[16];
  int i;
   
  s1.x = _mm_set_epi16 (2134, -128, 1234, 6354, 1002, 3004, 4050, 9999);
  s2.x = _mm_set_epi16 (41124, 234, 2344, 2354, 607, 1, 2, -8009);
  u.x = test (s1.x, s2.x); 

  for (i = 0; i < 8; i++)
    {
      if (s1.a[i] > 127)
        e[i] = 127;
      else if (s1.a[i] < -128)
        e[i] = -128;
      else
        e[i] = s1.a[i];
    }
  
  for (i = 0; i < 8; i++)
   {
      if (s2.a[i] > 127)
        e[i+8] = 127;
      else if (s2.a[i] < -128)
        e[i+8] = -128;
      else
        e[i+8] = s2.a[i];
    }

  if (check_union128i_b (u, e))
    abort ();
}
Beispiel #3
0
/* @note: When this function is changed, make corresponding change to
 * fm10k_dev_supported_ptypes_get().
 */
static inline void
fm10k_desc_to_pktype_v(__m128i descs[4], struct rte_mbuf **rx_pkts)
{
	__m128i l3l4type0, l3l4type1, l3type, l4type;
	union {
		uint16_t e[4];
		uint64_t dword;
	} vol;

	/* L3 pkt type mask  Bit4 to Bit6 */
	const __m128i l3type_msk = _mm_set_epi16(
			0x0000, 0x0000, 0x0000, 0x0000,
			0x0070, 0x0070, 0x0070, 0x0070);

	/* L4 pkt type mask  Bit7 to Bit9 */
	const __m128i l4type_msk = _mm_set_epi16(
			0x0000, 0x0000, 0x0000, 0x0000,
			0x0380, 0x0380, 0x0380, 0x0380);

	/* convert RRC l3 type to mbuf format */
	const __m128i l3type_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
			0, 0, 0, RTE_PTYPE_L3_IPV6_EXT,
			RTE_PTYPE_L3_IPV6, RTE_PTYPE_L3_IPV4_EXT,
			RTE_PTYPE_L3_IPV4, 0);

	/* Convert RRC l4 type to mbuf format l4type_flags shift-left 8 bits
	 * to fill into8 bits length.
	 */
	const __m128i l4type_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0,
			RTE_PTYPE_TUNNEL_GENEVE >> 8,
			RTE_PTYPE_TUNNEL_NVGRE >> 8,
			RTE_PTYPE_TUNNEL_VXLAN >> 8,
			RTE_PTYPE_TUNNEL_GRE >> 8,
			RTE_PTYPE_L4_UDP >> 8,
			RTE_PTYPE_L4_TCP >> 8,
			0);

	l3l4type0 = _mm_unpacklo_epi16(descs[0], descs[1]);
	l3l4type1 = _mm_unpacklo_epi16(descs[2], descs[3]);
	l3l4type0 = _mm_unpacklo_epi32(l3l4type0, l3l4type1);

	l3type = _mm_and_si128(l3l4type0, l3type_msk);
	l4type = _mm_and_si128(l3l4type0, l4type_msk);

	l3type = _mm_srli_epi16(l3type, L3TYPE_SHIFT);
	l4type = _mm_srli_epi16(l4type, L4TYPE_SHIFT);

	l3type = _mm_shuffle_epi8(l3type_flags, l3type);
	/* l4type_flags shift-left for 8 bits, need shift-right back */
	l4type = _mm_shuffle_epi8(l4type_flags, l4type);

	l4type = _mm_slli_epi16(l4type, 8);
	l3l4type0 = _mm_or_si128(l3type, l4type);
	vol.dword = _mm_cvtsi128_si64(l3l4type0);

	rx_pkts[0]->packet_type = vol.e[0];
	rx_pkts[1]->packet_type = vol.e[1];
	rx_pkts[2]->packet_type = vol.e[2];
	rx_pkts[3]->packet_type = vol.e[3];
}
Beispiel #4
0
static void TransformColor(const VP8LMultipliers* const m,
                           uint32_t* argb_data, int num_pixels) {
  const __m128i mults_rb = _mm_set_epi16(
      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_));
  const __m128i mults_b2 = _mm_set_epi16(
      CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0,
      CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0);
  const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);  // alpha-green masks
  const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff);  // red-blue masks
  int i;
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
    const __m128i A = _mm_and_si128(in, mask_ag);     // a   0   g   0
    const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
    const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // g0g0
    const __m128i D = _mm_mulhi_epi16(C, mults_rb);    // x dr  x db1
    const __m128i E = _mm_slli_epi16(in, 8);           // r 0   b   0
    const __m128i F = _mm_mulhi_epi16(E, mults_b2);    // x db2 0   0
    const __m128i G = _mm_srli_epi32(F, 16);           // 0 0   x db2
    const __m128i H = _mm_add_epi8(G, D);              // x dr  x  db
    const __m128i I = _mm_and_si128(H, mask_rb);       // 0 dr  0  db
    const __m128i out = _mm_sub_epi8(in, I);
    _mm_storeu_si128((__m128i*)&argb_data[i], out);
  }
  // fallthrough and finish off with plain-C
  VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
}
static inline void
desc_to_olflags_v(__m128i descs[4], uint8_t vlan_flags,
	struct rte_mbuf **rx_pkts)
{
	__m128i ptype0, ptype1, vtag0, vtag1;
	union {
		uint16_t e[4];
		uint64_t dword;
	} vol;

	/* mask everything except rss type */
	const __m128i rsstype_msk = _mm_set_epi16(
			0x0000, 0x0000, 0x0000, 0x0000,
			0x000F, 0x000F, 0x000F, 0x000F);

	/* map rss type to rss hash flag */
	const __m128i rss_flags = _mm_set_epi8(PKT_RX_FDIR, 0, 0, 0,
			0, 0, 0, PKT_RX_RSS_HASH,
			PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0,
			PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0);

	/* mask everything except vlan present bit */
	const __m128i vlan_msk = _mm_set_epi16(
			0x0000, 0x0000,
			0x0000, 0x0000,
			IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP,
			IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP);
	/* map vlan present (0x8) to ol_flags */
	const __m128i vlan_map = _mm_set_epi8(
		0, 0, 0, 0,
		0, 0, 0, vlan_flags,
		0, 0, 0, 0,
		0, 0, 0, 0);

	ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]);
	ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]);
	vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]);
	vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]);

	ptype0 = _mm_unpacklo_epi32(ptype0, ptype1);
	ptype0 = _mm_and_si128(ptype0, rsstype_msk);
	ptype0 = _mm_shuffle_epi8(rss_flags, ptype0);

	vtag1 = _mm_unpacklo_epi32(vtag0, vtag1);
	vtag1 = _mm_and_si128(vtag1, vlan_msk);
	vtag1 = _mm_shuffle_epi8(vlan_map, vtag1);

	vtag1 = _mm_or_si128(ptype0, vtag1);
	vol.dword = _mm_cvtsi128_si64(vtag1);

	rx_pkts[0]->ol_flags = vol.e[0];
	rx_pkts[1]->ol_flags = vol.e[1];
	rx_pkts[2]->ol_flags = vol.e[2];
	rx_pkts[3]->ol_flags = vol.e[3];
}
Beispiel #6
0
rfx_dwt_2d_encode_block_horiz_sse2(INT16* src, INT16* l, INT16* h, int subband_width)
{
	int y;
	int n;
	int first;
	__m128i src_2n;
	__m128i src_2n_1;
	__m128i src_2n_2;
	__m128i h_n;
	__m128i h_n_m;
	__m128i l_n;

	for (y = 0; y < subband_width; y++)
	{
		for (n = 0; n < subband_width; n += 8)
		{
			/* The following 3 Set operations consumes more than half of the total DWT processing time! */
			src_2n = _mm_set_epi16(src[14], src[12], src[10], src[8], src[6], src[4], src[2], src[0]);
			src_2n_1 = _mm_set_epi16(src[15], src[13], src[11], src[9], src[7], src[5], src[3], src[1]);
			src_2n_2 = _mm_set_epi16(n == subband_width - 8 ? src[14] : src[16],
				src[14], src[12], src[10], src[8], src[6], src[4], src[2]);

			/* h[n] = (src[2n + 1] - ((src[2n] + src[2n + 2]) >> 1)) >> 1 */

			h_n = _mm_add_epi16(src_2n, src_2n_2);
			h_n = _mm_srai_epi16(h_n, 1);
			h_n = _mm_sub_epi16(src_2n_1, h_n);
			h_n = _mm_srai_epi16(h_n, 1);

			_mm_store_si128((__m128i*) h, h_n);

			h_n_m = _mm_loadu_si128((__m128i*) (h - 1));
			if (n == 0)
			{
				first = _mm_extract_epi16(h_n_m, 1);
				h_n_m = _mm_insert_epi16(h_n_m, first, 0);
			}

			/* l[n] = src[2n] + ((h[n - 1] + h[n]) >> 1) */

			l_n = _mm_add_epi16(h_n_m, h_n);
			l_n = _mm_srai_epi16(l_n, 1);
			l_n = _mm_add_epi16(l_n, src_2n);

			_mm_store_si128((__m128i*) l, l_n);

			src += 16;
			l += 8;
			h += 8;
		}
	}
}
Beispiel #7
0
static void CollectColorBlueTransforms(const uint32_t* argb, int stride,
                                       int tile_width, int tile_height,
                                       int green_to_blue, int red_to_blue,
                                       int histo[]) {
  const __m128i mults_r = _mm_set_epi16(
      CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0,
      CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0);
  const __m128i mults_g = _mm_set_epi16(
      0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue),
      0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue));
  const __m128i mask_g = _mm_set1_epi32(0x00ff00);  // green mask
  const __m128i mask_b = _mm_set1_epi32(0x0000ff);  // blue mask
  int y;
  for (y = 0; y < tile_height; ++y) {
    const uint32_t* const src = argb + y * stride;
    int i, x;
    for (x = 0; x + SPAN <= tile_width; x += SPAN) {
      uint16_t values[SPAN];
      const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x +        0]);
      const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
      const __m128i A0 = _mm_slli_epi16(in0, 8);        // r 0  | b 0
      const __m128i A1 = _mm_slli_epi16(in1, 8);
      const __m128i B0 = _mm_and_si128(in0, mask_g);    // 0 0  | g 0
      const __m128i B1 = _mm_and_si128(in1, mask_g);
      const __m128i C0 = _mm_mulhi_epi16(A0, mults_r);  // x db | 0 0
      const __m128i C1 = _mm_mulhi_epi16(A1, mults_r);
      const __m128i D0 = _mm_mulhi_epi16(B0, mults_g);  // 0 0  | x db
      const __m128i D1 = _mm_mulhi_epi16(B1, mults_g);
      const __m128i E0 = _mm_sub_epi8(in0, D0);         // x x  | x b'
      const __m128i E1 = _mm_sub_epi8(in1, D1);
      const __m128i F0 = _mm_srli_epi32(C0, 16);        // 0 0  | x db
      const __m128i F1 = _mm_srli_epi32(C1, 16);
      const __m128i G0 = _mm_sub_epi8(E0, F0);          // 0 0  | x b'
      const __m128i G1 = _mm_sub_epi8(E1, F1);
      const __m128i H0 = _mm_and_si128(G0, mask_b);     // 0 0  | 0 b
      const __m128i H1 = _mm_and_si128(G1, mask_b);
      const __m128i I = _mm_packs_epi32(H0, H1);        // 0 b' | 0 b'
      _mm_storeu_si128((__m128i*)values, I);
      for (i = 0; i < SPAN; ++i) ++histo[values[i]];
    }
  }
  {
    const int left_over = tile_width & (SPAN - 1);
    if (left_over > 0) {
      VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride,
                                       left_over, tile_height,
                                       green_to_blue, red_to_blue, histo);
    }
  }
}
Beispiel #8
0
void unpack_rgb5a1_sse2(const Uint8* source, const Uint32 size, Uint8* dest)
{
	__m128i t0, t1, t2;
	Uint32 i;

	for (i = 0; i < (size / 8); i++)
	{
		t0 = _mm_loadl_epi64((__m128i*)&source[i * 8]);

		t0 = _mm_unpacklo_epi16(t0, t0);
		t1 = _mm_unpacklo_epi16(t0, t0);
		t1 = _mm_and_si128(t1, _mm_set_epi16(0x8000, 0x001F, 0x03E0, 0x7C00, 0x8000, 0x001F, 0x03E0, 0x7C00));
		t1 = _mm_mullo_epi16(t1, _mm_set_epi16(0x0001, 0x0800, 0x0040, 0x0002, 0x0001, 0x0800, 0x0040, 0x0002));
		t1 = _mm_mulhi_epu16(t1, _mm_set_epi16(0x0200, 0x0260, 0x0260, 0x0260, 0x0200, 0x0260, 0x0260, 0x0260));
		t1 = _mm_mulhi_epu16(t1, _mm_set_epi16(0xFF00, 0x6ED5, 0x6ED5, 0x6ED5, 0xFF00, 0x6ED5, 0x6ED5, 0x6ED5));
		t2 = _mm_unpackhi_epi16(t0, t0);
		t2 = _mm_and_si128(t2, _mm_set_epi16(0x8000, 0x001F, 0x03E0, 0x7C00, 0x8000, 0x001F, 0x03E0, 0x7C00));
		t2 = _mm_mullo_epi16(t2, _mm_set_epi16(0x0001, 0x0800, 0x0040, 0x0002, 0x0001, 0x0800, 0x0040, 0x0002));
		t2 = _mm_mulhi_epu16(t2, _mm_set_epi16(0x0200, 0x0260, 0x0260, 0x0260, 0x0200, 0x0260, 0x0260, 0x0260));
		t2 = _mm_mulhi_epu16(t2, _mm_set_epi16(0xFF00, 0x6ED5, 0x6ED5, 0x6ED5, 0xFF00, 0x6ED5, 0x6ED5, 0x6ED5));
		t1 = _mm_packus_epi16(t1, t2);

		_mm_stream_si128((__m128i*)&dest[i * 16], t1);
	}
}
Beispiel #9
0
static inline void
desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts)
{
	__m128i ptype0, ptype1, vtag0, vtag1;
	union {
		uint16_t e[4];
		uint64_t dword;
	} vol;

	/* pkt type + vlan olflags mask */
	const __m128i pkttype_msk = _mm_set_epi16(
			0x0000, 0x0000, 0x0000, 0x0000,
			PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT,
			PKT_RX_VLAN_PKT, PKT_RX_VLAN_PKT);

	/* mask everything except rss type */
	const __m128i rsstype_msk = _mm_set_epi16(
			0x0000, 0x0000, 0x0000, 0x0000,
			0x000F, 0x000F, 0x000F, 0x000F);

	/* map rss type to rss hash flag */
	const __m128i rss_flags = _mm_set_epi8(PKT_RX_FDIR, 0, 0, 0,
			0, 0, 0, PKT_RX_RSS_HASH,
			PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0,
			PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0);

	ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]);
	ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]);
	vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]);
	vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]);

	ptype0 = _mm_unpacklo_epi32(ptype0, ptype1);
	ptype0 = _mm_and_si128(ptype0, rsstype_msk);
	ptype0 = _mm_shuffle_epi8(rss_flags, ptype0);

	vtag1 = _mm_unpacklo_epi32(vtag0, vtag1);
	vtag1 = _mm_srli_epi16(vtag1, VTAG_SHIFT);
	vtag1 = _mm_and_si128(vtag1, pkttype_msk);

	vtag1 = _mm_or_si128(ptype0, vtag1);
	vol.dword = _mm_cvtsi128_si64(vtag1);

	rx_pkts[0]->ol_flags = vol.e[0];
	rx_pkts[1]->ol_flags = vol.e[1];
	rx_pkts[2]->ol_flags = vol.e[2];
	rx_pkts[3]->ol_flags = vol.e[3];
}
Beispiel #10
0
static void
TEST (void)
{
  union128i_w u, s1, s2;
  short e[8];
  int i;
   
  s1.x = _mm_set_epi16 (10,20,30,90,-80,-40,-100,-15);
  s2.x = _mm_set_epi16 (11, 98, 76, -100, -34, -78, -39, 14);
  u.x = test (s1.x, s2.x); 
   
  for (i = 0; i < 8; i++)
     e[i] = s1.a[i] + s2.a[i];

  if (check_union128i_w (u, e))
    abort ();
}
Beispiel #11
0
static void
TEST (void)
{
  union128i_w u, s1, s2;
  short e[8];
  int i;
   
  s1.x = _mm_set_epi16 (1,2,3,4,5,6,7,8);
  s2.x = _mm_set_epi16 (8,7,6,5,4,3,2,1);
  u.x = test (s1.x, s2.x);

  for (i=0; i<8; i++)
    e[i] = s1.a[i]>s2.a[i]?s1.a[i]:s2.a[i]; 

  if (check_union128i_w (u, e))
    abort ();
}
Beispiel #12
0
static void
TEST (void)
{
  union128i_uw u, s1, s2;
  unsigned short e[8];
  int i;
   
  s1.x = _mm_set_epi16 (10,20,30,90,80,40,100,15);
  s2.x = _mm_set_epi16 (11, 98, 76, 100, 34, 78, 39, 14);
  u.x = test (s1.x, s2.x); 
   
  for (i = 0; i < 8; i++)
     e[i] = (s1.a[i] + s2.a[i]+1)>>1;

  if (check_union128i_uw (u, e))
    abort ();
}
    // SIMD.Int16x8 operation wrappers that cover instrinsics for x86/x64 system
    SIMDValue SIMDInt16x8Operation::OpInt16x8(int16 values[])
    {
        X86SIMDValue x86Result;
        // Sets the 8 signed 16-bit integer values, note in revised order: starts with x7
        x86Result.m128i_value = _mm_set_epi16(values[7], values[6], values[5], values[4], values[3], values[2], values[1], values[0]);

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
void
interpolate_gint16_cubic_sse2 (gpointer op, const gpointer ap,
    gint len, const gpointer icp, gint astride)
{
  gint i = 0;
  gint16 *o = op, *a = ap, *ic = icp;
  __m128i ta, tb, tl1, tl2, th1, th2;
  __m128i f[2];
  const gint16 *c[4] = { (gint16 *) ((gint8 *) a + 0 * astride),
    (gint16 *) ((gint8 *) a + 1 * astride),
    (gint16 *) ((gint8 *) a + 2 * astride),
    (gint16 *) ((gint8 *) a + 3 * astride)
  };

  f[0] = _mm_set_epi16 (ic[1], ic[0], ic[1], ic[0], ic[1], ic[0], ic[1], ic[0]);
  f[1] = _mm_set_epi16 (ic[3], ic[2], ic[3], ic[2], ic[3], ic[2], ic[3], ic[2]);

  for (; i < len; i += 8) {
    ta = _mm_load_si128 ((__m128i *) (c[0] + i));
    tb = _mm_load_si128 ((__m128i *) (c[1] + i));

    tl1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[0]);
    th1 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[0]);

    ta = _mm_load_si128 ((__m128i *) (c[2] + i));
    tb = _mm_load_si128 ((__m128i *) (c[3] + i));

    tl2 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[1]);
    th2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[1]);

    tl1 = _mm_add_epi32 (tl1, tl2);
    th1 = _mm_add_epi32 (th1, th2);

    tl1 = _mm_add_epi32 (tl1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));
    th1 = _mm_add_epi32 (th1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1)));

    tl1 = _mm_srai_epi32 (tl1, PRECISION_S16);
    th1 = _mm_srai_epi32 (th1, PRECISION_S16);

    tl1 = _mm_packs_epi32 (tl1, th1);
    _mm_store_si128 ((__m128i *) (o + i), tl1);
  }
}
static void
TEST (void)
{
  union128i_w u, s1, s2;
  short e[8];
  int i, tmp;
   
  s1.x = _mm_set_epi16 (10,2067,-3033,90,80,40,-1000,15);
  s2.x = _mm_set_epi16 (11, 9834, 7444, -10222, 34, -7833, 39, 14);
  u.x = test (s1.x, s2.x); 
   
  for (i = 0; i < 8; i++)
    {
      tmp = s1.a[i] * s2.a[i];
    
      e[i] = tmp;
    }

  if (check_union128i_w (u, e))
    abort ();
}
static void
TEST (void)
{
  union128i_uw u, s1, s2;
  unsigned short e[8];
  int i, tmp;
   
  s1.x = _mm_set_epi16 (10,2067,3033,90,80,40,1000,15);
  s2.x = _mm_set_epi16 (11, 9834, 7444, 10222, 34, 7833, 39, 14);
  u.x = test (s1.x, s2.x); 
   
  for (i = 0; i < 8; i++)
    {
      tmp = s1.a[i] * s2.a[i];
    
      e[i] = (tmp & 0xffff0000)>>16;
    }

  if (check_union128i_uw (u, e))
    abort ();
}
Beispiel #17
0
/* -----------------------------------
 *       weighted_merge_planar
 * -----------------------------------
 */
void weighted_merge_planar_sse2(BYTE *p1, const BYTE *p2, int p1_pitch, int p2_pitch, int width, int height, int weight, int invweight) {
  __m128i round_mask = _mm_set1_epi32(0x4000);
  __m128i zero = _mm_setzero_si128();
  __m128i mask = _mm_set_epi16(weight, invweight, weight, invweight, weight, invweight, weight, invweight);

  int wMod16 = (width/16) * 16;

  for (int y = 0; y < height; y++) {
    for (int x = 0; x < wMod16; x += 16) {
      __m128i px1 = _mm_load_si128(reinterpret_cast<const __m128i*>(p1+x)); //y7y6 y5y4 y3y2 y1y0
      __m128i px2 = _mm_load_si128(reinterpret_cast<const __m128i*>(p2+x)); //Y7Y6 Y5Y4 Y3Y2 Y1Y0

      __m128i p0123 = _mm_unpacklo_epi8(px1, px2); //Y3y3 Y2y2 Y1y1 Y0y0
      __m128i p4567 = _mm_unpackhi_epi8(px1, px2); //Y7y7 Y6y6 Y5y5 Y4y4

      __m128i p01 = _mm_unpacklo_epi8(p0123, zero); //00Y1 00y1 00Y0 00y0
      __m128i p23 = _mm_unpackhi_epi8(p0123, zero); //00Y3 00y3 00Y2 00y2
      __m128i p45 = _mm_unpacklo_epi8(p4567, zero); //00Y5 00y5 00Y4 00y4
      __m128i p67 = _mm_unpackhi_epi8(p4567, zero); //00Y7 00y7 00Y6 00y6

      p01 = _mm_madd_epi16(p01, mask);
      p23 = _mm_madd_epi16(p23, mask);
      p45 = _mm_madd_epi16(p45, mask);
      p67 = _mm_madd_epi16(p67, mask);

      p01 = _mm_add_epi32(p01, round_mask);
      p23 = _mm_add_epi32(p23, round_mask);
      p45 = _mm_add_epi32(p45, round_mask);
      p67 = _mm_add_epi32(p67, round_mask);

      p01 = _mm_srli_epi32(p01, 15);
      p23 = _mm_srli_epi32(p23, 15);
      p45 = _mm_srli_epi32(p45, 15);
      p67 = _mm_srli_epi32(p67, 15);

      p0123 = _mm_packs_epi32(p01, p23);
      p4567 = _mm_packs_epi32(p45, p67);

      __m128i result = _mm_packus_epi16(p0123, p4567);

      _mm_store_si128(reinterpret_cast<__m128i*>(p1+x), result);
    }

    for (int x = wMod16; x < width; x++) {
      p1[x] = (p1[x]*invweight + p2[x]*weight + 16384) >> 15;
    }

    p1 += p1_pitch;
    p2 += p2_pitch;
  }
}
Beispiel #18
0
static void ApplyAlphaMultiply(uint8_t* rgba, int alpha_first,
                               int w, int h, int stride) {
  const __m128i zero = _mm_setzero_si128();
  const int kSpan = 2;
  const int w2 = w & ~(kSpan - 1);
  while (h-- > 0) {
    uint32_t* const rgbx = (uint32_t*)rgba;
    int i;
    if (!alpha_first) {
      const __m128i kMask = _mm_set_epi16(0xff, 0, 0, 0, 0xff, 0, 0, 0);
      const __m128i kMult =
          _mm_set_epi16(0, 0x8081, 0x8081, 0x8081, 0, 0x8081, 0x8081, 0x8081);
      for (i = 0; i < w2; i += kSpan) {
        APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 3, 3, 3), kMask, kMult);
      }
    } else {
      const __m128i kMask = _mm_set_epi16(0, 0, 0, 0xff, 0, 0, 0, 0xff);
      const __m128i kMult =
          _mm_set_epi16(0x8081, 0x8081, 0x8081, 0, 0x8081, 0x8081, 0x8081, 0);
      for (i = 0; i < w2; i += kSpan) {
        APPLY_ALPHA(rgbx[i], _MM_SHUFFLE(0, 0, 0, 3), kMask, kMult);
      }
    }
    // Finish with left-overs.
    for (; i < w; ++i) {
      uint8_t* const rgb = rgba + (alpha_first ? 1 : 0);
      const uint8_t* const alpha = rgba + (alpha_first ? 0 : 3);
      const uint32_t a = alpha[4 * i];
      if (a != 0xff) {
        const uint32_t mult = MULTIPLIER(a);
        rgb[4 * i + 0] = PREMULTIPLY(rgb[4 * i + 0], mult);
        rgb[4 * i + 1] = PREMULTIPLY(rgb[4 * i + 1], mult);
        rgb[4 * i + 2] = PREMULTIPLY(rgb[4 * i + 2], mult);
      }
    }
    rgba += stride;
  }
}
Beispiel #19
0
void unpack_rgba4_sse2(const Uint8* source, const Uint32 size, Uint8* dest)
{
	__m128i t0, t1, t2;
	Uint32 i;

	for (i = 0; i < (size / 8); i++)
	{
		t0 = _mm_loadl_epi64((__m128i*)&source[i * 8]);
		// converts 4 bit values to 8 bit values (multiply with 17)
		t0 = _mm_unpacklo_epi16(t0, t0);
		t1 = _mm_unpacklo_epi16(t0, t0);
		t1 = _mm_and_si128(t1, _mm_set_epi16(0xF000, 0x000F, 0x00F0, 0x0F00, 0xF000, 0x000F, 0x00F0, 0x0F00));
		t1 = _mm_mullo_epi16(t1, _mm_set_epi16(0x0001, 0x1000, 0x0100, 0x0010, 0x0001, 0x1000, 0x0100, 0x0010));
		t1 = _mm_mulhi_epu16(t1, _mm_set1_epi16(0x0110));
		t2 = _mm_unpackhi_epi16(t0, t0);
		t2 = _mm_and_si128(t2, _mm_set_epi16(0xF000, 0x000F, 0x00F0, 0x0F00, 0xF000, 0x000F, 0x00F0, 0x0F00));
		t2 = _mm_mullo_epi16(t2, _mm_set_epi16(0x0001, 0x1000, 0x0100, 0x0010, 0x0001, 0x1000, 0x0100, 0x0010));
		t2 = _mm_mulhi_epu16(t2, _mm_set1_epi16(0x0110));
		t1 = _mm_packus_epi16(t1, t2);

		_mm_stream_si128((__m128i*)&dest[i * 16], t1);
	}
}
Beispiel #20
0
static void TransformColorInverse(const VP8LMultipliers* const m,
                                  const uint32_t* const src, int num_pixels,
                                  uint32_t* dst) {
// sign-extended multiplying constants, pre-shifted by 5.
#define CST(X)  (((int16_t)(m->X << 8)) >> 5)   // sign-extend
  const __m128i mults_rb = _mm_set_epi16(
      CST(green_to_red_), CST(green_to_blue_),
      CST(green_to_red_), CST(green_to_blue_),
      CST(green_to_red_), CST(green_to_blue_),
      CST(green_to_red_), CST(green_to_blue_));
  const __m128i mults_b2 = _mm_set_epi16(
      CST(red_to_blue_), 0, CST(red_to_blue_), 0,
      CST(red_to_blue_), 0, CST(red_to_blue_), 0);
#undef CST
  const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);  // alpha-green masks
  int i;
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
    const __m128i A = _mm_and_si128(in, mask_ag);     // a   0   g   0
    const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
    const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // g0g0
    const __m128i D = _mm_mulhi_epi16(C, mults_rb);    // x dr  x db1
    const __m128i E = _mm_add_epi8(in, D);             // x r'  x   b'
    const __m128i F = _mm_slli_epi16(E, 8);            // r' 0   b' 0
    const __m128i G = _mm_mulhi_epi16(F, mults_b2);    // x db2  0  0
    const __m128i H = _mm_srli_epi32(G, 8);            // 0  x db2  0
    const __m128i I = _mm_add_epi8(H, F);              // r' x  b'' 0
    const __m128i J = _mm_srli_epi16(I, 8);            // 0  r'  0  b''
    const __m128i out = _mm_or_si128(J, A);
    _mm_storeu_si128((__m128i*)&dst[i], out);
  }
  // Fall-back to C-version for left-overs.
  if (i != num_pixels) {
    VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
  }
}
Beispiel #21
0
/* -----------------------------------
 *      weighted_merge_luma_yuy2
 * -----------------------------------
 */
static void weighted_merge_luma_yuy2_sse2(BYTE *src, const BYTE *luma, int pitch, int luma_pitch,int width, int height, int weight, int invweight)
{
  __m128i round_mask = _mm_set1_epi32(0x4000);
  __m128i mask = _mm_set_epi16(weight, invweight, weight, invweight, weight, invweight, weight, invweight);
  __m128i luma_mask = _mm_set1_epi16(0x00FF);
#pragma warning(push)
#pragma warning(disable: 4309)
  __m128i chroma_mask = _mm_set1_epi16(0xFF00);
#pragma warning(pop)

  int wMod16 = (width/16) * 16;

  for (int y = 0; y < height; y++) {
    for (int x = 0; x < wMod16; x += 16) {
      __m128i px1 = _mm_load_si128(reinterpret_cast<const __m128i*>(src+x)); //V1 Y3 U1 Y2 V0 Y1 U0 Y0
      __m128i px2 = _mm_load_si128(reinterpret_cast<const __m128i*>(luma+x)); //v1 y3 u1 y2 v0 y1 u0 y0

      __m128i src_lo = _mm_unpacklo_epi16(px1, px2); //v0 y1 V0 Y1 u0 y0 U0 Y0
      __m128i src_hi = _mm_unpackhi_epi16(px1, px2); 

      src_lo = _mm_and_si128(src_lo, luma_mask); //00 v0 00 V0 00 u0 00 U0
      src_hi = _mm_and_si128(src_hi, luma_mask); 

      src_lo = _mm_madd_epi16(src_lo, mask);
      src_hi = _mm_madd_epi16(src_hi, mask);

      src_lo = _mm_add_epi32(src_lo, round_mask);
      src_hi = _mm_add_epi32(src_hi, round_mask);

      src_lo = _mm_srli_epi32(src_lo, 15);
      src_hi = _mm_srli_epi32(src_hi, 15);

      __m128i result_luma = _mm_packs_epi32(src_lo, src_hi);

      __m128i result_chroma = _mm_and_si128(px1, chroma_mask);
      __m128i result = _mm_or_si128(result_chroma, result_luma);

      _mm_store_si128(reinterpret_cast<__m128i*>(src+x), result);
    }

    for (int x = wMod16; x < width; x+=2) {
      src[x] = (luma[x] * weight + src[x] * invweight + 16384) >> 15;
    }

    src += pitch;
    luma += luma_pitch;
  }
}
Beispiel #22
0
static void
avx2_test (void)
{
  union128i_w s;
  union256i_q res;
  long long int res_ref[4];

  s.x = _mm_set_epi16 (1, 2, 3, 4, -200, 50, 6, 8);

  res.x = _mm256_cvtepi16_epi64 (s.x);

  compute_movsxwq (s.a, res_ref);

  if (check_union256i_q (res, res_ref))
    abort ();
}
Beispiel #23
0
void main() {
    int i;
    __m128i v1, v2;
    char a1[16], a2[16];

    // packing data
    v1 = _mm_set_epi16(0, 1, 2, 3, 4, 5, 6, 7);
    v2 = _mm_set_epi32(0x0f0e0d0c, 0x0b0a0908, 0x07060504, 0x03020100);
    
    // load data
    _mm_storeu_si128((__m128i*)a1, v1);
    _mm_storeu_si128((__m128i*)a2, v2);

    printarr(a1, "a1");
    printarr(a2, "a2");

}
Beispiel #24
0
static inline void
desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts)
{
	__m128i vlan0, vlan1, rss;
	union {
		uint16_t e[4];
		uint64_t dword;
	} vol;

	/* mask everything except rss and vlan flags
	*bit2 is for vlan tag, bits 13:12 for rss
	*/
	const __m128i rss_vlan_msk = _mm_set_epi16(
			0x0000, 0x0000, 0x0000, 0x0000,
			0x3004, 0x3004, 0x3004, 0x3004);

	/* map rss and vlan type to rss hash and vlan flag */
	const __m128i vlan_flags = _mm_set_epi8(0, 0, 0, 0,
			0, 0, 0, 0,
			0, 0, 0, PKT_RX_VLAN_PKT,
			0, 0, 0, 0);

	const __m128i rss_flags = _mm_set_epi8(0, 0, 0, 0,
			0, 0, 0, 0,
			0, 0, 0, 0,
			PKT_RX_FDIR, 0, PKT_RX_RSS_HASH, 0);

	vlan0 = _mm_unpackhi_epi16(descs[0], descs[1]);
	vlan1 = _mm_unpackhi_epi16(descs[2], descs[3]);
	vlan0 = _mm_unpacklo_epi32(vlan0, vlan1);

	vlan1 = _mm_and_si128(vlan0, rss_vlan_msk);
	vlan0 = _mm_shuffle_epi8(vlan_flags, vlan1);

	rss = _mm_srli_epi16(vlan1, 12);
	rss = _mm_shuffle_epi8(rss_flags, rss);

	vlan0 = _mm_or_si128(vlan0, rss);
	vol.dword = _mm_cvtsi128_si64(vlan0);

	rx_pkts[0]->ol_flags = vol.e[0];
	rx_pkts[1]->ol_flags = vol.e[1];
	rx_pkts[2]->ol_flags = vol.e[2];
	rx_pkts[3]->ol_flags = vol.e[3];
}
Beispiel #25
0
/* -----------------------------------
 *     weighted_merge_chroma_yuy2
 * -----------------------------------
 */
static void weighted_merge_chroma_yuy2_sse2(BYTE *src, const BYTE *chroma, int pitch, int chroma_pitch,int width, int height, int weight, int invweight )
{
  __m128i round_mask = _mm_set1_epi32(0x4000);
  __m128i mask = _mm_set_epi16(weight, invweight, weight, invweight, weight, invweight, weight, invweight);
  __m128i luma_mask = _mm_set1_epi16(0x00FF);

  int wMod16 = (width/16) * 16;

  for (int y = 0; y < height; y++) {
    for (int x = 0; x < wMod16; x += 16) {
      __m128i px1 = _mm_load_si128(reinterpret_cast<const __m128i*>(src+x)); 
      __m128i px2 = _mm_load_si128(reinterpret_cast<const __m128i*>(chroma+x)); 

      __m128i src_lo = _mm_unpacklo_epi16(px1, px2); 
      __m128i src_hi = _mm_unpackhi_epi16(px1, px2); 

      src_lo = _mm_srli_epi16(src_lo, 8);
      src_hi = _mm_srli_epi16(src_hi, 8);

      src_lo = _mm_madd_epi16(src_lo, mask);
      src_hi = _mm_madd_epi16(src_hi, mask);

      src_lo = _mm_add_epi32(src_lo, round_mask);
      src_hi = _mm_add_epi32(src_hi, round_mask);

      src_lo = _mm_srli_epi32(src_lo, 15);
      src_hi = _mm_srli_epi32(src_hi, 15);

      __m128i result_chroma = _mm_packs_epi32(src_lo, src_hi);
      result_chroma = _mm_slli_epi16(result_chroma, 8);

      __m128i result_luma = _mm_and_si128(px1, luma_mask);
      __m128i result = _mm_or_si128(result_chroma, result_luma);

      _mm_store_si128(reinterpret_cast<__m128i*>(src+x), result);
    }

    for (int x = wMod16; x < width; x+=2) {
      src[x+1] = (chroma[x+1] * weight + src[x+1] * invweight + 16384) >> 15;
    }

    src += pitch;
    chroma += chroma_pitch;
  }
}
test (unsigned short *v)
{
  __m128i x;

  x = _mm_set_epi16 (0, 0, 0, 0, 0, 0, 0, v[0]);
  check (x, v, 0);
  x = _mm_set_epi16 (0, 0, 0, 0, 0, 0, v[1], 0);
  check (x, v, 1);
  x = _mm_set_epi16 (0, 0, 0, 0, 0, v[2], 0, 0);
  check (x, v, 2);
  x = _mm_set_epi16 (0, 0, 0, 0, v[3], 0, 0, 0);
  check (x, v, 3);
  x = _mm_set_epi16 (0, 0, 0, v[4], 0, 0, 0, 0);
  check (x, v, 4);
  x = _mm_set_epi16 (0, 0, v[5], 0, 0, 0, 0, 0);
  check (x, v, 5);
  x = _mm_set_epi16 (0, v[6], 0, 0, 0, 0, 0, 0);
  check (x, v, 6);
  x = _mm_set_epi16 (v[7], 0, 0, 0, 0, 0, 0, 0);
  check (x, v, 7);
}
Beispiel #27
0
static void
TEST (void)
{
  union128i_w u, s;
  union128i_q c;
  short e[8] = {0};
  int i;
 
  s.x = _mm_set_epi16 (1, -2, 3, 4, 5, 6, -0x7000, 0x9000);
  c.x = _mm_set_epi64x (12, 13);

  u.x = test (s.x, c.x);

  if (c.a[0] < 16)
    for (i = 0; i < 8; i++)
      e[i] = s.a[i] >> c.a[0]; 

  if (check_union128i_w (u, e))
    abort (); 
}
Beispiel #28
0
static void CollectColorRedTransforms(const uint32_t* argb, int stride,
                                      int tile_width, int tile_height,
                                      int green_to_red, int histo[]) {
  const __m128i mults_g = _mm_set_epi16(
      0, CST_5b(green_to_red), 0, CST_5b(green_to_red),
      0, CST_5b(green_to_red), 0, CST_5b(green_to_red));
  const __m128i mask_g = _mm_set1_epi32(0x00ff00);  // green mask
  const __m128i mask = _mm_set1_epi32(0xff);

  int y;
  for (y = 0; y < tile_height; ++y) {
    const uint32_t* const src = argb + y * stride;
    int i, x;
    for (x = 0; x + SPAN <= tile_width; x += SPAN) {
      uint16_t values[SPAN];
      const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x +        0]);
      const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]);
      const __m128i A0 = _mm_and_si128(in0, mask_g);    // 0 0  | g 0
      const __m128i A1 = _mm_and_si128(in1, mask_g);
      const __m128i B0 = _mm_srli_epi32(in0, 16);       // 0 0  | x r
      const __m128i B1 = _mm_srli_epi32(in1, 16);
      const __m128i C0 = _mm_mulhi_epi16(A0, mults_g);  // 0 0  | x dr
      const __m128i C1 = _mm_mulhi_epi16(A1, mults_g);
      const __m128i E0 = _mm_sub_epi8(B0, C0);          // x x  | x r'
      const __m128i E1 = _mm_sub_epi8(B1, C1);
      const __m128i F0 = _mm_and_si128(E0, mask);       // 0 0  | 0 r'
      const __m128i F1 = _mm_and_si128(E1, mask);
      const __m128i I = _mm_packs_epi32(F0, F1);
      _mm_storeu_si128((__m128i*)values, I);
      for (i = 0; i < SPAN; ++i) ++histo[values[i]];
    }
  }
  {
    const int left_over = tile_width & (SPAN - 1);
    if (left_over > 0) {
      VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride,
                                      left_over, tile_height,
                                      green_to_red, histo);
    }
  }
}
Beispiel #29
0
static void TransformAC3(const int16_t* in, uint8_t* dst) {
    static const int kC1 = 20091 + (1 << 16);
    static const int kC2 = 35468;
    const __m128i A = _mm_set1_epi16(in[0] + 4);
    const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2));
    const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1));
    const int c1 = MUL(in[1], kC2);
    const int d1 = MUL(in[1], kC1);
    const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1);
    const __m128i B = _mm_adds_epi16(A, CD);
    const __m128i m0 = _mm_adds_epi16(B, d4);
    const __m128i m1 = _mm_adds_epi16(B, c4);
    const __m128i m2 = _mm_subs_epi16(B, c4);
    const __m128i m3 = _mm_subs_epi16(B, d4);
    const __m128i zero = _mm_setzero_si128();
    // Load the source pixels.
    __m128i dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
    __m128i dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
    __m128i dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
    __m128i dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
    // Convert to 16b.
    dst0 = _mm_unpacklo_epi8(dst0, zero);
    dst1 = _mm_unpacklo_epi8(dst1, zero);
    dst2 = _mm_unpacklo_epi8(dst2, zero);
    dst3 = _mm_unpacklo_epi8(dst3, zero);
    // Add the inverse transform.
    dst0 = _mm_adds_epi16(dst0, _mm_srai_epi16(m0, 3));
    dst1 = _mm_adds_epi16(dst1, _mm_srai_epi16(m1, 3));
    dst2 = _mm_adds_epi16(dst2, _mm_srai_epi16(m2, 3));
    dst3 = _mm_adds_epi16(dst3, _mm_srai_epi16(m3, 3));
    // Unsigned saturate to 8b.
    dst0 = _mm_packus_epi16(dst0, dst0);
    dst1 = _mm_packus_epi16(dst1, dst1);
    dst2 = _mm_packus_epi16(dst2, dst2);
    dst3 = _mm_packus_epi16(dst3, dst3);
    // Store the results.
    *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
    *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
    *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
    *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
}
Beispiel #30
0
/*
 * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP)
 *
 * Notice:
 * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet
 * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST
 *   numbers of DD bit
 * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two
 * - don't support ol_flags for rss and csum err
 */
static inline uint16_t
_recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts,
		uint16_t nb_pkts, uint8_t *split_packet)
{
	volatile union ixgbe_adv_rx_desc *rxdp;
	struct ixgbe_rx_entry *sw_ring;
	uint16_t nb_pkts_recd;
	int pos;
	uint64_t var;
	__m128i shuf_msk;
	__m128i crc_adjust = _mm_set_epi16(
				0, 0, 0,    /* ignore non-length fields */
				-rxq->crc_len, /* sub crc on data_len */
				0,          /* ignore high-16bits of pkt_len */
				-rxq->crc_len, /* sub crc on pkt_len */
				0, 0            /* ignore pkt_type field */
			);
	__m128i dd_check, eop_check;

	/* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */
	nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST);

	/* nb_pkts has to be floor-aligned to RTE_IXGBE_DESCS_PER_LOOP */
	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_IXGBE_DESCS_PER_LOOP);

	/* Just the act of getting into the function from the application is
	 * going to cost about 7 cycles
	 */
	rxdp = rxq->rx_ring + rxq->rx_tail;

	_mm_prefetch((const void *)rxdp, _MM_HINT_T0);

	/* See if we need to rearm the RX queue - gives the prefetch a bit
	 * of time to act
	 */
	if (rxq->rxrearm_nb > RTE_IXGBE_RXQ_REARM_THRESH)
		ixgbe_rxq_rearm(rxq);

	/* Before we start moving massive data around, check to see if
	 * there is actually a packet available
	 */
	if (!(rxdp->wb.upper.status_error &
				rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD)))
		return 0;

	/* 4 packets DD mask */
	dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL);

	/* 4 packets EOP mask */
	eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL);

	/* mask to shuffle from desc. to mbuf */
	shuf_msk = _mm_set_epi8(
		7, 6, 5, 4,  /* octet 4~7, 32bits rss */
		15, 14,      /* octet 14~15, low 16 bits vlan_macip */
		13, 12,      /* octet 12~13, 16 bits data_len */
		0xFF, 0xFF,  /* skip high 16 bits pkt_len, zero out */
		13, 12,      /* octet 12~13, low 16 bits pkt_len */
		0xFF, 0xFF,  /* skip 32 bit pkt_type */
		0xFF, 0xFF
		);

	/* Cache is empty -> need to scan the buffer rings, but first move
	 * the next 'n' mbufs into the cache
	 */
	sw_ring = &rxq->sw_ring[rxq->rx_tail];

	/* A. load 4 packet in one loop
	 * [A*. mask out 4 unused dirty field in desc]
	 * B. copy 4 mbuf point from swring to rx_pkts
	 * C. calc the number of DD bits among the 4 packets
	 * [C*. extract the end-of-packet bit, if requested]
	 * D. fill info. from desc to mbuf
	 */
	for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
			pos += RTE_IXGBE_DESCS_PER_LOOP,
			rxdp += RTE_IXGBE_DESCS_PER_LOOP) {
		__m128i descs[RTE_IXGBE_DESCS_PER_LOOP];
		__m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
		__m128i zero, staterr, sterr_tmp1, sterr_tmp2;
		__m128i mbp1, mbp2; /* two mbuf pointer in one XMM reg. */

		/* B.1 load 1 mbuf point */
		mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]);

		/* Read desc statuses backwards to avoid race condition */
		/* A.1 load 4 pkts desc */
		descs[3] = _mm_loadu_si128((__m128i *)(rxdp + 3));

		/* B.2 copy 2 mbuf point into rx_pkts  */
		_mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1);

		/* B.1 load 1 mbuf point */
		mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos+2]);

		descs[2] = _mm_loadu_si128((__m128i *)(rxdp + 2));
		/* B.1 load 2 mbuf point */
		descs[1] = _mm_loadu_si128((__m128i *)(rxdp + 1));
		descs[0] = _mm_loadu_si128((__m128i *)(rxdp));

		/* B.2 copy 2 mbuf point into rx_pkts  */
		_mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2);

		if (split_packet) {
			rte_mbuf_prefetch_part2(rx_pkts[pos]);
			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
		}

		/* avoid compiler reorder optimization */
		rte_compiler_barrier();

		/* D.1 pkt 3,4 convert format from desc to pktmbuf */
		pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk);
		pkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk);

		/* D.1 pkt 1,2 convert format from desc to pktmbuf */
		pkt_mb2 = _mm_shuffle_epi8(descs[1], shuf_msk);
		pkt_mb1 = _mm_shuffle_epi8(descs[0], shuf_msk);

		/* C.1 4=>2 filter staterr info only */
		sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]);
		/* C.1 4=>2 filter staterr info only */
		sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]);

		/* set ol_flags with vlan packet type */
		desc_to_olflags_v(descs, &rx_pkts[pos]);

		/* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
		pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust);
		pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust);

		/* C.2 get 4 pkts staterr value  */
		zero = _mm_xor_si128(dd_check, dd_check);
		staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2);

		/* D.3 copy final 3,4 data to rx_pkts */
		_mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1,
				pkt_mb4);
		_mm_storeu_si128((void *)&rx_pkts[pos+2]->rx_descriptor_fields1,
				pkt_mb3);

		/* D.2 pkt 1,2 set in_port/nb_seg and remove crc */
		pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust);
		pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust);

		/* C* extract and record EOP bit */
		if (split_packet) {
			__m128i eop_shuf_mask = _mm_set_epi8(
					0xFF, 0xFF, 0xFF, 0xFF,
					0xFF, 0xFF, 0xFF, 0xFF,
					0xFF, 0xFF, 0xFF, 0xFF,
					0x04, 0x0C, 0x00, 0x08
					);

			/* and with mask to extract bits, flipping 1-0 */
			__m128i eop_bits = _mm_andnot_si128(staterr, eop_check);
			/* the staterr values are not in order, as the count
			 * count of dd bits doesn't care. However, for end of
			 * packet tracking, we do care, so shuffle. This also
			 * compresses the 32-bit values to 8-bit
			 */
			eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask);
			/* store the resulting 32-bit value */
			*(int *)split_packet = _mm_cvtsi128_si32(eop_bits);
			split_packet += RTE_IXGBE_DESCS_PER_LOOP;

			/* zero-out next pointers */
			rx_pkts[pos]->next = NULL;
			rx_pkts[pos + 1]->next = NULL;
			rx_pkts[pos + 2]->next = NULL;
			rx_pkts[pos + 3]->next = NULL;
		}

		/* C.3 calc available number of desc */
		staterr = _mm_and_si128(staterr, dd_check);
		staterr = _mm_packs_epi32(staterr, zero);

		/* D.3 copy final 1,2 data to rx_pkts */
		_mm_storeu_si128((void *)&rx_pkts[pos+1]->rx_descriptor_fields1,
				pkt_mb2);
		_mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1,
				pkt_mb1);

		/* C.4 calc avaialbe number of desc */
		var = __builtin_popcountll(_mm_cvtsi128_si64(staterr));
		nb_pkts_recd += var;
		if (likely(var != RTE_IXGBE_DESCS_PER_LOOP))
			break;
	}

	/* Update our internal tail pointer */
	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);
	rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
	rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);

	return nb_pkts_recd;
}