void test_vreinterpretQu32_u8 (void)
{
  uint32x4_t out_uint32x4_t;
  uint8x16_t arg0_uint8x16_t;

  out_uint32x4_t = vreinterpretq_u32_u8 (arg0_uint8x16_t);
}
Example #2
0
// about twice as fast as generic
void MipMap_32_neon( int width, int height, unsigned char *source, unsigned char *target )
{
    if(width < 8) {
        MipMap_32_generic(width, height, source, target);
        return;
    }

    int newwidth = width / 2;
    int newheight = height / 2;
    int stride = width * 4;
    
    unsigned char *s = target;
    unsigned char *t = source;
    unsigned char *u = t+stride;

    int y, x;
    for( y = 0; y < newheight; y++ ) {
        for( x = 0; x < newwidth; x+=4 ) {
            uint8x16_t a0, a1, a2, a3;

            memcpy(&a0, t,    16);
            memcpy(&a1, t+16, 16);
            memcpy(&a2, u,    16);
            memcpy(&a3, u+16, 16);

            // average first and second scan lines
            a0 = vhaddq_u8(a0, a2);
            a1 = vhaddq_u8(a1, a3);

            // repack
            uint32x4x2_t z = vuzpq_u32(vreinterpretq_u32_u8(a0), vreinterpretq_u32_u8(a1));
            uint8x16_t d0, d1;
            memcpy(&d0, &z.val[0], 16), memcpy(&d1, &z.val[1], 16);

            // average even and odd x pixels
            a0 = vhaddq_u8(vreinterpretq_u8_u32(z.val[0]), vreinterpretq_u8_u32(z.val[1]));

            memcpy(s, &a0, 16);

            s+=16;
            t+=32;
            u+=32;
        }
        t += stride;
        u += stride;
    }
}
static void MixColumns(void)
{
	uint32x4_t a = vreinterpretq_u32_u8(*state);
	uint32x4_t b = vreinterpretq_u32_u8(xtime(*state));
	
	uint32x4_t a3 = veorq_u32(a,b);
	uint32x4_t a3r = vshlq_n_u32(a3,8);
	a3r = vsraq_n_u32(a3r,a3,24);
	
	uint32x4_t a2 = vshlq_n_u32(a,16);
	a2 = vsraq_n_u32(a2,a,16);
	
	uint32x4_t a1 = vshlq_n_u32(a,24);
	a1 = vsraq_n_u32(a1,a,8);
	
	uint32x4_t out = veorq_u32(b,a1);
	out = veorq_u32(out,a2);
	out = veorq_u32(out,a3r);
	*state = vreinterpretq_u8_u32(out);
}
Example #4
0
 /*
 * Notice:
 * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet
 * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST
 *   numbers of DD bits
 */
static inline uint16_t
_recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts,
		   uint16_t nb_pkts, uint8_t *split_packet)
{
	volatile union i40e_rx_desc *rxdp;
	struct i40e_rx_entry *sw_ring;
	uint16_t nb_pkts_recd;
	int pos;
	uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl;

	/* mask to shuffle from desc. to mbuf */
	uint8x16_t shuf_msk = {
		0xFF, 0xFF,   /* pkt_type set as unknown */
		0xFF, 0xFF,   /* pkt_type set as unknown */
		14, 15,       /* octet 15~14, low 16 bits pkt_len */
		0xFF, 0xFF,   /* skip high 16 bits pkt_len, zero out */
		14, 15,       /* octet 15~14, 16 bits data_len */
		2, 3,         /* octet 2~3, low 16 bits vlan_macip */
		4, 5, 6, 7    /* octet 4~7, 32bits rss */
		};

	uint8x16_t eop_check = {
		0x02, 0x00, 0x02, 0x00,
		0x02, 0x00, 0x02, 0x00,
		0x00, 0x00, 0x00, 0x00,
		0x00, 0x00, 0x00, 0x00
		};

	uint16x8_t crc_adjust = {
		0, 0,         /* ignore pkt_type field */
		rxq->crc_len, /* sub crc on pkt_len */
		0,            /* ignore high-16bits of pkt_len */
		rxq->crc_len, /* sub crc on data_len */
		0, 0, 0       /* ignore non-length fields */
		};

	/* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */
	nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST);

	/* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */
	nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP);

	/* Just the act of getting into the function from the application is
	 * going to cost about 7 cycles
	 */
	rxdp = rxq->rx_ring + rxq->rx_tail;

	rte_prefetch_non_temporal(rxdp);

	/* See if we need to rearm the RX queue - gives the prefetch a bit
	 * of time to act
	 */
	if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH)
		i40e_rxq_rearm(rxq);

	/* Before we start moving massive data around, check to see if
	 * there is actually a packet available
	 */
	if (!(rxdp->wb.qword1.status_error_len &
			rte_cpu_to_le_32(1 << I40E_RX_DESC_STATUS_DD_SHIFT)))
		return 0;

	/* Cache is empty -> need to scan the buffer rings, but first move
	 * the next 'n' mbufs into the cache
	 */
	sw_ring = &rxq->sw_ring[rxq->rx_tail];

	/* A. load 4 packet in one loop
	 * [A*. mask out 4 unused dirty field in desc]
	 * B. copy 4 mbuf point from swring to rx_pkts
	 * C. calc the number of DD bits among the 4 packets
	 * [C*. extract the end-of-packet bit, if requested]
	 * D. fill info. from desc to mbuf
	 */

	for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts;
			pos += RTE_I40E_DESCS_PER_LOOP,
			rxdp += RTE_I40E_DESCS_PER_LOOP) {
		uint64x2_t descs[RTE_I40E_DESCS_PER_LOOP];
		uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4;
		uint16x8x2_t sterr_tmp1, sterr_tmp2;
		uint64x2_t mbp1, mbp2;
		uint16x8_t staterr;
		uint16x8_t tmp;
		uint64_t stat;

		int32x4_t len_shl = {0, 0, 0, PKTLEN_SHIFT};

		/* B.1 load 1 mbuf point */
		mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]);
		/* Read desc statuses backwards to avoid race condition */
		/* A.1 load 4 pkts desc */
		descs[3] =  vld1q_u64((uint64_t *)(rxdp + 3));
		rte_rmb();

		/* B.2 copy 2 mbuf point into rx_pkts  */
		vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1);

		/* B.1 load 1 mbuf point */
		mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]);

		descs[2] =  vld1q_u64((uint64_t *)(rxdp + 2));
		/* B.1 load 2 mbuf point */
		descs[1] =  vld1q_u64((uint64_t *)(rxdp + 1));
		descs[0] =  vld1q_u64((uint64_t *)(rxdp));

		/* B.2 copy 2 mbuf point into rx_pkts  */
		vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2);

		if (split_packet) {
			rte_mbuf_prefetch_part2(rx_pkts[pos]);
			rte_mbuf_prefetch_part2(rx_pkts[pos + 1]);
			rte_mbuf_prefetch_part2(rx_pkts[pos + 2]);
			rte_mbuf_prefetch_part2(rx_pkts[pos + 3]);
		}

		/* avoid compiler reorder optimization */
		rte_compiler_barrier();

		/* pkt 3,4 shift the pktlen field to be 16-bit aligned*/
		uint32x4_t len3 = vshlq_u32(vreinterpretq_u32_u64(descs[3]),
					    len_shl);
		descs[3] = vreinterpretq_u64_u32(len3);
		uint32x4_t len2 = vshlq_u32(vreinterpretq_u32_u64(descs[2]),
					    len_shl);
		descs[2] = vreinterpretq_u64_u32(len2);

		/* D.1 pkt 3,4 convert format from desc to pktmbuf */
		pkt_mb4 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[3]), shuf_msk);
		pkt_mb3 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[2]), shuf_msk);

		/* C.1 4=>2 filter staterr info only */
		sterr_tmp2 = vzipq_u16(vreinterpretq_u16_u64(descs[1]),
				       vreinterpretq_u16_u64(descs[3]));
		/* C.1 4=>2 filter staterr info only */
		sterr_tmp1 = vzipq_u16(vreinterpretq_u16_u64(descs[0]),
				       vreinterpretq_u16_u64(descs[2]));

		/* C.2 get 4 pkts staterr value  */
		staterr = vzipq_u16(sterr_tmp1.val[1],
				    sterr_tmp2.val[1]).val[0];

		desc_to_olflags_v(rxq, descs, &rx_pkts[pos]);

		/* D.2 pkt 3,4 set in_port/nb_seg and remove crc */
		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust);
		pkt_mb4 = vreinterpretq_u8_u16(tmp);
		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust);
		pkt_mb3 = vreinterpretq_u8_u16(tmp);

		/* pkt 1,2 shift the pktlen field to be 16-bit aligned*/
		uint32x4_t len1 = vshlq_u32(vreinterpretq_u32_u64(descs[1]),
					    len_shl);
		descs[1] = vreinterpretq_u64_u32(len1);
		uint32x4_t len0 = vshlq_u32(vreinterpretq_u32_u64(descs[0]),
					    len_shl);
		descs[0] = vreinterpretq_u64_u32(len0);

		/* D.1 pkt 1,2 convert format from desc to pktmbuf */
		pkt_mb2 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[1]), shuf_msk);
		pkt_mb1 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[0]), shuf_msk);

		/* D.3 copy final 3,4 data to rx_pkts */
		vst1q_u8((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1,
				 pkt_mb4);
		vst1q_u8((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1,
				 pkt_mb3);

		/* D.2 pkt 1,2 set in_port/nb_seg and remove crc */
		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust);
		pkt_mb2 = vreinterpretq_u8_u16(tmp);
		tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust);
		pkt_mb1 = vreinterpretq_u8_u16(tmp);

		/* C* extract and record EOP bit */
		if (split_packet) {
			uint8x16_t eop_shuf_mask = {
					0x00, 0x02, 0x04, 0x06,
					0xFF, 0xFF, 0xFF, 0xFF,
					0xFF, 0xFF, 0xFF, 0xFF,
					0xFF, 0xFF, 0xFF, 0xFF};
			uint8x16_t eop_bits;

			/* and with mask to extract bits, flipping 1-0 */
			eop_bits = vmvnq_u8(vreinterpretq_u8_u16(staterr));
			eop_bits = vandq_u8(eop_bits, eop_check);
			/* the staterr values are not in order, as the count
			 * count of dd bits doesn't care. However, for end of
			 * packet tracking, we do care, so shuffle. This also
			 * compresses the 32-bit values to 8-bit
			 */
			eop_bits = vqtbl1q_u8(eop_bits, eop_shuf_mask);

			/* store the resulting 32-bit value */
			vst1q_lane_u32((uint32_t *)split_packet,
				       vreinterpretq_u32_u8(eop_bits), 0);
			split_packet += RTE_I40E_DESCS_PER_LOOP;

			/* zero-out next pointers */
			rx_pkts[pos]->next = NULL;
			rx_pkts[pos + 1]->next = NULL;
			rx_pkts[pos + 2]->next = NULL;
			rx_pkts[pos + 3]->next = NULL;
		}

		staterr = vshlq_n_u16(staterr, I40E_UINT16_BIT - 1);
		staterr = vreinterpretq_u16_s16(
				vshrq_n_s16(vreinterpretq_s16_u16(staterr),
					    I40E_UINT16_BIT - 1));
		stat = ~vgetq_lane_u64(vreinterpretq_u64_u16(staterr), 0);

		rte_prefetch_non_temporal(rxdp + RTE_I40E_DESCS_PER_LOOP);

		/* D.3 copy final 1,2 data to rx_pkts */
		vst1q_u8((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1,
			 pkt_mb2);
		vst1q_u8((void *)&rx_pkts[pos]->rx_descriptor_fields1,
			 pkt_mb1);
		desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl);
		/* C.4 calc avaialbe number of desc */
		if (unlikely(stat == 0)) {
			nb_pkts_recd += RTE_I40E_DESCS_PER_LOOP;
		} else {
			nb_pkts_recd += __builtin_ctzl(stat) / I40E_UINT16_BIT;
			break;
		}
	}

	/* Update our internal tail pointer */
	rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd);
	rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1));
	rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd);

	return nb_pkts_recd;
}
Example #5
0
static inline void
desc_to_olflags_v(struct i40e_rx_queue *rxq, uint64x2_t descs[4],
		  struct rte_mbuf **rx_pkts)
{
	uint32x4_t vlan0, vlan1, rss, l3_l4e;
	const uint64x2_t mbuf_init = {rxq->mbuf_initializer, 0};
	uint64x2_t rearm0, rearm1, rearm2, rearm3;

	/* mask everything except RSS, flow director and VLAN flags
	 * bit2 is for VLAN tag, bit11 for flow director indication
	 * bit13:12 for RSS indication.
	 */
	const uint32x4_t rss_vlan_msk = {
			0x1c03804, 0x1c03804, 0x1c03804, 0x1c03804};

	const uint32x4_t cksum_mask = {
			PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
			PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
			PKT_RX_EIP_CKSUM_BAD,
			PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
			PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
			PKT_RX_EIP_CKSUM_BAD,
			PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
			PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
			PKT_RX_EIP_CKSUM_BAD,
			PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
			PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
			PKT_RX_EIP_CKSUM_BAD};

	/* map rss and vlan type to rss hash and vlan flag */
	const uint8x16_t vlan_flags = {
			0, 0, 0, 0,
			PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0, 0, 0,
			0, 0, 0, 0,
			0, 0, 0, 0};

	const uint8x16_t rss_flags = {
			0, PKT_RX_FDIR, 0, 0,
			0, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH | PKT_RX_FDIR,
			0, 0, 0, 0,
			0, 0, 0, 0};

	const uint8x16_t l3_l4e_flags = {
			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1,
			PKT_RX_IP_CKSUM_BAD >> 1,
			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD) >> 1,
			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD |
			 PKT_RX_L4_CKSUM_BAD) >> 1,
			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
			 PKT_RX_IP_CKSUM_BAD) >> 1,
			0, 0, 0, 0, 0, 0, 0, 0};

	vlan0 = vzipq_u32(vreinterpretq_u32_u64(descs[0]),
			  vreinterpretq_u32_u64(descs[2])).val[1];
	vlan1 = vzipq_u32(vreinterpretq_u32_u64(descs[1]),
			  vreinterpretq_u32_u64(descs[3])).val[1];
	vlan0 = vzipq_u32(vlan0, vlan1).val[0];

	vlan1 = vandq_u32(vlan0, rss_vlan_msk);
	vlan0 = vreinterpretq_u32_u8(vqtbl1q_u8(vlan_flags,
						vreinterpretq_u8_u32(vlan1)));

	rss = vshrq_n_u32(vlan1, 11);
	rss = vreinterpretq_u32_u8(vqtbl1q_u8(rss_flags,
					      vreinterpretq_u8_u32(rss)));

	l3_l4e = vshrq_n_u32(vlan1, 22);
	l3_l4e = vreinterpretq_u32_u8(vqtbl1q_u8(l3_l4e_flags,
					      vreinterpretq_u8_u32(l3_l4e)));
	/* then we shift left 1 bit */
	l3_l4e = vshlq_n_u32(l3_l4e, 1);
	/* we need to mask out the reduntant bits */
	l3_l4e = vandq_u32(l3_l4e, cksum_mask);

	vlan0 = vorrq_u32(vlan0, rss);
	vlan0 = vorrq_u32(vlan0, l3_l4e);

	rearm0 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 0), mbuf_init, 1);
	rearm1 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 1), mbuf_init, 1);
	rearm2 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 2), mbuf_init, 1);
	rearm3 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 3), mbuf_init, 1);

	vst1q_u64((uint64_t *)&rx_pkts[0]->rearm_data, rearm0);
	vst1q_u64((uint64_t *)&rx_pkts[1]->rearm_data, rearm1);
	vst1q_u64((uint64_t *)&rx_pkts[2]->rearm_data, rearm2);
	vst1q_u64((uint64_t *)&rx_pkts[3]->rearm_data, rearm3);
}
Example #6
0
static
void
neon_w32_split_4_32_multiply_region(gf_t *gf, uint32_t *src, uint32_t *dst,
                                    uint32_t *d_end, uint8_t btable[8][4][16],
                                    uint32_t val, int xor, int altmap)
{
  int i, j;
#ifdef ARCH_AARCH64
  uint8x16_t tables[8][4];
#else
  uint8x8x2_t tables[8][4];
#endif
  uint32x4_t v0, v1, v2, v3, s0, s1, s2, s3;
  uint8x16_t p0, p1, p2, p3, si, mask1;
  uint16x8x2_t r0, r1;
  uint8x16x2_t q0, q1;

  for (i = 0; i < 8; i++) {
    for (j = 0; j < 4; j++) {
#ifdef ARCH_AARCH64
      tables[i][j] = vld1q_u8(btable[i][j]);
#else
      tables[i][j].val[0] = vld1_u8(btable[i][j]);
      tables[i][j].val[1] = vld1_u8(btable[i][j] + 8);
#endif
    }
  }

  mask1 = vdupq_n_u8(0xf);

  while (dst < d_end) {

      v0 = vld1q_u32(src); src += 4;
      v1 = vld1q_u32(src); src += 4;
      v2 = vld1q_u32(src); src += 4;
      v3 = vld1q_u32(src); src += 4;

      if (altmap) {
          q0.val[0] = vreinterpretq_u8_u32(v0);
          q0.val[1] = vreinterpretq_u8_u32(v1);
          q1.val[0] = vreinterpretq_u8_u32(v2);
          q1.val[1] = vreinterpretq_u8_u32(v3);
      } else {
          r0 = vtrnq_u16(vreinterpretq_u16_u32(v0), vreinterpretq_u16_u32(v2));
          r1 = vtrnq_u16(vreinterpretq_u16_u32(v1), vreinterpretq_u16_u32(v3));

          q0 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[0]),
                        vreinterpretq_u8_u16(r1.val[0]));
          q1 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[1]),
                        vreinterpretq_u8_u16(r1.val[1]));
      }

      si = vandq_u8(q0.val[0], mask1);
      p0 = vqtbl1q_u8(tables[0][0], si);
      p1 = vqtbl1q_u8(tables[0][1], si);
      p2 = vqtbl1q_u8(tables[0][2], si);
      p3 = vqtbl1q_u8(tables[0][3], si);

      si = vshrq_n_u8(q0.val[0], 4);
      p0 = veorq_u8(p0, vqtbl1q_u8(tables[1][0], si));
      p1 = veorq_u8(p1, vqtbl1q_u8(tables[1][1], si));
      p2 = veorq_u8(p2, vqtbl1q_u8(tables[1][2], si));
      p3 = veorq_u8(p3, vqtbl1q_u8(tables[1][3], si));

      si = vandq_u8(q0.val[1], mask1);
      p0 = veorq_u8(p0, vqtbl1q_u8(tables[2][0], si));
      p1 = veorq_u8(p1, vqtbl1q_u8(tables[2][1], si));
      p2 = veorq_u8(p2, vqtbl1q_u8(tables[2][2], si));
      p3 = veorq_u8(p3, vqtbl1q_u8(tables[2][3], si));

      si = vshrq_n_u8(q0.val[1], 4);
      p0 = veorq_u8(p0, vqtbl1q_u8(tables[3][0], si));
      p1 = veorq_u8(p1, vqtbl1q_u8(tables[3][1], si));
      p2 = veorq_u8(p2, vqtbl1q_u8(tables[3][2], si));
      p3 = veorq_u8(p3, vqtbl1q_u8(tables[3][3], si));

      si = vandq_u8(q1.val[0], mask1);
      p0 = veorq_u8(p0, vqtbl1q_u8(tables[4][0], si));
      p1 = veorq_u8(p1, vqtbl1q_u8(tables[4][1], si));
      p2 = veorq_u8(p2, vqtbl1q_u8(tables[4][2], si));
      p3 = veorq_u8(p3, vqtbl1q_u8(tables[4][3], si));

      si = vshrq_n_u8(q1.val[0], 4);
      p0 = veorq_u8(p0, vqtbl1q_u8(tables[5][0], si));
      p1 = veorq_u8(p1, vqtbl1q_u8(tables[5][1], si));
      p2 = veorq_u8(p2, vqtbl1q_u8(tables[5][2], si));
      p3 = veorq_u8(p3, vqtbl1q_u8(tables[5][3], si));

      si = vandq_u8(q1.val[1], mask1);
      p0 = veorq_u8(p0, vqtbl1q_u8(tables[6][0], si));
      p1 = veorq_u8(p1, vqtbl1q_u8(tables[6][1], si));
      p2 = veorq_u8(p2, vqtbl1q_u8(tables[6][2], si));
      p3 = veorq_u8(p3, vqtbl1q_u8(tables[6][3], si));

      si = vshrq_n_u8(q1.val[1], 4);
      p0 = veorq_u8(p0, vqtbl1q_u8(tables[7][0], si));
      p1 = veorq_u8(p1, vqtbl1q_u8(tables[7][1], si));
      p2 = veorq_u8(p2, vqtbl1q_u8(tables[7][2], si));
      p3 = veorq_u8(p3, vqtbl1q_u8(tables[7][3], si));

      if (altmap) {
          s0 = vreinterpretq_u32_u8(p0);
          s1 = vreinterpretq_u32_u8(p1);
          s2 = vreinterpretq_u32_u8(p2);
          s3 = vreinterpretq_u32_u8(p3);
      } else {
          q0 = vtrnq_u8(p0, p1);
          q1 = vtrnq_u8(p2, p3);

          r0 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[0]),
                         vreinterpretq_u16_u8(q1.val[0]));
          r1 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[1]),
                         vreinterpretq_u16_u8(q1.val[1]));

          s0 = vreinterpretq_u32_u16(r0.val[0]);
          s1 = vreinterpretq_u32_u16(r1.val[0]);
          s2 = vreinterpretq_u32_u16(r0.val[1]);
          s3 = vreinterpretq_u32_u16(r1.val[1]);
      }

      if (xor) {
          v0 = vld1q_u32(dst);
          v1 = vld1q_u32(dst + 4);
          v2 = vld1q_u32(dst + 8);
          v3 = vld1q_u32(dst + 12);
          s0 = veorq_u32(s0, v0);
          s1 = veorq_u32(s1, v1);
          s2 = veorq_u32(s2, v2);
          s3 = veorq_u32(s3, v3);
      }

      vst1q_u32(dst,      s0);
      vst1q_u32(dst + 4,  s1);
      vst1q_u32(dst + 8,  s2);
      vst1q_u32(dst + 12, s3);

      dst += 16;
  }
}
Example #7
0
   uint8x8_t ad1_8x8;
   uint8x8_t alpha0_8x8;
   uint8x8_t alpha1_8x8;
   uint8x8_t c_8x8;
   uint8x8_t d0_8x8;
   uint8x8_t d1_8x8;
   uint8x8_t s0_8x8;
   uint8x8_t s1_8x8;
   uint8x8_t sc0_8x8;
   uint8x8_t sc1_8x8;

   c_32x2 = vdup_n_u32(c);
   c_8x8 = vreinterpret_u8_u32(c_32x2);
   x255_16x8 = vdupq_n_u16(0xff);
   x0_8x16 = vdupq_n_u8(0x0);
   x0_32x4 = vreinterpretq_u32_u8(x0_8x16);
   x1_8x16 = vdupq_n_u8(0x1);
   x1_32x4 = vreinterpretq_u32_u8(x1_8x16);
   DATA32 *start = d;
   int size = l;
   DATA32 *end = start + (size & ~3);
   while (start < end)
   {

      s_32x4 = vld1q_u32(s);
      s_8x16 = vreinterpretq_u8_u32(s_32x4);

      d_32x4 = vld1q_u32(start);
      d_8x16 = vreinterpretq_u8_u32(d_32x4);
      d0_8x8 = vget_low_u8(d_8x16);
      d1_8x8 = vget_high_u8(d_8x16);
Example #8
0
void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride,
                              uint8_t *dst, ptrdiff_t dst_stride,
                              const int16_t *filter_x, int x_step_q4,
                              const int16_t *filter_y,  // unused
                              int y_step_q4,            // unused
                              int w, int h) {
  int width;
  const uint8_t *s, *psrc;
  uint8_t *d, *pdst;
  uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8;
  uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32;
  uint8x16_t q12u8, q13u8, q14u8, q15u8;
  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16;
  int16x4_t d24s16, d25s16, d26s16, d27s16;
  uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16;
  int16x8_t q0s16;
  uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16;
  int32x4_t q1s32, q2s32, q14s32, q15s32;
  uint16x8x2_t q0x2u16;
  uint8x8x2_t d0x2u8, d1x2u8;
  uint32x2x2_t d0x2u32;
  uint16x4x2_t d0x2u16, d1x2u16;
  uint32x4x2_t q0x2u32;

  assert(x_step_q4 == 16);

  (void)x_step_q4;
  (void)y_step_q4;
  (void)filter_y;

  q0s16 = vld1q_s16(filter_x);

  src -= 3;  // adjust for taps
  for (; h > 0; h -= 4, src += src_stride * 4,
                dst += dst_stride * 4) {  // loop_horiz_v
    s = src;
    d24u8 = vld1_u8(s);
    s += src_stride;
    d25u8 = vld1_u8(s);
    s += src_stride;
    d26u8 = vld1_u8(s);
    s += src_stride;
    d27u8 = vld1_u8(s);

    q12u8 = vcombine_u8(d24u8, d25u8);
    q13u8 = vcombine_u8(d26u8, d27u8);

    q0x2u16 =
        vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8));
    d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0]));
    d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0]));
    d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1]));
    d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1]));
    d0x2u8 = vtrn_u8(d24u8, d25u8);
    d1x2u8 = vtrn_u8(d26u8, d27u8);

    __builtin_prefetch(src + src_stride * 4);
    __builtin_prefetch(src + src_stride * 5);
    __builtin_prefetch(src + src_stride * 6);

    q8u16 = vmovl_u8(d0x2u8.val[0]);
    q9u16 = vmovl_u8(d0x2u8.val[1]);
    q10u16 = vmovl_u8(d1x2u8.val[0]);
    q11u16 = vmovl_u8(d1x2u8.val[1]);

    d16u16 = vget_low_u16(q8u16);
    d17u16 = vget_high_u16(q8u16);
    d18u16 = vget_low_u16(q9u16);
    d19u16 = vget_high_u16(q9u16);
    q8u16 = vcombine_u16(d16u16, d18u16);  // vswp 17 18
    q9u16 = vcombine_u16(d17u16, d19u16);

    d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
    d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));  // vmov 23 21
    for (width = w, psrc = src + 7, pdst = dst; width > 0;
         width -= 4, psrc += 4, pdst += 4) {  // loop_horiz
      s = psrc;
      d28u32 = vld1_dup_u32((const uint32_t *)s);
      s += src_stride;
      d29u32 = vld1_dup_u32((const uint32_t *)s);
      s += src_stride;
      d31u32 = vld1_dup_u32((const uint32_t *)s);
      s += src_stride;
      d30u32 = vld1_dup_u32((const uint32_t *)s);

      __builtin_prefetch(psrc + 64);

      d0x2u16 =
          vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32));
      d1x2u16 =
          vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32));
      d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]),   // d28
                       vreinterpret_u8_u16(d1x2u16.val[0]));  // d29
      d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]),   // d31
                       vreinterpret_u8_u16(d1x2u16.val[1]));  // d30

      __builtin_prefetch(psrc + 64 + src_stride);

      q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]);
      q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]);
      q0x2u32 =
          vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8));

      d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0]));
      d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0]));
      q12u16 = vmovl_u8(d28u8);
      q13u16 = vmovl_u8(d29u8);

      __builtin_prefetch(psrc + 64 + src_stride * 2);

      d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16));
      d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16));
      d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
      d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
      d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
      d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
      d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
      d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
      d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));

      q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16,
                             d23s16, d24s16, q0s16);
      q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16,
                             d24s16, d26s16, q0s16);
      q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16,
                              d26s16, d27s16, q0s16);
      q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16,
                              d27s16, d25s16, q0s16);

      __builtin_prefetch(psrc + 60 + src_stride * 3);

      d2u16 = vqrshrun_n_s32(q1s32, 7);
      d3u16 = vqrshrun_n_s32(q2s32, 7);
      d4u16 = vqrshrun_n_s32(q14s32, 7);
      d5u16 = vqrshrun_n_s32(q15s32, 7);

      q1u16 = vcombine_u16(d2u16, d3u16);
      q2u16 = vcombine_u16(d4u16, d5u16);

      d2u8 = vqmovn_u16(q1u16);
      d3u8 = vqmovn_u16(q2u16);

      d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8));
      d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]),
                         vreinterpret_u32_u16(d0x2u16.val[1]));
      d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]),
                       vreinterpret_u8_u32(d0x2u32.val[1]));

      d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]);
      d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]);

      d = pdst;
      vst1_lane_u32((uint32_t *)d, d2u32, 0);
      d += dst_stride;
      vst1_lane_u32((uint32_t *)d, d3u32, 0);
      d += dst_stride;
      vst1_lane_u32((uint32_t *)d, d2u32, 1);
      d += dst_stride;
      vst1_lane_u32((uint32_t *)d, d3u32, 1);

      q8u16 = q9u16;
      d20s16 = d23s16;
      q11u16 = q12u16;
      q9u16 = q13u16;
      d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
    }
  }
  return;
}
Example #9
0
	operator  uint32x4_t () const { return vreinterpretq_u32_u8(val); }
Example #10
0
void vp8_mbloop_filter_vertical_edge_uv_neon(
        unsigned char *u,
        int pitch,
        unsigned char blimit,
        unsigned char limit,
        unsigned char thresh,
        unsigned char *v) {
    unsigned char *us, *ud;
    unsigned char *vs, *vd;
    uint8x16_t qblimit, qlimit, qthresh, q3, q4;
    uint8x16_t q5, q6, q7, q8, q9, q10;
    uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14;
    uint8x8_t d15, d16, d17, d18, d19, d20, d21;
    uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3;
    uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7;
    uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11;

    qblimit = vdupq_n_u8(blimit);
    qlimit = vdupq_n_u8(limit);
    qthresh = vdupq_n_u8(thresh);

    us = u - 4;
    vs = v - 4;
    d6 = vld1_u8(us);
    us += pitch;
    d7 = vld1_u8(vs);
    vs += pitch;
    d8 = vld1_u8(us);
    us += pitch;
    d9 = vld1_u8(vs);
    vs += pitch;
    d10 = vld1_u8(us);
    us += pitch;
    d11 = vld1_u8(vs);
    vs += pitch;
    d12 = vld1_u8(us);
    us += pitch;
    d13 = vld1_u8(vs);
    vs += pitch;
    d14 = vld1_u8(us);
    us += pitch;
    d15 = vld1_u8(vs);
    vs += pitch;
    d16 = vld1_u8(us);
    us += pitch;
    d17 = vld1_u8(vs);
    vs += pitch;
    d18 = vld1_u8(us);
    us += pitch;
    d19 = vld1_u8(vs);
    vs += pitch;
    d20 = vld1_u8(us);
    d21 = vld1_u8(vs);

    q3 = vcombine_u8(d6, d7);
    q4 = vcombine_u8(d8, d9);
    q5 = vcombine_u8(d10, d11);
    q6 = vcombine_u8(d12, d13);
    q7 = vcombine_u8(d14, d15);
    q8 = vcombine_u8(d16, d17);
    q9 = vcombine_u8(d18, d19);
    q10 = vcombine_u8(d20, d21);

    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));

    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
                       vreinterpretq_u16_u32(q2tmp2.val[0]));
    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
                       vreinterpretq_u16_u32(q2tmp3.val[0]));
    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
                       vreinterpretq_u16_u32(q2tmp2.val[1]));
    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
                       vreinterpretq_u16_u32(q2tmp3.val[1]));

    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
                       vreinterpretq_u8_u16(q2tmp5.val[0]));
    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
                       vreinterpretq_u8_u16(q2tmp5.val[1]));
    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
                       vreinterpretq_u8_u16(q2tmp7.val[0]));
    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
                       vreinterpretq_u8_u16(q2tmp7.val[1]));

    q3 = q2tmp8.val[0];
    q4 = q2tmp8.val[1];
    q5 = q2tmp9.val[0];
    q6 = q2tmp9.val[1];
    q7 = q2tmp10.val[0];
    q8 = q2tmp10.val[1];
    q9 = q2tmp11.val[0];
    q10 = q2tmp11.val[1];

    vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4,
                         q5, q6, q7, q8, q9, q10,
                         &q4, &q5, &q6, &q7, &q8, &q9);

    q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7));
    q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8));
    q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9));
    q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10));

    q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]),
                       vreinterpretq_u16_u32(q2tmp2.val[0]));
    q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]),
                       vreinterpretq_u16_u32(q2tmp3.val[0]));
    q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]),
                       vreinterpretq_u16_u32(q2tmp2.val[1]));
    q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]),
                       vreinterpretq_u16_u32(q2tmp3.val[1]));

    q2tmp8  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]),
                       vreinterpretq_u8_u16(q2tmp5.val[0]));
    q2tmp9  = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]),
                       vreinterpretq_u8_u16(q2tmp5.val[1]));
    q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]),
                       vreinterpretq_u8_u16(q2tmp7.val[0]));
    q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]),
                       vreinterpretq_u8_u16(q2tmp7.val[1]));

    q3 = q2tmp8.val[0];
    q4 = q2tmp8.val[1];
    q5 = q2tmp9.val[0];
    q6 = q2tmp9.val[1];
    q7 = q2tmp10.val[0];
    q8 = q2tmp10.val[1];
    q9 = q2tmp11.val[0];
    q10 = q2tmp11.val[1];

    ud = u - 4;
    vst1_u8(ud, vget_low_u8(q3));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q4));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q5));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q6));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q7));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q8));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q9));
    ud += pitch;
    vst1_u8(ud, vget_low_u8(q10));

    vd = v - 4;
    vst1_u8(vd, vget_high_u8(q3));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q4));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q5));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q6));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q7));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q8));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q9));
    vd += pitch;
    vst1_u8(vd, vget_high_u8(q10));
    return;
}