void test_vreinterpretQu32_u8 (void) { uint32x4_t out_uint32x4_t; uint8x16_t arg0_uint8x16_t; out_uint32x4_t = vreinterpretq_u32_u8 (arg0_uint8x16_t); }
// about twice as fast as generic void MipMap_32_neon( int width, int height, unsigned char *source, unsigned char *target ) { if(width < 8) { MipMap_32_generic(width, height, source, target); return; } int newwidth = width / 2; int newheight = height / 2; int stride = width * 4; unsigned char *s = target; unsigned char *t = source; unsigned char *u = t+stride; int y, x; for( y = 0; y < newheight; y++ ) { for( x = 0; x < newwidth; x+=4 ) { uint8x16_t a0, a1, a2, a3; memcpy(&a0, t, 16); memcpy(&a1, t+16, 16); memcpy(&a2, u, 16); memcpy(&a3, u+16, 16); // average first and second scan lines a0 = vhaddq_u8(a0, a2); a1 = vhaddq_u8(a1, a3); // repack uint32x4x2_t z = vuzpq_u32(vreinterpretq_u32_u8(a0), vreinterpretq_u32_u8(a1)); uint8x16_t d0, d1; memcpy(&d0, &z.val[0], 16), memcpy(&d1, &z.val[1], 16); // average even and odd x pixels a0 = vhaddq_u8(vreinterpretq_u8_u32(z.val[0]), vreinterpretq_u8_u32(z.val[1])); memcpy(s, &a0, 16); s+=16; t+=32; u+=32; } t += stride; u += stride; } }
static void MixColumns(void) { uint32x4_t a = vreinterpretq_u32_u8(*state); uint32x4_t b = vreinterpretq_u32_u8(xtime(*state)); uint32x4_t a3 = veorq_u32(a,b); uint32x4_t a3r = vshlq_n_u32(a3,8); a3r = vsraq_n_u32(a3r,a3,24); uint32x4_t a2 = vshlq_n_u32(a,16); a2 = vsraq_n_u32(a2,a,16); uint32x4_t a1 = vshlq_n_u32(a,24); a1 = vsraq_n_u32(a1,a,8); uint32x4_t out = veorq_u32(b,a1); out = veorq_u32(out,a2); out = veorq_u32(out,a3r); *state = vreinterpretq_u8_u32(out); }
/* * Notice: * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST * numbers of DD bits */ static inline uint16_t _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts, uint8_t *split_packet) { volatile union i40e_rx_desc *rxdp; struct i40e_rx_entry *sw_ring; uint16_t nb_pkts_recd; int pos; uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; /* mask to shuffle from desc. to mbuf */ uint8x16_t shuf_msk = { 0xFF, 0xFF, /* pkt_type set as unknown */ 0xFF, 0xFF, /* pkt_type set as unknown */ 14, 15, /* octet 15~14, low 16 bits pkt_len */ 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 14, 15, /* octet 15~14, 16 bits data_len */ 2, 3, /* octet 2~3, low 16 bits vlan_macip */ 4, 5, 6, 7 /* octet 4~7, 32bits rss */ }; uint8x16_t eop_check = { 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; uint16x8_t crc_adjust = { 0, 0, /* ignore pkt_type field */ rxq->crc_len, /* sub crc on pkt_len */ 0, /* ignore high-16bits of pkt_len */ rxq->crc_len, /* sub crc on data_len */ 0, 0, 0 /* ignore non-length fields */ }; /* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */ nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST); /* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP); /* Just the act of getting into the function from the application is * going to cost about 7 cycles */ rxdp = rxq->rx_ring + rxq->rx_tail; rte_prefetch_non_temporal(rxdp); /* See if we need to rearm the RX queue - gives the prefetch a bit * of time to act */ if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) i40e_rxq_rearm(rxq); /* Before we start moving massive data around, check to see if * there is actually a packet available */ if (!(rxdp->wb.qword1.status_error_len & rte_cpu_to_le_32(1 << I40E_RX_DESC_STATUS_DD_SHIFT))) return 0; /* Cache is empty -> need to scan the buffer rings, but first move * the next 'n' mbufs into the cache */ sw_ring = &rxq->sw_ring[rxq->rx_tail]; /* A. load 4 packet in one loop * [A*. mask out 4 unused dirty field in desc] * B. copy 4 mbuf point from swring to rx_pkts * C. calc the number of DD bits among the 4 packets * [C*. extract the end-of-packet bit, if requested] * D. fill info. from desc to mbuf */ for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; pos += RTE_I40E_DESCS_PER_LOOP, rxdp += RTE_I40E_DESCS_PER_LOOP) { uint64x2_t descs[RTE_I40E_DESCS_PER_LOOP]; uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; uint16x8x2_t sterr_tmp1, sterr_tmp2; uint64x2_t mbp1, mbp2; uint16x8_t staterr; uint16x8_t tmp; uint64_t stat; int32x4_t len_shl = {0, 0, 0, PKTLEN_SHIFT}; /* B.1 load 1 mbuf point */ mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]); /* Read desc statuses backwards to avoid race condition */ /* A.1 load 4 pkts desc */ descs[3] = vld1q_u64((uint64_t *)(rxdp + 3)); rte_rmb(); /* B.2 copy 2 mbuf point into rx_pkts */ vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1); /* B.1 load 1 mbuf point */ mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]); descs[2] = vld1q_u64((uint64_t *)(rxdp + 2)); /* B.1 load 2 mbuf point */ descs[1] = vld1q_u64((uint64_t *)(rxdp + 1)); descs[0] = vld1q_u64((uint64_t *)(rxdp)); /* B.2 copy 2 mbuf point into rx_pkts */ vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2); if (split_packet) { rte_mbuf_prefetch_part2(rx_pkts[pos]); rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); } /* avoid compiler reorder optimization */ rte_compiler_barrier(); /* pkt 3,4 shift the pktlen field to be 16-bit aligned*/ uint32x4_t len3 = vshlq_u32(vreinterpretq_u32_u64(descs[3]), len_shl); descs[3] = vreinterpretq_u64_u32(len3); uint32x4_t len2 = vshlq_u32(vreinterpretq_u32_u64(descs[2]), len_shl); descs[2] = vreinterpretq_u64_u32(len2); /* D.1 pkt 3,4 convert format from desc to pktmbuf */ pkt_mb4 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[3]), shuf_msk); pkt_mb3 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[2]), shuf_msk); /* C.1 4=>2 filter staterr info only */ sterr_tmp2 = vzipq_u16(vreinterpretq_u16_u64(descs[1]), vreinterpretq_u16_u64(descs[3])); /* C.1 4=>2 filter staterr info only */ sterr_tmp1 = vzipq_u16(vreinterpretq_u16_u64(descs[0]), vreinterpretq_u16_u64(descs[2])); /* C.2 get 4 pkts staterr value */ staterr = vzipq_u16(sterr_tmp1.val[1], sterr_tmp2.val[1]).val[0]; desc_to_olflags_v(rxq, descs, &rx_pkts[pos]); /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */ tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust); pkt_mb4 = vreinterpretq_u8_u16(tmp); tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust); pkt_mb3 = vreinterpretq_u8_u16(tmp); /* pkt 1,2 shift the pktlen field to be 16-bit aligned*/ uint32x4_t len1 = vshlq_u32(vreinterpretq_u32_u64(descs[1]), len_shl); descs[1] = vreinterpretq_u64_u32(len1); uint32x4_t len0 = vshlq_u32(vreinterpretq_u32_u64(descs[0]), len_shl); descs[0] = vreinterpretq_u64_u32(len0); /* D.1 pkt 1,2 convert format from desc to pktmbuf */ pkt_mb2 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[1]), shuf_msk); pkt_mb1 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[0]), shuf_msk); /* D.3 copy final 3,4 data to rx_pkts */ vst1q_u8((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1, pkt_mb4); vst1q_u8((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1, pkt_mb3); /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */ tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust); pkt_mb2 = vreinterpretq_u8_u16(tmp); tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust); pkt_mb1 = vreinterpretq_u8_u16(tmp); /* C* extract and record EOP bit */ if (split_packet) { uint8x16_t eop_shuf_mask = { 0x00, 0x02, 0x04, 0x06, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}; uint8x16_t eop_bits; /* and with mask to extract bits, flipping 1-0 */ eop_bits = vmvnq_u8(vreinterpretq_u8_u16(staterr)); eop_bits = vandq_u8(eop_bits, eop_check); /* the staterr values are not in order, as the count * count of dd bits doesn't care. However, for end of * packet tracking, we do care, so shuffle. This also * compresses the 32-bit values to 8-bit */ eop_bits = vqtbl1q_u8(eop_bits, eop_shuf_mask); /* store the resulting 32-bit value */ vst1q_lane_u32((uint32_t *)split_packet, vreinterpretq_u32_u8(eop_bits), 0); split_packet += RTE_I40E_DESCS_PER_LOOP; /* zero-out next pointers */ rx_pkts[pos]->next = NULL; rx_pkts[pos + 1]->next = NULL; rx_pkts[pos + 2]->next = NULL; rx_pkts[pos + 3]->next = NULL; } staterr = vshlq_n_u16(staterr, I40E_UINT16_BIT - 1); staterr = vreinterpretq_u16_s16( vshrq_n_s16(vreinterpretq_s16_u16(staterr), I40E_UINT16_BIT - 1)); stat = ~vgetq_lane_u64(vreinterpretq_u64_u16(staterr), 0); rte_prefetch_non_temporal(rxdp + RTE_I40E_DESCS_PER_LOOP); /* D.3 copy final 1,2 data to rx_pkts */ vst1q_u8((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1, pkt_mb2); vst1q_u8((void *)&rx_pkts[pos]->rx_descriptor_fields1, pkt_mb1); desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl); /* C.4 calc avaialbe number of desc */ if (unlikely(stat == 0)) { nb_pkts_recd += RTE_I40E_DESCS_PER_LOOP; } else { nb_pkts_recd += __builtin_ctzl(stat) / I40E_UINT16_BIT; break; } } /* Update our internal tail pointer */ rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd); rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1)); rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); return nb_pkts_recd; }
static inline void desc_to_olflags_v(struct i40e_rx_queue *rxq, uint64x2_t descs[4], struct rte_mbuf **rx_pkts) { uint32x4_t vlan0, vlan1, rss, l3_l4e; const uint64x2_t mbuf_init = {rxq->mbuf_initializer, 0}; uint64x2_t rearm0, rearm1, rearm2, rearm3; /* mask everything except RSS, flow director and VLAN flags * bit2 is for VLAN tag, bit11 for flow director indication * bit13:12 for RSS indication. */ const uint32x4_t rss_vlan_msk = { 0x1c03804, 0x1c03804, 0x1c03804, 0x1c03804}; const uint32x4_t cksum_mask = { PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD}; /* map rss and vlan type to rss hash and vlan flag */ const uint8x16_t vlan_flags = { 0, 0, 0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; const uint8x16_t rss_flags = { 0, PKT_RX_FDIR, 0, 0, 0, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH | PKT_RX_FDIR, 0, 0, 0, 0, 0, 0, 0, 0}; const uint8x16_t l3_l4e_flags = { (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1, PKT_RX_IP_CKSUM_BAD >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1, (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD) >> 1, (PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD) >> 1, (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, 0, 0, 0, 0, 0, 0, 0, 0}; vlan0 = vzipq_u32(vreinterpretq_u32_u64(descs[0]), vreinterpretq_u32_u64(descs[2])).val[1]; vlan1 = vzipq_u32(vreinterpretq_u32_u64(descs[1]), vreinterpretq_u32_u64(descs[3])).val[1]; vlan0 = vzipq_u32(vlan0, vlan1).val[0]; vlan1 = vandq_u32(vlan0, rss_vlan_msk); vlan0 = vreinterpretq_u32_u8(vqtbl1q_u8(vlan_flags, vreinterpretq_u8_u32(vlan1))); rss = vshrq_n_u32(vlan1, 11); rss = vreinterpretq_u32_u8(vqtbl1q_u8(rss_flags, vreinterpretq_u8_u32(rss))); l3_l4e = vshrq_n_u32(vlan1, 22); l3_l4e = vreinterpretq_u32_u8(vqtbl1q_u8(l3_l4e_flags, vreinterpretq_u8_u32(l3_l4e))); /* then we shift left 1 bit */ l3_l4e = vshlq_n_u32(l3_l4e, 1); /* we need to mask out the reduntant bits */ l3_l4e = vandq_u32(l3_l4e, cksum_mask); vlan0 = vorrq_u32(vlan0, rss); vlan0 = vorrq_u32(vlan0, l3_l4e); rearm0 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 0), mbuf_init, 1); rearm1 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 1), mbuf_init, 1); rearm2 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 2), mbuf_init, 1); rearm3 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 3), mbuf_init, 1); vst1q_u64((uint64_t *)&rx_pkts[0]->rearm_data, rearm0); vst1q_u64((uint64_t *)&rx_pkts[1]->rearm_data, rearm1); vst1q_u64((uint64_t *)&rx_pkts[2]->rearm_data, rearm2); vst1q_u64((uint64_t *)&rx_pkts[3]->rearm_data, rearm3); }
static void neon_w32_split_4_32_multiply_region(gf_t *gf, uint32_t *src, uint32_t *dst, uint32_t *d_end, uint8_t btable[8][4][16], uint32_t val, int xor, int altmap) { int i, j; #ifdef ARCH_AARCH64 uint8x16_t tables[8][4]; #else uint8x8x2_t tables[8][4]; #endif uint32x4_t v0, v1, v2, v3, s0, s1, s2, s3; uint8x16_t p0, p1, p2, p3, si, mask1; uint16x8x2_t r0, r1; uint8x16x2_t q0, q1; for (i = 0; i < 8; i++) { for (j = 0; j < 4; j++) { #ifdef ARCH_AARCH64 tables[i][j] = vld1q_u8(btable[i][j]); #else tables[i][j].val[0] = vld1_u8(btable[i][j]); tables[i][j].val[1] = vld1_u8(btable[i][j] + 8); #endif } } mask1 = vdupq_n_u8(0xf); while (dst < d_end) { v0 = vld1q_u32(src); src += 4; v1 = vld1q_u32(src); src += 4; v2 = vld1q_u32(src); src += 4; v3 = vld1q_u32(src); src += 4; if (altmap) { q0.val[0] = vreinterpretq_u8_u32(v0); q0.val[1] = vreinterpretq_u8_u32(v1); q1.val[0] = vreinterpretq_u8_u32(v2); q1.val[1] = vreinterpretq_u8_u32(v3); } else { r0 = vtrnq_u16(vreinterpretq_u16_u32(v0), vreinterpretq_u16_u32(v2)); r1 = vtrnq_u16(vreinterpretq_u16_u32(v1), vreinterpretq_u16_u32(v3)); q0 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[0]), vreinterpretq_u8_u16(r1.val[0])); q1 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[1]), vreinterpretq_u8_u16(r1.val[1])); } si = vandq_u8(q0.val[0], mask1); p0 = vqtbl1q_u8(tables[0][0], si); p1 = vqtbl1q_u8(tables[0][1], si); p2 = vqtbl1q_u8(tables[0][2], si); p3 = vqtbl1q_u8(tables[0][3], si); si = vshrq_n_u8(q0.val[0], 4); p0 = veorq_u8(p0, vqtbl1q_u8(tables[1][0], si)); p1 = veorq_u8(p1, vqtbl1q_u8(tables[1][1], si)); p2 = veorq_u8(p2, vqtbl1q_u8(tables[1][2], si)); p3 = veorq_u8(p3, vqtbl1q_u8(tables[1][3], si)); si = vandq_u8(q0.val[1], mask1); p0 = veorq_u8(p0, vqtbl1q_u8(tables[2][0], si)); p1 = veorq_u8(p1, vqtbl1q_u8(tables[2][1], si)); p2 = veorq_u8(p2, vqtbl1q_u8(tables[2][2], si)); p3 = veorq_u8(p3, vqtbl1q_u8(tables[2][3], si)); si = vshrq_n_u8(q0.val[1], 4); p0 = veorq_u8(p0, vqtbl1q_u8(tables[3][0], si)); p1 = veorq_u8(p1, vqtbl1q_u8(tables[3][1], si)); p2 = veorq_u8(p2, vqtbl1q_u8(tables[3][2], si)); p3 = veorq_u8(p3, vqtbl1q_u8(tables[3][3], si)); si = vandq_u8(q1.val[0], mask1); p0 = veorq_u8(p0, vqtbl1q_u8(tables[4][0], si)); p1 = veorq_u8(p1, vqtbl1q_u8(tables[4][1], si)); p2 = veorq_u8(p2, vqtbl1q_u8(tables[4][2], si)); p3 = veorq_u8(p3, vqtbl1q_u8(tables[4][3], si)); si = vshrq_n_u8(q1.val[0], 4); p0 = veorq_u8(p0, vqtbl1q_u8(tables[5][0], si)); p1 = veorq_u8(p1, vqtbl1q_u8(tables[5][1], si)); p2 = veorq_u8(p2, vqtbl1q_u8(tables[5][2], si)); p3 = veorq_u8(p3, vqtbl1q_u8(tables[5][3], si)); si = vandq_u8(q1.val[1], mask1); p0 = veorq_u8(p0, vqtbl1q_u8(tables[6][0], si)); p1 = veorq_u8(p1, vqtbl1q_u8(tables[6][1], si)); p2 = veorq_u8(p2, vqtbl1q_u8(tables[6][2], si)); p3 = veorq_u8(p3, vqtbl1q_u8(tables[6][3], si)); si = vshrq_n_u8(q1.val[1], 4); p0 = veorq_u8(p0, vqtbl1q_u8(tables[7][0], si)); p1 = veorq_u8(p1, vqtbl1q_u8(tables[7][1], si)); p2 = veorq_u8(p2, vqtbl1q_u8(tables[7][2], si)); p3 = veorq_u8(p3, vqtbl1q_u8(tables[7][3], si)); if (altmap) { s0 = vreinterpretq_u32_u8(p0); s1 = vreinterpretq_u32_u8(p1); s2 = vreinterpretq_u32_u8(p2); s3 = vreinterpretq_u32_u8(p3); } else { q0 = vtrnq_u8(p0, p1); q1 = vtrnq_u8(p2, p3); r0 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[0]), vreinterpretq_u16_u8(q1.val[0])); r1 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[1]), vreinterpretq_u16_u8(q1.val[1])); s0 = vreinterpretq_u32_u16(r0.val[0]); s1 = vreinterpretq_u32_u16(r1.val[0]); s2 = vreinterpretq_u32_u16(r0.val[1]); s3 = vreinterpretq_u32_u16(r1.val[1]); } if (xor) { v0 = vld1q_u32(dst); v1 = vld1q_u32(dst + 4); v2 = vld1q_u32(dst + 8); v3 = vld1q_u32(dst + 12); s0 = veorq_u32(s0, v0); s1 = veorq_u32(s1, v1); s2 = veorq_u32(s2, v2); s3 = veorq_u32(s3, v3); } vst1q_u32(dst, s0); vst1q_u32(dst + 4, s1); vst1q_u32(dst + 8, s2); vst1q_u32(dst + 12, s3); dst += 16; } }
uint8x8_t ad1_8x8; uint8x8_t alpha0_8x8; uint8x8_t alpha1_8x8; uint8x8_t c_8x8; uint8x8_t d0_8x8; uint8x8_t d1_8x8; uint8x8_t s0_8x8; uint8x8_t s1_8x8; uint8x8_t sc0_8x8; uint8x8_t sc1_8x8; c_32x2 = vdup_n_u32(c); c_8x8 = vreinterpret_u8_u32(c_32x2); x255_16x8 = vdupq_n_u16(0xff); x0_8x16 = vdupq_n_u8(0x0); x0_32x4 = vreinterpretq_u32_u8(x0_8x16); x1_8x16 = vdupq_n_u8(0x1); x1_32x4 = vreinterpretq_u32_u8(x1_8x16); DATA32 *start = d; int size = l; DATA32 *end = start + (size & ~3); while (start < end) { s_32x4 = vld1q_u32(s); s_8x16 = vreinterpretq_u8_u32(s_32x4); d_32x4 = vld1q_u32(start); d_8x16 = vreinterpretq_u8_u32(d_32x4); d0_8x8 = vget_low_u8(d_8x16); d1_8x8 = vget_high_u8(d_8x16);
void aom_convolve8_horiz_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, // unused int y_step_q4, // unused int w, int h) { int width; const uint8_t *s, *psrc; uint8_t *d, *pdst; uint8x8_t d2u8, d3u8, d24u8, d25u8, d26u8, d27u8, d28u8, d29u8; uint32x2_t d2u32, d3u32, d28u32, d29u32, d30u32, d31u32; uint8x16_t q12u8, q13u8, q14u8, q15u8; int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d22s16, d23s16; int16x4_t d24s16, d25s16, d26s16, d27s16; uint16x4_t d2u16, d3u16, d4u16, d5u16, d16u16, d17u16, d18u16, d19u16; int16x8_t q0s16; uint16x8_t q1u16, q2u16, q8u16, q9u16, q10u16, q11u16, q12u16, q13u16; int32x4_t q1s32, q2s32, q14s32, q15s32; uint16x8x2_t q0x2u16; uint8x8x2_t d0x2u8, d1x2u8; uint32x2x2_t d0x2u32; uint16x4x2_t d0x2u16, d1x2u16; uint32x4x2_t q0x2u32; assert(x_step_q4 == 16); (void)x_step_q4; (void)y_step_q4; (void)filter_y; q0s16 = vld1q_s16(filter_x); src -= 3; // adjust for taps for (; h > 0; h -= 4, src += src_stride * 4, dst += dst_stride * 4) { // loop_horiz_v s = src; d24u8 = vld1_u8(s); s += src_stride; d25u8 = vld1_u8(s); s += src_stride; d26u8 = vld1_u8(s); s += src_stride; d27u8 = vld1_u8(s); q12u8 = vcombine_u8(d24u8, d25u8); q13u8 = vcombine_u8(d26u8, d27u8); q0x2u16 = vtrnq_u16(vreinterpretq_u16_u8(q12u8), vreinterpretq_u16_u8(q13u8)); d24u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[0])); d25u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[0])); d26u8 = vreinterpret_u8_u16(vget_low_u16(q0x2u16.val[1])); d27u8 = vreinterpret_u8_u16(vget_high_u16(q0x2u16.val[1])); d0x2u8 = vtrn_u8(d24u8, d25u8); d1x2u8 = vtrn_u8(d26u8, d27u8); __builtin_prefetch(src + src_stride * 4); __builtin_prefetch(src + src_stride * 5); __builtin_prefetch(src + src_stride * 6); q8u16 = vmovl_u8(d0x2u8.val[0]); q9u16 = vmovl_u8(d0x2u8.val[1]); q10u16 = vmovl_u8(d1x2u8.val[0]); q11u16 = vmovl_u8(d1x2u8.val[1]); d16u16 = vget_low_u16(q8u16); d17u16 = vget_high_u16(q8u16); d18u16 = vget_low_u16(q9u16); d19u16 = vget_high_u16(q9u16); q8u16 = vcombine_u16(d16u16, d18u16); // vswp 17 18 q9u16 = vcombine_u16(d17u16, d19u16); d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16)); d23s16 = vreinterpret_s16_u16(vget_high_u16(q10u16)); // vmov 23 21 for (width = w, psrc = src + 7, pdst = dst; width > 0; width -= 4, psrc += 4, pdst += 4) { // loop_horiz s = psrc; d28u32 = vld1_dup_u32((const uint32_t *)s); s += src_stride; d29u32 = vld1_dup_u32((const uint32_t *)s); s += src_stride; d31u32 = vld1_dup_u32((const uint32_t *)s); s += src_stride; d30u32 = vld1_dup_u32((const uint32_t *)s); __builtin_prefetch(psrc + 64); d0x2u16 = vtrn_u16(vreinterpret_u16_u32(d28u32), vreinterpret_u16_u32(d31u32)); d1x2u16 = vtrn_u16(vreinterpret_u16_u32(d29u32), vreinterpret_u16_u32(d30u32)); d0x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[0]), // d28 vreinterpret_u8_u16(d1x2u16.val[0])); // d29 d1x2u8 = vtrn_u8(vreinterpret_u8_u16(d0x2u16.val[1]), // d31 vreinterpret_u8_u16(d1x2u16.val[1])); // d30 __builtin_prefetch(psrc + 64 + src_stride); q14u8 = vcombine_u8(d0x2u8.val[0], d0x2u8.val[1]); q15u8 = vcombine_u8(d1x2u8.val[1], d1x2u8.val[0]); q0x2u32 = vtrnq_u32(vreinterpretq_u32_u8(q14u8), vreinterpretq_u32_u8(q15u8)); d28u8 = vreinterpret_u8_u32(vget_low_u32(q0x2u32.val[0])); d29u8 = vreinterpret_u8_u32(vget_high_u32(q0x2u32.val[0])); q12u16 = vmovl_u8(d28u8); q13u16 = vmovl_u8(d29u8); __builtin_prefetch(psrc + 64 + src_stride * 2); d16s16 = vreinterpret_s16_u16(vget_low_u16(q8u16)); d17s16 = vreinterpret_s16_u16(vget_high_u16(q8u16)); d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16)); d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16)); d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16)); d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16)); d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16)); d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16)); d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16)); q1s32 = MULTIPLY_BY_Q0(d16s16, d17s16, d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, q0s16); q2s32 = MULTIPLY_BY_Q0(d17s16, d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, q0s16); q14s32 = MULTIPLY_BY_Q0(d20s16, d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, d27s16, q0s16); q15s32 = MULTIPLY_BY_Q0(d22s16, d18s16, d19s16, d23s16, d24s16, d26s16, d27s16, d25s16, q0s16); __builtin_prefetch(psrc + 60 + src_stride * 3); d2u16 = vqrshrun_n_s32(q1s32, 7); d3u16 = vqrshrun_n_s32(q2s32, 7); d4u16 = vqrshrun_n_s32(q14s32, 7); d5u16 = vqrshrun_n_s32(q15s32, 7); q1u16 = vcombine_u16(d2u16, d3u16); q2u16 = vcombine_u16(d4u16, d5u16); d2u8 = vqmovn_u16(q1u16); d3u8 = vqmovn_u16(q2u16); d0x2u16 = vtrn_u16(vreinterpret_u16_u8(d2u8), vreinterpret_u16_u8(d3u8)); d0x2u32 = vtrn_u32(vreinterpret_u32_u16(d0x2u16.val[0]), vreinterpret_u32_u16(d0x2u16.val[1])); d0x2u8 = vtrn_u8(vreinterpret_u8_u32(d0x2u32.val[0]), vreinterpret_u8_u32(d0x2u32.val[1])); d2u32 = vreinterpret_u32_u8(d0x2u8.val[0]); d3u32 = vreinterpret_u32_u8(d0x2u8.val[1]); d = pdst; vst1_lane_u32((uint32_t *)d, d2u32, 0); d += dst_stride; vst1_lane_u32((uint32_t *)d, d3u32, 0); d += dst_stride; vst1_lane_u32((uint32_t *)d, d2u32, 1); d += dst_stride; vst1_lane_u32((uint32_t *)d, d3u32, 1); q8u16 = q9u16; d20s16 = d23s16; q11u16 = q12u16; q9u16 = q13u16; d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16)); } } return; }
operator uint32x4_t () const { return vreinterpretq_u32_u8(val); }
void vp8_mbloop_filter_vertical_edge_uv_neon( unsigned char *u, int pitch, unsigned char blimit, unsigned char limit, unsigned char thresh, unsigned char *v) { unsigned char *us, *ud; unsigned char *vs, *vd; uint8x16_t qblimit, qlimit, qthresh, q3, q4; uint8x16_t q5, q6, q7, q8, q9, q10; uint8x8_t d6, d7, d8, d9, d10, d11, d12, d13, d14; uint8x8_t d15, d16, d17, d18, d19, d20, d21; uint32x4x2_t q2tmp0, q2tmp1, q2tmp2, q2tmp3; uint16x8x2_t q2tmp4, q2tmp5, q2tmp6, q2tmp7; uint8x16x2_t q2tmp8, q2tmp9, q2tmp10, q2tmp11; qblimit = vdupq_n_u8(blimit); qlimit = vdupq_n_u8(limit); qthresh = vdupq_n_u8(thresh); us = u - 4; vs = v - 4; d6 = vld1_u8(us); us += pitch; d7 = vld1_u8(vs); vs += pitch; d8 = vld1_u8(us); us += pitch; d9 = vld1_u8(vs); vs += pitch; d10 = vld1_u8(us); us += pitch; d11 = vld1_u8(vs); vs += pitch; d12 = vld1_u8(us); us += pitch; d13 = vld1_u8(vs); vs += pitch; d14 = vld1_u8(us); us += pitch; d15 = vld1_u8(vs); vs += pitch; d16 = vld1_u8(us); us += pitch; d17 = vld1_u8(vs); vs += pitch; d18 = vld1_u8(us); us += pitch; d19 = vld1_u8(vs); vs += pitch; d20 = vld1_u8(us); d21 = vld1_u8(vs); q3 = vcombine_u8(d6, d7); q4 = vcombine_u8(d8, d9); q5 = vcombine_u8(d10, d11); q6 = vcombine_u8(d12, d13); q7 = vcombine_u8(d14, d15); q8 = vcombine_u8(d16, d17); q9 = vcombine_u8(d18, d19); q10 = vcombine_u8(d20, d21); q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), vreinterpretq_u16_u32(q2tmp2.val[0])); q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), vreinterpretq_u16_u32(q2tmp3.val[0])); q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), vreinterpretq_u16_u32(q2tmp2.val[1])); q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), vreinterpretq_u16_u32(q2tmp3.val[1])); q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), vreinterpretq_u8_u16(q2tmp5.val[0])); q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), vreinterpretq_u8_u16(q2tmp5.val[1])); q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), vreinterpretq_u8_u16(q2tmp7.val[0])); q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), vreinterpretq_u8_u16(q2tmp7.val[1])); q3 = q2tmp8.val[0]; q4 = q2tmp8.val[1]; q5 = q2tmp9.val[0]; q6 = q2tmp9.val[1]; q7 = q2tmp10.val[0]; q8 = q2tmp10.val[1]; q9 = q2tmp11.val[0]; q10 = q2tmp11.val[1]; vp8_mbloop_filter_neon(qblimit, qlimit, qthresh, q3, q4, q5, q6, q7, q8, q9, q10, &q4, &q5, &q6, &q7, &q8, &q9); q2tmp0 = vtrnq_u32(vreinterpretq_u32_u8(q3), vreinterpretq_u32_u8(q7)); q2tmp1 = vtrnq_u32(vreinterpretq_u32_u8(q4), vreinterpretq_u32_u8(q8)); q2tmp2 = vtrnq_u32(vreinterpretq_u32_u8(q5), vreinterpretq_u32_u8(q9)); q2tmp3 = vtrnq_u32(vreinterpretq_u32_u8(q6), vreinterpretq_u32_u8(q10)); q2tmp4 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[0]), vreinterpretq_u16_u32(q2tmp2.val[0])); q2tmp5 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[0]), vreinterpretq_u16_u32(q2tmp3.val[0])); q2tmp6 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp0.val[1]), vreinterpretq_u16_u32(q2tmp2.val[1])); q2tmp7 = vtrnq_u16(vreinterpretq_u16_u32(q2tmp1.val[1]), vreinterpretq_u16_u32(q2tmp3.val[1])); q2tmp8 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[0]), vreinterpretq_u8_u16(q2tmp5.val[0])); q2tmp9 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp4.val[1]), vreinterpretq_u8_u16(q2tmp5.val[1])); q2tmp10 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[0]), vreinterpretq_u8_u16(q2tmp7.val[0])); q2tmp11 = vtrnq_u8(vreinterpretq_u8_u16(q2tmp6.val[1]), vreinterpretq_u8_u16(q2tmp7.val[1])); q3 = q2tmp8.val[0]; q4 = q2tmp8.val[1]; q5 = q2tmp9.val[0]; q6 = q2tmp9.val[1]; q7 = q2tmp10.val[0]; q8 = q2tmp10.val[1]; q9 = q2tmp11.val[0]; q10 = q2tmp11.val[1]; ud = u - 4; vst1_u8(ud, vget_low_u8(q3)); ud += pitch; vst1_u8(ud, vget_low_u8(q4)); ud += pitch; vst1_u8(ud, vget_low_u8(q5)); ud += pitch; vst1_u8(ud, vget_low_u8(q6)); ud += pitch; vst1_u8(ud, vget_low_u8(q7)); ud += pitch; vst1_u8(ud, vget_low_u8(q8)); ud += pitch; vst1_u8(ud, vget_low_u8(q9)); ud += pitch; vst1_u8(ud, vget_low_u8(q10)); vd = v - 4; vst1_u8(vd, vget_high_u8(q3)); vd += pitch; vst1_u8(vd, vget_high_u8(q4)); vd += pitch; vst1_u8(vd, vget_high_u8(q5)); vd += pitch; vst1_u8(vd, vget_high_u8(q6)); vd += pitch; vst1_u8(vd, vget_high_u8(q7)); vd += pitch; vst1_u8(vd, vget_high_u8(q8)); vd += pitch; vst1_u8(vd, vget_high_u8(q9)); vd += pitch; vst1_u8(vd, vget_high_u8(q10)); return; }