void test_vsubQu16 (void) { uint16x8_t out_uint16x8_t; uint16x8_t arg0_uint16x8_t; uint16x8_t arg1_uint16x8_t; out_uint16x8_t = vsubq_u16 (arg0_uint16x8_t, arg1_uint16x8_t); }
/* u16x8 sub */ void mw_neon_mm_sub_u16x8(unsigned short * A, int Row, int Col, unsigned short * B, unsigned short * C) { uint16x8_t neon_a, neon_b, neon_c; int size = Row * Col; int i = 0; int k = 0; for (i = 8; i <= size ; i+=8) { k = i - 8; neon_a = vld1q_u16(A + k); neon_b = vld1q_u16(B + k); neon_c = vsubq_u16(neon_a, neon_b); vst1q_u16(C + k, neon_c); } k = i - 8; for (i = 0; i < size % 8; i++) { C[k + i] = A[k + i] - B[k + i]; } }
/* * Notice: * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST * numbers of DD bits */ static inline uint16_t _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts, uint8_t *split_packet) { volatile union i40e_rx_desc *rxdp; struct i40e_rx_entry *sw_ring; uint16_t nb_pkts_recd; int pos; uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; /* mask to shuffle from desc. to mbuf */ uint8x16_t shuf_msk = { 0xFF, 0xFF, /* pkt_type set as unknown */ 0xFF, 0xFF, /* pkt_type set as unknown */ 14, 15, /* octet 15~14, low 16 bits pkt_len */ 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 14, 15, /* octet 15~14, 16 bits data_len */ 2, 3, /* octet 2~3, low 16 bits vlan_macip */ 4, 5, 6, 7 /* octet 4~7, 32bits rss */ }; uint8x16_t eop_check = { 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; uint16x8_t crc_adjust = { 0, 0, /* ignore pkt_type field */ rxq->crc_len, /* sub crc on pkt_len */ 0, /* ignore high-16bits of pkt_len */ rxq->crc_len, /* sub crc on data_len */ 0, 0, 0 /* ignore non-length fields */ }; /* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */ nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST); /* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP); /* Just the act of getting into the function from the application is * going to cost about 7 cycles */ rxdp = rxq->rx_ring + rxq->rx_tail; rte_prefetch_non_temporal(rxdp); /* See if we need to rearm the RX queue - gives the prefetch a bit * of time to act */ if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) i40e_rxq_rearm(rxq); /* Before we start moving massive data around, check to see if * there is actually a packet available */ if (!(rxdp->wb.qword1.status_error_len & rte_cpu_to_le_32(1 << I40E_RX_DESC_STATUS_DD_SHIFT))) return 0; /* Cache is empty -> need to scan the buffer rings, but first move * the next 'n' mbufs into the cache */ sw_ring = &rxq->sw_ring[rxq->rx_tail]; /* A. load 4 packet in one loop * [A*. mask out 4 unused dirty field in desc] * B. copy 4 mbuf point from swring to rx_pkts * C. calc the number of DD bits among the 4 packets * [C*. extract the end-of-packet bit, if requested] * D. fill info. from desc to mbuf */ for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; pos += RTE_I40E_DESCS_PER_LOOP, rxdp += RTE_I40E_DESCS_PER_LOOP) { uint64x2_t descs[RTE_I40E_DESCS_PER_LOOP]; uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; uint16x8x2_t sterr_tmp1, sterr_tmp2; uint64x2_t mbp1, mbp2; uint16x8_t staterr; uint16x8_t tmp; uint64_t stat; int32x4_t len_shl = {0, 0, 0, PKTLEN_SHIFT}; /* B.1 load 1 mbuf point */ mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]); /* Read desc statuses backwards to avoid race condition */ /* A.1 load 4 pkts desc */ descs[3] = vld1q_u64((uint64_t *)(rxdp + 3)); rte_rmb(); /* B.2 copy 2 mbuf point into rx_pkts */ vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1); /* B.1 load 1 mbuf point */ mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]); descs[2] = vld1q_u64((uint64_t *)(rxdp + 2)); /* B.1 load 2 mbuf point */ descs[1] = vld1q_u64((uint64_t *)(rxdp + 1)); descs[0] = vld1q_u64((uint64_t *)(rxdp)); /* B.2 copy 2 mbuf point into rx_pkts */ vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2); if (split_packet) { rte_mbuf_prefetch_part2(rx_pkts[pos]); rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); } /* avoid compiler reorder optimization */ rte_compiler_barrier(); /* pkt 3,4 shift the pktlen field to be 16-bit aligned*/ uint32x4_t len3 = vshlq_u32(vreinterpretq_u32_u64(descs[3]), len_shl); descs[3] = vreinterpretq_u64_u32(len3); uint32x4_t len2 = vshlq_u32(vreinterpretq_u32_u64(descs[2]), len_shl); descs[2] = vreinterpretq_u64_u32(len2); /* D.1 pkt 3,4 convert format from desc to pktmbuf */ pkt_mb4 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[3]), shuf_msk); pkt_mb3 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[2]), shuf_msk); /* C.1 4=>2 filter staterr info only */ sterr_tmp2 = vzipq_u16(vreinterpretq_u16_u64(descs[1]), vreinterpretq_u16_u64(descs[3])); /* C.1 4=>2 filter staterr info only */ sterr_tmp1 = vzipq_u16(vreinterpretq_u16_u64(descs[0]), vreinterpretq_u16_u64(descs[2])); /* C.2 get 4 pkts staterr value */ staterr = vzipq_u16(sterr_tmp1.val[1], sterr_tmp2.val[1]).val[0]; desc_to_olflags_v(rxq, descs, &rx_pkts[pos]); /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */ tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust); pkt_mb4 = vreinterpretq_u8_u16(tmp); tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust); pkt_mb3 = vreinterpretq_u8_u16(tmp); /* pkt 1,2 shift the pktlen field to be 16-bit aligned*/ uint32x4_t len1 = vshlq_u32(vreinterpretq_u32_u64(descs[1]), len_shl); descs[1] = vreinterpretq_u64_u32(len1); uint32x4_t len0 = vshlq_u32(vreinterpretq_u32_u64(descs[0]), len_shl); descs[0] = vreinterpretq_u64_u32(len0); /* D.1 pkt 1,2 convert format from desc to pktmbuf */ pkt_mb2 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[1]), shuf_msk); pkt_mb1 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[0]), shuf_msk); /* D.3 copy final 3,4 data to rx_pkts */ vst1q_u8((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1, pkt_mb4); vst1q_u8((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1, pkt_mb3); /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */ tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust); pkt_mb2 = vreinterpretq_u8_u16(tmp); tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust); pkt_mb1 = vreinterpretq_u8_u16(tmp); /* C* extract and record EOP bit */ if (split_packet) { uint8x16_t eop_shuf_mask = { 0x00, 0x02, 0x04, 0x06, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}; uint8x16_t eop_bits; /* and with mask to extract bits, flipping 1-0 */ eop_bits = vmvnq_u8(vreinterpretq_u8_u16(staterr)); eop_bits = vandq_u8(eop_bits, eop_check); /* the staterr values are not in order, as the count * count of dd bits doesn't care. However, for end of * packet tracking, we do care, so shuffle. This also * compresses the 32-bit values to 8-bit */ eop_bits = vqtbl1q_u8(eop_bits, eop_shuf_mask); /* store the resulting 32-bit value */ vst1q_lane_u32((uint32_t *)split_packet, vreinterpretq_u32_u8(eop_bits), 0); split_packet += RTE_I40E_DESCS_PER_LOOP; /* zero-out next pointers */ rx_pkts[pos]->next = NULL; rx_pkts[pos + 1]->next = NULL; rx_pkts[pos + 2]->next = NULL; rx_pkts[pos + 3]->next = NULL; } staterr = vshlq_n_u16(staterr, I40E_UINT16_BIT - 1); staterr = vreinterpretq_u16_s16( vshrq_n_s16(vreinterpretq_s16_u16(staterr), I40E_UINT16_BIT - 1)); stat = ~vgetq_lane_u64(vreinterpretq_u64_u16(staterr), 0); rte_prefetch_non_temporal(rxdp + RTE_I40E_DESCS_PER_LOOP); /* D.3 copy final 1,2 data to rx_pkts */ vst1q_u8((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1, pkt_mb2); vst1q_u8((void *)&rx_pkts[pos]->rx_descriptor_fields1, pkt_mb1); desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl); /* C.4 calc avaialbe number of desc */ if (unlikely(stat == 0)) { nb_pkts_recd += RTE_I40E_DESCS_PER_LOOP; } else { nb_pkts_recd += __builtin_ctzl(stat) / I40E_UINT16_BIT; break; } } /* Update our internal tail pointer */ rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd); rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1)); rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); return nb_pkts_recd; }
inline uint16x8_t vsubq(const uint16x8_t & v0, const uint16x8_t & v1) { return vsubq_u16(v0, v1); }
/* virtio vPMD receive routine, only accept(nb_pkts >= RTE_VIRTIO_DESC_PER_LOOP) * * This routine is for non-mergeable RX, one desc for each guest buffer. * This routine is based on the RX ring layout optimization. Each entry in the * avail ring points to the desc with the same index in the desc ring and this * will never be changed in the driver. * * - nb_pkts < RTE_VIRTIO_DESC_PER_LOOP, just return no packet */ uint16_t virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts, uint16_t nb_pkts) { struct virtnet_rx *rxvq = rx_queue; struct virtqueue *vq = rxvq->vq; uint16_t nb_used; uint16_t desc_idx; struct vring_used_elem *rused; struct rte_mbuf **sw_ring; struct rte_mbuf **sw_ring_end; uint16_t nb_pkts_received; uint8x16_t shuf_msk1 = { 0xFF, 0xFF, 0xFF, 0xFF, /* packet type */ 4, 5, 0xFF, 0xFF, /* pkt len */ 4, 5, /* dat len */ 0xFF, 0xFF, /* vlan tci */ 0xFF, 0xFF, 0xFF, 0xFF }; uint8x16_t shuf_msk2 = { 0xFF, 0xFF, 0xFF, 0xFF, /* packet type */ 12, 13, 0xFF, 0xFF, /* pkt len */ 12, 13, /* dat len */ 0xFF, 0xFF, /* vlan tci */ 0xFF, 0xFF, 0xFF, 0xFF }; /* Subtract the header length. * In which case do we need the header length in used->len ? */ uint16x8_t len_adjust = { 0, 0, (uint16_t)vq->hw->vtnet_hdr_size, 0, (uint16_t)vq->hw->vtnet_hdr_size, 0, 0, 0 }; if (unlikely(nb_pkts < RTE_VIRTIO_DESC_PER_LOOP)) return 0; nb_used = VIRTQUEUE_NUSED(vq); rte_rmb(); if (unlikely(nb_used == 0)) return 0; nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_VIRTIO_DESC_PER_LOOP); nb_used = RTE_MIN(nb_used, nb_pkts); desc_idx = (uint16_t)(vq->vq_used_cons_idx & (vq->vq_nentries - 1)); rused = &vq->vq_ring.used->ring[desc_idx]; sw_ring = &vq->sw_ring[desc_idx]; sw_ring_end = &vq->sw_ring[vq->vq_nentries]; rte_prefetch_non_temporal(rused); if (vq->vq_free_cnt >= RTE_VIRTIO_VPMD_RX_REARM_THRESH) { virtio_rxq_rearm_vec(rxvq); if (unlikely(virtqueue_kick_prepare(vq))) virtqueue_notify(vq); } for (nb_pkts_received = 0; nb_pkts_received < nb_used;) { uint64x2_t desc[RTE_VIRTIO_DESC_PER_LOOP / 2]; uint64x2_t mbp[RTE_VIRTIO_DESC_PER_LOOP / 2]; uint64x2_t pkt_mb[RTE_VIRTIO_DESC_PER_LOOP]; mbp[0] = vld1q_u64((uint64_t *)(sw_ring + 0)); desc[0] = vld1q_u64((uint64_t *)(rused + 0)); vst1q_u64((uint64_t *)&rx_pkts[0], mbp[0]); mbp[1] = vld1q_u64((uint64_t *)(sw_ring + 2)); desc[1] = vld1q_u64((uint64_t *)(rused + 2)); vst1q_u64((uint64_t *)&rx_pkts[2], mbp[1]); mbp[2] = vld1q_u64((uint64_t *)(sw_ring + 4)); desc[2] = vld1q_u64((uint64_t *)(rused + 4)); vst1q_u64((uint64_t *)&rx_pkts[4], mbp[2]); mbp[3] = vld1q_u64((uint64_t *)(sw_ring + 6)); desc[3] = vld1q_u64((uint64_t *)(rused + 6)); vst1q_u64((uint64_t *)&rx_pkts[6], mbp[3]); pkt_mb[1] = vreinterpretq_u64_u8(vqtbl1q_u8( vreinterpretq_u8_u64(desc[0]), shuf_msk2)); pkt_mb[0] = vreinterpretq_u64_u8(vqtbl1q_u8( vreinterpretq_u8_u64(desc[0]), shuf_msk1)); pkt_mb[1] = vreinterpretq_u64_u16(vsubq_u16( vreinterpretq_u16_u64(pkt_mb[1]), len_adjust)); pkt_mb[0] = vreinterpretq_u64_u16(vsubq_u16( vreinterpretq_u16_u64(pkt_mb[0]), len_adjust)); vst1q_u64((void *)&rx_pkts[1]->rx_descriptor_fields1, pkt_mb[1]); vst1q_u64((void *)&rx_pkts[0]->rx_descriptor_fields1, pkt_mb[0]); pkt_mb[3] = vreinterpretq_u64_u8(vqtbl1q_u8( vreinterpretq_u8_u64(desc[1]), shuf_msk2)); pkt_mb[2] = vreinterpretq_u64_u8(vqtbl1q_u8( vreinterpretq_u8_u64(desc[1]), shuf_msk1)); pkt_mb[3] = vreinterpretq_u64_u16(vsubq_u16( vreinterpretq_u16_u64(pkt_mb[3]), len_adjust)); pkt_mb[2] = vreinterpretq_u64_u16(vsubq_u16( vreinterpretq_u16_u64(pkt_mb[2]), len_adjust)); vst1q_u64((void *)&rx_pkts[3]->rx_descriptor_fields1, pkt_mb[3]); vst1q_u64((void *)&rx_pkts[2]->rx_descriptor_fields1, pkt_mb[2]); pkt_mb[5] = vreinterpretq_u64_u8(vqtbl1q_u8( vreinterpretq_u8_u64(desc[2]), shuf_msk2)); pkt_mb[4] = vreinterpretq_u64_u8(vqtbl1q_u8( vreinterpretq_u8_u64(desc[2]), shuf_msk1)); pkt_mb[5] = vreinterpretq_u64_u16(vsubq_u16( vreinterpretq_u16_u64(pkt_mb[5]), len_adjust)); pkt_mb[4] = vreinterpretq_u64_u16(vsubq_u16( vreinterpretq_u16_u64(pkt_mb[4]), len_adjust)); vst1q_u64((void *)&rx_pkts[5]->rx_descriptor_fields1, pkt_mb[5]); vst1q_u64((void *)&rx_pkts[4]->rx_descriptor_fields1, pkt_mb[4]); pkt_mb[7] = vreinterpretq_u64_u8(vqtbl1q_u8( vreinterpretq_u8_u64(desc[3]), shuf_msk2)); pkt_mb[6] = vreinterpretq_u64_u8(vqtbl1q_u8( vreinterpretq_u8_u64(desc[3]), shuf_msk1)); pkt_mb[7] = vreinterpretq_u64_u16(vsubq_u16( vreinterpretq_u16_u64(pkt_mb[7]), len_adjust)); pkt_mb[6] = vreinterpretq_u64_u16(vsubq_u16( vreinterpretq_u16_u64(pkt_mb[6]), len_adjust)); vst1q_u64((void *)&rx_pkts[7]->rx_descriptor_fields1, pkt_mb[7]); vst1q_u64((void *)&rx_pkts[6]->rx_descriptor_fields1, pkt_mb[6]); if (unlikely(nb_used <= RTE_VIRTIO_DESC_PER_LOOP)) { if (sw_ring + nb_used <= sw_ring_end) nb_pkts_received += nb_used; else nb_pkts_received += sw_ring_end - sw_ring; break; } else { if (unlikely(sw_ring + RTE_VIRTIO_DESC_PER_LOOP >= sw_ring_end)) { nb_pkts_received += sw_ring_end - sw_ring; break; } else { nb_pkts_received += RTE_VIRTIO_DESC_PER_LOOP; rx_pkts += RTE_VIRTIO_DESC_PER_LOOP; sw_ring += RTE_VIRTIO_DESC_PER_LOOP; rused += RTE_VIRTIO_DESC_PER_LOOP; nb_used -= RTE_VIRTIO_DESC_PER_LOOP; } } } vq->vq_used_cons_idx += nb_pkts_received; vq->vq_free_cnt += nb_pkts_received; rxvq->stats.packets += nb_pkts_received; return nb_pkts_received; }
int neon_new(DATA32* _p0, DATA32* _p1, DATA32* _p2, DATA32* _p3, DATA32* _ax, DATA32 _ay, DATA32* result, int len) { int ay = _ay; int i; DATA32* pbuf = result; uint16x4_t ay_16x4; uint16x4_t p0_16x4; uint16x4_t p2_16x4; uint16x8_t ax_16x8; uint16x8_t p0_p2_16x8; uint16x8_t p1_p3_16x8; uint16x8_t x255_16x8; uint32x2_t p0_p2_32x2; uint32x2_t p1_p3_32x2; uint32x2_t res_32x2; uint8x8_t p0_p2_8x8; uint8x8_t p1_p3_8x8; uint8x8_t p2_8x8; uint16x4_t temp_16x4; ay_16x4 = vdup_n_u16(ay); x255_16x8 = vdupq_n_u16(0xff); for(i = 0; i < len; i++) { DATA32 p0 = *_p0++; DATA32 p1 = *_p1++; DATA32 p2 = *_p2++; DATA32 p3 = *_p3++; int ax = *_ax++; if (p0 | p1 | p2 | p3) { ax_16x8 = vdupq_n_u16(ax); p0_p2_32x2 = vset_lane_u32(p0, p0_p2_32x2, 0); p0_p2_32x2 = vset_lane_u32(p2, p0_p2_32x2, 1); p1_p3_32x2 = vset_lane_u32(p1, p1_p3_32x2, 0); p1_p3_32x2 = vset_lane_u32(p3, p1_p3_32x2, 1); p0_p2_8x8 = vreinterpret_u8_u32(p0_p2_32x2); p1_p3_8x8 = vreinterpret_u8_u32(p1_p3_32x2); p1_p3_16x8 = vmovl_u8(p1_p3_8x8); p0_p2_16x8 = vmovl_u8(p0_p2_8x8); p1_p3_16x8 = vsubq_u16(p1_p3_16x8, p0_p2_16x8); p1_p3_16x8 = vmulq_u16(p1_p3_16x8, ax_16x8); p1_p3_16x8 = vshrq_n_u16(p1_p3_16x8, 8); p1_p3_16x8 = vaddq_u16(p1_p3_16x8, p0_p2_16x8); p1_p3_16x8 = vandq_u16(p1_p3_16x8, x255_16x8); p0_16x4 = vget_low_u16(p1_p3_16x8); p2_16x4 = vget_high_u16(p1_p3_16x8); p2_16x4 = vsub_u16(p2_16x4, p0_16x4); p2_16x4 = vmul_u16(p2_16x4, ay_16x4); p2_16x4 = vshr_n_u16(p2_16x4, 8); p2_16x4 = vadd_u16(p2_16x4, p0_16x4); p1_p3_16x8 = vcombine_u16(temp_16x4, p2_16x4); p2_8x8 = vmovn_u16(p1_p3_16x8); res_32x2 = vreinterpret_u32_u8(p2_8x8); vst1_lane_u32(pbuf++, res_32x2, 1); } else *pbuf++ = p0; } return 0; }