static inline void i40e_rxq_rearm(struct i40e_rx_queue *rxq) { int i; uint16_t rx_id; volatile union i40e_rx_desc *rxdp; struct i40e_rx_entry *rxep = &rxq->sw_ring[rxq->rxrearm_start]; struct rte_mbuf *mb0, *mb1; uint64x2_t dma_addr0, dma_addr1; uint64x2_t zero = vdupq_n_u64(0); uint64_t paddr; rxdp = rxq->rx_ring + rxq->rxrearm_start; /* Pull 'n' more MBUFs into the software ring */ if (unlikely(rte_mempool_get_bulk(rxq->mp, (void *)rxep, RTE_I40E_RXQ_REARM_THRESH) < 0)) { if (rxq->rxrearm_nb + RTE_I40E_RXQ_REARM_THRESH >= rxq->nb_rx_desc) { for (i = 0; i < RTE_I40E_DESCS_PER_LOOP; i++) { rxep[i].mbuf = &rxq->fake_mbuf; vst1q_u64((uint64_t *)&rxdp[i].read, zero); } } rte_eth_devices[rxq->port_id].data->rx_mbuf_alloc_failed += RTE_I40E_RXQ_REARM_THRESH; return; } /* Initialize the mbufs in vector, process 2 mbufs in one loop */ for (i = 0; i < RTE_I40E_RXQ_REARM_THRESH; i += 2, rxep += 2) { mb0 = rxep[0].mbuf; mb1 = rxep[1].mbuf; paddr = mb0->buf_iova + RTE_PKTMBUF_HEADROOM; dma_addr0 = vdupq_n_u64(paddr); /* flush desc with pa dma_addr */ vst1q_u64((uint64_t *)&rxdp++->read, dma_addr0); paddr = mb1->buf_iova + RTE_PKTMBUF_HEADROOM; dma_addr1 = vdupq_n_u64(paddr); vst1q_u64((uint64_t *)&rxdp++->read, dma_addr1); } rxq->rxrearm_start += RTE_I40E_RXQ_REARM_THRESH; if (rxq->rxrearm_start >= rxq->nb_rx_desc) rxq->rxrearm_start = 0; rxq->rxrearm_nb -= RTE_I40E_RXQ_REARM_THRESH; rx_id = (uint16_t)((rxq->rxrearm_start == 0) ? (rxq->nb_rx_desc - 1) : (rxq->rxrearm_start - 1)); /* Update the tail pointer on the NIC */ I40E_PCI_REG_WRITE(rxq->qrx_tail, rx_id); }
void test_vdupQ_nu64 (void) { uint64x2_t out_uint64x2_t; uint64_t arg0_uint64_t; out_uint64x2_t = vdupq_n_u64 (arg0_uint64_t); }
static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src, uint16_t* dst, int len) { int i; const int16x8_t zero = vdupq_n_s16(0); const int16x8_t max = vdupq_n_s16(MAX_Y); uint64x2_t sum = vdupq_n_u64(0); uint64_t diff; for (i = 0; i + 8 <= len; i += 8) { const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i)); const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i)); const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i)); const int16x8_t D = vsubq_s16(A, B); // diff_y const int16x8_t F = vaddq_s16(C, D); // new_y const uint16x8_t H = vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero)); const int16x8_t I = vabsq_s16(D); // abs(diff_y) vst1q_u16(dst + i, H); sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I))); } diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1); for (; i < len; ++i) { const int diff_y = ref[i] - src[i]; const int new_y = (int)(dst[i]) + diff_y; dst[i] = clip_y(new_y); diff += (uint64_t)(abs(diff_y)); } return diff; }
void foo () { int8x8_t val14; int8x8_t val15; uint8x8_t val16; uint32x4_t val40; val14 = vcreate_s8 (0xff0080f6807f807fUL); val15 = vcreate_s8 (0x10807fff7f808080UL); val16 = vcgt_s8 (val14, val15); val40 = vreinterpretq_u32_u64 ( vdupq_n_u64 ( vget_lane_s64_1 ( vreinterpret_s64_u8 (val16), 0) )); }
uint64x2_t test_vdupq_n_u64(uint64_t v1) { // CHECK: test_vdupq_n_u64 return vdupq_n_u64(v1); // CHECK: dup {{v[0-9]+}}.2d, {{x[0-9]+}} }
bool CPU_ProbeNEON() { #if defined(CRYPTOPP_NO_CPU_FEATURE_PROBES) return false; #elif (CRYPTOPP_ARM_NEON_AVAILABLE) # if defined(CRYPTOPP_MS_STYLE_INLINE_ASSEMBLY) volatile bool result = true; __try { uint32_t v1[4] = {1,1,1,1}; uint32x4_t x1 = vld1q_u32(v1); uint64_t v2[2] = {1,1}; uint64x2_t x2 = vld1q_u64(v2); uint32x4_t x3 = vdupq_n_u32(2); x3 = vsetq_lane_u32(vgetq_lane_u32(x1,0),x3,0); x3 = vsetq_lane_u32(vgetq_lane_u32(x1,3),x3,3); uint64x2_t x4 = vdupq_n_u64(2); x4 = vsetq_lane_u64(vgetq_lane_u64(x2,0),x4,0); x4 = vsetq_lane_u64(vgetq_lane_u64(x2,1),x4,1); result = !!(vgetq_lane_u32(x3,0) | vgetq_lane_u64(x4,1)); } __except (EXCEPTION_EXECUTE_HANDLER) { return false; } return result; # else // longjmp and clobber warnings. Volatile is required. // http://github.com/weidai11/cryptopp/issues/24 and http://stackoverflow.com/q/7721854 volatile bool result = true; volatile SigHandler oldHandler = signal(SIGILL, SigIllHandler); if (oldHandler == SIG_ERR) return false; volatile sigset_t oldMask; if (sigprocmask(0, NULLPTR, (sigset_t*)&oldMask)) return false; if (setjmp(s_jmpSIGILL)) result = false; else { uint32_t v1[4] = {1,1,1,1}; uint32x4_t x1 = vld1q_u32(v1); uint64_t v2[2] = {1,1}; uint64x2_t x2 = vld1q_u64(v2); uint32x4_t x3 = {0,0,0,0}; x3 = vsetq_lane_u32(vgetq_lane_u32(x1,0),x3,0); x3 = vsetq_lane_u32(vgetq_lane_u32(x1,3),x3,3); uint64x2_t x4 = {0,0}; x4 = vsetq_lane_u64(vgetq_lane_u64(x2,0),x4,0); x4 = vsetq_lane_u64(vgetq_lane_u64(x2,1),x4,1); // Hack... GCC optimizes away the code and returns true result = !!(vgetq_lane_u32(x3,0) | vgetq_lane_u64(x4,1)); } sigprocmask(SIG_SETMASK, (sigset_t*)&oldMask, NULLPTR); signal(SIGILL, oldHandler); return result; # endif #else return false; #endif // CRYPTOPP_ARM_NEON_AVAILABLE }
inline uint64x2_t vdupq_n(const u64 & val) { return vdupq_n_u64(val); }