void test_vreinterpretQu8_u32 (void) { uint8x16_t out_uint8x16_t; uint32x4_t arg0_uint32x4_t; out_uint8x16_t = vreinterpretq_u8_u32 (arg0_uint32x4_t); }
// about twice as fast as generic void MipMap_32_neon( int width, int height, unsigned char *source, unsigned char *target ) { if(width < 8) { MipMap_32_generic(width, height, source, target); return; } int newwidth = width / 2; int newheight = height / 2; int stride = width * 4; unsigned char *s = target; unsigned char *t = source; unsigned char *u = t+stride; int y, x; for( y = 0; y < newheight; y++ ) { for( x = 0; x < newwidth; x+=4 ) { uint8x16_t a0, a1, a2, a3; memcpy(&a0, t, 16); memcpy(&a1, t+16, 16); memcpy(&a2, u, 16); memcpy(&a3, u+16, 16); // average first and second scan lines a0 = vhaddq_u8(a0, a2); a1 = vhaddq_u8(a1, a3); // repack uint32x4x2_t z = vuzpq_u32(vreinterpretq_u32_u8(a0), vreinterpretq_u32_u8(a1)); uint8x16_t d0, d1; memcpy(&d0, &z.val[0], 16), memcpy(&d1, &z.val[1], 16); // average even and odd x pixels a0 = vhaddq_u8(vreinterpretq_u8_u32(z.val[0]), vreinterpretq_u8_u32(z.val[1])); memcpy(s, &a0, 16); s+=16; t+=32; u+=32; } t += stride; u += stride; } }
static void MixColumns(void) { uint32x4_t a = vreinterpretq_u32_u8(*state); uint32x4_t b = vreinterpretq_u32_u8(xtime(*state)); uint32x4_t a3 = veorq_u32(a,b); uint32x4_t a3r = vshlq_n_u32(a3,8); a3r = vsraq_n_u32(a3r,a3,24); uint32x4_t a2 = vshlq_n_u32(a,16); a2 = vsraq_n_u32(a2,a,16); uint32x4_t a1 = vshlq_n_u32(a,24); a1 = vsraq_n_u32(a1,a,8); uint32x4_t out = veorq_u32(b,a1); out = veorq_u32(out,a2); out = veorq_u32(out,a3r); *state = vreinterpretq_u8_u32(out); }
static inline void desc_to_olflags_v(struct i40e_rx_queue *rxq, uint64x2_t descs[4], struct rte_mbuf **rx_pkts) { uint32x4_t vlan0, vlan1, rss, l3_l4e; const uint64x2_t mbuf_init = {rxq->mbuf_initializer, 0}; uint64x2_t rearm0, rearm1, rearm2, rearm3; /* mask everything except RSS, flow director and VLAN flags * bit2 is for VLAN tag, bit11 for flow director indication * bit13:12 for RSS indication. */ const uint32x4_t rss_vlan_msk = { 0x1c03804, 0x1c03804, 0x1c03804, 0x1c03804}; const uint32x4_t cksum_mask = { PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD}; /* map rss and vlan type to rss hash and vlan flag */ const uint8x16_t vlan_flags = { 0, 0, 0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; const uint8x16_t rss_flags = { 0, PKT_RX_FDIR, 0, 0, 0, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH | PKT_RX_FDIR, 0, 0, 0, 0, 0, 0, 0, 0}; const uint8x16_t l3_l4e_flags = { (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1, PKT_RX_IP_CKSUM_BAD >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1, (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD) >> 1, (PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD) >> 1, (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, 0, 0, 0, 0, 0, 0, 0, 0}; vlan0 = vzipq_u32(vreinterpretq_u32_u64(descs[0]), vreinterpretq_u32_u64(descs[2])).val[1]; vlan1 = vzipq_u32(vreinterpretq_u32_u64(descs[1]), vreinterpretq_u32_u64(descs[3])).val[1]; vlan0 = vzipq_u32(vlan0, vlan1).val[0]; vlan1 = vandq_u32(vlan0, rss_vlan_msk); vlan0 = vreinterpretq_u32_u8(vqtbl1q_u8(vlan_flags, vreinterpretq_u8_u32(vlan1))); rss = vshrq_n_u32(vlan1, 11); rss = vreinterpretq_u32_u8(vqtbl1q_u8(rss_flags, vreinterpretq_u8_u32(rss))); l3_l4e = vshrq_n_u32(vlan1, 22); l3_l4e = vreinterpretq_u32_u8(vqtbl1q_u8(l3_l4e_flags, vreinterpretq_u8_u32(l3_l4e))); /* then we shift left 1 bit */ l3_l4e = vshlq_n_u32(l3_l4e, 1); /* we need to mask out the reduntant bits */ l3_l4e = vandq_u32(l3_l4e, cksum_mask); vlan0 = vorrq_u32(vlan0, rss); vlan0 = vorrq_u32(vlan0, l3_l4e); rearm0 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 0), mbuf_init, 1); rearm1 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 1), mbuf_init, 1); rearm2 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 2), mbuf_init, 1); rearm3 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 3), mbuf_init, 1); vst1q_u64((uint64_t *)&rx_pkts[0]->rearm_data, rearm0); vst1q_u64((uint64_t *)&rx_pkts[1]->rearm_data, rearm1); vst1q_u64((uint64_t *)&rx_pkts[2]->rearm_data, rearm2); vst1q_u64((uint64_t *)&rx_pkts[3]->rearm_data, rearm3); }
static void neon_w32_split_4_32_multiply_region(gf_t *gf, uint32_t *src, uint32_t *dst, uint32_t *d_end, uint8_t btable[8][4][16], uint32_t val, int xor, int altmap) { int i, j; #ifdef ARCH_AARCH64 uint8x16_t tables[8][4]; #else uint8x8x2_t tables[8][4]; #endif uint32x4_t v0, v1, v2, v3, s0, s1, s2, s3; uint8x16_t p0, p1, p2, p3, si, mask1; uint16x8x2_t r0, r1; uint8x16x2_t q0, q1; for (i = 0; i < 8; i++) { for (j = 0; j < 4; j++) { #ifdef ARCH_AARCH64 tables[i][j] = vld1q_u8(btable[i][j]); #else tables[i][j].val[0] = vld1_u8(btable[i][j]); tables[i][j].val[1] = vld1_u8(btable[i][j] + 8); #endif } } mask1 = vdupq_n_u8(0xf); while (dst < d_end) { v0 = vld1q_u32(src); src += 4; v1 = vld1q_u32(src); src += 4; v2 = vld1q_u32(src); src += 4; v3 = vld1q_u32(src); src += 4; if (altmap) { q0.val[0] = vreinterpretq_u8_u32(v0); q0.val[1] = vreinterpretq_u8_u32(v1); q1.val[0] = vreinterpretq_u8_u32(v2); q1.val[1] = vreinterpretq_u8_u32(v3); } else { r0 = vtrnq_u16(vreinterpretq_u16_u32(v0), vreinterpretq_u16_u32(v2)); r1 = vtrnq_u16(vreinterpretq_u16_u32(v1), vreinterpretq_u16_u32(v3)); q0 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[0]), vreinterpretq_u8_u16(r1.val[0])); q1 = vtrnq_u8(vreinterpretq_u8_u16(r0.val[1]), vreinterpretq_u8_u16(r1.val[1])); } si = vandq_u8(q0.val[0], mask1); p0 = vqtbl1q_u8(tables[0][0], si); p1 = vqtbl1q_u8(tables[0][1], si); p2 = vqtbl1q_u8(tables[0][2], si); p3 = vqtbl1q_u8(tables[0][3], si); si = vshrq_n_u8(q0.val[0], 4); p0 = veorq_u8(p0, vqtbl1q_u8(tables[1][0], si)); p1 = veorq_u8(p1, vqtbl1q_u8(tables[1][1], si)); p2 = veorq_u8(p2, vqtbl1q_u8(tables[1][2], si)); p3 = veorq_u8(p3, vqtbl1q_u8(tables[1][3], si)); si = vandq_u8(q0.val[1], mask1); p0 = veorq_u8(p0, vqtbl1q_u8(tables[2][0], si)); p1 = veorq_u8(p1, vqtbl1q_u8(tables[2][1], si)); p2 = veorq_u8(p2, vqtbl1q_u8(tables[2][2], si)); p3 = veorq_u8(p3, vqtbl1q_u8(tables[2][3], si)); si = vshrq_n_u8(q0.val[1], 4); p0 = veorq_u8(p0, vqtbl1q_u8(tables[3][0], si)); p1 = veorq_u8(p1, vqtbl1q_u8(tables[3][1], si)); p2 = veorq_u8(p2, vqtbl1q_u8(tables[3][2], si)); p3 = veorq_u8(p3, vqtbl1q_u8(tables[3][3], si)); si = vandq_u8(q1.val[0], mask1); p0 = veorq_u8(p0, vqtbl1q_u8(tables[4][0], si)); p1 = veorq_u8(p1, vqtbl1q_u8(tables[4][1], si)); p2 = veorq_u8(p2, vqtbl1q_u8(tables[4][2], si)); p3 = veorq_u8(p3, vqtbl1q_u8(tables[4][3], si)); si = vshrq_n_u8(q1.val[0], 4); p0 = veorq_u8(p0, vqtbl1q_u8(tables[5][0], si)); p1 = veorq_u8(p1, vqtbl1q_u8(tables[5][1], si)); p2 = veorq_u8(p2, vqtbl1q_u8(tables[5][2], si)); p3 = veorq_u8(p3, vqtbl1q_u8(tables[5][3], si)); si = vandq_u8(q1.val[1], mask1); p0 = veorq_u8(p0, vqtbl1q_u8(tables[6][0], si)); p1 = veorq_u8(p1, vqtbl1q_u8(tables[6][1], si)); p2 = veorq_u8(p2, vqtbl1q_u8(tables[6][2], si)); p3 = veorq_u8(p3, vqtbl1q_u8(tables[6][3], si)); si = vshrq_n_u8(q1.val[1], 4); p0 = veorq_u8(p0, vqtbl1q_u8(tables[7][0], si)); p1 = veorq_u8(p1, vqtbl1q_u8(tables[7][1], si)); p2 = veorq_u8(p2, vqtbl1q_u8(tables[7][2], si)); p3 = veorq_u8(p3, vqtbl1q_u8(tables[7][3], si)); if (altmap) { s0 = vreinterpretq_u32_u8(p0); s1 = vreinterpretq_u32_u8(p1); s2 = vreinterpretq_u32_u8(p2); s3 = vreinterpretq_u32_u8(p3); } else { q0 = vtrnq_u8(p0, p1); q1 = vtrnq_u8(p2, p3); r0 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[0]), vreinterpretq_u16_u8(q1.val[0])); r1 = vtrnq_u16(vreinterpretq_u16_u8(q0.val[1]), vreinterpretq_u16_u8(q1.val[1])); s0 = vreinterpretq_u32_u16(r0.val[0]); s1 = vreinterpretq_u32_u16(r1.val[0]); s2 = vreinterpretq_u32_u16(r0.val[1]); s3 = vreinterpretq_u32_u16(r1.val[1]); } if (xor) { v0 = vld1q_u32(dst); v1 = vld1q_u32(dst + 4); v2 = vld1q_u32(dst + 8); v3 = vld1q_u32(dst + 12); s0 = veorq_u32(s0, v0); s1 = veorq_u32(s1, v1); s2 = veorq_u32(s2, v2); s3 = veorq_u32(s3, v3); } vst1q_u32(dst, s0); vst1q_u32(dst + 4, s1); vst1q_u32(dst + 8, s2); vst1q_u32(dst + 12, s3); dst += 16; } }
_v16u8_( const uint32x4_t &src) { val = vreinterpretq_u8_u32(src); }