void GPUgstate::FastLoadBoneMatrix(u32 addr) { const u32_le *src = (const u32_le *)Memory::GetPointerUnchecked(addr); u32 num = boneMatrixNumber; u32 *dst = (u32 *)(boneMatrix + (num & 0x7F)); #ifdef _M_SSE __m128i row1 = _mm_slli_epi32(_mm_loadu_si128((const __m128i *)src), 8); __m128i row2 = _mm_slli_epi32(_mm_loadu_si128((const __m128i *)(src + 4)), 8); __m128i row3 = _mm_slli_epi32(_mm_loadu_si128((const __m128i *)(src + 8)), 8); if ((num & 0x3) == 0) { _mm_store_si128((__m128i *)dst, row1); _mm_store_si128((__m128i *)(dst + 4), row2); _mm_store_si128((__m128i *)(dst + 8), row3); } else { _mm_storeu_si128((__m128i *)dst, row1); _mm_storeu_si128((__m128i *)(dst + 4), row2); _mm_storeu_si128((__m128i *)(dst + 8), row3); } #elif PPSSPP_ARCH(ARM_NEON) const uint32x4_t row1 = vshlq_n_u32(vld1q_u32(src), 8); const uint32x4_t row2 = vshlq_n_u32(vld1q_u32(src + 4), 8); const uint32x4_t row3 = vshlq_n_u32(vld1q_u32(src + 8), 8); vst1q_u32(dst, row1); vst1q_u32(dst + 4, row2); vst1q_u32(dst + 8, row3); #else for (int i = 0; i < 12; i++) { dst[i] = src[i] << 8; } #endif num += 12; gstate.boneMatrixNumber = (GE_CMD_BONEMATRIXNUMBER << 24) | (num & 0x7F); }
static void MixColumns(void) { uint32x4_t a = vreinterpretq_u32_u8(*state); uint32x4_t b = vreinterpretq_u32_u8(xtime(*state)); uint32x4_t a3 = veorq_u32(a,b); uint32x4_t a3r = vshlq_n_u32(a3,8); a3r = vsraq_n_u32(a3r,a3,24); uint32x4_t a2 = vshlq_n_u32(a,16); a2 = vsraq_n_u32(a2,a,16); uint32x4_t a1 = vshlq_n_u32(a,24); a1 = vsraq_n_u32(a1,a,8); uint32x4_t out = veorq_u32(b,a1); out = veorq_u32(out,a2); out = veorq_u32(out,a3r); *state = vreinterpretq_u8_u32(out); }
int crypto_stream_xor( unsigned char *c, const unsigned char *m,unsigned long long mlen, const unsigned char *n, const unsigned char *k ) { const uint32x4_t abab = {-1,0,-1,0}; const uint64x1_t nextblock = {1}; uint32x4_t k0k1k2k3 = (uint32x4_t) vld1q_u8((uint8_t *) k); uint32x4_t k4k5k6k7 = (uint32x4_t) vld1q_u8((uint8_t *) (k + 16)); uint32x4_t start0 = (uint32x4_t) vld1q_u8((uint8_t *) sigma); uint32x2_t n0n1 = (uint32x2_t) vld1_u8((uint8_t *) n); uint32x2_t n2n3 = {0,0}; uint32x2_t k0k1 = vget_low_u32(k0k1k2k3); uint32x2_t k2k3 = vget_high_u32(k0k1k2k3); uint32x2_t k4k5 = vget_low_u32(k4k5k6k7); uint32x2_t k6k7 = vget_high_u32(k4k5k6k7); uint32x2_t n1n0 = vext_u32(n0n1,n0n1,1); uint32x2_t n3n2; uint32x2_t n0k4 = vext_u32(n1n0,k4k5,1); uint32x2_t k5k0 = vext_u32(k4k5,k0k1,1); uint32x2_t k1n1 = vext_u32(k0k1,n1n0,1); uint32x2_t n2k6; uint32x2_t k7k2 = vext_u32(k6k7,k2k3,1); uint32x2_t k3n3; uint32x4_t start1 = vcombine_u32(k5k0,n0k4); uint32x4_t start2; uint32x4_t start3; register uint32x4_t diag0; register uint32x4_t diag1; register uint32x4_t diag2; register uint32x4_t diag3; uint32x4_t next_start2; uint32x4_t next_start3; register uint32x4_t next_diag0; register uint32x4_t next_diag1; register uint32x4_t next_diag2; register uint32x4_t next_diag3; uint32x4_t x0x5x10x15; uint32x4_t x12x1x6x11; uint32x4_t x8x13x2x7; uint32x4_t x4x9x14x3; uint32x4_t x0x1x10x11; uint32x4_t x12x13x6x7; uint32x4_t x8x9x2x3; uint32x4_t x4x5x14x15; uint32x4_t x0x1x2x3; uint32x4_t x4x5x6x7; uint32x4_t x8x9x10x11; uint32x4_t x12x13x14x15; uint32x4_t m0m1m2m3; uint32x4_t m4m5m6m7; uint32x4_t m8m9m10m11; uint32x4_t m12m13m14m15; register uint32x4_t a0; register uint32x4_t a1; register uint32x4_t a2; register uint32x4_t a3; register uint32x4_t b0; register uint32x4_t b1; register uint32x4_t b2; register uint32x4_t b3; register uint32x4_t next_a0; register uint32x4_t next_a1; register uint32x4_t next_a2; register uint32x4_t next_a3; register uint32x4_t next_b0; register uint32x4_t next_b1; register uint32x4_t next_b2; register uint32x4_t next_b3; unsigned char block[64]; unsigned char *savec; int i; int flagm = (m != 0); if (!mlen) return 0; if (mlen < 128) goto mlenatleast1; mlenatleast128: n3n2 = vext_u32(n2n3,n2n3,1); n2k6 = vext_u32(n3n2,k6k7,1); k3n3 = vext_u32(k2k3,n3n2,1); start2 = vcombine_u32(n2k6,k1n1); start3 = vcombine_u32(k3n3,k7k2); n2n3 = (uint32x2_t) vadd_u64(nextblock,(uint64x1_t) n2n3); diag0 = start0; diag1 = start1; diag2 = start2; diag3 = start3; n3n2 = vext_u32(n2n3,n2n3,1); n2k6 = vext_u32(n3n2,k6k7,1); k3n3 = vext_u32(k2k3,n3n2,1); next_start2 = vcombine_u32(n2k6,k1n1); next_start3 = vcombine_u32(k3n3,k7k2); n2n3 = (uint32x2_t) vadd_u64(nextblock,(uint64x1_t) n2n3); next_diag0 = start0; next_diag1 = start1; next_diag2 = next_start2; next_diag3 = next_start3; for (i = ROUNDS;i > 0;i -= 2) { a0 = diag1 + diag0; b0 = vshlq_n_u32(a0,7); next_a0 = next_diag1 + next_diag0; a0 = vsriq_n_u32(b0,a0,25); next_b0 = vshlq_n_u32(next_a0,7); diag3 ^= a0; next_a0 = vsriq_n_u32(next_b0,next_a0,25); a1 = diag0 + diag3; next_diag3 ^= next_a0; b1 = vshlq_n_u32(a1,9); next_a1 = next_diag0 + next_diag3; a1 = vsriq_n_u32(b1,a1,23); next_b1 = vshlq_n_u32(next_a1,9); diag2 ^= a1; next_a1 = vsriq_n_u32(next_b1,next_a1,23); a2 = diag3 + diag2; diag3 = vextq_u32(diag3,diag3,3); next_diag2 ^= next_a1; b2 = vshlq_n_u32(a2,13); next_a2 = next_diag3 + next_diag2; next_diag3 = vextq_u32(next_diag3,next_diag3,3); a2 = vsriq_n_u32(b2,a2,19); next_b2 = vshlq_n_u32(next_a2,13); diag1 ^= a2; next_a2 = vsriq_n_u32(next_b2,next_a2,19); a3 = diag2 + diag1; diag2 = vextq_u32(diag2,diag2,2); next_diag1 ^= next_a2; b3 = vshlq_n_u32(a3,18); diag1 = vextq_u32(diag1,diag1,1); next_a3 = next_diag2 + next_diag1; next_diag2 = vextq_u32(next_diag2,next_diag2,2); a3 = vsriq_n_u32(b3,a3,14); next_b3 = vshlq_n_u32(next_a3,18); next_diag1 = vextq_u32(next_diag1,next_diag1,1); diag0 ^= a3; next_a3 = vsriq_n_u32(next_b3,next_a3,14); a0 = diag3 + diag0; next_diag0 ^= next_a3; b0 = vshlq_n_u32(a0,7); next_a0 = next_diag3 + next_diag0; a0 = vsriq_n_u32(b0,a0,25); next_b0 = vshlq_n_u32(next_a0,7); diag1 ^= a0; next_a0 = vsriq_n_u32(next_b0,next_a0,25); a1 = diag0 + diag1; next_diag1 ^= next_a0; b1 = vshlq_n_u32(a1,9); next_a1 = next_diag0 + next_diag1; a1 = vsriq_n_u32(b1,a1,23); next_b1 = vshlq_n_u32(next_a1,9); diag2 ^= a1; next_a1 = vsriq_n_u32(next_b1,next_a1,23); a2 = diag1 + diag2; diag1 = vextq_u32(diag1,diag1,3); next_diag2 ^= next_a1; b2 = vshlq_n_u32(a2,13); next_a2 = next_diag1 + next_diag2; next_diag1 = vextq_u32(next_diag1,next_diag1,3); a2 = vsriq_n_u32(b2,a2,19); next_b2 = vshlq_n_u32(next_a2,13); diag3 ^= a2; next_a2 = vsriq_n_u32(next_b2,next_a2,19); a3 = diag2 + diag3; diag2 = vextq_u32(diag2,diag2,2); next_diag3 ^= next_a2; b3 = vshlq_n_u32(a3,18); diag3 = vextq_u32(diag3,diag3,1); next_a3 = next_diag2 + next_diag3; next_diag2 = vextq_u32(next_diag2,next_diag2,2); a3 = vsriq_n_u32(b3,a3,14); next_b3 = vshlq_n_u32(next_a3,18); next_diag3 = vextq_u32(next_diag3,next_diag3,1); diag0 ^= a3; next_a3 = vsriq_n_u32(next_b3,next_a3,14); next_diag0 ^= next_a3; } x0x5x10x15 = diag0 + start0; x12x1x6x11 = diag1 + start1; x8x13x2x7 = diag2 + start2; x4x9x14x3 = diag3 + start3; if (flagm) m0m1m2m3 = (uint32x4_t) vld1q_u8((uint8_t *) m); if (flagm) m4m5m6m7 = (uint32x4_t) vld1q_u8(16 + (uint8_t *) m); if (flagm) m8m9m10m11 = (uint32x4_t) vld1q_u8(32 + (uint8_t *) m); if (flagm) m12m13m14m15 = (uint32x4_t) vld1q_u8(48 + (uint8_t *) m); x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11); x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7); x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3); x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15); x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3)); x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7)); x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11)); x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15)); if (flagm) x0x1x2x3 ^= m0m1m2m3; if (flagm) x4x5x6x7 ^= m4m5m6m7; if (flagm) x8x9x10x11 ^= m8m9m10m11; if (flagm) x12x13x14x15 ^= m12m13m14m15; vst1q_u8((uint8_t *) c,(uint8x16_t) x0x1x2x3); vst1q_u8(16 + (uint8_t *) c,(uint8x16_t) x4x5x6x7); vst1q_u8(32 + (uint8_t *) c,(uint8x16_t) x8x9x10x11); vst1q_u8(48 + (uint8_t *) c,(uint8x16_t) x12x13x14x15); x0x5x10x15 = next_diag0 + start0; x12x1x6x11 = next_diag1 + start1; x8x13x2x7 = next_diag2 + next_start2; x4x9x14x3 = next_diag3 + next_start3; if (flagm) m0m1m2m3 = (uint32x4_t) vld1q_u8(64 + (uint8_t *) m); if (flagm) m4m5m6m7 = (uint32x4_t) vld1q_u8(80 + (uint8_t *) m); if (flagm) m8m9m10m11 = (uint32x4_t) vld1q_u8(96 + (uint8_t *) m); if (flagm) m12m13m14m15 = (uint32x4_t) vld1q_u8(112 + (uint8_t *) m); x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11); x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7); x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3); x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15); x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3)); x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7)); x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11)); x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15)); if (flagm) x0x1x2x3 ^= m0m1m2m3; if (flagm) x4x5x6x7 ^= m4m5m6m7; if (flagm) x8x9x10x11 ^= m8m9m10m11; if (flagm) x12x13x14x15 ^= m12m13m14m15; vst1q_u8(64 + (uint8_t *) c,(uint8x16_t) x0x1x2x3); vst1q_u8(80 + (uint8_t *) c,(uint8x16_t) x4x5x6x7); vst1q_u8(96 + (uint8_t *) c,(uint8x16_t) x8x9x10x11); vst1q_u8(112 + (uint8_t *) c,(uint8x16_t) x12x13x14x15); mlen -= 128; c += 128; if (flagm) m += 128; if (mlen >= 128) goto mlenatleast128; mlenatleast1: if (mlen < 64) { if (flagm) for (i = 0;i < 64;++i) block[i] = 0; if (flagm) for (i = 0;i < mlen;++i) block[i] = m[i]; savec = c; c = block; if (flagm) m = block; } n3n2 = vext_u32(n2n3,n2n3,1); n2k6 = vext_u32(n3n2,k6k7,1); k3n3 = vext_u32(k2k3,n3n2,1); start2 = vcombine_u32(n2k6,k1n1); start3 = vcombine_u32(k3n3,k7k2); diag0 = start0; diag1 = start1; diag2 = start2; diag3 = start3; for (i = ROUNDS;i > 0;i -= 2) { a0 = diag1 + diag0; b0 = vshlq_n_u32(a0,7); a0 = vsriq_n_u32(b0,a0,25); diag3 ^= a0; a1 = diag0 + diag3; b1 = vshlq_n_u32(a1,9); a1 = vsriq_n_u32(b1,a1,23); diag2 ^= a1; a2 = diag3 + diag2; diag3 = vextq_u32(diag3,diag3,3); b2 = vshlq_n_u32(a2,13); a2 = vsriq_n_u32(b2,a2,19); diag1 ^= a2; a3 = diag2 + diag1; diag2 = vextq_u32(diag2,diag2,2); b3 = vshlq_n_u32(a3,18); diag1 = vextq_u32(diag1,diag1,1); a3 = vsriq_n_u32(b3,a3,14); diag0 ^= a3; a0 = diag3 + diag0; b0 = vshlq_n_u32(a0,7); a0 = vsriq_n_u32(b0,a0,25); diag1 ^= a0; a1 = diag0 + diag1; b1 = vshlq_n_u32(a1,9); a1 = vsriq_n_u32(b1,a1,23); diag2 ^= a1; a2 = diag1 + diag2; diag1 = vextq_u32(diag1,diag1,3); b2 = vshlq_n_u32(a2,13); a2 = vsriq_n_u32(b2,a2,19); diag3 ^= a2; a3 = diag2 + diag3; diag2 = vextq_u32(diag2,diag2,2); b3 = vshlq_n_u32(a3,18); diag3 = vextq_u32(diag3,diag3,1); a3 = vsriq_n_u32(b3,a3,14); diag0 ^= a3; } x0x5x10x15 = diag0 + start0; x12x1x6x11 = diag1 + start1; x8x13x2x7 = diag2 + start2; x4x9x14x3 = diag3 + start3; if (flagm) m0m1m2m3 = (uint32x4_t) vld1q_u8((uint8_t *) m); if (flagm) m4m5m6m7 = (uint32x4_t) vld1q_u8(16 + (uint8_t *) m); if (flagm) m8m9m10m11 = (uint32x4_t) vld1q_u8(32 + (uint8_t *) m); if (flagm) m12m13m14m15 = (uint32x4_t) vld1q_u8(48 + (uint8_t *) m); x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11); x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7); x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3); x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15); x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3)); x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7)); x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11)); x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15)); if (flagm) x0x1x2x3 ^= m0m1m2m3; if (flagm) x4x5x6x7 ^= m4m5m6m7; if (flagm) x8x9x10x11 ^= m8m9m10m11; if (flagm) x12x13x14x15 ^= m12m13m14m15; vst1q_u8((uint8_t *) c,(uint8x16_t) x0x1x2x3); vst1q_u8(16 + (uint8_t *) c,(uint8x16_t) x4x5x6x7); vst1q_u8(32 + (uint8_t *) c,(uint8x16_t) x8x9x10x11); vst1q_u8(48 + (uint8_t *) c,(uint8x16_t) x12x13x14x15); if (mlen < 64) { for (i = 0;i < mlen;++i) savec[i] = c[i]; } if (mlen <= 64) return 0; n2n3 = (uint32x2_t) vadd_u64(nextblock,(uint64x1_t) n2n3); mlen -= 64; c += 64; if (flagm) m += 64; goto mlenatleast1; }
static inline void desc_to_olflags_v(struct i40e_rx_queue *rxq, uint64x2_t descs[4], struct rte_mbuf **rx_pkts) { uint32x4_t vlan0, vlan1, rss, l3_l4e; const uint64x2_t mbuf_init = {rxq->mbuf_initializer, 0}; uint64x2_t rearm0, rearm1, rearm2, rearm3; /* mask everything except RSS, flow director and VLAN flags * bit2 is for VLAN tag, bit11 for flow director indication * bit13:12 for RSS indication. */ const uint32x4_t rss_vlan_msk = { 0x1c03804, 0x1c03804, 0x1c03804, 0x1c03804}; const uint32x4_t cksum_mask = { PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD}; /* map rss and vlan type to rss hash and vlan flag */ const uint8x16_t vlan_flags = { 0, 0, 0, 0, PKT_RX_VLAN | PKT_RX_VLAN_STRIPPED, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; const uint8x16_t rss_flags = { 0, PKT_RX_FDIR, 0, 0, 0, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH | PKT_RX_FDIR, 0, 0, 0, 0, 0, 0, 0, 0}; const uint8x16_t l3_l4e_flags = { (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1, PKT_RX_IP_CKSUM_BAD >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1, (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD) >> 1, (PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD) >> 1, (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, 0, 0, 0, 0, 0, 0, 0, 0}; vlan0 = vzipq_u32(vreinterpretq_u32_u64(descs[0]), vreinterpretq_u32_u64(descs[2])).val[1]; vlan1 = vzipq_u32(vreinterpretq_u32_u64(descs[1]), vreinterpretq_u32_u64(descs[3])).val[1]; vlan0 = vzipq_u32(vlan0, vlan1).val[0]; vlan1 = vandq_u32(vlan0, rss_vlan_msk); vlan0 = vreinterpretq_u32_u8(vqtbl1q_u8(vlan_flags, vreinterpretq_u8_u32(vlan1))); rss = vshrq_n_u32(vlan1, 11); rss = vreinterpretq_u32_u8(vqtbl1q_u8(rss_flags, vreinterpretq_u8_u32(rss))); l3_l4e = vshrq_n_u32(vlan1, 22); l3_l4e = vreinterpretq_u32_u8(vqtbl1q_u8(l3_l4e_flags, vreinterpretq_u8_u32(l3_l4e))); /* then we shift left 1 bit */ l3_l4e = vshlq_n_u32(l3_l4e, 1); /* we need to mask out the reduntant bits */ l3_l4e = vandq_u32(l3_l4e, cksum_mask); vlan0 = vorrq_u32(vlan0, rss); vlan0 = vorrq_u32(vlan0, l3_l4e); rearm0 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 0), mbuf_init, 1); rearm1 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 1), mbuf_init, 1); rearm2 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 2), mbuf_init, 1); rearm3 = vsetq_lane_u64(vgetq_lane_u32(vlan0, 3), mbuf_init, 1); vst1q_u64((uint64_t *)&rx_pkts[0]->rearm_data, rearm0); vst1q_u64((uint64_t *)&rx_pkts[1]->rearm_data, rearm1); vst1q_u64((uint64_t *)&rx_pkts[2]->rearm_data, rearm2); vst1q_u64((uint64_t *)&rx_pkts[3]->rearm_data, rearm3); }
int crypto_core( unsigned char *out, const unsigned char *in, const unsigned char *k, const unsigned char *c ) { int i; const uint32x4_t abab = {-1,0,-1,0}; uint32x4_t k0k1k2k3 = (uint32x4_t) vld1q_u8((uint8_t *) k); uint32x4_t k4k5k6k7 = (uint32x4_t) vld1q_u8((uint8_t *) (k + 16)); uint32x4_t c0c1c2c3 = (uint32x4_t) vld1q_u8((uint8_t *) c); uint32x4_t n0n1n2n3 = (uint32x4_t) vld1q_u8((uint8_t *) in); uint32x4_t n1n2n3n0 = vextq_u32(n0n1n2n3,n0n1n2n3,1); uint32x2_t n1n2 = vget_low_u32(n1n2n3n0); uint32x2_t n3n0 = vget_high_u32(n1n2n3n0); uint32x2_t k0k1 = vget_low_u32(k0k1k2k3); uint32x2_t k2k3 = vget_high_u32(k0k1k2k3); uint32x2_t k4k5 = vget_low_u32(k4k5k6k7); uint32x2_t k6k7 = vget_high_u32(k4k5k6k7); uint32x2_t n0k0 = vext_u32(n3n0,k0k1,1); uint32x2_t k0n0 = vext_u32(n0k0,n0k0,1); uint32x4_t k4k5k0n0 = vcombine_u32(k4k5,k0n0); uint32x2_t k1k6 = vext_u32(k0k1,k6k7,1); uint32x2_t k6k1 = vext_u32(k1k6,k1k6,1); uint32x4_t n1n2k6k1 = vcombine_u32(n1n2,k6k1); uint32x2_t k7n3 = vext_u32(k6k7,n3n0,1); uint32x2_t n3k7 = vext_u32(k7n3,k7n3,1); uint32x4_t k2k3n3k7 = vcombine_u32(k2k3,n3k7); uint32x4_t start0 = c0c1c2c3; uint32x4_t start1 = vextq_u32(k4k5k0n0,k4k5k0n0,1); uint32x4_t start2 = vextq_u32(n1n2k6k1,n1n2k6k1,1); uint32x4_t start3 = vextq_u32(k2k3n3k7,k2k3n3k7,1); uint32x4_t diag0 = start0; uint32x4_t diag1 = start1; uint32x4_t diag2 = start2; uint32x4_t diag3 = start3; uint32x4_t x0x5x10x15; uint32x4_t x12x1x6x11; uint32x4_t x8x13x2x7; uint32x4_t x4x9x14x3; uint32x4_t x0x1x10x11; uint32x4_t x12x13x6x7; uint32x4_t x8x9x2x3; uint32x4_t x4x5x14x15; uint32x4_t x0x1x2x3; uint32x4_t x4x5x6x7; uint32x4_t x8x9x10x11; uint32x4_t x12x13x14x15; uint32x4_t a0; uint32x4_t a1; uint32x4_t a2; uint32x4_t a3; for (i = ROUNDS; i > 0; i -= 2) { a0 = diag1 + diag0; diag3 ^= vsriq_n_u32(vshlq_n_u32(a0,7),a0,25); a1 = diag0 + diag3; diag2 ^= vsriq_n_u32(vshlq_n_u32(a1,9),a1,23); a2 = diag3 + diag2; diag1 ^= vsriq_n_u32(vshlq_n_u32(a2,13),a2,19); a3 = diag2 + diag1; diag0 ^= vsriq_n_u32(vshlq_n_u32(a3,18),a3,14); diag3 = vextq_u32(diag3,diag3,3); diag2 = vextq_u32(diag2,diag2,2); diag1 = vextq_u32(diag1,diag1,1); a0 = diag3 + diag0; diag1 ^= vsriq_n_u32(vshlq_n_u32(a0,7),a0,25); a1 = diag0 + diag1; diag2 ^= vsriq_n_u32(vshlq_n_u32(a1,9),a1,23); a2 = diag1 + diag2; diag3 ^= vsriq_n_u32(vshlq_n_u32(a2,13),a2,19); a3 = diag2 + diag3; diag0 ^= vsriq_n_u32(vshlq_n_u32(a3,18),a3,14); diag1 = vextq_u32(diag1,diag1,3); diag2 = vextq_u32(diag2,diag2,2); diag3 = vextq_u32(diag3,diag3,1); } x0x5x10x15 = diag0 + start0; x12x1x6x11 = diag1 + start1; x8x13x2x7 = diag2 + start2; x4x9x14x3 = diag3 + start3; x0x1x10x11 = vbslq_u32(abab,x0x5x10x15,x12x1x6x11); x12x13x6x7 = vbslq_u32(abab,x12x1x6x11,x8x13x2x7); x8x9x2x3 = vbslq_u32(abab,x8x13x2x7,x4x9x14x3); x4x5x14x15 = vbslq_u32(abab,x4x9x14x3,x0x5x10x15); x0x1x2x3 = vcombine_u32(vget_low_u32(x0x1x10x11),vget_high_u32(x8x9x2x3)); x4x5x6x7 = vcombine_u32(vget_low_u32(x4x5x14x15),vget_high_u32(x12x13x6x7)); x8x9x10x11 = vcombine_u32(vget_low_u32(x8x9x2x3),vget_high_u32(x0x1x10x11)); x12x13x14x15 = vcombine_u32(vget_low_u32(x12x13x6x7),vget_high_u32(x4x5x14x15)); vst1q_u8((uint8_t *) out,(uint8x16_t) x0x1x2x3); vst1q_u8(16 + (uint8_t *) out,(uint8x16_t) x4x5x6x7); vst1q_u8(32 + (uint8_t *) out,(uint8x16_t) x8x9x10x11); vst1q_u8(48 + (uint8_t *) out,(uint8x16_t) x12x13x14x15); return 0; }