static uint8x16_t armv8_aes_dec(int rounds, const uint8x16_t *keysched, const uint8x16_t from) { uint8x16_t tmp; int i; tmp = from; for (i = 0; i < rounds - 1; i += 2) { tmp = vaesdq_u8(tmp, keysched[i]); tmp = vaesimcq_u8(tmp); tmp = vaesdq_u8(tmp, keysched[i+1]); tmp = vaesimcq_u8(tmp); } tmp = vaesdq_u8(tmp, keysched[rounds - 1]); tmp = vaesimcq_u8(tmp); tmp = vaesdq_u8(tmp, keysched[rounds]); tmp = veorq_u8(tmp, keysched[rounds + 1]); return (tmp); }
int foo (void) { uint8x16_t a, b, c; int i = 0; for (i = 0; i < 16; ++i) { a[i] = i; b[i] = 15 - i; } c = vaesdq_u8 (a, b); return c[0]; }
uint8x16_t test_aesd(uint8x16_t data, uint8x16_t key) { // CHECK-LABEL: test_aesd: // CHECK: aesd.16b v0, v1 return vaesdq_u8(data, key); }
uint8x16_t test_vaesdq_u8(uint8x16_t data, uint8x16_t key) { // CHECK: test_vaesdq_u8 return vaesdq_u8(data, key); // CHECK: aesd {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }
void AES_256::armv8_decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { BOTAN_ASSERT(m_DK.empty() == false, "Key was set"); const uint8_t *skey = reinterpret_cast<const uint8_t*>(m_DK.data()); const uint8_t *mkey = reinterpret_cast<const uint8_t*>(m_MD.data()); const uint8x16_t K0 = vld1q_u8(skey + 0); const uint8x16_t K1 = vld1q_u8(skey + 16); const uint8x16_t K2 = vld1q_u8(skey + 32); const uint8x16_t K3 = vld1q_u8(skey + 48); const uint8x16_t K4 = vld1q_u8(skey + 64); const uint8x16_t K5 = vld1q_u8(skey + 80); const uint8x16_t K6 = vld1q_u8(skey + 96); const uint8x16_t K7 = vld1q_u8(skey + 112); const uint8x16_t K8 = vld1q_u8(skey + 128); const uint8x16_t K9 = vld1q_u8(skey + 144); const uint8x16_t K10 = vld1q_u8(skey + 160); const uint8x16_t K11 = vld1q_u8(skey + 176); const uint8x16_t K12 = vld1q_u8(skey + 192); const uint8x16_t K13 = vld1q_u8(skey + 208); const uint8x16_t K14 = vld1q_u8(mkey); while(blocks >= 4) { uint8x16_t B0 = vld1q_u8(in); uint8x16_t B1 = vld1q_u8(in+16); uint8x16_t B2 = vld1q_u8(in+32); uint8x16_t B3 = vld1q_u8(in+48); AES_DEC_4_ROUNDS(K0); AES_DEC_4_ROUNDS(K1); AES_DEC_4_ROUNDS(K2); AES_DEC_4_ROUNDS(K3); AES_DEC_4_ROUNDS(K4); AES_DEC_4_ROUNDS(K5); AES_DEC_4_ROUNDS(K6); AES_DEC_4_ROUNDS(K7); AES_DEC_4_ROUNDS(K8); AES_DEC_4_ROUNDS(K9); AES_DEC_4_ROUNDS(K10); AES_DEC_4_ROUNDS(K11); AES_DEC_4_ROUNDS(K12); AES_DEC_4_LAST_ROUNDS(K13, K14); vst1q_u8(out, B0); vst1q_u8(out+16, B1); vst1q_u8(out+32, B2); vst1q_u8(out+48, B3); in += 16*4; out += 16*4; blocks -= 4; } for(size_t i = 0; i != blocks; ++i) { uint8x16_t B = vld1q_u8(in+16*i); B = vaesimcq_u8(vaesdq_u8(B, K0)); B = vaesimcq_u8(vaesdq_u8(B, K1)); B = vaesimcq_u8(vaesdq_u8(B, K2)); B = vaesimcq_u8(vaesdq_u8(B, K3)); B = vaesimcq_u8(vaesdq_u8(B, K4)); B = vaesimcq_u8(vaesdq_u8(B, K5)); B = vaesimcq_u8(vaesdq_u8(B, K6)); B = vaesimcq_u8(vaesdq_u8(B, K7)); B = vaesimcq_u8(vaesdq_u8(B, K8)); B = vaesimcq_u8(vaesdq_u8(B, K9)); B = vaesimcq_u8(vaesdq_u8(B, K10)); B = vaesimcq_u8(vaesdq_u8(B, K11)); B = vaesimcq_u8(vaesdq_u8(B, K12)); B = veorq_u8(vaesdq_u8(B, K13), K14); vst1q_u8(out+16*i, B); } }