/* * AES-256 Encryption */ void AES_256_NI::encrypt_n(const byte in[], byte out[], size_t blocks) const { const __m128i* in_mm = (const __m128i*)in; __m128i* out_mm = (__m128i*)out; const __m128i* key_mm = (const __m128i*)&EK[0]; __m128i K0 = _mm_loadu_si128(key_mm); __m128i K1 = _mm_loadu_si128(key_mm + 1); __m128i K2 = _mm_loadu_si128(key_mm + 2); __m128i K3 = _mm_loadu_si128(key_mm + 3); __m128i K4 = _mm_loadu_si128(key_mm + 4); __m128i K5 = _mm_loadu_si128(key_mm + 5); __m128i K6 = _mm_loadu_si128(key_mm + 6); __m128i K7 = _mm_loadu_si128(key_mm + 7); __m128i K8 = _mm_loadu_si128(key_mm + 8); __m128i K9 = _mm_loadu_si128(key_mm + 9); __m128i K10 = _mm_loadu_si128(key_mm + 10); __m128i K11 = _mm_loadu_si128(key_mm + 11); __m128i K12 = _mm_loadu_si128(key_mm + 12); __m128i K13 = _mm_loadu_si128(key_mm + 13); __m128i K14 = _mm_loadu_si128(key_mm + 14); while(blocks >= 4) { __m128i B0 = _mm_loadu_si128(in_mm + 0); __m128i B1 = _mm_loadu_si128(in_mm + 1); __m128i B2 = _mm_loadu_si128(in_mm + 2); __m128i B3 = _mm_loadu_si128(in_mm + 3); B0 = _mm_xor_si128(B0, K0); B1 = _mm_xor_si128(B1, K0); B2 = _mm_xor_si128(B2, K0); B3 = _mm_xor_si128(B3, K0); AES_ENC_4_ROUNDS(K1); AES_ENC_4_ROUNDS(K2); AES_ENC_4_ROUNDS(K3); AES_ENC_4_ROUNDS(K4); AES_ENC_4_ROUNDS(K5); AES_ENC_4_ROUNDS(K6); AES_ENC_4_ROUNDS(K7); AES_ENC_4_ROUNDS(K8); AES_ENC_4_ROUNDS(K9); AES_ENC_4_ROUNDS(K10); AES_ENC_4_ROUNDS(K11); AES_ENC_4_ROUNDS(K12); AES_ENC_4_ROUNDS(K13); AES_ENC_4_LAST_ROUNDS(K14); _mm_storeu_si128(out_mm + 0, B0); _mm_storeu_si128(out_mm + 1, B1); _mm_storeu_si128(out_mm + 2, B2); _mm_storeu_si128(out_mm + 3, B3); blocks -= 4; in_mm += 4; out_mm += 4; } for(size_t i = 0; i != blocks; ++i) { __m128i B = _mm_loadu_si128(in_mm + i); B = _mm_xor_si128(B, K0); B = _mm_aesenc_si128(B, K1); B = _mm_aesenc_si128(B, K2); B = _mm_aesenc_si128(B, K3); B = _mm_aesenc_si128(B, K4); B = _mm_aesenc_si128(B, K5); B = _mm_aesenc_si128(B, K6); B = _mm_aesenc_si128(B, K7); B = _mm_aesenc_si128(B, K8); B = _mm_aesenc_si128(B, K9); B = _mm_aesenc_si128(B, K10); B = _mm_aesenc_si128(B, K11); B = _mm_aesenc_si128(B, K12); B = _mm_aesenc_si128(B, K13); B = _mm_aesenclast_si128(B, K14); _mm_storeu_si128(out_mm + i, B); } }
void AES_128::armv8_encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { BOTAN_ASSERT(m_EK.empty() == false, "Key was set"); const uint8_t *skey = reinterpret_cast<const uint8_t*>(m_EK.data()); const uint8_t *mkey = reinterpret_cast<const uint8_t*>(m_ME.data()); const uint8x16_t K0 = vld1q_u8(skey + 0); const uint8x16_t K1 = vld1q_u8(skey + 16); const uint8x16_t K2 = vld1q_u8(skey + 32); const uint8x16_t K3 = vld1q_u8(skey + 48); const uint8x16_t K4 = vld1q_u8(skey + 64); const uint8x16_t K5 = vld1q_u8(skey + 80); const uint8x16_t K6 = vld1q_u8(skey + 96); const uint8x16_t K7 = vld1q_u8(skey + 112); const uint8x16_t K8 = vld1q_u8(skey + 128); const uint8x16_t K9 = vld1q_u8(skey + 144); const uint8x16_t K10 = vld1q_u8(mkey); while(blocks >= 4) { uint8x16_t B0 = vld1q_u8(in); uint8x16_t B1 = vld1q_u8(in+16); uint8x16_t B2 = vld1q_u8(in+32); uint8x16_t B3 = vld1q_u8(in+48); AES_ENC_4_ROUNDS(K0); AES_ENC_4_ROUNDS(K1); AES_ENC_4_ROUNDS(K2); AES_ENC_4_ROUNDS(K3); AES_ENC_4_ROUNDS(K4); AES_ENC_4_ROUNDS(K5); AES_ENC_4_ROUNDS(K6); AES_ENC_4_ROUNDS(K7); AES_ENC_4_ROUNDS(K8); AES_ENC_4_LAST_ROUNDS(K9, K10); vst1q_u8(out, B0); vst1q_u8(out+16, B1); vst1q_u8(out+32, B2); vst1q_u8(out+48, B3); in += 16*4; out += 16*4; blocks -= 4; } for(size_t i = 0; i != blocks; ++i) { uint8x16_t B = vld1q_u8(in+16*i); B = vaesmcq_u8(vaeseq_u8(B, K0)); B = vaesmcq_u8(vaeseq_u8(B, K1)); B = vaesmcq_u8(vaeseq_u8(B, K2)); B = vaesmcq_u8(vaeseq_u8(B, K3)); B = vaesmcq_u8(vaeseq_u8(B, K4)); B = vaesmcq_u8(vaeseq_u8(B, K5)); B = vaesmcq_u8(vaeseq_u8(B, K6)); B = vaesmcq_u8(vaeseq_u8(B, K7)); B = vaesmcq_u8(vaeseq_u8(B, K8)); B = veorq_u8(vaeseq_u8(B, K9), K10); vst1q_u8(out+16*i, B); } }