void Rijndael::blockDecryptSSE(const byte *input, size_t numBlocks, byte *outBuffer) { __m128i initVector = _mm_loadu_si128((__m128i*)m_initVector); __m128i *src=(__m128i*)input; __m128i *dest=(__m128i*)outBuffer; __m128i *rkey=(__m128i*)m_expandedKey; while (numBlocks > 0) { __m128i rl = _mm_loadu_si128(rkey + m_uRounds); __m128i d = _mm_loadu_si128(src++); __m128i v = _mm_xor_si128(rl, d); for (int i=m_uRounds-1; i>0; i--) { __m128i ri = _mm_loadu_si128(rkey + i); v = _mm_aesdec_si128(v, ri); } __m128i r0 = _mm_loadu_si128(rkey); v = _mm_aesdeclast_si128(v, r0); if (CBCMode) v = _mm_xor_si128(v, initVector); initVector = d; _mm_storeu_si128(dest++,v); numBlocks--; } _mm_storeu_si128((__m128i*)m_initVector,initVector); }
static void block_decrypt(block_state* self, const u8* in, u8* out) { __m128i m = _mm_loadu_si128((const __m128i*) in); /* first 9 rounds */ m = _mm_xor_si128(m, self->dk[0]); m = _mm_aesdec_si128(m, self->dk[1]); m = _mm_aesdec_si128(m, self->dk[2]); m = _mm_aesdec_si128(m, self->dk[3]); m = _mm_aesdec_si128(m, self->dk[4]); m = _mm_aesdec_si128(m, self->dk[5]); m = _mm_aesdec_si128(m, self->dk[6]); m = _mm_aesdec_si128(m, self->dk[7]); m = _mm_aesdec_si128(m, self->dk[8]); m = _mm_aesdec_si128(m, self->dk[9]); if (self->rounds != 10) { /* two additional rounds for AES-192/256 */ m = _mm_aesdec_si128(m, self->dk[10]); m = _mm_aesdec_si128(m, self->dk[11]); if (self->rounds == 14) { /* another two additional rounds for AES-256 */ m = _mm_aesdec_si128(m, self->dk[12]); m = _mm_aesdec_si128(m, self->dk[13]); } } m = _mm_aesdeclast_si128(m, self->dk[self->rounds]); _mm_storeu_si128((__m128i*) out, m); }
void Cryptor::cbcDecrypt(const string &ciphertext, const Key &key, string *plaintext, unsigned char *schedule) { plaintext->resize(ciphertext.size()); int blocks = ciphertext.size() / 16; if (ciphertext.size() % 16) { blocks++; } __m128i tmp, tmp2, tmp3; __m128i *input = (__m128i*) ciphertext.data(); __m128i *output = (__m128i*) plaintext->data(); __m128i *keySchedule = (__m128i*) schedule; int rounds = getRounds(key.size); // Load the IV. tmp2 = _mm_loadu_si128((__m128i*) key.iv); // Swap byte-order => big-endian. if (!bigEndian) { reverse_m128i(tmp2); } for (int block = 0; block < blocks; block++) { // Get next 128-bit block. tmp = _mm_loadu_si128(&input[block]); // Swap byte-order => big-endian. if (!bigEndian) { reverse_m128i(tmp); } // Whitening step. tmp3 = _mm_xor_si128(tmp, keySchedule[0]); // Apply the AES rounds. int round = 1; for (; round < rounds; round++) { tmp3 = _mm_aesdec_si128(tmp3, keySchedule[round]); } // And the last. tmp3 = _mm_aesdeclast_si128(tmp3, keySchedule[round]); // XOR IV or last ciphertext with the ciphertext. tmp3 = _mm_xor_si128(tmp3, tmp2); // Swap byte-order => little-endian. if (!bigEndian) { reverse_m128i(tmp3); } // Save the decrypted block. _mm_storeu_si128(&output[block], tmp3); // Save the last ciphertext. tmp2 = tmp; } }
void aesni_decrypt(aesni_ctx *ctx, const byte *in, byte *out) { register __m128i tmp; tmp = _mm_loadu_si128((__m128i*)in); tmp = _mm_xor_si128(tmp, ctx->dec_keys[0]); for (int i = 1; i < 10; i++) { tmp = _mm_aesdec_si128(tmp, ctx->dec_keys[i]); } tmp = _mm_aesdeclast_si128(tmp, ctx->dec_keys[10]); _mm_storeu_si128(((__m128i*)out), tmp); }
inline void AES_ecb_decrypt_blks(block *blks, unsigned nblks, AES_KEY *key) { unsigned i, j, rnds = ROUNDS(key); const __m128i *sched = ((__m128i *) (key->rd_key)); for (i = 0; i < nblks; ++i) blks[i] = _mm_xor_si128(blks[i], sched[0]); for (j = 1; j < rnds; ++j) for (i = 0; i < nblks; ++i) blks[i] = _mm_aesdec_si128(blks[i], sched[j]); for (i = 0; i < nblks; ++i) blks[i] = _mm_aesdeclast_si128(blks[i], sched[j]); }
inline void AES_decrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key) { int j, rnds = ROUNDS(key); const __m128i *sched = ((__m128i *) (key->rd_key)); __m128i tmp = _mm_load_si128((__m128i *) in); tmp = _mm_xor_si128(tmp, sched[0]); for (j = 1; j < rnds; j++) tmp = _mm_aesdec_si128(tmp, sched[j]); tmp = _mm_aesdeclast_si128(tmp, sched[j]); _mm_store_si128((__m128i *) out, tmp); }
static __m128i AES_decrypt(__m128i in, const __m128i* expkey) { int j; __m128i tmp = byte_swap(in) ^ expkey[0]; for (j=1; j <10; j++){ tmp = _mm_aesdec_si128 (tmp,expkey[j]); } tmp = _mm_aesdeclast_si128 (tmp,expkey[10]); return byte_swap(tmp); }
AES_AES_Block __fastcall aes_AES128_decrypt_block_( AES_AES_Block ciphertext, const AES_AES128_RoundKeys* decryption_keys) { ciphertext = _mm_xor_si128(ciphertext, decryption_keys->keys[0]); ciphertext = _mm_aesdec_si128(ciphertext, decryption_keys->keys[1]); ciphertext = _mm_aesdec_si128(ciphertext, decryption_keys->keys[2]); ciphertext = _mm_aesdec_si128(ciphertext, decryption_keys->keys[3]); ciphertext = _mm_aesdec_si128(ciphertext, decryption_keys->keys[4]); ciphertext = _mm_aesdec_si128(ciphertext, decryption_keys->keys[5]); ciphertext = _mm_aesdec_si128(ciphertext, decryption_keys->keys[6]); ciphertext = _mm_aesdec_si128(ciphertext, decryption_keys->keys[7]); ciphertext = _mm_aesdec_si128(ciphertext, decryption_keys->keys[8]); ciphertext = _mm_aesdec_si128(ciphertext, decryption_keys->keys[9]); return _mm_aesdeclast_si128(ciphertext, decryption_keys->keys[10]); }
void AESNI_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY dec_key) { __m128i tmp; tmp = _mm_loadu_si128 ((__m128i*)in); tmp = _mm_xor_si128 (tmp,dec_key[0]); tmp = _mm_aesdec_si128 (tmp,dec_key[1]); tmp = _mm_aesdec_si128 (tmp,dec_key[2]); tmp = _mm_aesdec_si128 (tmp,dec_key[3]); tmp = _mm_aesdec_si128 (tmp,dec_key[4]); tmp = _mm_aesdec_si128 (tmp,dec_key[5]); tmp = _mm_aesdec_si128 (tmp,dec_key[6]); tmp = _mm_aesdec_si128 (tmp,dec_key[7]); tmp = _mm_aesdec_si128 (tmp,dec_key[8]); tmp = _mm_aesdec_si128 (tmp,dec_key[9]); tmp = _mm_aesdeclast_si128 (tmp,dec_key[10]); _mm_storeu_si128 ((__m128i*)out,tmp); }
static void AESNI_CBC_decrypt(const unsigned char *in,unsigned char *out,unsigned char ivec[16],unsigned long length,unsigned char *key,int number_of_rounds) { __m128i data,feedback,last_in; int i,j; if (length%16) length = length/16+1; else length /=16; feedback=_mm_loadu_si128 ((__m128i*)ivec); for(i=0; i < length; i++) { last_in=_mm_loadu_si128 (&((__m128i*)in)[i]); data = _mm_xor_si128 (last_in,((__m128i*)key)[0]); for(j=1; j <number_of_rounds; j++) { data = _mm_aesdec_si128 (data,((__m128i*)key)[j]); } data = _mm_aesdeclast_si128 (data,((__m128i*)key)[j]); data = _mm_xor_si128 (data,feedback); _mm_storeu_si128 (&((__m128i*)out)[i],data); feedback=last_in; } }
int aesni_xcryptecb( aes_context *ctx, int mode, const unsigned char input[16], unsigned char output[16] ) { __m128i block; const __m128i *subkeys = (__m128i *) ctx->rk; const int rounds = ctx->nr; int i; /* This could be faster if more data was provided at once. */ block = _mm_loadu_si128( (__m128i *) input ); block = _mm_xor_si128( block, subkeys[0] ); if( mode == AES_ENCRYPT ) { for( i = 1; i < rounds - 1; i += 2 ) { block = _mm_aesenc_si128( block, subkeys[i] ); block = _mm_aesenc_si128( block, subkeys[i + 1] ); } block = _mm_aesenc_si128( block, subkeys[rounds - 1] ); block = _mm_aesenclast_si128( block, subkeys[rounds] ); } else { for( i = 1; i < rounds - 1; i += 2 ) { block = _mm_aesdec_si128( block, subkeys[i] ); block = _mm_aesdec_si128( block, subkeys[i + 1] ); } block = _mm_aesdec_si128( block, subkeys[rounds - 1] ); block = _mm_aesdeclast_si128( block, subkeys[rounds] ); } _mm_storeu_si128( (__m128i *) output, block ); return( 0 ); }
void aes_ecb_decrypt(aes *a,MR_BYTE *buff) { int i,j,k; MR_WORD p[4],q[4],*x,*y,*t; #ifdef AES_NI_SUPPORT __m128i ky,m = _mm_loadu_si128((__m128i *) buff); ky = _mm_loadu_si128((__m128i *) &a->rkey[0]); m = _mm_xor_si128 (m, ky); k=NB; for (i=1;i<a->Nr;i++) { ky=_mm_loadu_si128((__m128i *) &a->rkey[k]); m =_mm_aesdec_si128 (m, ky); k+=4; } ky=_mm_loadu_si128((__m128i *) &a->rkey[k]); m=_mm_aesdeclast_si128(m, ky); _mm_storeu_si128((__m128i *)buff, m); #else for (i=j=0;i<NB;i++,j+=4) { p[i]=pack((MR_BYTE *)&buff[j]); p[i]^=a->rkey[i]; } k=NB; x=p; y=q; /* State alternates between x and y */ for (i=1;i<a->Nr;i++) { /* Nr is number of rounds. May be odd. */ #ifndef MR_SMALL_AES y[0]=a->rkey[k]^rtable[MR_TOBYTE(x[0])]^ rtable1[MR_TOBYTE(x[3]>>8)]^ rtable2[MR_TOBYTE(x[2]>>16)]^ rtable3[x[1]>>24]; y[1]=a->rkey[k+1]^rtable[MR_TOBYTE(x[1])]^ rtable1[MR_TOBYTE(x[0]>>8)]^ rtable2[MR_TOBYTE(x[3]>>16)]^ rtable3[x[2]>>24]; y[2]=a->rkey[k+2]^rtable[MR_TOBYTE(x[2])]^ rtable1[MR_TOBYTE(x[1]>>8)]^ rtable2[MR_TOBYTE(x[0]>>16)]^ rtable3[x[3]>>24]; y[3]=a->rkey[k+3]^rtable[MR_TOBYTE(x[3])]^ rtable1[MR_TOBYTE(x[2]>>8)]^ rtable2[MR_TOBYTE(x[1]>>16)]^ rtable3[x[0]>>24]; #else y[0]=a->rkey[k]^rtable[MR_TOBYTE(x[0])]^ ROTL8(rtable[MR_TOBYTE(x[3]>>8)])^ ROTL16(rtable[MR_TOBYTE(x[2]>>16)])^ ROTL24(rtable[x[1]>>24]); y[1]=a->rkey[k+1]^rtable[MR_TOBYTE(x[1])]^ ROTL8(rtable[MR_TOBYTE(x[0]>>8)])^ ROTL16(rtable[MR_TOBYTE(x[3]>>16)])^ ROTL24(rtable[x[2]>>24]); y[2]=a->rkey[k+2]^rtable[MR_TOBYTE(x[2])]^ ROTL8(rtable[MR_TOBYTE(x[1]>>8)])^ ROTL16(rtable[MR_TOBYTE(x[0]>>16)])^ ROTL24(rtable[x[3]>>24]); y[3]=a->rkey[k+3]^rtable[MR_TOBYTE(x[3])]^ ROTL8(rtable[MR_TOBYTE(x[2]>>8)])^ ROTL16(rtable[MR_TOBYTE(x[1]>>16)])^ ROTL24(rtable[x[0]>>24]); #endif k+=4; t=x; x=y; y=t; /* swap pointers */ } /* Last Round */ y[0]=a->rkey[k]^(MR_WORD)rbsub[MR_TOBYTE(x[0])]^ ROTL8((MR_WORD)rbsub[MR_TOBYTE(x[3]>>8)])^ ROTL16((MR_WORD)rbsub[MR_TOBYTE(x[2]>>16)])^ ROTL24((MR_WORD)rbsub[x[1]>>24]); y[1]=a->rkey[k+1]^(MR_WORD)rbsub[MR_TOBYTE(x[1])]^ ROTL8((MR_WORD)rbsub[MR_TOBYTE(x[0]>>8)])^ ROTL16((MR_WORD)rbsub[MR_TOBYTE(x[3]>>16)])^ ROTL24((MR_WORD)rbsub[x[2]>>24]); y[2]=a->rkey[k+2]^(MR_WORD)rbsub[MR_TOBYTE(x[2])]^ ROTL8((MR_WORD)rbsub[MR_TOBYTE(x[1]>>8)])^ ROTL16((MR_WORD)rbsub[MR_TOBYTE(x[0]>>16)])^ ROTL24((MR_WORD)rbsub[x[3]>>24]); y[3]=a->rkey[k+3]^(MR_WORD)rbsub[MR_TOBYTE(x[3])]^ ROTL8((MR_WORD)rbsub[MR_TOBYTE(x[2]>>8)])^ ROTL16((MR_WORD)rbsub[MR_TOBYTE(x[1]>>16)])^ ROTL24((MR_WORD)rbsub[x[0]>>24]); for (i=j=0;i<NB;i++,j+=4) { unpack(y[i],(MR_BYTE *)&buff[j]); x[i]=y[i]=0; /* clean up stack */ } #endif }
int aesni_xcryptcbc( aes_context *ctx, int mode, size_t length, unsigned char iv[16], const unsigned char *input, unsigned char *output ) { const __m128i *subkeys = (__m128i *) ctx->rk; const int rounds = ctx->nr; const size_t blocks = length / 16; __m128i block0, block1, block2, block3; __m128i fb0, fb1, fb2, fb3; __m128i rk; __m128i last; size_t i; int j; fb0 = _mm_loadu_si128( (__m128i *) iv ); if (mode == AES_ENCRYPT ) { for( i = 0 ; i < blocks; i++ ) { block0 = _mm_loadu_si128( &((__m128i *) input)[i] ); fb0 = _mm_xor_si128( block0, fb0 ); fb0 = _mm_xor_si128( fb0, subkeys[0] ); for( j = 1; j < rounds - 1; j += 2 ) { fb0 = _mm_aesenc_si128( fb0, subkeys[j] ); fb0 = _mm_aesenc_si128( fb0, subkeys[j + 1] ); } fb0 = _mm_aesenc_si128( fb0, subkeys[rounds - 1] ); fb0 = _mm_aesenclast_si128( fb0, subkeys[rounds] ); _mm_storeu_si128( &((__m128i*) output)[i], fb0 ); } } else { /* Take advantage of pipelining by decrypting 4 blocks at once. */ for( i = 0; i < blocks / 4; i++ ) { block0 = _mm_loadu_si128( (__m128i *) input + i * 4 ); block1 = _mm_loadu_si128( (__m128i *) input + i * 4 + 1 ); block2 = _mm_loadu_si128( (__m128i *) input + i * 4 + 2 ); block3 = _mm_loadu_si128( (__m128i *) input + i * 4 + 3 ); fb1 = block0; fb2 = block1; fb3 = block2; last = block3; rk = subkeys[0]; block0 = _mm_xor_si128( block0, rk ); block1 = _mm_xor_si128( block1, rk ); block2 = _mm_xor_si128( block2, rk ); block3 = _mm_xor_si128( block3, rk ); for( j = 1; j < rounds; j++ ) { rk = subkeys[j]; block0 = _mm_aesdec_si128( block0, rk ); block1 = _mm_aesdec_si128( block1, rk ); block2 = _mm_aesdec_si128( block2, rk ); block3 = _mm_aesdec_si128( block3, rk ); } rk = subkeys[rounds]; block0 = _mm_aesdeclast_si128( block0, rk ); block1 = _mm_aesdeclast_si128( block1, rk ); block2 = _mm_aesdeclast_si128( block2, rk ); block3 = _mm_aesdeclast_si128( block3, rk ); block0 = _mm_xor_si128( block0, fb0 ); block1 = _mm_xor_si128( block1, fb1 ); block2 = _mm_xor_si128( block2, fb2 ); block3 = _mm_xor_si128( block3, fb3 ); _mm_storeu_si128( ((__m128i *) output) + i * 4, block0 ); _mm_storeu_si128( ((__m128i *) output) + i * 4 + 1, block1 ); _mm_storeu_si128( ((__m128i *) output) + i * 4 + 2, block2 ); _mm_storeu_si128( ((__m128i *) output) + i * 4 + 3, block3 ); fb0 = last; } for( i *= 4; i < blocks; i++ ) { block0 = _mm_loadu_si128( (__m128i *) input + i ); last = block0; block0 = _mm_xor_si128 (last, subkeys[0] ); for( j = 1; j < rounds - 1; j += 2 ) { block0 = _mm_aesdec_si128( block0, subkeys[j] ); block0 = _mm_aesdec_si128( block0, subkeys[j + 1] ); } block0 = _mm_aesdec_si128( block0, subkeys[rounds - 1] ); block0 = _mm_aesdeclast_si128( block0, subkeys[rounds] ); block0 = _mm_xor_si128( block0, fb0 ); _mm_storeu_si128( ((__m128i *) output) + i, block0 ); fb0 = last; } } _mm_storeu_si128( (__m128i *) iv, fb0 ); return( 0 ); }
/* * AES-256 Decryption */ void AES_256_NI::decrypt_n(const byte in[], byte out[], size_t blocks) const { const __m128i* in_mm = (const __m128i*)in; __m128i* out_mm = (__m128i*)out; const __m128i* key_mm = (const __m128i*)&DK[0]; __m128i K0 = _mm_loadu_si128(key_mm); __m128i K1 = _mm_loadu_si128(key_mm + 1); __m128i K2 = _mm_loadu_si128(key_mm + 2); __m128i K3 = _mm_loadu_si128(key_mm + 3); __m128i K4 = _mm_loadu_si128(key_mm + 4); __m128i K5 = _mm_loadu_si128(key_mm + 5); __m128i K6 = _mm_loadu_si128(key_mm + 6); __m128i K7 = _mm_loadu_si128(key_mm + 7); __m128i K8 = _mm_loadu_si128(key_mm + 8); __m128i K9 = _mm_loadu_si128(key_mm + 9); __m128i K10 = _mm_loadu_si128(key_mm + 10); __m128i K11 = _mm_loadu_si128(key_mm + 11); __m128i K12 = _mm_loadu_si128(key_mm + 12); __m128i K13 = _mm_loadu_si128(key_mm + 13); __m128i K14 = _mm_loadu_si128(key_mm + 14); while(blocks >= 4) { __m128i B0 = _mm_loadu_si128(in_mm + 0); __m128i B1 = _mm_loadu_si128(in_mm + 1); __m128i B2 = _mm_loadu_si128(in_mm + 2); __m128i B3 = _mm_loadu_si128(in_mm + 3); B0 = _mm_xor_si128(B0, K0); B1 = _mm_xor_si128(B1, K0); B2 = _mm_xor_si128(B2, K0); B3 = _mm_xor_si128(B3, K0); AES_DEC_4_ROUNDS(K1); AES_DEC_4_ROUNDS(K2); AES_DEC_4_ROUNDS(K3); AES_DEC_4_ROUNDS(K4); AES_DEC_4_ROUNDS(K5); AES_DEC_4_ROUNDS(K6); AES_DEC_4_ROUNDS(K7); AES_DEC_4_ROUNDS(K8); AES_DEC_4_ROUNDS(K9); AES_DEC_4_ROUNDS(K10); AES_DEC_4_ROUNDS(K11); AES_DEC_4_ROUNDS(K12); AES_DEC_4_ROUNDS(K13); AES_DEC_4_LAST_ROUNDS(K14); _mm_storeu_si128(out_mm + 0, B0); _mm_storeu_si128(out_mm + 1, B1); _mm_storeu_si128(out_mm + 2, B2); _mm_storeu_si128(out_mm + 3, B3); blocks -= 4; in_mm += 4; out_mm += 4; } for(size_t i = 0; i != blocks; ++i) { __m128i B = _mm_loadu_si128(in_mm + i); B = _mm_xor_si128(B, K0); B = _mm_aesdec_si128(B, K1); B = _mm_aesdec_si128(B, K2); B = _mm_aesdec_si128(B, K3); B = _mm_aesdec_si128(B, K4); B = _mm_aesdec_si128(B, K5); B = _mm_aesdec_si128(B, K6); B = _mm_aesdec_si128(B, K7); B = _mm_aesdec_si128(B, K8); B = _mm_aesdec_si128(B, K9); B = _mm_aesdec_si128(B, K10); B = _mm_aesdec_si128(B, K11); B = _mm_aesdec_si128(B, K12); B = _mm_aesdec_si128(B, K13); B = _mm_aesdeclast_si128(B, K14); _mm_storeu_si128(out_mm + i, B); } }