/* ** AES-128/256 encrypt */ __inline__ static void AES_encrypt( __m128i in, __m128i *out, const __m128i *key) { __m128i tmp = _mm_load_si128(&in); tmp = _mm_xor_si128(tmp, key[0]); tmp = _mm_aesenc_si128(tmp, key[1]); tmp = _mm_aesenc_si128(tmp, key[2]); tmp = _mm_aesenc_si128(tmp, key[3]); tmp = _mm_aesenc_si128(tmp, key[4]); tmp = _mm_aesenc_si128(tmp, key[5]); tmp = _mm_aesenc_si128(tmp, key[6]); tmp = _mm_aesenc_si128(tmp, key[7]); tmp = _mm_aesenc_si128(tmp, key[8]); tmp = _mm_aesenc_si128(tmp, key[9]); #if (ROUND==10) *out = _mm_aesenclast_si128(tmp, key[10]); #elif (ROUND==14) tmp = _mm_aesenc_si128(tmp, key[10]); tmp = _mm_aesenc_si128(tmp, key[11]); tmp = _mm_aesenc_si128(tmp, key[12]); tmp = _mm_aesenc_si128(tmp, key[13]); *out = _mm_aesenclast_si128(tmp, key[14]); #endif }
static void block_encrypt(block_state* self, const u8* in, u8* out) { __m128i m = _mm_loadu_si128((const __m128i*) in); /* first 9 rounds */ m = _mm_xor_si128(m, self->ek[0]); m = _mm_aesenc_si128(m, self->ek[1]); m = _mm_aesenc_si128(m, self->ek[2]); m = _mm_aesenc_si128(m, self->ek[3]); m = _mm_aesenc_si128(m, self->ek[4]); m = _mm_aesenc_si128(m, self->ek[5]); m = _mm_aesenc_si128(m, self->ek[6]); m = _mm_aesenc_si128(m, self->ek[7]); m = _mm_aesenc_si128(m, self->ek[8]); m = _mm_aesenc_si128(m, self->ek[9]); if (self->rounds != 10) { /* two additional rounds for AES-192/256 */ m = _mm_aesenc_si128(m, self->ek[10]); m = _mm_aesenc_si128(m, self->ek[11]); if (self->rounds == 14) { /* another two additional rounds for AES-256 */ m = _mm_aesenc_si128(m, self->ek[12]); m = _mm_aesenc_si128(m, self->ek[13]); } } m = _mm_aesenclast_si128(m, self->ek[self->rounds]); _mm_storeu_si128((__m128i*) out, m); }
/** * crypto_aes_encrypt_block_aesni(in, out, key): * Using the expanded AES key ${key}, encrypt the block ${in} and write the * resulting ciphertext to ${out}. This implementation uses x86 AESNI * instructions, and should only be used if CPUSUPPORT_X86_AESNI is defined * and cpusupport_x86_aesni() returns nonzero. */ void crypto_aes_encrypt_block_aesni(const uint8_t * in, uint8_t * out, const void * key) { const struct crypto_aes_key_aesni * _key = key; const __m128i * aes_key = _key->rkeys; __m128i aes_state; size_t nr = _key->nr; aes_state = _mm_loadu_si128((const __m128i *)in); aes_state = _mm_xor_si128(aes_state, aes_key[0]); aes_state = _mm_aesenc_si128(aes_state, aes_key[1]); aes_state = _mm_aesenc_si128(aes_state, aes_key[2]); aes_state = _mm_aesenc_si128(aes_state, aes_key[3]); aes_state = _mm_aesenc_si128(aes_state, aes_key[4]); aes_state = _mm_aesenc_si128(aes_state, aes_key[5]); aes_state = _mm_aesenc_si128(aes_state, aes_key[6]); aes_state = _mm_aesenc_si128(aes_state, aes_key[7]); aes_state = _mm_aesenc_si128(aes_state, aes_key[8]); aes_state = _mm_aesenc_si128(aes_state, aes_key[9]); if (nr > 10) { aes_state = _mm_aesenc_si128(aes_state, aes_key[10]); aes_state = _mm_aesenc_si128(aes_state, aes_key[11]); if (nr > 12) { aes_state = _mm_aesenc_si128(aes_state, aes_key[12]); aes_state = _mm_aesenc_si128(aes_state, aes_key[13]); } } aes_state = _mm_aesenclast_si128(aes_state, aes_key[nr]); _mm_storeu_si128((__m128i *)out, aes_state); }
static inline void aes_enc_128(__m128i *state, __m128i *key) { // 0 *state = _mm_xor_si128(*state, key[0]); // 1 *state = _mm_aesenc_si128(*state, key[1]); // 2 *state = _mm_aesenc_si128(*state, key[2]); // 3 *state = _mm_aesenc_si128(*state, key[3]); // 4 *state = _mm_aesenc_si128(*state, key[4]); // 5 *state = _mm_aesenc_si128(*state, key[5]); // 6 *state = _mm_aesenc_si128(*state, key[6]); // 7 *state = _mm_aesenc_si128(*state, key[7]); // 8 *state = _mm_aesenc_si128(*state, key[8]); // 9 *state = _mm_aesenc_si128(*state, key[9]); // 10 *state = _mm_aesenclast_si128(*state, key[10]); }
/* ** AES-128/256 batch encrypt for PIPE blocks */ __inline__ static void AES_ecb_encrypt_PIPE( __m128i *blks, const __m128i *key) { unsigned j; blks[0] = _mm_xor_si128(blks[0], key[0]); blks[1] = _mm_xor_si128(blks[1], key[0]); blks[2] = _mm_xor_si128(blks[2], key[0]); blks[3] = _mm_xor_si128(blks[3], key[0]); #if (PIPE>=5) blks[4] = _mm_xor_si128(blks[4], key[0]); #endif #if (PIPE>=6) blks[5] = _mm_xor_si128(blks[5], key[0]); #endif #if (PIPE>=7) blks[6] = _mm_xor_si128(blks[6], key[0]); #endif #if (PIPE==8) blks[7] = _mm_xor_si128(blks[7], key[0]); #endif for (j = 1; j<ROUND; ++j) { blks[0] = _mm_aesenc_si128(blks[0], key[j]); blks[1] = _mm_aesenc_si128(blks[1], key[j]); blks[2] = _mm_aesenc_si128(blks[2], key[j]); blks[3] = _mm_aesenc_si128(blks[3], key[j]); #if (PIPE>=5) blks[4] = _mm_aesenc_si128(blks[4], key[j]); #endif #if (PIPE>=6) blks[5] = _mm_aesenc_si128(blks[5], key[j]); #endif #if (PIPE>=7) blks[6] = _mm_aesenc_si128(blks[6], key[j]); #endif #if (PIPE==8) blks[7] = _mm_aesenc_si128(blks[7], key[j]); #endif } blks[0] = _mm_aesenclast_si128(blks[0], key[j]); blks[1] = _mm_aesenclast_si128(blks[1], key[j]); blks[2] = _mm_aesenclast_si128(blks[2], key[j]); blks[3] = _mm_aesenclast_si128(blks[3], key[j]); #if (PIPE>=5) blks[4] = _mm_aesenclast_si128(blks[4], key[j]); #endif #if (PIPE>=6) blks[5] = _mm_aesenclast_si128(blks[5], key[j]); #endif #if (PIPE>=7) blks[6] = _mm_aesenclast_si128(blks[6], key[j]); #endif #if (PIPE==8) blks[7] = _mm_aesenclast_si128(blks[7], key[j]); #endif }
void Cryptor::cbcEncrypt(const string &plaintext, const Key &key, string *ciphertext, unsigned char *schedule) { ciphertext->resize(plaintext.size()); int blocks = plaintext.size() / 16; if (plaintext.size() % 16) { blocks++; } __m128i tmp, tmp2, tmp3; __m128i *input = (__m128i*) plaintext.data(); __m128i *output = (__m128i*) ciphertext->data(); __m128i *keySchedule = (__m128i*) schedule; int rounds = getRounds(key.size); // Load the IV. tmp2 = _mm_loadu_si128((__m128i*) key.iv); // Swap byte-order => big-endian. if (!bigEndian) { reverse_m128i(tmp2); } for (int block = 0; block < blocks; block++) { // Get next 128-bit block. tmp = _mm_loadu_si128(&input[block]); // Swap byte-order => big-endian. if (!bigEndian) { reverse_m128i(tmp); } // XOR IV or last ciphertext with the plaintext. tmp2 = _mm_xor_si128(tmp, tmp2); // Whitening step. tmp2 = _mm_xor_si128(tmp2, keySchedule[0]); // Apply the AES rounds. int round = 1; for (; round < rounds; round++) { tmp2 = _mm_aesenc_si128(tmp2, keySchedule[round]); } // And the last. tmp2 = _mm_aesenclast_si128(tmp2, keySchedule[round]); // Swap byte-order => little-endian. tmp3 = tmp2; if (!bigEndian) { reverse_m128i(tmp3); } // Save the encrypted block. _mm_storeu_si128(&output[block], tmp3); } }
void AES_ecb_encrypt(block *blk, AES_KEY *aesKey) { unsigned j, rnds = ROUNDS(aesKey); const block *sched = ((block *)(aesKey->rd_key)); *blk = _mm_xor_si128(*blk, sched[0]); for (j = 1; j<rnds; ++j) *blk = _mm_aesenc_si128(*blk, sched[j]); *blk = _mm_aesenclast_si128(*blk, sched[j]); }
void AES_encryptC(block *in, block *out, AES_KEY *aesKey) { int j, rnds = ROUNDS(aesKey); const __m128i *sched = ((__m128i *)(aesKey->rd_key)); __m128i tmp = _mm_load_si128((__m128i*)in); tmp = _mm_xor_si128(tmp, sched[0]); for (j = 1; j<rnds; j++) tmp = _mm_aesenc_si128(tmp, sched[j]); tmp = _mm_aesenclast_si128(tmp, sched[j]); _mm_store_si128((__m128i*)out, tmp); }
void AES_ecb_encrypt_chunk_in_out(block *in, block *out, unsigned nblks, AES_KEY *aesKey) { int numberOfLoops = nblks / 8; int blocksPipeLined = numberOfLoops * 8; int remainingEncrypts = nblks - blocksPipeLined; unsigned j, rnds = ROUNDS(aesKey); const block *sched = ((block *)(aesKey->rd_key)); for (int i = 0; i < numberOfLoops; i++){ out[0 + i * 8] = _mm_xor_si128(in[0 + i * 8], sched[0]); out[1 + i * 8] = _mm_xor_si128(in[1 + i * 8], sched[0]); out[2 + i * 8] = _mm_xor_si128(in[2 + i * 8], sched[0]); out[3 + i * 8] = _mm_xor_si128(in[3 + i * 8], sched[0]); out[4 + i * 8] = _mm_xor_si128(in[4 + i * 8], sched[0]); out[5 + i * 8] = _mm_xor_si128(in[5 + i * 8], sched[0]); out[6 + i * 8] = _mm_xor_si128(in[6 + i * 8], sched[0]); out[7 + i * 8] = _mm_xor_si128(in[7 + i * 8], sched[0]); for (j = 1; j < rnds; ++j){ out[0 + i * 8] = _mm_aesenc_si128(out[0 + i * 8], sched[j]); out[1 + i * 8] = _mm_aesenc_si128(out[1 + i * 8], sched[j]); out[2 + i * 8] = _mm_aesenc_si128(out[2 + i * 8], sched[j]); out[3 + i * 8] = _mm_aesenc_si128(out[3 + i * 8], sched[j]); out[4 + i * 8] = _mm_aesenc_si128(out[4 + i * 8], sched[j]); out[5 + i * 8] = _mm_aesenc_si128(out[5 + i * 8], sched[j]); out[6 + i * 8] = _mm_aesenc_si128(out[6 + i * 8], sched[j]); out[7 + i * 8] = _mm_aesenc_si128(out[7 + i * 8], sched[j]); } out[0 + i * 8] = _mm_aesenclast_si128(out[0 + i * 8], sched[j]); out[1 + i * 8] = _mm_aesenclast_si128(out[1 + i * 8], sched[j]); out[2 + i * 8] = _mm_aesenclast_si128(out[2 + i * 8], sched[j]); out[3 + i * 8] = _mm_aesenclast_si128(out[3 + i * 8], sched[j]); out[4 + i * 8] = _mm_aesenclast_si128(out[4 + i * 8], sched[j]); out[5 + i * 8] = _mm_aesenclast_si128(out[5 + i * 8], sched[j]); out[6 + i * 8] = _mm_aesenclast_si128(out[6 + i * 8], sched[j]); out[7 + i * 8] = _mm_aesenclast_si128(out[7 + i * 8], sched[j]); } for (int i = blocksPipeLined; i<blocksPipeLined + remainingEncrypts; ++i) out[i] = _mm_xor_si128(in[i], sched[0]); for (j = 1; j<rnds; ++j) for (int i = blocksPipeLined; i<blocksPipeLined + remainingEncrypts; ++i) out[i] = _mm_aesenc_si128(out[i], sched[j]); for (int i = blocksPipeLined; i<blocksPipeLined + remainingEncrypts; ++i) out[i] = _mm_aesenclast_si128(out[i], sched[j]); }
void AES_ecb_encrypt_blks_4(block *blks, AES_KEY *aesKey) { unsigned j, rnds = ROUNDS(aesKey); const block *sched = ((block *)(aesKey->rd_key)); blks[0] = _mm_xor_si128(blks[0], sched[0]); blks[1] = _mm_xor_si128(blks[1], sched[0]); blks[2] = _mm_xor_si128(blks[2], sched[0]); blks[3] = _mm_xor_si128(blks[3], sched[0]); for (j = 1; j < rnds; ++j){ blks[0] = _mm_aesenc_si128(blks[0], sched[j]); blks[1] = _mm_aesenc_si128(blks[1], sched[j]); blks[2] = _mm_aesenc_si128(blks[2], sched[j]); blks[3] = _mm_aesenc_si128(blks[3], sched[j]); } blks[0] = _mm_aesenclast_si128(blks[0], sched[j]); blks[1] = _mm_aesenclast_si128(blks[1], sched[j]); blks[2] = _mm_aesenclast_si128(blks[2], sched[j]); blks[3] = _mm_aesenclast_si128(blks[3], sched[j]); }
void aesni_encrypt(aesni_ctx *ctx, const byte *in, byte *out) { register __m128i tmp; tmp = _mm_loadu_si128((__m128i*)in); tmp = _mm_xor_si128(tmp, ctx->enc_keys[0]); for (int i = 1; i < 10; i++) { tmp = _mm_aesenc_si128(tmp, ctx->enc_keys[i]); } tmp = _mm_aesenclast_si128(tmp, ctx->enc_keys[10]); _mm_storeu_si128((__m128i*)out, tmp); }
void AES_ecb_encrypt_blks(block *blks, unsigned nblks, AES_KEY *aesKey) { unsigned i,j,rnds=ROUNDS(aesKey); const block *sched = ((block *)(aesKey->rd_key)); for (i=0; i<nblks; ++i) blks[i] =_mm_xor_si128(blks[i], sched[0]); for(j=1; j<rnds; ++j) for (i=0; i<nblks; ++i) blks[i] = _mm_aesenc_si128(blks[i], sched[j]); for (i=0; i<nblks; ++i) blks[i] =_mm_aesenclast_si128(blks[i], sched[j]); }
inline void AES_encrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key) { int j, rnds = ROUNDS(key); const __m128i *sched = ((__m128i *) (key->rd_key)); __m128i tmp = _mm_load_si128((__m128i *) in); tmp = _mm_xor_si128(tmp, sched[0]); for (j = 1; j < rnds; j++) tmp = _mm_aesenc_si128(tmp, sched[j]); tmp = _mm_aesenclast_si128(tmp, sched[j]); _mm_store_si128((__m128i *) out, tmp); }
static inline void aes256ni_encrypt(const __m128i rkeys[15], const unsigned char *n, unsigned char *out) { __m128i nv = _mm_load_si128((const __m128i *)n); int i; __m128i temp = _mm_xor_si128(nv, rkeys[0]); #pragma unroll(13) for (i = 1 ; i < 14 ; i++) { temp = _mm_aesenc_si128(temp, rkeys[i]); } temp = _mm_aesenclast_si128(temp, rkeys[14]); _mm_store_si128((__m128i*)(out), temp); }
void AES_ecb_encrypt_blks_4_in_out(block *in, block *out, AES_KEY *aesKey) { unsigned j, rnds = ROUNDS(aesKey); const block *sched = ((block *)(aesKey->rd_key)); //block temp[4]; out[0] = _mm_xor_si128(in[0], sched[0]); out[1] = _mm_xor_si128(in[1], sched[0]); out[2] = _mm_xor_si128(in[2], sched[0]); out[3] = _mm_xor_si128(in[3], sched[0]); for (j = 1; j < rnds; ++j){ out[0] = _mm_aesenc_si128(out[0], sched[j]); out[1] = _mm_aesenc_si128(out[1], sched[j]); out[2] = _mm_aesenc_si128(out[2], sched[j]); out[3] = _mm_aesenc_si128(out[3], sched[j]); } out[0] = _mm_aesenclast_si128(out[0], sched[j]); out[1] = _mm_aesenclast_si128(out[1], sched[j]); out[2] = _mm_aesenclast_si128(out[2], sched[j]); out[3] = _mm_aesenclast_si128(out[3], sched[j]); }
static __m128i AES_encrypt(__m128i in, const __m128i* expkey) { int j; __m128i tmp = byte_swap(in) ^ expkey[0]; for (j=1; j <10; j++){ tmp = _mm_aesenc_si128 (tmp,expkey[j]); } tmp = _mm_aesenclast_si128 (tmp,expkey[10]); return byte_swap(tmp); }
static __m128i aes_encrypt(__m128i in, __m128i* k) { __m128i x = _mm_xor_si128(in, k[0]); x = _mm_aesenc_si128(x, k[1]); x = _mm_aesenc_si128(x, k[2]); x = _mm_aesenc_si128(x, k[3]); x = _mm_aesenc_si128(x, k[4]); x = _mm_aesenc_si128(x, k[5]); x = _mm_aesenc_si128(x, k[6]); x = _mm_aesenc_si128(x, k[7]); x = _mm_aesenc_si128(x, k[8]); x = _mm_aesenc_si128(x, k[9]); return _mm_aesenclast_si128(x, k[10]); }
inline block garble_random_block(void) { block out; uint64_t *val; int i; out = garble_zero_block(); val = (uint64_t *) &out; val[0] = current_rand_index++; out = _mm_xor_si128(out, rand_aes_key.rd_key[0]); for (i = 1; i < 10; ++i) out = _mm_aesenc_si128(out, rand_aes_key.rd_key[i]); return _mm_aesenclast_si128(out, rand_aes_key.rd_key[i]); }
static inline void aes_encrypt_n(__m128i *text, int num_blocks, __m128i *keys) { int i, j; for(j = 1; j < 10 ; j++) { for(i = 0; i< num_blocks; i++) { text[i] = _mm_aesenc_si128(text[i], keys[j]); } } for(i = 0; i < num_blocks; i++) { text[i] = _mm_aesenclast_si128(text[i], keys[j]); } }
void Cryptor::ecbEncrypt(const string &plaintext, const Key &key, string *ciphertext, unsigned char *schedule) { // Right now we just use the same length, but it should just be // a multiple of 16. ciphertext->resize(plaintext.size()); int blocks = plaintext.size() / 16; if (plaintext.size() % 16) { blocks++; } __m128i tmp; __m128i *input = (__m128i*) plaintext.data(); __m128i *output = (__m128i*) ciphertext->data(); __m128i *keySchedule = (__m128i*) schedule; int rounds = getRounds(key.size); for (int block = 0; block < blocks; block++) { // Get next 128-bit block. tmp = _mm_loadu_si128(&input[block]); // Swap byte-order => big-endian. if (!bigEndian) { reverse_m128i(tmp); } // Whitening step. tmp = _mm_xor_si128(tmp, keySchedule[0]); // Apply the AES rounds. int round = 1; for (; round < rounds; round++) { tmp = _mm_aesenc_si128(tmp, keySchedule[round]); } // And the last. tmp = _mm_aesenclast_si128(tmp, keySchedule[round]); // Swap byte-order => little-endian. if (!bigEndian) { reverse_m128i(tmp); } // Save the encrypted block. _mm_storeu_si128(&output[block], tmp); } }
AES_AES_Block __fastcall aes_AES128_encrypt_block_( AES_AES_Block plaintext, const AES_AES128_RoundKeys* encryption_keys) { plaintext = _mm_xor_si128(plaintext, encryption_keys->keys[0]); plaintext = _mm_aesenc_si128(plaintext, encryption_keys->keys[1]); plaintext = _mm_aesenc_si128(plaintext, encryption_keys->keys[2]); plaintext = _mm_aesenc_si128(plaintext, encryption_keys->keys[3]); plaintext = _mm_aesenc_si128(plaintext, encryption_keys->keys[4]); plaintext = _mm_aesenc_si128(plaintext, encryption_keys->keys[5]); plaintext = _mm_aesenc_si128(plaintext, encryption_keys->keys[6]); plaintext = _mm_aesenc_si128(plaintext, encryption_keys->keys[7]); plaintext = _mm_aesenc_si128(plaintext, encryption_keys->keys[8]); plaintext = _mm_aesenc_si128(plaintext, encryption_keys->keys[9]); return _mm_aesenclast_si128(plaintext, encryption_keys->keys[10]); }
static void AESNI_CBC_encrypt(const unsigned char *in, unsigned char *out,unsigned char ivec[16],unsigned long length,unsigned char *key,int number_of_rounds) { __m128i feedback,data; int i,j; if (length%16) length = length/16+1; else length /=16; feedback=_mm_loadu_si128 ((__m128i*)ivec); for(i=0; i < length; i++) { data = _mm_loadu_si128 (&((__m128i*)in)[i]); feedback = _mm_xor_si128 (data,feedback); feedback = _mm_xor_si128 (feedback,((__m128i*)key)[0]); for(j=1; j <number_of_rounds; j++) feedback = _mm_aesenc_si128 (feedback,((__m128i*)key)[j]); feedback = _mm_aesenclast_si128 (feedback,((__m128i*)key)[j]); _mm_storeu_si128 (&((__m128i*)out)[i],feedback); } }
void AESNI_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY enc_key) { __m128i tmp; tmp = _mm_loadu_si128 ((__m128i*)in); tmp = _mm_xor_si128 (tmp,enc_key[0]); tmp = _mm_aesenc_si128 (tmp, enc_key[1]); tmp = _mm_aesenc_si128 (tmp, enc_key[2]); tmp = _mm_aesenc_si128 (tmp, enc_key[3]); tmp = _mm_aesenc_si128 (tmp, enc_key[4]); tmp = _mm_aesenc_si128 (tmp, enc_key[5]); tmp = _mm_aesenc_si128 (tmp, enc_key[6]); tmp = _mm_aesenc_si128 (tmp, enc_key[7]); tmp = _mm_aesenc_si128 (tmp, enc_key[8]); tmp = _mm_aesenc_si128 (tmp, enc_key[9]); tmp = _mm_aesenclast_si128 (tmp, enc_key[10]); _mm_storeu_si128 ((__m128i*)out,tmp); }
int aesni_xcryptecb( aes_context *ctx, int mode, const unsigned char input[16], unsigned char output[16] ) { __m128i block; const __m128i *subkeys = (__m128i *) ctx->rk; const int rounds = ctx->nr; int i; /* This could be faster if more data was provided at once. */ block = _mm_loadu_si128( (__m128i *) input ); block = _mm_xor_si128( block, subkeys[0] ); if( mode == AES_ENCRYPT ) { for( i = 1; i < rounds - 1; i += 2 ) { block = _mm_aesenc_si128( block, subkeys[i] ); block = _mm_aesenc_si128( block, subkeys[i + 1] ); } block = _mm_aesenc_si128( block, subkeys[rounds - 1] ); block = _mm_aesenclast_si128( block, subkeys[rounds] ); } else { for( i = 1; i < rounds - 1; i += 2 ) { block = _mm_aesdec_si128( block, subkeys[i] ); block = _mm_aesdec_si128( block, subkeys[i + 1] ); } block = _mm_aesdec_si128( block, subkeys[rounds - 1] ); block = _mm_aesdeclast_si128( block, subkeys[rounds] ); } _mm_storeu_si128( (__m128i *) output, block ); return( 0 ); }
int aesni_xcryptcbc( aes_context *ctx, int mode, size_t length, unsigned char iv[16], const unsigned char *input, unsigned char *output ) { const __m128i *subkeys = (__m128i *) ctx->rk; const int rounds = ctx->nr; const size_t blocks = length / 16; __m128i block0, block1, block2, block3; __m128i fb0, fb1, fb2, fb3; __m128i rk; __m128i last; size_t i; int j; fb0 = _mm_loadu_si128( (__m128i *) iv ); if (mode == AES_ENCRYPT ) { for( i = 0 ; i < blocks; i++ ) { block0 = _mm_loadu_si128( &((__m128i *) input)[i] ); fb0 = _mm_xor_si128( block0, fb0 ); fb0 = _mm_xor_si128( fb0, subkeys[0] ); for( j = 1; j < rounds - 1; j += 2 ) { fb0 = _mm_aesenc_si128( fb0, subkeys[j] ); fb0 = _mm_aesenc_si128( fb0, subkeys[j + 1] ); } fb0 = _mm_aesenc_si128( fb0, subkeys[rounds - 1] ); fb0 = _mm_aesenclast_si128( fb0, subkeys[rounds] ); _mm_storeu_si128( &((__m128i*) output)[i], fb0 ); } } else { /* Take advantage of pipelining by decrypting 4 blocks at once. */ for( i = 0; i < blocks / 4; i++ ) { block0 = _mm_loadu_si128( (__m128i *) input + i * 4 ); block1 = _mm_loadu_si128( (__m128i *) input + i * 4 + 1 ); block2 = _mm_loadu_si128( (__m128i *) input + i * 4 + 2 ); block3 = _mm_loadu_si128( (__m128i *) input + i * 4 + 3 ); fb1 = block0; fb2 = block1; fb3 = block2; last = block3; rk = subkeys[0]; block0 = _mm_xor_si128( block0, rk ); block1 = _mm_xor_si128( block1, rk ); block2 = _mm_xor_si128( block2, rk ); block3 = _mm_xor_si128( block3, rk ); for( j = 1; j < rounds; j++ ) { rk = subkeys[j]; block0 = _mm_aesdec_si128( block0, rk ); block1 = _mm_aesdec_si128( block1, rk ); block2 = _mm_aesdec_si128( block2, rk ); block3 = _mm_aesdec_si128( block3, rk ); } rk = subkeys[rounds]; block0 = _mm_aesdeclast_si128( block0, rk ); block1 = _mm_aesdeclast_si128( block1, rk ); block2 = _mm_aesdeclast_si128( block2, rk ); block3 = _mm_aesdeclast_si128( block3, rk ); block0 = _mm_xor_si128( block0, fb0 ); block1 = _mm_xor_si128( block1, fb1 ); block2 = _mm_xor_si128( block2, fb2 ); block3 = _mm_xor_si128( block3, fb3 ); _mm_storeu_si128( ((__m128i *) output) + i * 4, block0 ); _mm_storeu_si128( ((__m128i *) output) + i * 4 + 1, block1 ); _mm_storeu_si128( ((__m128i *) output) + i * 4 + 2, block2 ); _mm_storeu_si128( ((__m128i *) output) + i * 4 + 3, block3 ); fb0 = last; } for( i *= 4; i < blocks; i++ ) { block0 = _mm_loadu_si128( (__m128i *) input + i ); last = block0; block0 = _mm_xor_si128 (last, subkeys[0] ); for( j = 1; j < rounds - 1; j += 2 ) { block0 = _mm_aesdec_si128( block0, subkeys[j] ); block0 = _mm_aesdec_si128( block0, subkeys[j + 1] ); } block0 = _mm_aesdec_si128( block0, subkeys[rounds - 1] ); block0 = _mm_aesdeclast_si128( block0, subkeys[rounds] ); block0 = _mm_xor_si128( block0, fb0 ); _mm_storeu_si128( ((__m128i *) output) + i, block0 ); fb0 = last; } } _mm_storeu_si128( (__m128i *) iv, fb0 ); return( 0 ); }
int AES_GCM_decrypt (const unsigned char *in, unsigned char *out, const unsigned char* addt, const unsigned char* ivec, unsigned char *tag, int nbytes, int abytes, int ibytes, const unsigned char* key, int nr) { int i, j ,k; __m128i hlp1, hlp2, hlp3, hlp4; __m128i tmp1, tmp2, tmp3, tmp4; __m128i H, Y, T; __m128i *KEY = (__m128i*)key; __m128i ctr1, ctr2, ctr3, ctr4; __m128i last_block = _mm_setzero_si128(); __m128i ONE = _mm_set_epi32(0, 1, 0, 0); __m128i FOUR = _mm_set_epi32(0, 4, 0, 0); __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7); __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); __m128i X = _mm_setzero_si128(); if(ibytes == 96/8){ Y = _mm_loadu_si128((__m128i*)ivec); Y = _mm_insert_epi32(Y, 0x1000000, 3); /*(Compute E[ZERO, KS] and E[Y0, KS] together*/ tmp1 = _mm_xor_si128(X, KEY[0]); tmp2 = _mm_xor_si128(Y, KEY[0]); for(j=1; j < nr-1; j+=2) { tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]); }; tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]); H = _mm_aesenclast_si128(tmp1, KEY[nr]); T = _mm_aesenclast_si128(tmp2, KEY[nr]); H = _mm_shuffle_epi8(H, BSWAP_MASK); } else{ tmp1 = _mm_xor_si128(X, KEY[0]); for(j=1; j <nr; j++) tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); H = _mm_aesenclast_si128(tmp1, KEY[nr]); H = _mm_shuffle_epi8(H, BSWAP_MASK); Y = _mm_xor_si128(Y, Y); for(i=0; i < ibytes/16; i++){ tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]); tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); Y = _mm_xor_si128(Y, tmp1); gfmul(Y, H, &Y); } if(ibytes%16){ for(j=0; j < ibytes%16; j++) ((unsigned char*)&last_block)[j] = ivec[i*16+j]; tmp1 = last_block; tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); Y = _mm_xor_si128(Y, tmp1); gfmul(Y, H, &Y); } tmp1 = _mm_insert_epi64(tmp1, ibytes*8, 0); tmp1 = _mm_insert_epi64(tmp1, 0, 1); Y = _mm_xor_si128(Y, tmp1); gfmul(Y, H, &Y); Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/ tmp1 = _mm_xor_si128(Y, KEY[0]); for(j=1; j < nr; j++) tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); T = _mm_aesenclast_si128(tmp1, KEY[nr]); } for(i=0; i<abytes/16; i++){ tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]); tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); X = _mm_xor_si128(X, tmp1); gfmul(X, H, &X); } if(abytes%16){ last_block = _mm_setzero_si128(); for(j=0;j<abytes%16;j++) ((unsigned char*)&last_block)[j] = addt[i*16+j]; tmp1 = last_block; tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); X =_mm_xor_si128(X, tmp1); gfmul(X, H, &X); } for(i=0; i<nbytes/16; i++){ tmp1 = _mm_loadu_si128(&((__m128i*)in)[i]); tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); X = _mm_xor_si128(X, tmp1); gfmul(X, H, &X); } if(nbytes%16){ last_block = _mm_setzero_si128(); for(j=0; j<nbytes%16; j++) ((unsigned char*)&last_block)[j] = in[i*16+j]; tmp1 = last_block; tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); X = _mm_xor_si128(X, tmp1); gfmul(X, H, &X); } tmp1 =_mm_insert_epi64(tmp1, nbytes*8, 0); tmp1 =_mm_insert_epi64(tmp1, abytes*8, 1); X = _mm_xor_si128(X, tmp1); gfmul(X, H, &X); X = _mm_shuffle_epi8(X, BSWAP_MASK); T = _mm_xor_si128(X, T); if(0xffff!=_mm_movemask_epi8(_mm_cmpeq_epi8(T, _mm_loadu_si128((__m128i*)tag)))) return 0; //in case the authentication failed ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); ctr1 = _mm_add_epi32(ctr1, ONE); ctr2 = _mm_add_epi32(ctr1, ONE); ctr3 = _mm_add_epi32(ctr2, ONE); ctr4 = _mm_add_epi32(ctr3, ONE); for(i=0; i < nbytes/16/4; i++){ tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64); tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64); tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64); ctr1 = _mm_add_epi32(ctr1, FOUR); ctr2 = _mm_add_epi32(ctr2, FOUR); ctr3 = _mm_add_epi32(ctr3, FOUR); ctr4 = _mm_add_epi32(ctr4, FOUR); tmp1 =_mm_xor_si128(tmp1, KEY[0]); tmp2 =_mm_xor_si128(tmp2, KEY[0]); tmp3 =_mm_xor_si128(tmp3, KEY[0]); tmp4 =_mm_xor_si128(tmp4, KEY[0]); for(j=1; j < nr-1; j+=2){ tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); tmp3 = _mm_aesenc_si128(tmp3, KEY[j]); tmp4 = _mm_aesenc_si128(tmp4, KEY[j]); tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]); tmp3 = _mm_aesenc_si128(tmp3, KEY[j+1]); tmp4 = _mm_aesenc_si128(tmp4, KEY[j+1]); } tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]); tmp3 = _mm_aesenc_si128(tmp3, KEY[nr-1]); tmp4 = _mm_aesenc_si128(tmp4, KEY[nr-1]); tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]); tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]); tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]); tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]); tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[i*4+0])); tmp2 = _mm_xor_si128(tmp2, _mm_loadu_si128(&((__m128i*)in)[i*4+1])); tmp3 = _mm_xor_si128(tmp3, _mm_loadu_si128(&((__m128i*)in)[i*4+2])); tmp4 = _mm_xor_si128(tmp4, _mm_loadu_si128(&((__m128i*)in)[i*4+3])); _mm_storeu_si128(&((__m128i*)out)[i*4+0], tmp1); _mm_storeu_si128(&((__m128i*)out)[i*4+1], tmp2); _mm_storeu_si128(&((__m128i*)out)[i*4+2], tmp3); _mm_storeu_si128(&((__m128i*)out)[i*4+3], tmp4); tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); } for(k = i*4; k < nbytes/16; k++){ tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); ctr1 = _mm_add_epi32(ctr1, ONE); tmp1 = _mm_xor_si128(tmp1, KEY[0]); for(j=1; j<nr-1; j+=2){ tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); } tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); _mm_storeu_si128(&((__m128i*)out)[k], tmp1); } //If one partial block remains if(nbytes%16){ tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); tmp1 = _mm_xor_si128(tmp1, KEY[0]); for(j=1; j<nr-1; j+=2){ tmp1 =_mm_aesenc_si128(tmp1, KEY[j]); tmp1 =_mm_aesenc_si128(tmp1, KEY[j+1]); } tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); last_block = tmp1; for(j=0; j<nbytes%16; j++) out[k*16+j]=((unsigned char*)&last_block)[j]; } return 1; //when sucessfull returns 1 }
void aes_ecb_encrypt(aes *a,MR_BYTE *buff) { int i,j,k; MR_WORD p[4],q[4],*x,*y,*t; #ifdef AES_NI_SUPPORT __m128i ky,m = _mm_loadu_si128((__m128i *) buff); ky = _mm_loadu_si128((__m128i *) &a->fkey[0]); m = _mm_xor_si128 (m, ky); k=NB; for (i=1;i<a->Nr;i++) { ky=_mm_loadu_si128((__m128i *) &a->fkey[k]); m =_mm_aesenc_si128(m, ky); k+=4; } ky=_mm_loadu_si128((__m128i *) &a->fkey[k]); m=_mm_aesenclast_si128(m, ky); _mm_storeu_si128((__m128i *)buff, m); #else for (i=j=0;i<NB;i++,j+=4) { p[i]=pack((MR_BYTE *)&buff[j]); p[i]^=a->fkey[i]; } k=NB; x=p; y=q; /* State alternates between x and y */ for (i=1;i<a->Nr;i++) { /* Nr is number of rounds. May be odd. */ #ifndef MR_SMALL_AES y[0]=a->fkey[k]^ftable[MR_TOBYTE(x[0])]^ ftable1[MR_TOBYTE(x[1]>>8)]^ ftable2[MR_TOBYTE(x[2]>>16)]^ ftable3[x[3]>>24]; y[1]=a->fkey[k+1]^ftable[MR_TOBYTE(x[1])]^ ftable1[MR_TOBYTE(x[2]>>8)]^ ftable2[MR_TOBYTE(x[3]>>16)]^ ftable3[x[0]>>24]; y[2]=a->fkey[k+2]^ftable[MR_TOBYTE(x[2])]^ ftable1[MR_TOBYTE(x[3]>>8)]^ ftable2[MR_TOBYTE(x[0]>>16)]^ ftable3[x[1]>>24]; y[3]=a->fkey[k+3]^ftable[MR_TOBYTE(x[3])]^ ftable1[MR_TOBYTE(x[0]>>8)]^ ftable2[MR_TOBYTE(x[1]>>16)]^ ftable3[x[2]>>24]; #else y[0]=a->fkey[k]^ftable[MR_TOBYTE(x[0])]^ ROTL8(ftable[MR_TOBYTE(x[1]>>8)])^ ROTL16(ftable[MR_TOBYTE(x[2]>>16)])^ ROTL24(ftable[x[3]>>24]); y[1]=a->fkey[k+1]^ftable[MR_TOBYTE(x[1])]^ ROTL8(ftable[MR_TOBYTE(x[2]>>8)])^ ROTL16(ftable[MR_TOBYTE(x[3]>>16)])^ ROTL24(ftable[x[0]>>24]); y[2]=a->fkey[k+2]^ftable[MR_TOBYTE(x[2])]^ ROTL8(ftable[MR_TOBYTE(x[3]>>8)])^ ROTL16(ftable[MR_TOBYTE(x[0]>>16)])^ ROTL24(ftable[x[1]>>24]); y[3]=a->fkey[k+3]^ftable[MR_TOBYTE(x[3])]^ ROTL8(ftable[MR_TOBYTE(x[0]>>8)])^ ROTL16(ftable[MR_TOBYTE(x[1]>>16)])^ ROTL24(ftable[x[2]>>24]); #endif k+=4; t=x; x=y; y=t; /* swap pointers */ } /* Last Round */ y[0]=a->fkey[k]^(MR_WORD)fbsub[MR_TOBYTE(x[0])]^ ROTL8((MR_WORD)fbsub[MR_TOBYTE(x[1]>>8)])^ ROTL16((MR_WORD)fbsub[MR_TOBYTE(x[2]>>16)])^ ROTL24((MR_WORD)fbsub[x[3]>>24]); y[1]=a->fkey[k+1]^(MR_WORD)fbsub[MR_TOBYTE(x[1])]^ ROTL8((MR_WORD)fbsub[MR_TOBYTE(x[2]>>8)])^ ROTL16((MR_WORD)fbsub[MR_TOBYTE(x[3]>>16)])^ ROTL24((MR_WORD)fbsub[x[0]>>24]); y[2]=a->fkey[k+2]^(MR_WORD)fbsub[MR_TOBYTE(x[2])]^ ROTL8((MR_WORD)fbsub[MR_TOBYTE(x[3]>>8)])^ ROTL16((MR_WORD)fbsub[MR_TOBYTE(x[0]>>16)])^ ROTL24((MR_WORD)fbsub[x[1]>>24]); y[3]=a->fkey[k+3]^(MR_WORD)fbsub[MR_TOBYTE(x[3])]^ ROTL8((MR_WORD)fbsub[MR_TOBYTE(x[0]>>8)])^ ROTL16((MR_WORD)fbsub[MR_TOBYTE(x[1]>>16)])^ ROTL24((MR_WORD)fbsub[x[2]>>24]); for (i=j=0;i<NB;i++,j+=4) { unpack(y[i],(MR_BYTE *)&buff[j]); x[i]=y[i]=0; /* clean up stack */ } #endif }
void c_opt_unrolled_8x(uint8_t *keys, uint8_t *data, uint8_t *dataOut) { __m128i mask = _mm_set_epi8(0x0C, 0x03, 0x06, 0x09, 0x08, 0x0F, 0x02, 0x05, 0x04, 0x0B, 0x0E, 0x01, 0x00, 0x07, 0x0A, 0x0D); __m128i mmrcon = _mm_set_epi8(0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00); __m128i mmrconFinal = _mm_set_epi8(0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x1b, 0x00); __m128i key [8]; __m128i rkey [8]; __m128i state[8]; __m128i tmp0; key[0] = _mm_load_si128((__m128i*)&(keys[ 0])); key[1] = _mm_load_si128((__m128i*)&(keys[ 16])); key[2] = _mm_load_si128((__m128i*)&(keys[ 32])); key[3] = _mm_load_si128((__m128i*)&(keys[ 48])); key[4] = _mm_load_si128((__m128i*)&(keys[ 64])); key[5] = _mm_load_si128((__m128i*)&(keys[ 80])); key[6] = _mm_load_si128((__m128i*)&(keys[ 96])); key[7] = _mm_load_si128((__m128i*)&(keys[112])); transpose_state(rkey, key); transpose_state(&(rkey[4]), &(key[4])); state[0] = _mm_load_si128((__m128i*) data ); state[1] = state[0]; state[2] = state[0]; state[3] = state[0]; state[4] = state[0]; state[5] = state[0]; state[6] = state[0]; state[7] = state[0]; tmp0 = _mm_aesenclast_si128(rkey[3], mmrcon); tmp0 = _mm_shuffle_epi8(tmp0, mask); state[0] = _mm_xor_si128(state[0], key[0]); state[1] = _mm_xor_si128(state[1], key[1]); state[2] = _mm_xor_si128(state[2], key[2]); state[3] = _mm_xor_si128(state[3], key[3]); state[4] = _mm_xor_si128(state[4], key[4]); state[5] = _mm_xor_si128(state[5], key[5]); state[6] = _mm_xor_si128(state[6], key[6]); state[7] = _mm_xor_si128(state[7], key[7]); rkey[0] = _mm_xor_si128(rkey[0], tmp0); tmp0 = _mm_aesenclast_si128(rkey[7], mmrcon); tmp0 = _mm_shuffle_epi8(tmp0, mask); mmrcon = _mm_slli_epi16(mmrcon, 0x01); rkey[1] = _mm_xor_si128(rkey[1], rkey[0]); rkey[2] = _mm_xor_si128(rkey[2], rkey[1]); rkey[3] = _mm_xor_si128(rkey[3], rkey[2]); rkey[4] = _mm_xor_si128(rkey[4], tmp0); rkey[5] = _mm_xor_si128(rkey[5], rkey[4]); rkey[6] = _mm_xor_si128(rkey[6], rkey[5]); rkey[7] = _mm_xor_si128(rkey[7], rkey[6]); _mm_prefetch((char const *)state, 0); transpose_state(key, rkey); transpose_state(&(key[4]), &(rkey[4])); for (uint8_t roundCounter = 1; roundCounter < 8; roundCounter++) { tmp0 = _mm_aesenclast_si128(rkey[3], mmrcon); tmp0 = _mm_shuffle_epi8(tmp0, mask); state[0] = _mm_aesenc_si128(state[0], key[0]); state[1] = _mm_aesenc_si128(state[1], key[1]); state[2] = _mm_aesenc_si128(state[2], key[2]); state[3] = _mm_aesenc_si128(state[3], key[3]); state[4] = _mm_aesenc_si128(state[4], key[4]); state[5] = _mm_aesenc_si128(state[5], key[5]); rkey[0] = _mm_xor_si128(rkey[0], tmp0); tmp0 = _mm_aesenclast_si128(rkey[7], mmrcon); rkey[1] = _mm_xor_si128(rkey[1], rkey[0]); rkey[2] = _mm_xor_si128(rkey[2], rkey[1]); rkey[3] = _mm_xor_si128(rkey[3], rkey[2]); state[6] = _mm_aesenc_si128(state[6], key[6]); state[7] = _mm_aesenc_si128(state[7], key[7]); transpose_state(key, rkey); tmp0 = _mm_shuffle_epi8(tmp0, mask); mmrcon = _mm_slli_epi16(mmrcon, 0x01); rkey[4] = _mm_xor_si128(rkey[4], tmp0); rkey[5] = _mm_xor_si128(rkey[5], rkey[4]); rkey[6] = _mm_xor_si128(rkey[6], rkey[5]); rkey[7] = _mm_xor_si128(rkey[7], rkey[6]); _mm_prefetch((char const *)state, 0); transpose_state(&(key[4]), &(rkey[4])); } tmp0 = _mm_aesenclast_si128(rkey[3], mmrconFinal); tmp0 = _mm_shuffle_epi8(tmp0, mask); state[0] = _mm_aesenc_si128(state[0], key[0]); state[1] = _mm_aesenc_si128(state[1], key[1]); state[2] = _mm_aesenc_si128(state[2], key[2]); state[3] = _mm_aesenc_si128(state[3], key[3]); state[4] = _mm_aesenc_si128(state[4], key[4]); state[5] = _mm_aesenc_si128(state[5], key[5]); rkey[0] = _mm_xor_si128(rkey[0], tmp0); tmp0 = _mm_aesenclast_si128(rkey[7], mmrconFinal); rkey[1] = _mm_xor_si128(rkey[1], rkey[0]); rkey[2] = _mm_xor_si128(rkey[2], rkey[1]); rkey[3] = _mm_xor_si128(rkey[3], rkey[2]); state[6] = _mm_aesenc_si128(state[6], key[6]); state[7] = _mm_aesenc_si128(state[7], key[7]); transpose_state(key, rkey); tmp0 = _mm_shuffle_epi8(tmp0, mask); mmrconFinal = _mm_slli_epi16(mmrconFinal, 0x01); rkey[4] = _mm_xor_si128(rkey[4], tmp0); rkey[5] = _mm_xor_si128(rkey[5], rkey[4]); rkey[6] = _mm_xor_si128(rkey[6], rkey[5]); rkey[7] = _mm_xor_si128(rkey[7], rkey[6]); _mm_prefetch((char const *)state, 0); transpose_state(&(key[4]), &(rkey[4])); tmp0 = _mm_aesenclast_si128(rkey[3], mmrconFinal); tmp0 = _mm_shuffle_epi8(tmp0, mask); state[0] = _mm_aesenc_si128(state[0], key[0]); state[1] = _mm_aesenc_si128(state[1], key[1]); state[2] = _mm_aesenc_si128(state[2], key[2]); state[3] = _mm_aesenc_si128(state[3], key[3]); state[4] = _mm_aesenc_si128(state[4], key[4]); state[5] = _mm_aesenc_si128(state[5], key[5]); rkey[0] = _mm_xor_si128(rkey[0], tmp0); tmp0 = _mm_aesenclast_si128(rkey[7], mmrconFinal); rkey[1] = _mm_xor_si128(rkey[1], rkey[0]); rkey[2] = _mm_xor_si128(rkey[2], rkey[1]); rkey[3] = _mm_xor_si128(rkey[3], rkey[2]); state[6] = _mm_aesenc_si128(state[6], key[6]); state[7] = _mm_aesenc_si128(state[7], key[7]); transpose_state(key, rkey); tmp0 = _mm_shuffle_epi8(tmp0, mask); state[0] = _mm_aesenclast_si128(state[0], key[0]); state[1] = _mm_aesenclast_si128(state[1], key[1]); state[2] = _mm_aesenclast_si128(state[2], key[2]); state[3] = _mm_aesenclast_si128(state[3], key[3]); rkey[4] = _mm_xor_si128(rkey[4], tmp0); rkey[5] = _mm_xor_si128(rkey[5], rkey[4]); rkey[6] = _mm_xor_si128(rkey[6], rkey[5]); rkey[7] = _mm_xor_si128(rkey[7], rkey[6]); transpose_state(&(key[4]), &(rkey[4])); state[4] = _mm_aesenclast_si128(state[4], key[4]); state[5] = _mm_aesenclast_si128(state[5], key[5]); state[6] = _mm_aesenclast_si128(state[6], key[6]); state[7] = _mm_aesenclast_si128(state[7], key[7]); _mm_store_si128((__m128i*)&(dataOut[ 0]), state[0]); _mm_store_si128((__m128i*)&(dataOut[ 16]), state[1]); _mm_store_si128((__m128i*)&(dataOut[ 32]), state[2]); _mm_store_si128((__m128i*)&(dataOut[ 48]), state[3]); _mm_store_si128((__m128i*)&(dataOut[ 64]), state[4]); _mm_store_si128((__m128i*)&(dataOut[ 80]), state[5]); _mm_store_si128((__m128i*)&(dataOut[ 96]), state[6]); _mm_store_si128((__m128i*)&(dataOut[112]), state[7]); }
void ENC_MSG_x8(const unsigned char *PT, unsigned char *CT, const unsigned char *TAG, const unsigned char *KS, int length) { __m128i or_mask, TWO,ctr_block, tmp, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, ONE; int i,j,remainder_loc; if (length%16) length = length/16 + 1; else length/=16; ONE = _mm_setr_epi32(1,0,0,0); TWO = _mm_setr_epi32(2,0,0,0); ctr_block = _mm_setzero_si128(); ctr_block = _mm_loadu_si128(((__m128i*)TAG)); or_mask = _mm_setr_epi32(0,0,0,0x80000000); ctr_block = _mm_or_si128(ctr_block, or_mask); for (i=0; i< (length-length%8); i=i+8) { tmp = ctr_block; tmp1 = _mm_add_epi32(ctr_block, ONE); tmp2 = _mm_add_epi32(ctr_block, TWO); tmp3 = _mm_add_epi32(tmp2, ONE); tmp4 = _mm_add_epi32(tmp2, TWO); tmp5 = _mm_add_epi32(tmp4, ONE); tmp6 = _mm_add_epi32(tmp4, TWO); tmp7 = _mm_add_epi32(tmp6, ONE); ctr_block = _mm_add_epi32(tmp6, TWO); tmp = _mm_xor_si128(tmp, ((__m128i*)KS)[0]); tmp1 = _mm_xor_si128(tmp1, ((__m128i*)KS)[0]); tmp2 = _mm_xor_si128(tmp2, ((__m128i*)KS)[0]); tmp3 = _mm_xor_si128(tmp3, ((__m128i*)KS)[0]); tmp4 = _mm_xor_si128(tmp4, ((__m128i*)KS)[0]); tmp5 = _mm_xor_si128(tmp5, ((__m128i*)KS)[0]); tmp6 = _mm_xor_si128(tmp6, ((__m128i*)KS)[0]); tmp7 = _mm_xor_si128(tmp7, ((__m128i*)KS)[0]); for(j=1; j <10; j++) { tmp = _mm_aesenc_si128 (tmp, ((__m128i*)KS)[j]); tmp1 = _mm_aesenc_si128 (tmp1, ((__m128i*)KS)[j]); tmp2 = _mm_aesenc_si128 (tmp2, ((__m128i*)KS)[j]); tmp3 = _mm_aesenc_si128 (tmp3, ((__m128i*)KS)[j]); tmp4 = _mm_aesenc_si128 (tmp4, ((__m128i*)KS)[j]); tmp5 = _mm_aesenc_si128 (tmp5, ((__m128i*)KS)[j]); tmp6 = _mm_aesenc_si128 (tmp6, ((__m128i*)KS)[j]); tmp7 = _mm_aesenc_si128 (tmp7, ((__m128i*)KS)[j]); }; tmp = _mm_aesenclast_si128 (tmp, ((__m128i*)KS)[j]); tmp1 = _mm_aesenclast_si128 (tmp1, ((__m128i*)KS)[j]); tmp2 = _mm_aesenclast_si128 (tmp2, ((__m128i*)KS)[j]); tmp3 = _mm_aesenclast_si128 (tmp3, ((__m128i*)KS)[j]); tmp4 = _mm_aesenclast_si128 (tmp4, ((__m128i*)KS)[j]); tmp5 = _mm_aesenclast_si128 (tmp5, ((__m128i*)KS)[j]); tmp6 = _mm_aesenclast_si128 (tmp6, ((__m128i*)KS)[j]); tmp7 = _mm_aesenclast_si128 (tmp7, ((__m128i*)KS)[j]); tmp = _mm_xor_si128(tmp,_mm_loadu_si128(&((__m128i*)PT)[i])); tmp1 = _mm_xor_si128(tmp1,_mm_loadu_si128(&((__m128i*)PT)[i+1])); tmp2 = _mm_xor_si128(tmp2,_mm_loadu_si128(&((__m128i*)PT)[i+2])); tmp3 = _mm_xor_si128(tmp3,_mm_loadu_si128(&((__m128i*)PT)[i+3])); tmp4 = _mm_xor_si128(tmp4,_mm_loadu_si128(&((__m128i*)PT)[i+4])); tmp5 = _mm_xor_si128(tmp5,_mm_loadu_si128(&((__m128i*)PT)[i+5])); tmp6 = _mm_xor_si128(tmp6,_mm_loadu_si128(&((__m128i*)PT)[i+6])); tmp7 = _mm_xor_si128(tmp7,_mm_loadu_si128(&((__m128i*)PT)[i+7])); _mm_storeu_si128(&((__m128i*)CT)[i],tmp); _mm_storeu_si128(&((__m128i*)CT)[i+1],tmp1); _mm_storeu_si128(&((__m128i*)CT)[i+2],tmp2); _mm_storeu_si128(&((__m128i*)CT)[i+3],tmp3); _mm_storeu_si128(&((__m128i*)CT)[i+4],tmp4); _mm_storeu_si128(&((__m128i*)CT)[i+5],tmp5); _mm_storeu_si128(&((__m128i*)CT)[i+6],tmp6); _mm_storeu_si128(&((__m128i*)CT)[i+7],tmp7); } // handling remainder and less than 8 blocks if (length%8==0) return; // The remainder_loc is used to remember the location of our block handled remainder_loc = length-length%8; for(i=0; i < (length%8); i++) { tmp = ctr_block; ctr_block = _mm_add_epi32(ctr_block, ONE); tmp = _mm_xor_si128(tmp, ((__m128i*)KS)[0]); for(j=1; j <10; j++) { tmp = _mm_aesenc_si128 (tmp, ((__m128i*)KS)[j]); }; tmp = _mm_aesenclast_si128 (tmp, ((__m128i*)KS)[j]); tmp = _mm_xor_si128(tmp,_mm_loadu_si128(&((__m128i*)PT)[remainder_loc+i])); _mm_storeu_si128 (&((__m128i*)CT)[remainder_loc+i],tmp); } }
/* * AES-256 Encryption */ void AES_256_NI::encrypt_n(const byte in[], byte out[], size_t blocks) const { const __m128i* in_mm = (const __m128i*)in; __m128i* out_mm = (__m128i*)out; const __m128i* key_mm = (const __m128i*)&EK[0]; __m128i K0 = _mm_loadu_si128(key_mm); __m128i K1 = _mm_loadu_si128(key_mm + 1); __m128i K2 = _mm_loadu_si128(key_mm + 2); __m128i K3 = _mm_loadu_si128(key_mm + 3); __m128i K4 = _mm_loadu_si128(key_mm + 4); __m128i K5 = _mm_loadu_si128(key_mm + 5); __m128i K6 = _mm_loadu_si128(key_mm + 6); __m128i K7 = _mm_loadu_si128(key_mm + 7); __m128i K8 = _mm_loadu_si128(key_mm + 8); __m128i K9 = _mm_loadu_si128(key_mm + 9); __m128i K10 = _mm_loadu_si128(key_mm + 10); __m128i K11 = _mm_loadu_si128(key_mm + 11); __m128i K12 = _mm_loadu_si128(key_mm + 12); __m128i K13 = _mm_loadu_si128(key_mm + 13); __m128i K14 = _mm_loadu_si128(key_mm + 14); while(blocks >= 4) { __m128i B0 = _mm_loadu_si128(in_mm + 0); __m128i B1 = _mm_loadu_si128(in_mm + 1); __m128i B2 = _mm_loadu_si128(in_mm + 2); __m128i B3 = _mm_loadu_si128(in_mm + 3); B0 = _mm_xor_si128(B0, K0); B1 = _mm_xor_si128(B1, K0); B2 = _mm_xor_si128(B2, K0); B3 = _mm_xor_si128(B3, K0); AES_ENC_4_ROUNDS(K1); AES_ENC_4_ROUNDS(K2); AES_ENC_4_ROUNDS(K3); AES_ENC_4_ROUNDS(K4); AES_ENC_4_ROUNDS(K5); AES_ENC_4_ROUNDS(K6); AES_ENC_4_ROUNDS(K7); AES_ENC_4_ROUNDS(K8); AES_ENC_4_ROUNDS(K9); AES_ENC_4_ROUNDS(K10); AES_ENC_4_ROUNDS(K11); AES_ENC_4_ROUNDS(K12); AES_ENC_4_ROUNDS(K13); AES_ENC_4_LAST_ROUNDS(K14); _mm_storeu_si128(out_mm + 0, B0); _mm_storeu_si128(out_mm + 1, B1); _mm_storeu_si128(out_mm + 2, B2); _mm_storeu_si128(out_mm + 3, B3); blocks -= 4; in_mm += 4; out_mm += 4; } for(size_t i = 0; i != blocks; ++i) { __m128i B = _mm_loadu_si128(in_mm + i); B = _mm_xor_si128(B, K0); B = _mm_aesenc_si128(B, K1); B = _mm_aesenc_si128(B, K2); B = _mm_aesenc_si128(B, K3); B = _mm_aesenc_si128(B, K4); B = _mm_aesenc_si128(B, K5); B = _mm_aesenc_si128(B, K6); B = _mm_aesenc_si128(B, K7); B = _mm_aesenc_si128(B, K8); B = _mm_aesenc_si128(B, K9); B = _mm_aesenc_si128(B, K10); B = _mm_aesenc_si128(B, K11); B = _mm_aesenc_si128(B, K12); B = _mm_aesenc_si128(B, K13); B = _mm_aesenclast_si128(B, K14); _mm_storeu_si128(out_mm + i, B); } }