/*The input to initialization is the 128-bit key; 128-bit IV;*/ void aegis128_initialization(const unsigned char *key, const unsigned char *iv, __m128i *state) { int i; __m128i tmp; __m128i keytmp = _mm_load_si128((__m128i*)key); __m128i ivtmp = _mm_load_si128((__m128i*)iv); state[0] = ivtmp; state[1] = _mm_set_epi8(0xdd,0x28,0xb5,0x73,0x42,0x31,0x11,0x20,0xf1,0x2f,0xc2,0x6d,0x55,0x18,0x3d,0xdb); state[2] = _mm_set_epi8(0x62,0x79,0xe9,0x90,0x59,0x37,0x22,0x15,0x0d,0x08,0x05,0x03,0x02,0x01,0x1, 0x0); state[3] = _mm_xor_si128(keytmp, _mm_set_epi8(0x62,0x79,0xe9,0x90,0x59,0x37,0x22,0x15,0x0d,0x08,0x05,0x03,0x02,0x01,0x1,0x0)); state[4] = _mm_xor_si128(keytmp, _mm_set_epi8(0xdd,0x28,0xb5,0x73,0x42,0x31,0x11,0x20,0xf1,0x2f,0xc2,0x6d,0x55,0x18,0x3d,0xdb)); state[0] = _mm_xor_si128(state[0], keytmp); keytmp = _mm_xor_si128(keytmp, ivtmp); for (i = 0; i < 10; i++) { //state update function tmp = state[4]; state[4] = _mm_aesenc_si128(state[3], state[4]); state[3] = _mm_aesenc_si128(state[2], state[3]); state[2] = _mm_aesenc_si128(state[1], state[2]); state[1] = _mm_aesenc_si128(state[0], state[1]); state[0] = _mm_aesenc_si128(tmp, state[0]); //xor msg with state[0] keytmp = _mm_xor_si128(keytmp, ivtmp); state[0] = _mm_xor_si128(state[0], keytmp); } }
static inline void aes_enc_128(__m128i *state, __m128i *key) { // 0 *state = _mm_xor_si128(*state, key[0]); // 1 *state = _mm_aesenc_si128(*state, key[1]); // 2 *state = _mm_aesenc_si128(*state, key[2]); // 3 *state = _mm_aesenc_si128(*state, key[3]); // 4 *state = _mm_aesenc_si128(*state, key[4]); // 5 *state = _mm_aesenc_si128(*state, key[5]); // 6 *state = _mm_aesenc_si128(*state, key[6]); // 7 *state = _mm_aesenc_si128(*state, key[7]); // 8 *state = _mm_aesenc_si128(*state, key[8]); // 9 *state = _mm_aesenc_si128(*state, key[9]); // 10 *state = _mm_aesenclast_si128(*state, key[10]); }
int haraka256256(unsigned char *hash, const unsigned char *msg) { // stuff we need int i, j; __m128i s[2], tmp, rcon; __m128i MSB64 = _mm_set_epi32(0xFFFFFFFF,0xFFFFFFFF,0,0); // set initial round constant rcon = _mm_set_epi32(1,1,1,1); // initialize state to msg s[0] = _mm_load_si128(&((__m128i*)msg)[0]); s[1] = _mm_load_si128(&((__m128i*)msg)[1]); //printf("= input state =\n"); //printstate256(s[0], s[1]); for (i = 0; i < ROUNDS; ++i) { // aes round(s) for (j = 0; j < AES_PER_ROUND; ++j) { s[0] = _mm_aesenc_si128(s[0], rcon); s[1] = _mm_aesenc_si128(s[1], rcon); rcon = _mm_slli_epi32(rcon, 1); } //printf("= round %d : after aes layer =\n", i); //printstate256(s[0], s[1]); // mixing tmp = _mm_unpacklo_epi32(s[0], s[1]); s[1] = _mm_unpackhi_epi32(s[0], s[1]); s[0] = tmp; //printf("= round %d : after mix layer =\n", i); //printstate256(s[0], s[1]); } //printf("= output from permutation =\n"); //printstate256(s[0], s[1]); // xor message to get DM effect s[0] = _mm_xor_si128(s[0], _mm_load_si128(&((__m128i*)msg)[0])); s[1] = _mm_xor_si128(s[1], _mm_load_si128(&((__m128i*)msg)[1])); //printf("= after feed-forward =\n"); //printstate256(s[0], s[1]); // store result _mm_storeu_si128((__m128i*)hash, s[0]); _mm_storeu_si128((__m128i*)(hash + 16), s[1]); }
void AESNI_encrypt4(const uint8_t *in, uint8_t *out, const AUX_KEY key) { __m128i tmp; tmp = _mm_loadu_si128 ((__m128i*)in); tmp = _mm_xor_si128 (tmp,key[0]); tmp = _mm_aesenc_si128 (tmp, key[1]); tmp = _mm_aesenc_si128 (tmp, key[2]); tmp = _mm_aesenc_si128 (tmp, key[3]); tmp = _mm_aesenc_si128 (tmp, key[4]); _mm_storeu_si128 ((__m128i*)out,tmp); }
inline void AES_reduced_opt(int128 &u) { //Round Key initialization __m128i roundkey[AES_ROUNDS + 1]; for (unsigned i = 0; i<AES_ROUNDS + 1; ++i) { roundkey[i] = _mm_set_epi64x(subkeys64[i][1], subkeys64[i][0]); } __m128i acc0 = _mm_set_epi64x(u.i1, u.i0); acc0 = _mm_xor_si128(acc0, roundkey[0]); for (unsigned j = 0; j<AES_ROUNDS; ++j) { for (unsigned i = 0; i<1; ++i) { acc0 = _mm_aesenc_si128(acc0, roundkey[j + 1]); } } { u.i0 = _mm_extract_epi64(acc0, 0); u.i1 = _mm_extract_epi64(acc0, 1); } }
int _declspec(noinline) _stdcall xts_aes_ni_available() { int CPUInfo[4], res = 0; __m128i enc; #ifdef _M_IX86 unsigned char fpustate[32]; #endif // check for AES-NI support via CPUID.01H:ECX.AES[bit 25] __cpuid(CPUInfo, 1); if ( CPUInfo[2] & 0x02000000 ) return 1; // Special workaround for AES-NI on Hyper-V server and virtual machines if ( (CPUInfo[2] & 0x80000000) == 0 ) return 0; __cpuid(CPUInfo, 0x40000000); if ( CPUInfo[1] != 'rciM' || CPUInfo[2] != 'foso' || CPUInfo[3] != 'vH t' ) return 0; #ifdef _M_IX86 if (save_fpu_state(fpustate) >= 0) { #endif __try { enc = _mm_aesenc_si128(_mm_set_epi32(0,1,2,3), _mm_set_epi32(4,5,6,7)); res = enc.m128i_u64[0] == 0x5f77774d4b7b7b54 && enc.m128i_u64[1] == 0x63636367427c7c58; } __except(EXCEPTION_EXECUTE_HANDLER) { res = 0; } #ifdef _M_IX86 load_fpu_state(fpustate); }
/* ** AES-128/256 batch encrypt for PIPE blocks */ __inline__ static void AES_ecb_encrypt_PIPE( __m128i *blks, const __m128i *key) { unsigned j; blks[0] = _mm_xor_si128(blks[0], key[0]); blks[1] = _mm_xor_si128(blks[1], key[0]); blks[2] = _mm_xor_si128(blks[2], key[0]); blks[3] = _mm_xor_si128(blks[3], key[0]); #if (PIPE>=5) blks[4] = _mm_xor_si128(blks[4], key[0]); #endif #if (PIPE>=6) blks[5] = _mm_xor_si128(blks[5], key[0]); #endif #if (PIPE>=7) blks[6] = _mm_xor_si128(blks[6], key[0]); #endif #if (PIPE==8) blks[7] = _mm_xor_si128(blks[7], key[0]); #endif for (j = 1; j<ROUND; ++j) { blks[0] = _mm_aesenc_si128(blks[0], key[j]); blks[1] = _mm_aesenc_si128(blks[1], key[j]); blks[2] = _mm_aesenc_si128(blks[2], key[j]); blks[3] = _mm_aesenc_si128(blks[3], key[j]); #if (PIPE>=5) blks[4] = _mm_aesenc_si128(blks[4], key[j]); #endif #if (PIPE>=6) blks[5] = _mm_aesenc_si128(blks[5], key[j]); #endif #if (PIPE>=7) blks[6] = _mm_aesenc_si128(blks[6], key[j]); #endif #if (PIPE==8) blks[7] = _mm_aesenc_si128(blks[7], key[j]); #endif } blks[0] = _mm_aesenclast_si128(blks[0], key[j]); blks[1] = _mm_aesenclast_si128(blks[1], key[j]); blks[2] = _mm_aesenclast_si128(blks[2], key[j]); blks[3] = _mm_aesenclast_si128(blks[3], key[j]); #if (PIPE>=5) blks[4] = _mm_aesenclast_si128(blks[4], key[j]); #endif #if (PIPE>=6) blks[5] = _mm_aesenclast_si128(blks[5], key[j]); #endif #if (PIPE>=7) blks[6] = _mm_aesenclast_si128(blks[6], key[j]); #endif #if (PIPE==8) blks[7] = _mm_aesenclast_si128(blks[7], key[j]); #endif }
void Cryptor::cbcEncrypt(const string &plaintext, const Key &key, string *ciphertext, unsigned char *schedule) { ciphertext->resize(plaintext.size()); int blocks = plaintext.size() / 16; if (plaintext.size() % 16) { blocks++; } __m128i tmp, tmp2, tmp3; __m128i *input = (__m128i*) plaintext.data(); __m128i *output = (__m128i*) ciphertext->data(); __m128i *keySchedule = (__m128i*) schedule; int rounds = getRounds(key.size); // Load the IV. tmp2 = _mm_loadu_si128((__m128i*) key.iv); // Swap byte-order => big-endian. if (!bigEndian) { reverse_m128i(tmp2); } for (int block = 0; block < blocks; block++) { // Get next 128-bit block. tmp = _mm_loadu_si128(&input[block]); // Swap byte-order => big-endian. if (!bigEndian) { reverse_m128i(tmp); } // XOR IV or last ciphertext with the plaintext. tmp2 = _mm_xor_si128(tmp, tmp2); // Whitening step. tmp2 = _mm_xor_si128(tmp2, keySchedule[0]); // Apply the AES rounds. int round = 1; for (; round < rounds; round++) { tmp2 = _mm_aesenc_si128(tmp2, keySchedule[round]); } // And the last. tmp2 = _mm_aesenclast_si128(tmp2, keySchedule[round]); // Swap byte-order => little-endian. tmp3 = tmp2; if (!bigEndian) { reverse_m128i(tmp3); } // Save the encrypted block. _mm_storeu_si128(&output[block], tmp3); } }
static __m128i aes_encrypt(__m128i in, __m128i* k) { __m128i x = _mm_xor_si128(in, k[0]); x = _mm_aesenc_si128(x, k[1]); x = _mm_aesenc_si128(x, k[2]); x = _mm_aesenc_si128(x, k[3]); x = _mm_aesenc_si128(x, k[4]); x = _mm_aesenc_si128(x, k[5]); x = _mm_aesenc_si128(x, k[6]); x = _mm_aesenc_si128(x, k[7]); x = _mm_aesenc_si128(x, k[8]); x = _mm_aesenc_si128(x, k[9]); return _mm_aesenclast_si128(x, k[10]); }
void AES_ecb_encrypt(block *blk, AES_KEY *aesKey) { unsigned j, rnds = ROUNDS(aesKey); const block *sched = ((block *)(aesKey->rd_key)); *blk = _mm_xor_si128(*blk, sched[0]); for (j = 1; j<rnds; ++j) *blk = _mm_aesenc_si128(*blk, sched[j]); *blk = _mm_aesenclast_si128(*blk, sched[j]); }
void AES_encryptC(block *in, block *out, AES_KEY *aesKey) { int j, rnds = ROUNDS(aesKey); const __m128i *sched = ((__m128i *)(aesKey->rd_key)); __m128i tmp = _mm_load_si128((__m128i*)in); tmp = _mm_xor_si128(tmp, sched[0]); for (j = 1; j<rnds; j++) tmp = _mm_aesenc_si128(tmp, sched[j]); tmp = _mm_aesenclast_si128(tmp, sched[j]); _mm_store_si128((__m128i*)out, tmp); }
void AES_ecb_encrypt_chunk_in_out(block *in, block *out, unsigned nblks, AES_KEY *aesKey) { int numberOfLoops = nblks / 8; int blocksPipeLined = numberOfLoops * 8; int remainingEncrypts = nblks - blocksPipeLined; unsigned j, rnds = ROUNDS(aesKey); const block *sched = ((block *)(aesKey->rd_key)); for (int i = 0; i < numberOfLoops; i++){ out[0 + i * 8] = _mm_xor_si128(in[0 + i * 8], sched[0]); out[1 + i * 8] = _mm_xor_si128(in[1 + i * 8], sched[0]); out[2 + i * 8] = _mm_xor_si128(in[2 + i * 8], sched[0]); out[3 + i * 8] = _mm_xor_si128(in[3 + i * 8], sched[0]); out[4 + i * 8] = _mm_xor_si128(in[4 + i * 8], sched[0]); out[5 + i * 8] = _mm_xor_si128(in[5 + i * 8], sched[0]); out[6 + i * 8] = _mm_xor_si128(in[6 + i * 8], sched[0]); out[7 + i * 8] = _mm_xor_si128(in[7 + i * 8], sched[0]); for (j = 1; j < rnds; ++j){ out[0 + i * 8] = _mm_aesenc_si128(out[0 + i * 8], sched[j]); out[1 + i * 8] = _mm_aesenc_si128(out[1 + i * 8], sched[j]); out[2 + i * 8] = _mm_aesenc_si128(out[2 + i * 8], sched[j]); out[3 + i * 8] = _mm_aesenc_si128(out[3 + i * 8], sched[j]); out[4 + i * 8] = _mm_aesenc_si128(out[4 + i * 8], sched[j]); out[5 + i * 8] = _mm_aesenc_si128(out[5 + i * 8], sched[j]); out[6 + i * 8] = _mm_aesenc_si128(out[6 + i * 8], sched[j]); out[7 + i * 8] = _mm_aesenc_si128(out[7 + i * 8], sched[j]); } out[0 + i * 8] = _mm_aesenclast_si128(out[0 + i * 8], sched[j]); out[1 + i * 8] = _mm_aesenclast_si128(out[1 + i * 8], sched[j]); out[2 + i * 8] = _mm_aesenclast_si128(out[2 + i * 8], sched[j]); out[3 + i * 8] = _mm_aesenclast_si128(out[3 + i * 8], sched[j]); out[4 + i * 8] = _mm_aesenclast_si128(out[4 + i * 8], sched[j]); out[5 + i * 8] = _mm_aesenclast_si128(out[5 + i * 8], sched[j]); out[6 + i * 8] = _mm_aesenclast_si128(out[6 + i * 8], sched[j]); out[7 + i * 8] = _mm_aesenclast_si128(out[7 + i * 8], sched[j]); } for (int i = blocksPipeLined; i<blocksPipeLined + remainingEncrypts; ++i) out[i] = _mm_xor_si128(in[i], sched[0]); for (j = 1; j<rnds; ++j) for (int i = blocksPipeLined; i<blocksPipeLined + remainingEncrypts; ++i) out[i] = _mm_aesenc_si128(out[i], sched[j]); for (int i = blocksPipeLined; i<blocksPipeLined + remainingEncrypts; ++i) out[i] = _mm_aesenclast_si128(out[i], sched[j]); }
void AES_ecb_encrypt_blks_4(block *blks, AES_KEY *aesKey) { unsigned j, rnds = ROUNDS(aesKey); const block *sched = ((block *)(aesKey->rd_key)); blks[0] = _mm_xor_si128(blks[0], sched[0]); blks[1] = _mm_xor_si128(blks[1], sched[0]); blks[2] = _mm_xor_si128(blks[2], sched[0]); blks[3] = _mm_xor_si128(blks[3], sched[0]); for (j = 1; j < rnds; ++j){ blks[0] = _mm_aesenc_si128(blks[0], sched[j]); blks[1] = _mm_aesenc_si128(blks[1], sched[j]); blks[2] = _mm_aesenc_si128(blks[2], sched[j]); blks[3] = _mm_aesenc_si128(blks[3], sched[j]); } blks[0] = _mm_aesenclast_si128(blks[0], sched[j]); blks[1] = _mm_aesenclast_si128(blks[1], sched[j]); blks[2] = _mm_aesenclast_si128(blks[2], sched[j]); blks[3] = _mm_aesenclast_si128(blks[3], sched[j]); }
AES_AES_Block __fastcall aes_AES128_encrypt_block_( AES_AES_Block plaintext, const AES_AES128_RoundKeys* encryption_keys) { plaintext = _mm_xor_si128(plaintext, encryption_keys->keys[0]); plaintext = _mm_aesenc_si128(plaintext, encryption_keys->keys[1]); plaintext = _mm_aesenc_si128(plaintext, encryption_keys->keys[2]); plaintext = _mm_aesenc_si128(plaintext, encryption_keys->keys[3]); plaintext = _mm_aesenc_si128(plaintext, encryption_keys->keys[4]); plaintext = _mm_aesenc_si128(plaintext, encryption_keys->keys[5]); plaintext = _mm_aesenc_si128(plaintext, encryption_keys->keys[6]); plaintext = _mm_aesenc_si128(plaintext, encryption_keys->keys[7]); plaintext = _mm_aesenc_si128(plaintext, encryption_keys->keys[8]); plaintext = _mm_aesenc_si128(plaintext, encryption_keys->keys[9]); return _mm_aesenclast_si128(plaintext, encryption_keys->keys[10]); }
void aesni_encrypt(aesni_ctx *ctx, const byte *in, byte *out) { register __m128i tmp; tmp = _mm_loadu_si128((__m128i*)in); tmp = _mm_xor_si128(tmp, ctx->enc_keys[0]); for (int i = 1; i < 10; i++) { tmp = _mm_aesenc_si128(tmp, ctx->enc_keys[i]); } tmp = _mm_aesenclast_si128(tmp, ctx->enc_keys[10]); _mm_storeu_si128((__m128i*)out, tmp); }
void AES_ecb_encrypt_blks(block *blks, unsigned nblks, AES_KEY *aesKey) { unsigned i,j,rnds=ROUNDS(aesKey); const block *sched = ((block *)(aesKey->rd_key)); for (i=0; i<nblks; ++i) blks[i] =_mm_xor_si128(blks[i], sched[0]); for(j=1; j<rnds; ++j) for (i=0; i<nblks; ++i) blks[i] = _mm_aesenc_si128(blks[i], sched[j]); for (i=0; i<nblks; ++i) blks[i] =_mm_aesenclast_si128(blks[i], sched[j]); }
static inline void aes256ni_encrypt(const __m128i rkeys[15], const unsigned char *n, unsigned char *out) { __m128i nv = _mm_load_si128((const __m128i *)n); int i; __m128i temp = _mm_xor_si128(nv, rkeys[0]); #pragma unroll(13) for (i = 1 ; i < 14 ; i++) { temp = _mm_aesenc_si128(temp, rkeys[i]); } temp = _mm_aesenclast_si128(temp, rkeys[14]); _mm_store_si128((__m128i*)(out), temp); }
inline void AES_encrypt(const unsigned char *in, unsigned char *out, const AES_KEY *key) { int j, rnds = ROUNDS(key); const __m128i *sched = ((__m128i *) (key->rd_key)); __m128i tmp = _mm_load_si128((__m128i *) in); tmp = _mm_xor_si128(tmp, sched[0]); for (j = 1; j < rnds; j++) tmp = _mm_aesenc_si128(tmp, sched[j]); tmp = _mm_aesenclast_si128(tmp, sched[j]); _mm_store_si128((__m128i *) out, tmp); }
static __m128i AES_encrypt(__m128i in, const __m128i* expkey) { int j; __m128i tmp = byte_swap(in) ^ expkey[0]; for (j=1; j <10; j++){ tmp = _mm_aesenc_si128 (tmp,expkey[j]); } tmp = _mm_aesenclast_si128 (tmp,expkey[10]); return byte_swap(tmp); }
void AES_ecb_encrypt_blks_4_in_out(block *in, block *out, AES_KEY *aesKey) { unsigned j, rnds = ROUNDS(aesKey); const block *sched = ((block *)(aesKey->rd_key)); //block temp[4]; out[0] = _mm_xor_si128(in[0], sched[0]); out[1] = _mm_xor_si128(in[1], sched[0]); out[2] = _mm_xor_si128(in[2], sched[0]); out[3] = _mm_xor_si128(in[3], sched[0]); for (j = 1; j < rnds; ++j){ out[0] = _mm_aesenc_si128(out[0], sched[j]); out[1] = _mm_aesenc_si128(out[1], sched[j]); out[2] = _mm_aesenc_si128(out[2], sched[j]); out[3] = _mm_aesenc_si128(out[3], sched[j]); } out[0] = _mm_aesenclast_si128(out[0], sched[j]); out[1] = _mm_aesenclast_si128(out[1], sched[j]); out[2] = _mm_aesenclast_si128(out[2], sched[j]); out[3] = _mm_aesenclast_si128(out[3], sched[j]); }
//the finalization state of AEGIS void aegis128_tag_generation(unsigned long long msglen, unsigned long long adlen, unsigned char maclen, unsigned char *mac, __m128i *state) { int i; __m128i tmp; __m128i msgtmp; unsigned char t[16],tt[16]; for (i = 0; i < 16; i++) tt[i] = 0; ((unsigned long long*)tt)[0] = adlen << 3; ((unsigned long long*)tt)[1] = msglen << 3; msgtmp = _mm_load_si128((__m128i*)tt); msgtmp = _mm_xor_si128(msgtmp, state[3]); for (i = 0; i < 7; i++) { //state update function tmp = state[4]; state[4] = _mm_aesenc_si128(state[3],state[4]); state[3] = _mm_aesenc_si128(state[2],state[3]); state[2] = _mm_aesenc_si128(state[1],state[2]); state[1] = _mm_aesenc_si128(state[0],state[1]); state[0] = _mm_aesenc_si128(tmp,state[0]); //xor "msg" with state[0] state[0] = _mm_xor_si128(state[0], msgtmp); } state[4] = _mm_xor_si128(state[4], state[3]); state[4] = _mm_xor_si128(state[4], state[2]); state[4] = _mm_xor_si128(state[4], state[1]); state[4] = _mm_xor_si128(state[4], state[0]); _mm_store_si128((__m128i*)t, state[4]); //in this program, the mac length is assumed to be multiple of bytes memcpy(mac,t,maclen); }
//one step of decryption inline void aegis128_dec_aut_step(unsigned char *plaintextblk, const unsigned char *ciphertextblk, __m128i *state) { __m128i msg = _mm_load_si128((__m128i*)ciphertextblk); __m128i tmp = state[4]; //decryption msg = _mm_xor_si128(msg, _mm_and_si128(state[2], state[3])); msg = _mm_xor_si128(msg, state[4]); msg = _mm_xor_si128(msg, state[1]); _mm_store_si128((__m128i*)plaintextblk, msg); //state update function state[4] = _mm_aesenc_si128(state[3],state[4]); state[3] = _mm_aesenc_si128(state[2],state[3]); state[2] = _mm_aesenc_si128(state[1],state[2]); state[1] = _mm_aesenc_si128(state[0],state[1]); state[0] = _mm_aesenc_si128(tmp,state[0]); //message is used to update the state state[0] = _mm_xor_si128(state[0],msg); }
int aesni_xcryptecb( aes_context *ctx, int mode, const unsigned char input[16], unsigned char output[16] ) { __m128i block; const __m128i *subkeys = (__m128i *) ctx->rk; const int rounds = ctx->nr; int i; /* This could be faster if more data was provided at once. */ block = _mm_loadu_si128( (__m128i *) input ); block = _mm_xor_si128( block, subkeys[0] ); if( mode == AES_ENCRYPT ) { for( i = 1; i < rounds - 1; i += 2 ) { block = _mm_aesenc_si128( block, subkeys[i] ); block = _mm_aesenc_si128( block, subkeys[i + 1] ); } block = _mm_aesenc_si128( block, subkeys[rounds - 1] ); block = _mm_aesenclast_si128( block, subkeys[rounds] ); } else { for( i = 1; i < rounds - 1; i += 2 ) { block = _mm_aesdec_si128( block, subkeys[i] ); block = _mm_aesdec_si128( block, subkeys[i + 1] ); } block = _mm_aesdec_si128( block, subkeys[rounds - 1] ); block = _mm_aesdeclast_si128( block, subkeys[rounds] ); } _mm_storeu_si128( (__m128i *) output, block ); return( 0 ); }
void AESNI_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY enc_key) { __m128i tmp; tmp = _mm_loadu_si128 ((__m128i*)in); tmp = _mm_xor_si128 (tmp,enc_key[0]); tmp = _mm_aesenc_si128 (tmp, enc_key[1]); tmp = _mm_aesenc_si128 (tmp, enc_key[2]); tmp = _mm_aesenc_si128 (tmp, enc_key[3]); tmp = _mm_aesenc_si128 (tmp, enc_key[4]); tmp = _mm_aesenc_si128 (tmp, enc_key[5]); tmp = _mm_aesenc_si128 (tmp, enc_key[6]); tmp = _mm_aesenc_si128 (tmp, enc_key[7]); tmp = _mm_aesenc_si128 (tmp, enc_key[8]); tmp = _mm_aesenc_si128 (tmp, enc_key[9]); tmp = _mm_aesenclast_si128 (tmp, enc_key[10]); _mm_storeu_si128 ((__m128i*)out,tmp); }
inline block garble_random_block(void) { block out; uint64_t *val; int i; out = garble_zero_block(); val = (uint64_t *) &out; val[0] = current_rand_index++; out = _mm_xor_si128(out, rand_aes_key.rd_key[0]); for (i = 1; i < 10; ++i) out = _mm_aesenc_si128(out, rand_aes_key.rd_key[i]); return _mm_aesenclast_si128(out, rand_aes_key.rd_key[i]); }
static inline void aes_encrypt_n(__m128i *text, int num_blocks, __m128i *keys) { int i, j; for(j = 1; j < 10 ; j++) { for(i = 0; i< num_blocks; i++) { text[i] = _mm_aesenc_si128(text[i], keys[j]); } } for(i = 0; i < num_blocks; i++) { text[i] = _mm_aesenclast_si128(text[i], keys[j]); } }
void Cryptor::ecbEncrypt(const string &plaintext, const Key &key, string *ciphertext, unsigned char *schedule) { // Right now we just use the same length, but it should just be // a multiple of 16. ciphertext->resize(plaintext.size()); int blocks = plaintext.size() / 16; if (plaintext.size() % 16) { blocks++; } __m128i tmp; __m128i *input = (__m128i*) plaintext.data(); __m128i *output = (__m128i*) ciphertext->data(); __m128i *keySchedule = (__m128i*) schedule; int rounds = getRounds(key.size); for (int block = 0; block < blocks; block++) { // Get next 128-bit block. tmp = _mm_loadu_si128(&input[block]); // Swap byte-order => big-endian. if (!bigEndian) { reverse_m128i(tmp); } // Whitening step. tmp = _mm_xor_si128(tmp, keySchedule[0]); // Apply the AES rounds. int round = 1; for (; round < rounds; round++) { tmp = _mm_aesenc_si128(tmp, keySchedule[round]); } // And the last. tmp = _mm_aesenclast_si128(tmp, keySchedule[round]); // Swap byte-order => little-endian. if (!bigEndian) { reverse_m128i(tmp); } // Save the encrypted block. _mm_storeu_si128(&output[block], tmp); } }
static void AESNI_CBC_encrypt(const unsigned char *in, unsigned char *out,unsigned char ivec[16],unsigned long length,unsigned char *key,int number_of_rounds) { __m128i feedback,data; int i,j; if (length%16) length = length/16+1; else length /=16; feedback=_mm_loadu_si128 ((__m128i*)ivec); for(i=0; i < length; i++) { data = _mm_loadu_si128 (&((__m128i*)in)[i]); feedback = _mm_xor_si128 (data,feedback); feedback = _mm_xor_si128 (feedback,((__m128i*)key)[0]); for(j=1; j <number_of_rounds; j++) feedback = _mm_aesenc_si128 (feedback,((__m128i*)key)[j]); feedback = _mm_aesenclast_si128 (feedback,((__m128i*)key)[j]); _mm_storeu_si128 (&((__m128i*)out)[i],feedback); } }
/*inline*/ void AES_reduced_batch_intr(__m128i* batch, uint32_t batch_size) //Encrypts batch_size in parallel { //Round Key initialization __m128i roundkey[AES_ROUNDS + 1]; for (unsigned i = 0; i<AES_ROUNDS + 1; ++i) { roundkey[i] = _mm_set_epi64x(subkeys64[i][1], subkeys64[i][0]); } for (unsigned i = 0; i<batch_size; ++i) { batch[i] = _mm_xor_si128(batch[i], roundkey[0]); } for (unsigned j = 0; j<AES_ROUNDS; ++j) { for (unsigned i = 0; i<batch_size; ++i) { batch[i] = _mm_aesenc_si128(batch[i], roundkey[j + 1]); } } }
int AES_GCM_decrypt (const unsigned char *in, unsigned char *out, const unsigned char* addt, const unsigned char* ivec, unsigned char *tag, int nbytes, int abytes, int ibytes, const unsigned char* key, int nr) { int i, j ,k; __m128i hlp1, hlp2, hlp3, hlp4; __m128i tmp1, tmp2, tmp3, tmp4; __m128i H, Y, T; __m128i *KEY = (__m128i*)key; __m128i ctr1, ctr2, ctr3, ctr4; __m128i last_block = _mm_setzero_si128(); __m128i ONE = _mm_set_epi32(0, 1, 0, 0); __m128i FOUR = _mm_set_epi32(0, 4, 0, 0); __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7); __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); __m128i X = _mm_setzero_si128(); if(ibytes == 96/8){ Y = _mm_loadu_si128((__m128i*)ivec); Y = _mm_insert_epi32(Y, 0x1000000, 3); /*(Compute E[ZERO, KS] and E[Y0, KS] together*/ tmp1 = _mm_xor_si128(X, KEY[0]); tmp2 = _mm_xor_si128(Y, KEY[0]); for(j=1; j < nr-1; j+=2) { tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]); }; tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]); H = _mm_aesenclast_si128(tmp1, KEY[nr]); T = _mm_aesenclast_si128(tmp2, KEY[nr]); H = _mm_shuffle_epi8(H, BSWAP_MASK); } else{ tmp1 = _mm_xor_si128(X, KEY[0]); for(j=1; j <nr; j++) tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); H = _mm_aesenclast_si128(tmp1, KEY[nr]); H = _mm_shuffle_epi8(H, BSWAP_MASK); Y = _mm_xor_si128(Y, Y); for(i=0; i < ibytes/16; i++){ tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]); tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); Y = _mm_xor_si128(Y, tmp1); gfmul(Y, H, &Y); } if(ibytes%16){ for(j=0; j < ibytes%16; j++) ((unsigned char*)&last_block)[j] = ivec[i*16+j]; tmp1 = last_block; tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); Y = _mm_xor_si128(Y, tmp1); gfmul(Y, H, &Y); } tmp1 = _mm_insert_epi64(tmp1, ibytes*8, 0); tmp1 = _mm_insert_epi64(tmp1, 0, 1); Y = _mm_xor_si128(Y, tmp1); gfmul(Y, H, &Y); Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/ tmp1 = _mm_xor_si128(Y, KEY[0]); for(j=1; j < nr; j++) tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); T = _mm_aesenclast_si128(tmp1, KEY[nr]); } for(i=0; i<abytes/16; i++){ tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]); tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); X = _mm_xor_si128(X, tmp1); gfmul(X, H, &X); } if(abytes%16){ last_block = _mm_setzero_si128(); for(j=0;j<abytes%16;j++) ((unsigned char*)&last_block)[j] = addt[i*16+j]; tmp1 = last_block; tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); X =_mm_xor_si128(X, tmp1); gfmul(X, H, &X); } for(i=0; i<nbytes/16; i++){ tmp1 = _mm_loadu_si128(&((__m128i*)in)[i]); tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); X = _mm_xor_si128(X, tmp1); gfmul(X, H, &X); } if(nbytes%16){ last_block = _mm_setzero_si128(); for(j=0; j<nbytes%16; j++) ((unsigned char*)&last_block)[j] = in[i*16+j]; tmp1 = last_block; tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); X = _mm_xor_si128(X, tmp1); gfmul(X, H, &X); } tmp1 =_mm_insert_epi64(tmp1, nbytes*8, 0); tmp1 =_mm_insert_epi64(tmp1, abytes*8, 1); X = _mm_xor_si128(X, tmp1); gfmul(X, H, &X); X = _mm_shuffle_epi8(X, BSWAP_MASK); T = _mm_xor_si128(X, T); if(0xffff!=_mm_movemask_epi8(_mm_cmpeq_epi8(T, _mm_loadu_si128((__m128i*)tag)))) return 0; //in case the authentication failed ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); ctr1 = _mm_add_epi32(ctr1, ONE); ctr2 = _mm_add_epi32(ctr1, ONE); ctr3 = _mm_add_epi32(ctr2, ONE); ctr4 = _mm_add_epi32(ctr3, ONE); for(i=0; i < nbytes/16/4; i++){ tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64); tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64); tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64); ctr1 = _mm_add_epi32(ctr1, FOUR); ctr2 = _mm_add_epi32(ctr2, FOUR); ctr3 = _mm_add_epi32(ctr3, FOUR); ctr4 = _mm_add_epi32(ctr4, FOUR); tmp1 =_mm_xor_si128(tmp1, KEY[0]); tmp2 =_mm_xor_si128(tmp2, KEY[0]); tmp3 =_mm_xor_si128(tmp3, KEY[0]); tmp4 =_mm_xor_si128(tmp4, KEY[0]); for(j=1; j < nr-1; j+=2){ tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); tmp3 = _mm_aesenc_si128(tmp3, KEY[j]); tmp4 = _mm_aesenc_si128(tmp4, KEY[j]); tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]); tmp3 = _mm_aesenc_si128(tmp3, KEY[j+1]); tmp4 = _mm_aesenc_si128(tmp4, KEY[j+1]); } tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]); tmp3 = _mm_aesenc_si128(tmp3, KEY[nr-1]); tmp4 = _mm_aesenc_si128(tmp4, KEY[nr-1]); tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]); tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]); tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]); tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]); tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[i*4+0])); tmp2 = _mm_xor_si128(tmp2, _mm_loadu_si128(&((__m128i*)in)[i*4+1])); tmp3 = _mm_xor_si128(tmp3, _mm_loadu_si128(&((__m128i*)in)[i*4+2])); tmp4 = _mm_xor_si128(tmp4, _mm_loadu_si128(&((__m128i*)in)[i*4+3])); _mm_storeu_si128(&((__m128i*)out)[i*4+0], tmp1); _mm_storeu_si128(&((__m128i*)out)[i*4+1], tmp2); _mm_storeu_si128(&((__m128i*)out)[i*4+2], tmp3); _mm_storeu_si128(&((__m128i*)out)[i*4+3], tmp4); tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); } for(k = i*4; k < nbytes/16; k++){ tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); ctr1 = _mm_add_epi32(ctr1, ONE); tmp1 = _mm_xor_si128(tmp1, KEY[0]); for(j=1; j<nr-1; j+=2){ tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); } tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); _mm_storeu_si128(&((__m128i*)out)[k], tmp1); } //If one partial block remains if(nbytes%16){ tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); tmp1 = _mm_xor_si128(tmp1, KEY[0]); for(j=1; j<nr-1; j+=2){ tmp1 =_mm_aesenc_si128(tmp1, KEY[j]); tmp1 =_mm_aesenc_si128(tmp1, KEY[j+1]); } tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); last_block = tmp1; for(j=0; j<nbytes%16; j++) out[k*16+j]=((unsigned char*)&last_block)[j]; } return 1; //when sucessfull returns 1 }