Esempio n. 1
1
/*
** AES-128/256 encrypt
*/
__inline__ static void AES_encrypt(
	__m128i in,
	__m128i *out,
	const __m128i *key)
{
	__m128i tmp = _mm_load_si128(&in);
	tmp = _mm_xor_si128(tmp, key[0]);
	tmp = _mm_aesenc_si128(tmp, key[1]);
	tmp = _mm_aesenc_si128(tmp, key[2]);
	tmp = _mm_aesenc_si128(tmp, key[3]);
	tmp = _mm_aesenc_si128(tmp, key[4]);
	tmp = _mm_aesenc_si128(tmp, key[5]);
	tmp = _mm_aesenc_si128(tmp, key[6]);
	tmp = _mm_aesenc_si128(tmp, key[7]);
	tmp = _mm_aesenc_si128(tmp, key[8]);
	tmp = _mm_aesenc_si128(tmp, key[9]);
#if (ROUND==10)
	*out = _mm_aesenclast_si128(tmp, key[10]);
#elif (ROUND==14)
	tmp = _mm_aesenc_si128(tmp, key[10]);
	tmp = _mm_aesenc_si128(tmp, key[11]);
	tmp = _mm_aesenc_si128(tmp, key[12]);
	tmp = _mm_aesenc_si128(tmp, key[13]);
	*out = _mm_aesenclast_si128(tmp, key[14]);
#endif
}
Esempio n. 2
0
static void block_encrypt(block_state* self, const u8* in, u8* out)
{
    __m128i m = _mm_loadu_si128((const __m128i*) in);
    /* first 9 rounds */
    m = _mm_xor_si128(m, self->ek[0]);
    m = _mm_aesenc_si128(m, self->ek[1]);
    m = _mm_aesenc_si128(m, self->ek[2]);
    m = _mm_aesenc_si128(m, self->ek[3]);
    m = _mm_aesenc_si128(m, self->ek[4]);
    m = _mm_aesenc_si128(m, self->ek[5]);
    m = _mm_aesenc_si128(m, self->ek[6]);
    m = _mm_aesenc_si128(m, self->ek[7]);
    m = _mm_aesenc_si128(m, self->ek[8]);
    m = _mm_aesenc_si128(m, self->ek[9]);
    if (self->rounds != 10) {
        /* two additional rounds for AES-192/256 */
        m = _mm_aesenc_si128(m, self->ek[10]);
        m = _mm_aesenc_si128(m, self->ek[11]);
        if (self->rounds == 14) {
            /* another two additional rounds for AES-256 */
            m = _mm_aesenc_si128(m, self->ek[12]);
            m = _mm_aesenc_si128(m, self->ek[13]);
        }
    }
    m = _mm_aesenclast_si128(m, self->ek[self->rounds]);
    _mm_storeu_si128((__m128i*) out, m);
}
Esempio n. 3
0
/**
 * crypto_aes_encrypt_block_aesni(in, out, key):
 * Using the expanded AES key ${key}, encrypt the block ${in} and write the
 * resulting ciphertext to ${out}.  This implementation uses x86 AESNI
 * instructions, and should only be used if CPUSUPPORT_X86_AESNI is defined
 * and cpusupport_x86_aesni() returns nonzero.
 */
void
crypto_aes_encrypt_block_aesni(const uint8_t * in, uint8_t * out,
    const void * key)
{
	const struct crypto_aes_key_aesni * _key = key;
	const __m128i * aes_key = _key->rkeys;
	__m128i aes_state;
	size_t nr = _key->nr;

	aes_state = _mm_loadu_si128((const __m128i *)in);
	aes_state = _mm_xor_si128(aes_state, aes_key[0]);
	aes_state = _mm_aesenc_si128(aes_state, aes_key[1]);
	aes_state = _mm_aesenc_si128(aes_state, aes_key[2]);
	aes_state = _mm_aesenc_si128(aes_state, aes_key[3]);
	aes_state = _mm_aesenc_si128(aes_state, aes_key[4]);
	aes_state = _mm_aesenc_si128(aes_state, aes_key[5]);
	aes_state = _mm_aesenc_si128(aes_state, aes_key[6]);
	aes_state = _mm_aesenc_si128(aes_state, aes_key[7]);
	aes_state = _mm_aesenc_si128(aes_state, aes_key[8]);
	aes_state = _mm_aesenc_si128(aes_state, aes_key[9]);
	if (nr > 10) {
		aes_state = _mm_aesenc_si128(aes_state, aes_key[10]);
		aes_state = _mm_aesenc_si128(aes_state, aes_key[11]);

		if (nr > 12) {
			aes_state = _mm_aesenc_si128(aes_state, aes_key[12]);
			aes_state = _mm_aesenc_si128(aes_state, aes_key[13]);
		}
	}

	aes_state = _mm_aesenclast_si128(aes_state, aes_key[nr]);
	_mm_storeu_si128((__m128i *)out, aes_state);
}
Esempio n. 4
0
static inline void aes_enc_128(__m128i *state, __m128i *key)
{
	// 0
	*state = _mm_xor_si128(*state, key[0]);
	// 1

	*state = _mm_aesenc_si128(*state, key[1]);
	// 2

	*state = _mm_aesenc_si128(*state, key[2]);
	// 3 
	*state = _mm_aesenc_si128(*state, key[3]);
	// 4
	*state = _mm_aesenc_si128(*state, key[4]);
	// 5
	*state = _mm_aesenc_si128(*state, key[5]);
	// 6
	*state = _mm_aesenc_si128(*state, key[6]);
	// 7 
	*state = _mm_aesenc_si128(*state, key[7]);
	// 8
	*state = _mm_aesenc_si128(*state, key[8]);
	// 9 
	*state = _mm_aesenc_si128(*state, key[9]);
	// 10
	*state = _mm_aesenclast_si128(*state, key[10]);

}
Esempio n. 5
0
/*
** AES-128/256 batch encrypt for PIPE blocks
*/
__inline__ static void AES_ecb_encrypt_PIPE(
	__m128i *blks,
	const __m128i *key)
{
	unsigned j;
	blks[0] = _mm_xor_si128(blks[0], key[0]);
	blks[1] = _mm_xor_si128(blks[1], key[0]);
	blks[2] = _mm_xor_si128(blks[2], key[0]);
	blks[3] = _mm_xor_si128(blks[3], key[0]);
#if (PIPE>=5)
	blks[4] = _mm_xor_si128(blks[4], key[0]);
#endif
#if (PIPE>=6)
	blks[5] = _mm_xor_si128(blks[5], key[0]);
#endif
#if (PIPE>=7)
	blks[6] = _mm_xor_si128(blks[6], key[0]);
#endif
#if (PIPE==8)
	blks[7] = _mm_xor_si128(blks[7], key[0]);
#endif
	for (j = 1; j<ROUND; ++j) {
		blks[0] = _mm_aesenc_si128(blks[0], key[j]);
		blks[1] = _mm_aesenc_si128(blks[1], key[j]);
		blks[2] = _mm_aesenc_si128(blks[2], key[j]);
		blks[3] = _mm_aesenc_si128(blks[3], key[j]);
#if (PIPE>=5)
		blks[4] = _mm_aesenc_si128(blks[4], key[j]);
#endif
#if (PIPE>=6)
		blks[5] = _mm_aesenc_si128(blks[5], key[j]);
#endif
#if (PIPE>=7)
		blks[6] = _mm_aesenc_si128(blks[6], key[j]);
#endif
#if (PIPE==8)
		blks[7] = _mm_aesenc_si128(blks[7], key[j]);
#endif
	}
	blks[0] = _mm_aesenclast_si128(blks[0], key[j]);
	blks[1] = _mm_aesenclast_si128(blks[1], key[j]);
	blks[2] = _mm_aesenclast_si128(blks[2], key[j]);
	blks[3] = _mm_aesenclast_si128(blks[3], key[j]);
#if (PIPE>=5)
	blks[4] = _mm_aesenclast_si128(blks[4], key[j]);
#endif
#if (PIPE>=6)
	blks[5] = _mm_aesenclast_si128(blks[5], key[j]);
#endif
#if (PIPE>=7)
	blks[6] = _mm_aesenclast_si128(blks[6], key[j]);
#endif
#if (PIPE==8)
	blks[7] = _mm_aesenclast_si128(blks[7], key[j]);
#endif
}
Esempio n. 6
0
    void Cryptor::cbcEncrypt(const string &plaintext, const Key &key,
                             string *ciphertext,
                             unsigned char *schedule) {
      ciphertext->resize(plaintext.size());

      int blocks = plaintext.size() / 16;
      if (plaintext.size() % 16) {
        blocks++;
      }

      __m128i tmp, tmp2, tmp3;
      __m128i *input = (__m128i*) plaintext.data();
      __m128i *output = (__m128i*) ciphertext->data();      
      __m128i *keySchedule = (__m128i*) schedule;
      int rounds = getRounds(key.size);

      // Load the IV.
      tmp2 = _mm_loadu_si128((__m128i*) key.iv);

      // Swap byte-order => big-endian.
      if (!bigEndian) {        
        reverse_m128i(tmp2); 
      }      
      
      for (int block = 0; block < blocks; block++) {
        // Get next 128-bit block.
        tmp = _mm_loadu_si128(&input[block]);

        // Swap byte-order => big-endian.
        if (!bigEndian) {        
          reverse_m128i(tmp); 
        }

        // XOR IV or last ciphertext with the plaintext.
        tmp2 = _mm_xor_si128(tmp, tmp2);

        // Whitening step.
        tmp2 = _mm_xor_si128(tmp2, keySchedule[0]);

        // Apply the AES rounds.
        int round = 1;
        for (; round < rounds; round++) {
          tmp2 = _mm_aesenc_si128(tmp2, keySchedule[round]);
        }

        // And the last.
        tmp2 = _mm_aesenclast_si128(tmp2, keySchedule[round]);

        // Swap byte-order => little-endian.
        tmp3 = tmp2;
        if (!bigEndian) {        
          reverse_m128i(tmp3); 
        }
        
        // Save the encrypted block.
        _mm_storeu_si128(&output[block], tmp3);
      }
    }
void AES_ecb_encrypt(block *blk,  AES_KEY *aesKey) {
	unsigned j, rnds = ROUNDS(aesKey);
	const block *sched = ((block *)(aesKey->rd_key));

	*blk = _mm_xor_si128(*blk, sched[0]);
	for (j = 1; j<rnds; ++j)
		*blk = _mm_aesenc_si128(*blk, sched[j]);
	*blk = _mm_aesenclast_si128(*blk, sched[j]);
}
void AES_encryptC(block *in, block *out,  AES_KEY *aesKey)
{
	int j, rnds = ROUNDS(aesKey);
	const __m128i *sched = ((__m128i *)(aesKey->rd_key));
	__m128i tmp = _mm_load_si128((__m128i*)in);
	tmp = _mm_xor_si128(tmp, sched[0]);
	for (j = 1; j<rnds; j++)  tmp = _mm_aesenc_si128(tmp, sched[j]);
	tmp = _mm_aesenclast_si128(tmp, sched[j]);
	_mm_store_si128((__m128i*)out, tmp);
}
void AES_ecb_encrypt_chunk_in_out(block *in, block *out, unsigned nblks, AES_KEY *aesKey) {

	int numberOfLoops = nblks / 8;
	int blocksPipeLined = numberOfLoops * 8;
	int remainingEncrypts = nblks - blocksPipeLined;

	unsigned j, rnds = ROUNDS(aesKey);
	const block *sched = ((block *)(aesKey->rd_key));

	for (int i = 0; i < numberOfLoops; i++){

		out[0 + i * 8] = _mm_xor_si128(in[0 + i * 8], sched[0]);
		out[1 + i * 8] = _mm_xor_si128(in[1 + i * 8], sched[0]);
		out[2 + i * 8] = _mm_xor_si128(in[2 + i * 8], sched[0]);
		out[3 + i * 8] = _mm_xor_si128(in[3 + i * 8], sched[0]);
		out[4 + i * 8] = _mm_xor_si128(in[4 + i * 8], sched[0]);
		out[5 + i * 8] = _mm_xor_si128(in[5 + i * 8], sched[0]);
		out[6 + i * 8] = _mm_xor_si128(in[6 + i * 8], sched[0]);
		out[7 + i * 8] = _mm_xor_si128(in[7 + i * 8], sched[0]);

		for (j = 1; j < rnds; ++j){
			out[0 + i * 8] = _mm_aesenc_si128(out[0 + i * 8], sched[j]);
			out[1 + i * 8] = _mm_aesenc_si128(out[1 + i * 8], sched[j]);
			out[2 + i * 8] = _mm_aesenc_si128(out[2 + i * 8], sched[j]);
			out[3 + i * 8] = _mm_aesenc_si128(out[3 + i * 8], sched[j]);
			out[4 + i * 8] = _mm_aesenc_si128(out[4 + i * 8], sched[j]);
			out[5 + i * 8] = _mm_aesenc_si128(out[5 + i * 8], sched[j]);
			out[6 + i * 8] = _mm_aesenc_si128(out[6 + i * 8], sched[j]);
			out[7 + i * 8] = _mm_aesenc_si128(out[7 + i * 8], sched[j]);
		}
		out[0 + i * 8] = _mm_aesenclast_si128(out[0 + i * 8], sched[j]);
		out[1 + i * 8] = _mm_aesenclast_si128(out[1 + i * 8], sched[j]);
		out[2 + i * 8] = _mm_aesenclast_si128(out[2 + i * 8], sched[j]);
		out[3 + i * 8] = _mm_aesenclast_si128(out[3 + i * 8], sched[j]);
		out[4 + i * 8] = _mm_aesenclast_si128(out[4 + i * 8], sched[j]);
		out[5 + i * 8] = _mm_aesenclast_si128(out[5 + i * 8], sched[j]);
		out[6 + i * 8] = _mm_aesenclast_si128(out[6 + i * 8], sched[j]);
		out[7 + i * 8] = _mm_aesenclast_si128(out[7 + i * 8], sched[j]);
	}

	for (int i = blocksPipeLined; i<blocksPipeLined + remainingEncrypts; ++i)
		out[i] = _mm_xor_si128(in[i], sched[0]);
	for (j = 1; j<rnds; ++j)
		for (int i = blocksPipeLined; i<blocksPipeLined + remainingEncrypts; ++i)
			out[i] = _mm_aesenc_si128(out[i], sched[j]);
	for (int i = blocksPipeLined; i<blocksPipeLined + remainingEncrypts; ++i)
		out[i] = _mm_aesenclast_si128(out[i], sched[j]);
}
Esempio n. 10
0
void AES_ecb_encrypt_blks_4(block *blks,  AES_KEY *aesKey) {
	unsigned j, rnds = ROUNDS(aesKey);
	const block *sched = ((block *)(aesKey->rd_key));
	blks[0] = _mm_xor_si128(blks[0], sched[0]);
	blks[1] = _mm_xor_si128(blks[1], sched[0]);
	blks[2] = _mm_xor_si128(blks[2], sched[0]);
	blks[3] = _mm_xor_si128(blks[3], sched[0]);

	for (j = 1; j < rnds; ++j){
		blks[0] = _mm_aesenc_si128(blks[0], sched[j]);
		blks[1] = _mm_aesenc_si128(blks[1], sched[j]);
		blks[2] = _mm_aesenc_si128(blks[2], sched[j]);
		blks[3] = _mm_aesenc_si128(blks[3], sched[j]);
	}
	blks[0] = _mm_aesenclast_si128(blks[0], sched[j]);
	blks[1] = _mm_aesenclast_si128(blks[1], sched[j]);
	blks[2] = _mm_aesenclast_si128(blks[2], sched[j]);
	blks[3] = _mm_aesenclast_si128(blks[3], sched[j]);
}
Esempio n. 11
0
File: aesni.c Progetto: behemot/pm
void aesni_encrypt(aesni_ctx *ctx, const byte *in, byte *out)
{
	register __m128i tmp;
	tmp = _mm_loadu_si128((__m128i*)in);
	tmp = _mm_xor_si128(tmp, ctx->enc_keys[0]);
	for (int i = 1; i < 10; i++) {
		tmp = _mm_aesenc_si128(tmp, ctx->enc_keys[i]);
	}
	tmp = _mm_aesenclast_si128(tmp, ctx->enc_keys[10]);
	_mm_storeu_si128((__m128i*)out, tmp);
}
Esempio n. 12
0
void AES_ecb_encrypt_blks(block *blks, unsigned nblks,  AES_KEY *aesKey) {
    unsigned i,j,rnds=ROUNDS(aesKey);
	const block *sched = ((block *)(aesKey->rd_key));
	for (i=0; i<nblks; ++i)
	    blks[i] =_mm_xor_si128(blks[i], sched[0]);
	for(j=1; j<rnds; ++j)
	    for (i=0; i<nblks; ++i)
		    blks[i] = _mm_aesenc_si128(blks[i], sched[j]);
	for (i=0; i<nblks; ++i)
	    blks[i] =_mm_aesenclast_si128(blks[i], sched[j]);
}
Esempio n. 13
0
inline void AES_encrypt(const unsigned char *in, unsigned char *out,
		const AES_KEY *key) {
	int j, rnds = ROUNDS(key);
	const __m128i *sched = ((__m128i *) (key->rd_key));
	__m128i tmp = _mm_load_si128((__m128i *) in);
	tmp = _mm_xor_si128(tmp, sched[0]);
	for (j = 1; j < rnds; j++)
		tmp = _mm_aesenc_si128(tmp, sched[j]);
	tmp = _mm_aesenclast_si128(tmp, sched[j]);
	_mm_store_si128((__m128i *) out, tmp);
}
Esempio n. 14
0
static inline void aes256ni_encrypt(const __m128i rkeys[15], const unsigned char *n, unsigned char *out) {
  __m128i nv = _mm_load_si128((const __m128i *)n);
  int i;
  __m128i temp = _mm_xor_si128(nv, rkeys[0]);
#pragma unroll(13)
  for (i = 1 ; i < 14 ; i++) {
    temp = _mm_aesenc_si128(temp, rkeys[i]);
  }
  temp = _mm_aesenclast_si128(temp, rkeys[14]);
  _mm_store_si128((__m128i*)(out), temp);
}
Esempio n. 15
0
void AES_ecb_encrypt_blks_4_in_out(block *in, block *out,  AES_KEY *aesKey) {
	unsigned j, rnds = ROUNDS(aesKey);
	const block *sched = ((block *)(aesKey->rd_key));
	//block temp[4];

	out[0] = _mm_xor_si128(in[0], sched[0]);
	out[1] = _mm_xor_si128(in[1], sched[0]);
	out[2] = _mm_xor_si128(in[2], sched[0]);
	out[3] = _mm_xor_si128(in[3], sched[0]);

	for (j = 1; j < rnds; ++j){
		out[0] = _mm_aesenc_si128(out[0], sched[j]);
		out[1] = _mm_aesenc_si128(out[1], sched[j]);
		out[2] = _mm_aesenc_si128(out[2], sched[j]);
		out[3] = _mm_aesenc_si128(out[3], sched[j]);
	}
	out[0] = _mm_aesenclast_si128(out[0], sched[j]);
	out[1] = _mm_aesenclast_si128(out[1], sched[j]);
	out[2] = _mm_aesenclast_si128(out[2], sched[j]);
	out[3] = _mm_aesenclast_si128(out[3], sched[j]);
}
Esempio n. 16
0
static __m128i AES_encrypt(__m128i in,  const __m128i* expkey)
{
	int j;

	__m128i tmp = byte_swap(in) ^ expkey[0];
	for (j=1; j <10; j++){
		tmp = _mm_aesenc_si128 (tmp,expkey[j]);
	}
	tmp = _mm_aesenclast_si128 (tmp,expkey[10]);

	return byte_swap(tmp);
}
Esempio n. 17
0
File: siv.c Progetto: medsec/riv
static __m128i aes_encrypt(__m128i in, __m128i* k)
{
    __m128i x = _mm_xor_si128(in, k[0]);
    x = _mm_aesenc_si128(x, k[1]);
    x = _mm_aesenc_si128(x, k[2]);
    x = _mm_aesenc_si128(x, k[3]);
    x = _mm_aesenc_si128(x, k[4]);
    x = _mm_aesenc_si128(x, k[5]);
    x = _mm_aesenc_si128(x, k[6]);
    x = _mm_aesenc_si128(x, k[7]);
    x = _mm_aesenc_si128(x, k[8]);
    x = _mm_aesenc_si128(x, k[9]);
    return _mm_aesenclast_si128(x, k[10]);
}
Esempio n. 18
0
inline block
garble_random_block(void)
{
    block out;
    uint64_t *val;
    int i;

    out = garble_zero_block();
    val = (uint64_t *) &out;
    val[0] = current_rand_index++;
    out = _mm_xor_si128(out, rand_aes_key.rd_key[0]);
    for (i = 1; i < 10; ++i)
        out = _mm_aesenc_si128(out, rand_aes_key.rd_key[i]);
    return _mm_aesenclast_si128(out, rand_aes_key.rd_key[i]);
}
Esempio n. 19
0
File: siv.c Progetto: medsec/riv
static inline void aes_encrypt_n(__m128i *text, int num_blocks,
                                 __m128i *keys)
{
    int i, j;

    for(j = 1; j < 10 ; j++) {
        for(i = 0; i< num_blocks; i++) {
            text[i] = _mm_aesenc_si128(text[i], keys[j]);
        }
    }

    for(i = 0; i < num_blocks; i++) {
        text[i] = _mm_aesenclast_si128(text[i], keys[j]);
    }
}
Esempio n. 20
0
    void Cryptor::ecbEncrypt(const string &plaintext, const Key &key,
                             string *ciphertext,
                             unsigned char *schedule) {
      // Right now we just use the same length, but it should just be
      // a multiple of 16.
      ciphertext->resize(plaintext.size());

      int blocks = plaintext.size() / 16;
      if (plaintext.size() % 16) {
        blocks++;
      }

      __m128i tmp;
      __m128i *input = (__m128i*) plaintext.data();
      __m128i *output = (__m128i*) ciphertext->data();      
      __m128i *keySchedule = (__m128i*) schedule;
      int rounds = getRounds(key.size);
      
      for (int block = 0; block < blocks; block++) {
        // Get next 128-bit block.
        tmp = _mm_loadu_si128(&input[block]);

        // Swap byte-order => big-endian.
        if (!bigEndian) {        
          reverse_m128i(tmp); 
        }

        // Whitening step.
        tmp = _mm_xor_si128(tmp, keySchedule[0]);

        // Apply the AES rounds.
        int round = 1;
        for (; round < rounds; round++) {
          tmp = _mm_aesenc_si128(tmp, keySchedule[round]);
        }

        // And the last.
        tmp = _mm_aesenclast_si128(tmp, keySchedule[round]);

        // Swap byte-order => little-endian.        
        if (!bigEndian) {        
          reverse_m128i(tmp); 
        }
        
        // Save the encrypted block.
        _mm_storeu_si128(&output[block], tmp);
      }
    }
Esempio n. 21
0
AES_AES_Block __fastcall aes_AES128_encrypt_block_(
    AES_AES_Block plaintext,
    const AES_AES128_RoundKeys* encryption_keys)
{
    plaintext = _mm_xor_si128(plaintext, encryption_keys->keys[0]);
    plaintext = _mm_aesenc_si128(plaintext, encryption_keys->keys[1]);
    plaintext = _mm_aesenc_si128(plaintext, encryption_keys->keys[2]);
    plaintext = _mm_aesenc_si128(plaintext, encryption_keys->keys[3]);
    plaintext = _mm_aesenc_si128(plaintext, encryption_keys->keys[4]);
    plaintext = _mm_aesenc_si128(plaintext, encryption_keys->keys[5]);
    plaintext = _mm_aesenc_si128(plaintext, encryption_keys->keys[6]);
    plaintext = _mm_aesenc_si128(plaintext, encryption_keys->keys[7]);
    plaintext = _mm_aesenc_si128(plaintext, encryption_keys->keys[8]);
    plaintext = _mm_aesenc_si128(plaintext, encryption_keys->keys[9]);
    return _mm_aesenclast_si128(plaintext, encryption_keys->keys[10]);
}
Esempio n. 22
0
static void AESNI_CBC_encrypt(const unsigned char *in, unsigned char *out,unsigned char ivec[16],unsigned long length,unsigned char *key,int number_of_rounds)
{
    __m128i feedback,data;
    int i,j;
    if (length%16)
        length = length/16+1;
    else length /=16;
    feedback=_mm_loadu_si128 ((__m128i*)ivec);
    for(i=0; i < length; i++)
    {
        data = _mm_loadu_si128 (&((__m128i*)in)[i]);
        feedback = _mm_xor_si128 (data,feedback);
        feedback = _mm_xor_si128 (feedback,((__m128i*)key)[0]);
        for(j=1; j <number_of_rounds; j++) feedback = _mm_aesenc_si128 (feedback,((__m128i*)key)[j]);
        feedback = _mm_aesenclast_si128 (feedback,((__m128i*)key)[j]);
        _mm_storeu_si128 (&((__m128i*)out)[i],feedback);
    }
}
Esempio n. 23
0
void AESNI_encrypt(const uint8_t *in, uint8_t *out, const AES_KEY enc_key)
{
    __m128i tmp;

    tmp = _mm_loadu_si128 ((__m128i*)in);

    tmp = _mm_xor_si128 (tmp,enc_key[0]);
    tmp = _mm_aesenc_si128 (tmp, enc_key[1]);
    tmp = _mm_aesenc_si128 (tmp, enc_key[2]);
    tmp = _mm_aesenc_si128 (tmp, enc_key[3]);
    tmp = _mm_aesenc_si128 (tmp, enc_key[4]);
    tmp = _mm_aesenc_si128 (tmp, enc_key[5]);
    tmp = _mm_aesenc_si128 (tmp, enc_key[6]);
    tmp = _mm_aesenc_si128 (tmp, enc_key[7]);
    tmp = _mm_aesenc_si128 (tmp, enc_key[8]);
    tmp = _mm_aesenc_si128 (tmp, enc_key[9]);
    tmp = _mm_aesenclast_si128 (tmp, enc_key[10]);

   _mm_storeu_si128 ((__m128i*)out,tmp);
}
Esempio n. 24
0
int aesni_xcryptecb( aes_context *ctx,
                     int mode,
                     const unsigned char input[16],
                     unsigned char output[16] )
{
    __m128i block;
    const __m128i *subkeys = (__m128i *) ctx->rk;
    const int rounds = ctx->nr;
    int i;

    /* This could be faster if more data was provided at once. */

    block = _mm_loadu_si128( (__m128i *) input );
    block = _mm_xor_si128( block, subkeys[0] );

    if( mode == AES_ENCRYPT ) {
        for( i = 1; i < rounds - 1; i += 2 ) {
            block = _mm_aesenc_si128( block, subkeys[i] );
            block = _mm_aesenc_si128( block, subkeys[i + 1] );
        }

        block = _mm_aesenc_si128( block, subkeys[rounds - 1] );
        block = _mm_aesenclast_si128( block, subkeys[rounds] );
    } else {
        for( i = 1; i < rounds - 1; i += 2 ) {
            block = _mm_aesdec_si128( block, subkeys[i] );
            block = _mm_aesdec_si128( block, subkeys[i + 1] );
        }

        block = _mm_aesdec_si128( block, subkeys[rounds - 1] );
        block = _mm_aesdeclast_si128( block, subkeys[rounds] );
    }

    _mm_storeu_si128( (__m128i *) output, block );

    return( 0 );
}
Esempio n. 25
0
int aesni_xcryptcbc( aes_context *ctx,
                     int mode,
                     size_t length,
                     unsigned char iv[16],
                     const unsigned char *input,
                     unsigned char *output )
{
    const __m128i *subkeys = (__m128i *) ctx->rk;
    const int rounds = ctx->nr;
    const size_t blocks = length / 16;
    __m128i block0, block1, block2, block3;
    __m128i fb0, fb1, fb2, fb3;
    __m128i rk;
    __m128i last;
    size_t i;
    int j;

    fb0 = _mm_loadu_si128( (__m128i *) iv );

    if (mode == AES_ENCRYPT ) {
        for( i = 0 ; i < blocks; i++ ) {
            block0 = _mm_loadu_si128( &((__m128i *) input)[i] );

            fb0 = _mm_xor_si128( block0, fb0 );
            fb0 = _mm_xor_si128( fb0, subkeys[0] );

            for( j = 1; j < rounds - 1; j += 2 ) {
                fb0 = _mm_aesenc_si128( fb0, subkeys[j] );
                fb0 = _mm_aesenc_si128( fb0, subkeys[j + 1] );
            }

            fb0 = _mm_aesenc_si128( fb0, subkeys[rounds - 1] );
            fb0 = _mm_aesenclast_si128( fb0, subkeys[rounds] );

            _mm_storeu_si128( &((__m128i*) output)[i], fb0 );
        }
    } else {
        /* Take advantage of pipelining by decrypting 4 blocks at once. */

        for( i = 0; i < blocks / 4; i++ ) {
            block0 = _mm_loadu_si128( (__m128i *) input + i * 4 );
            block1 = _mm_loadu_si128( (__m128i *) input + i * 4 + 1 );
            block2 = _mm_loadu_si128( (__m128i *) input + i * 4 + 2 );
            block3 = _mm_loadu_si128( (__m128i *) input + i * 4 + 3 );

            fb1 = block0;
            fb2 = block1;
            fb3 = block2;
            last = block3;

            rk = subkeys[0];
            block0 = _mm_xor_si128( block0, rk );
            block1 = _mm_xor_si128( block1, rk );
            block2 = _mm_xor_si128( block2, rk );
            block3 = _mm_xor_si128( block3, rk );

            for( j = 1; j < rounds; j++ ) {
                rk = subkeys[j];
                block0 = _mm_aesdec_si128( block0, rk );
                block1 = _mm_aesdec_si128( block1, rk );
                block2 = _mm_aesdec_si128( block2, rk );
                block3 = _mm_aesdec_si128( block3, rk );
            }

            rk = subkeys[rounds];
            block0 = _mm_aesdeclast_si128( block0, rk );
            block1 = _mm_aesdeclast_si128( block1, rk );
            block2 = _mm_aesdeclast_si128( block2, rk );
            block3 = _mm_aesdeclast_si128( block3, rk );

            block0 = _mm_xor_si128( block0, fb0 );
            block1 = _mm_xor_si128( block1, fb1 );
            block2 = _mm_xor_si128( block2, fb2 );
            block3 = _mm_xor_si128( block3, fb3 );

            _mm_storeu_si128( ((__m128i *) output) + i * 4, block0 );
            _mm_storeu_si128( ((__m128i *) output) + i * 4 + 1, block1 );
            _mm_storeu_si128( ((__m128i *) output) + i * 4 + 2, block2 );
            _mm_storeu_si128( ((__m128i *) output) + i * 4 + 3, block3 );

            fb0 = last;
        }

        for( i *= 4; i < blocks; i++ ) {
            block0 = _mm_loadu_si128( (__m128i *) input + i );

            last = block0;

            block0 = _mm_xor_si128 (last, subkeys[0] );

            for( j = 1; j < rounds - 1; j += 2 ) {
                block0 = _mm_aesdec_si128( block0, subkeys[j] );
                block0 = _mm_aesdec_si128( block0, subkeys[j + 1] );
            }

            block0 = _mm_aesdec_si128( block0, subkeys[rounds - 1] );
            block0 = _mm_aesdeclast_si128( block0, subkeys[rounds] );

            block0 = _mm_xor_si128( block0, fb0 );

            _mm_storeu_si128( ((__m128i *) output) + i, block0 );

            fb0 = last;
        }
    }

    _mm_storeu_si128( (__m128i *) iv, fb0 );

    return( 0 );
}
Esempio n. 26
0
int AES_GCM_decrypt (const unsigned char *in,
 unsigned char *out,
const unsigned char* addt,
 const unsigned char* ivec,
 unsigned char *tag,
int nbytes,
int abytes,
int ibytes,
const unsigned char* key,
int nr)
 {
 int i, j ,k;
 __m128i hlp1, hlp2, hlp3, hlp4;
 __m128i tmp1, tmp2, tmp3, tmp4;
 __m128i H, Y, T;
 __m128i *KEY = (__m128i*)key;
 __m128i ctr1, ctr2, ctr3, ctr4;
 __m128i last_block = _mm_setzero_si128();
 __m128i ONE = _mm_set_epi32(0, 1, 0, 0);
 __m128i FOUR = _mm_set_epi32(0, 4, 0, 0);
 __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7);
 __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15);
 __m128i X = _mm_setzero_si128();
 if(ibytes == 96/8){
 Y = _mm_loadu_si128((__m128i*)ivec);
 Y = _mm_insert_epi32(Y, 0x1000000, 3);
 /*(Compute E[ZERO, KS] and E[Y0, KS] together*/
 tmp1 = _mm_xor_si128(X, KEY[0]);
 tmp2 = _mm_xor_si128(Y, KEY[0]);
 for(j=1; j < nr-1; j+=2) {
 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
 tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
 };
 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
 tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
 T = _mm_aesenclast_si128(tmp2, KEY[nr]);
 H = _mm_shuffle_epi8(H, BSWAP_MASK);
 }
 else{
 tmp1 = _mm_xor_si128(X, KEY[0]);
 for(j=1; j <nr; j++)
 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 H = _mm_aesenclast_si128(tmp1, KEY[nr]);
 H = _mm_shuffle_epi8(H, BSWAP_MASK);
 Y = _mm_xor_si128(Y, Y);
 for(i=0; i < ibytes/16; i++){
 tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]);
tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 Y = _mm_xor_si128(Y, tmp1);
 gfmul(Y, H, &Y);
 }
 if(ibytes%16){
 for(j=0; j < ibytes%16; j++)
 ((unsigned char*)&last_block)[j] = ivec[i*16+j];
 tmp1 = last_block;
 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 Y = _mm_xor_si128(Y, tmp1);
 gfmul(Y, H, &Y);
 }
 tmp1 = _mm_insert_epi64(tmp1, ibytes*8, 0);
 tmp1 = _mm_insert_epi64(tmp1, 0, 1);
 Y = _mm_xor_si128(Y, tmp1);
 gfmul(Y, H, &Y);
 Y = _mm_shuffle_epi8(Y, BSWAP_MASK);
 /*Compute E(K, Y0)*/
 tmp1 = _mm_xor_si128(Y, KEY[0]);
 for(j=1; j < nr; j++)
 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 T = _mm_aesenclast_si128(tmp1, KEY[nr]);
 }
 for(i=0; i<abytes/16; i++){
 tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]);
 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 X = _mm_xor_si128(X, tmp1);
 gfmul(X, H, &X);
 }
 if(abytes%16){
 last_block = _mm_setzero_si128();
 for(j=0;j<abytes%16;j++)
 ((unsigned char*)&last_block)[j] = addt[i*16+j];
 tmp1 = last_block;
 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 X =_mm_xor_si128(X, tmp1);
 gfmul(X, H, &X);
 }
 for(i=0; i<nbytes/16; i++){
 tmp1 = _mm_loadu_si128(&((__m128i*)in)[i]);
 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 X = _mm_xor_si128(X, tmp1);
 gfmul(X, H, &X);
 }
 if(nbytes%16){
 last_block = _mm_setzero_si128();
 for(j=0; j<nbytes%16; j++)
 ((unsigned char*)&last_block)[j] = in[i*16+j];
 tmp1 = last_block;
 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
 X = _mm_xor_si128(X, tmp1);
 gfmul(X, H, &X);
 }
 tmp1 =_mm_insert_epi64(tmp1, nbytes*8, 0);
 tmp1 =_mm_insert_epi64(tmp1, abytes*8, 1);
X = _mm_xor_si128(X, tmp1);
 gfmul(X, H, &X);
 X = _mm_shuffle_epi8(X, BSWAP_MASK);
 T = _mm_xor_si128(X, T);
 if(0xffff!=_mm_movemask_epi8(_mm_cmpeq_epi8(T, _mm_loadu_si128((__m128i*)tag))))
 return 0; //in case the authentication failed
 ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64);
 ctr1 = _mm_add_epi32(ctr1, ONE);
 ctr2 = _mm_add_epi32(ctr1, ONE);
 ctr3 = _mm_add_epi32(ctr2, ONE);
 ctr4 = _mm_add_epi32(ctr3, ONE);
 for(i=0; i < nbytes/16/4; i++){
 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
 tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64);
 tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64);
 tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64);
 ctr1 = _mm_add_epi32(ctr1, FOUR);
 ctr2 = _mm_add_epi32(ctr2, FOUR);
 ctr3 = _mm_add_epi32(ctr3, FOUR);
 ctr4 = _mm_add_epi32(ctr4, FOUR);
 tmp1 =_mm_xor_si128(tmp1, KEY[0]);
 tmp2 =_mm_xor_si128(tmp2, KEY[0]);
 tmp3 =_mm_xor_si128(tmp3, KEY[0]);
 tmp4 =_mm_xor_si128(tmp4, KEY[0]);
 for(j=1; j < nr-1; j+=2){
 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 tmp2 = _mm_aesenc_si128(tmp2, KEY[j]);
 tmp3 = _mm_aesenc_si128(tmp3, KEY[j]);
 tmp4 = _mm_aesenc_si128(tmp4, KEY[j]);
 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
 tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]);
 tmp3 = _mm_aesenc_si128(tmp3, KEY[j+1]);
 tmp4 = _mm_aesenc_si128(tmp4, KEY[j+1]);
 }
 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
 tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]);
 tmp3 = _mm_aesenc_si128(tmp3, KEY[nr-1]);
 tmp4 = _mm_aesenc_si128(tmp4, KEY[nr-1]);
 tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]);
 tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]);
 tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]);
 tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]);
 tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[i*4+0]));
 tmp2 = _mm_xor_si128(tmp2, _mm_loadu_si128(&((__m128i*)in)[i*4+1]));
 tmp3 = _mm_xor_si128(tmp3, _mm_loadu_si128(&((__m128i*)in)[i*4+2]));
 tmp4 = _mm_xor_si128(tmp4, _mm_loadu_si128(&((__m128i*)in)[i*4+3]));
 _mm_storeu_si128(&((__m128i*)out)[i*4+0], tmp1);
 _mm_storeu_si128(&((__m128i*)out)[i*4+1], tmp2);
 _mm_storeu_si128(&((__m128i*)out)[i*4+2], tmp3);
 _mm_storeu_si128(&((__m128i*)out)[i*4+3], tmp4);
 tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK);
tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK);
 tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK);
 tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK);
 }
 for(k = i*4; k < nbytes/16; k++){
 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
 ctr1 = _mm_add_epi32(ctr1, ONE);
 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
 for(j=1; j<nr-1; j+=2){
 tmp1 = _mm_aesenc_si128(tmp1, KEY[j]);
 tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]);
 }
 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
 tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
 _mm_storeu_si128(&((__m128i*)out)[k], tmp1);
 }
//If one partial block remains
 if(nbytes%16){
 tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64);
 tmp1 = _mm_xor_si128(tmp1, KEY[0]);
 for(j=1; j<nr-1; j+=2){
 tmp1 =_mm_aesenc_si128(tmp1, KEY[j]);
 tmp1 =_mm_aesenc_si128(tmp1, KEY[j+1]);
 }
 tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]);
 tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]);
 tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k]));
 last_block = tmp1;
 for(j=0; j<nbytes%16; j++)
 out[k*16+j]=((unsigned char*)&last_block)[j];
 }
 return 1; //when sucessfull returns 1
}
Esempio n. 27
0
void aes_ecb_encrypt(aes *a,MR_BYTE *buff)
{
    int i,j,k;
    MR_WORD p[4],q[4],*x,*y,*t;

#ifdef AES_NI_SUPPORT
	__m128i ky,m = _mm_loadu_si128((__m128i *) buff);
	ky = _mm_loadu_si128((__m128i *) &a->fkey[0]);
    m = _mm_xor_si128       (m, ky); 
	k=NB;
	for (i=1;i<a->Nr;i++)
	{
		ky=_mm_loadu_si128((__m128i *) &a->fkey[k]);
		m =_mm_aesenc_si128(m, ky); 
		k+=4;
	}
	ky=_mm_loadu_si128((__m128i *) &a->fkey[k]);
    m=_mm_aesenclast_si128(m, ky);

    _mm_storeu_si128((__m128i *)buff, m);
#else

    for (i=j=0;i<NB;i++,j+=4)
    {
        p[i]=pack((MR_BYTE *)&buff[j]);
        p[i]^=a->fkey[i];
    }

    k=NB;
    x=p; y=q;

/* State alternates between x and y */
    for (i=1;i<a->Nr;i++)
    { /* Nr is number of rounds. May be odd. */
#ifndef MR_SMALL_AES
        y[0]=a->fkey[k]^ftable[MR_TOBYTE(x[0])]^
             ftable1[MR_TOBYTE(x[1]>>8)]^
             ftable2[MR_TOBYTE(x[2]>>16)]^
             ftable3[x[3]>>24];
        y[1]=a->fkey[k+1]^ftable[MR_TOBYTE(x[1])]^
             ftable1[MR_TOBYTE(x[2]>>8)]^
             ftable2[MR_TOBYTE(x[3]>>16)]^
             ftable3[x[0]>>24];
        y[2]=a->fkey[k+2]^ftable[MR_TOBYTE(x[2])]^
             ftable1[MR_TOBYTE(x[3]>>8)]^
             ftable2[MR_TOBYTE(x[0]>>16)]^
             ftable3[x[1]>>24];
        y[3]=a->fkey[k+3]^ftable[MR_TOBYTE(x[3])]^
             ftable1[MR_TOBYTE(x[0]>>8)]^
             ftable2[MR_TOBYTE(x[1]>>16)]^
             ftable3[x[2]>>24];
#else
        y[0]=a->fkey[k]^ftable[MR_TOBYTE(x[0])]^
             ROTL8(ftable[MR_TOBYTE(x[1]>>8)])^
             ROTL16(ftable[MR_TOBYTE(x[2]>>16)])^
             ROTL24(ftable[x[3]>>24]);
        y[1]=a->fkey[k+1]^ftable[MR_TOBYTE(x[1])]^
             ROTL8(ftable[MR_TOBYTE(x[2]>>8)])^
             ROTL16(ftable[MR_TOBYTE(x[3]>>16)])^
             ROTL24(ftable[x[0]>>24]);
        y[2]=a->fkey[k+2]^ftable[MR_TOBYTE(x[2])]^
             ROTL8(ftable[MR_TOBYTE(x[3]>>8)])^
             ROTL16(ftable[MR_TOBYTE(x[0]>>16)])^
             ROTL24(ftable[x[1]>>24]);
        y[3]=a->fkey[k+3]^ftable[MR_TOBYTE(x[3])]^
             ROTL8(ftable[MR_TOBYTE(x[0]>>8)])^
             ROTL16(ftable[MR_TOBYTE(x[1]>>16)])^
             ROTL24(ftable[x[2]>>24]);
#endif
        k+=4;
        t=x; x=y; y=t;      /* swap pointers */
    }

/* Last Round */ 
    
    y[0]=a->fkey[k]^(MR_WORD)fbsub[MR_TOBYTE(x[0])]^
         ROTL8((MR_WORD)fbsub[MR_TOBYTE(x[1]>>8)])^
         ROTL16((MR_WORD)fbsub[MR_TOBYTE(x[2]>>16)])^
         ROTL24((MR_WORD)fbsub[x[3]>>24]);
    y[1]=a->fkey[k+1]^(MR_WORD)fbsub[MR_TOBYTE(x[1])]^
         ROTL8((MR_WORD)fbsub[MR_TOBYTE(x[2]>>8)])^
         ROTL16((MR_WORD)fbsub[MR_TOBYTE(x[3]>>16)])^
         ROTL24((MR_WORD)fbsub[x[0]>>24]);
    y[2]=a->fkey[k+2]^(MR_WORD)fbsub[MR_TOBYTE(x[2])]^
         ROTL8((MR_WORD)fbsub[MR_TOBYTE(x[3]>>8)])^
         ROTL16((MR_WORD)fbsub[MR_TOBYTE(x[0]>>16)])^
         ROTL24((MR_WORD)fbsub[x[1]>>24]);
    y[3]=a->fkey[k+3]^(MR_WORD)fbsub[MR_TOBYTE(x[3])]^
         ROTL8((MR_WORD)fbsub[MR_TOBYTE(x[0]>>8)])^
         ROTL16((MR_WORD)fbsub[MR_TOBYTE(x[1]>>16)])^
         ROTL24((MR_WORD)fbsub[x[2]>>24]);

    for (i=j=0;i<NB;i++,j+=4)
    {
        unpack(y[i],(MR_BYTE *)&buff[j]);
        x[i]=y[i]=0;   /* clean up stack */
    }
#endif
}
Esempio n. 28
0
void c_opt_unrolled_8x(uint8_t *keys, uint8_t *data, uint8_t *dataOut) {

  __m128i mask        = _mm_set_epi8(0x0C, 0x03, 0x06, 0x09, 0x08, 0x0F, 0x02, 0x05,
                                     0x04, 0x0B, 0x0E, 0x01, 0x00, 0x07, 0x0A, 0x0D);

  __m128i mmrcon      = _mm_set_epi8(0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00,
                                     0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00);

  __m128i mmrconFinal = _mm_set_epi8(0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x1b, 0x00,
                                     0x00, 0x00, 0x1b, 0x00, 0x00, 0x00, 0x1b, 0x00);

  __m128i key  [8];
  __m128i rkey [8];
  __m128i state[8];
  __m128i tmp0;

  key[0]  = _mm_load_si128((__m128i*)&(keys[  0]));
  key[1]  = _mm_load_si128((__m128i*)&(keys[ 16]));
  key[2]  = _mm_load_si128((__m128i*)&(keys[ 32]));
  key[3]  = _mm_load_si128((__m128i*)&(keys[ 48]));
  key[4]  = _mm_load_si128((__m128i*)&(keys[ 64]));
  key[5]  = _mm_load_si128((__m128i*)&(keys[ 80]));
  key[6]  = _mm_load_si128((__m128i*)&(keys[ 96]));
  key[7]  = _mm_load_si128((__m128i*)&(keys[112]));

  transpose_state(rkey, key);
  transpose_state(&(rkey[4]), &(key[4]));

  state[0]  = _mm_load_si128((__m128i*)  data     );
  state[1]  = state[0];
  state[2]  = state[0];
  state[3]  = state[0];
  state[4]  = state[0];
  state[5]  = state[0];
  state[6]  = state[0];
  state[7]  = state[0];

  tmp0    = _mm_aesenclast_si128(rkey[3], mmrcon);
  tmp0    = _mm_shuffle_epi8(tmp0, mask);

  state[0]   = _mm_xor_si128(state[0], key[0]);
  state[1]   = _mm_xor_si128(state[1], key[1]);
  state[2]   = _mm_xor_si128(state[2], key[2]);
  state[3]   = _mm_xor_si128(state[3], key[3]);
  state[4]   = _mm_xor_si128(state[4], key[4]);
  state[5]   = _mm_xor_si128(state[5], key[5]);
  state[6]   = _mm_xor_si128(state[6], key[6]);
  state[7]   = _mm_xor_si128(state[7], key[7]);

  rkey[0]    = _mm_xor_si128(rkey[0], tmp0);

  tmp0    = _mm_aesenclast_si128(rkey[7], mmrcon);
  tmp0    = _mm_shuffle_epi8(tmp0, mask);
  mmrcon  = _mm_slli_epi16(mmrcon, 0x01);

  rkey[1]    = _mm_xor_si128(rkey[1], rkey[0]);
  rkey[2]    = _mm_xor_si128(rkey[2], rkey[1]);
  rkey[3]    = _mm_xor_si128(rkey[3], rkey[2]);

  rkey[4]    = _mm_xor_si128(rkey[4], tmp0);
  rkey[5]    = _mm_xor_si128(rkey[5], rkey[4]);
  rkey[6]    = _mm_xor_si128(rkey[6], rkey[5]);
  rkey[7]    = _mm_xor_si128(rkey[7], rkey[6]);

  _mm_prefetch((char const *)state, 0);

  transpose_state(key, rkey);
  transpose_state(&(key[4]), &(rkey[4]));

  for (uint8_t roundCounter = 1; roundCounter < 8; roundCounter++) {

    tmp0    = _mm_aesenclast_si128(rkey[3], mmrcon);
    tmp0    = _mm_shuffle_epi8(tmp0, mask);

    state[0]   = _mm_aesenc_si128(state[0], key[0]);
    state[1]   = _mm_aesenc_si128(state[1], key[1]);
    state[2]   = _mm_aesenc_si128(state[2], key[2]);
    state[3]   = _mm_aesenc_si128(state[3], key[3]);
    state[4]   = _mm_aesenc_si128(state[4], key[4]);
    state[5]   = _mm_aesenc_si128(state[5], key[5]);

    rkey[0]    = _mm_xor_si128(rkey[0], tmp0);
    tmp0    = _mm_aesenclast_si128(rkey[7], mmrcon);

    rkey[1]    = _mm_xor_si128(rkey[1], rkey[0]);
    rkey[2]    = _mm_xor_si128(rkey[2], rkey[1]);
    rkey[3]    = _mm_xor_si128(rkey[3], rkey[2]);

    state[6]   = _mm_aesenc_si128(state[6], key[6]);
    state[7]   = _mm_aesenc_si128(state[7], key[7]);

    transpose_state(key, rkey);

    tmp0    = _mm_shuffle_epi8(tmp0, mask);
    mmrcon  = _mm_slli_epi16(mmrcon, 0x01);

    rkey[4]    = _mm_xor_si128(rkey[4], tmp0);
    rkey[5]    = _mm_xor_si128(rkey[5], rkey[4]);
    rkey[6]    = _mm_xor_si128(rkey[6], rkey[5]);
    rkey[7]    = _mm_xor_si128(rkey[7], rkey[6]);

    _mm_prefetch((char const *)state, 0);
    transpose_state(&(key[4]), &(rkey[4]));

  }

  tmp0    = _mm_aesenclast_si128(rkey[3], mmrconFinal);
  tmp0    = _mm_shuffle_epi8(tmp0, mask);

  state[0]   = _mm_aesenc_si128(state[0], key[0]);
  state[1]   = _mm_aesenc_si128(state[1], key[1]);
  state[2]   = _mm_aesenc_si128(state[2], key[2]);
  state[3]   = _mm_aesenc_si128(state[3], key[3]);
  state[4]   = _mm_aesenc_si128(state[4], key[4]);
  state[5]   = _mm_aesenc_si128(state[5], key[5]);

  rkey[0]    = _mm_xor_si128(rkey[0], tmp0);
  tmp0       = _mm_aesenclast_si128(rkey[7], mmrconFinal);

  rkey[1]    = _mm_xor_si128(rkey[1], rkey[0]);
  rkey[2]    = _mm_xor_si128(rkey[2], rkey[1]);
  rkey[3]    = _mm_xor_si128(rkey[3], rkey[2]);

  state[6]   = _mm_aesenc_si128(state[6], key[6]);
  state[7]   = _mm_aesenc_si128(state[7], key[7]);

  transpose_state(key, rkey);

  tmp0          = _mm_shuffle_epi8(tmp0, mask);
  mmrconFinal   = _mm_slli_epi16(mmrconFinal, 0x01);

  rkey[4]    = _mm_xor_si128(rkey[4], tmp0);
  rkey[5]    = _mm_xor_si128(rkey[5], rkey[4]);
  rkey[6]    = _mm_xor_si128(rkey[6], rkey[5]);
  rkey[7]    = _mm_xor_si128(rkey[7], rkey[6]);

  _mm_prefetch((char const *)state, 0);
  transpose_state(&(key[4]), &(rkey[4]));

  tmp0    = _mm_aesenclast_si128(rkey[3], mmrconFinal);
  tmp0    = _mm_shuffle_epi8(tmp0, mask);

  state[0]   = _mm_aesenc_si128(state[0], key[0]);
  state[1]   = _mm_aesenc_si128(state[1], key[1]);
  state[2]   = _mm_aesenc_si128(state[2], key[2]);
  state[3]   = _mm_aesenc_si128(state[3], key[3]);
  state[4]   = _mm_aesenc_si128(state[4], key[4]);
  state[5]   = _mm_aesenc_si128(state[5], key[5]);

  rkey[0]    = _mm_xor_si128(rkey[0], tmp0);
  tmp0       = _mm_aesenclast_si128(rkey[7], mmrconFinal);

  rkey[1]    = _mm_xor_si128(rkey[1], rkey[0]);
  rkey[2]    = _mm_xor_si128(rkey[2], rkey[1]);
  rkey[3]    = _mm_xor_si128(rkey[3], rkey[2]);

  state[6]   = _mm_aesenc_si128(state[6], key[6]);
  state[7]   = _mm_aesenc_si128(state[7], key[7]);

  transpose_state(key, rkey);
  tmp0          = _mm_shuffle_epi8(tmp0, mask);

  state[0]   = _mm_aesenclast_si128(state[0], key[0]);
  state[1]   = _mm_aesenclast_si128(state[1], key[1]);
  state[2]   = _mm_aesenclast_si128(state[2], key[2]);
  state[3]   = _mm_aesenclast_si128(state[3], key[3]);

  rkey[4]    = _mm_xor_si128(rkey[4], tmp0);
  rkey[5]    = _mm_xor_si128(rkey[5], rkey[4]);
  rkey[6]    = _mm_xor_si128(rkey[6], rkey[5]);
  rkey[7]    = _mm_xor_si128(rkey[7], rkey[6]);

  transpose_state(&(key[4]), &(rkey[4]));

  state[4]   = _mm_aesenclast_si128(state[4], key[4]);
  state[5]   = _mm_aesenclast_si128(state[5], key[5]);
  state[6]   = _mm_aesenclast_si128(state[6], key[6]);
  state[7]   = _mm_aesenclast_si128(state[7], key[7]);

  _mm_store_si128((__m128i*)&(dataOut[  0]),  state[0]);
  _mm_store_si128((__m128i*)&(dataOut[ 16]),  state[1]);
  _mm_store_si128((__m128i*)&(dataOut[ 32]),  state[2]);
  _mm_store_si128((__m128i*)&(dataOut[ 48]),  state[3]);
  _mm_store_si128((__m128i*)&(dataOut[ 64]),  state[4]);
  _mm_store_si128((__m128i*)&(dataOut[ 80]),  state[5]);
  _mm_store_si128((__m128i*)&(dataOut[ 96]),  state[6]);
  _mm_store_si128((__m128i*)&(dataOut[112]),  state[7]);

}
Esempio n. 29
0
void ENC_MSG_x8(const unsigned char *PT,
                      unsigned char *CT,
                      const unsigned char *TAG,
                      const unsigned char *KS,
                      int length)
{
    __m128i or_mask, TWO,ctr_block, tmp, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, ONE;
    int i,j,remainder_loc;
    if (length%16)
        length = length/16 + 1;
    else length/=16;
    ONE = _mm_setr_epi32(1,0,0,0);
	TWO = _mm_setr_epi32(2,0,0,0);
	ctr_block = _mm_setzero_si128();
	ctr_block = _mm_loadu_si128(((__m128i*)TAG));
	or_mask = _mm_setr_epi32(0,0,0,0x80000000);
	ctr_block = _mm_or_si128(ctr_block, or_mask);
	for (i=0; i< (length-length%8); i=i+8)
	{
		tmp = ctr_block;
		tmp1 = _mm_add_epi32(ctr_block, ONE);
		tmp2 = _mm_add_epi32(ctr_block, TWO);
		tmp3 = _mm_add_epi32(tmp2, ONE);
		tmp4 = _mm_add_epi32(tmp2, TWO);
		tmp5 = _mm_add_epi32(tmp4, ONE);
		tmp6 = _mm_add_epi32(tmp4, TWO);
		tmp7 = _mm_add_epi32(tmp6, ONE);
		ctr_block = _mm_add_epi32(tmp6, TWO);
		tmp = _mm_xor_si128(tmp, ((__m128i*)KS)[0]);
		tmp1 = _mm_xor_si128(tmp1, ((__m128i*)KS)[0]);
		tmp2 = _mm_xor_si128(tmp2, ((__m128i*)KS)[0]);
		tmp3 = _mm_xor_si128(tmp3, ((__m128i*)KS)[0]);
		tmp4 = _mm_xor_si128(tmp4, ((__m128i*)KS)[0]);
		tmp5 = _mm_xor_si128(tmp5, ((__m128i*)KS)[0]);
		tmp6 = _mm_xor_si128(tmp6, ((__m128i*)KS)[0]);
		tmp7 = _mm_xor_si128(tmp7, ((__m128i*)KS)[0]);
			for(j=1; j <10; j++) {
				tmp = _mm_aesenc_si128 (tmp, ((__m128i*)KS)[j]);
				tmp1 = _mm_aesenc_si128 (tmp1, ((__m128i*)KS)[j]);
				tmp2 = _mm_aesenc_si128 (tmp2, ((__m128i*)KS)[j]);
				tmp3 = _mm_aesenc_si128 (tmp3, ((__m128i*)KS)[j]);
				tmp4 = _mm_aesenc_si128 (tmp4, ((__m128i*)KS)[j]);
				tmp5 = _mm_aesenc_si128 (tmp5, ((__m128i*)KS)[j]);
				tmp6 = _mm_aesenc_si128 (tmp6, ((__m128i*)KS)[j]);
				tmp7 = _mm_aesenc_si128 (tmp7, ((__m128i*)KS)[j]);
				};
			tmp = _mm_aesenclast_si128 (tmp, ((__m128i*)KS)[j]);
			tmp1 = _mm_aesenclast_si128 (tmp1, ((__m128i*)KS)[j]);
			tmp2 = _mm_aesenclast_si128 (tmp2, ((__m128i*)KS)[j]);
			tmp3 = _mm_aesenclast_si128 (tmp3, ((__m128i*)KS)[j]);
			tmp4 = _mm_aesenclast_si128 (tmp4, ((__m128i*)KS)[j]);
			tmp5 = _mm_aesenclast_si128 (tmp5, ((__m128i*)KS)[j]);
			tmp6 = _mm_aesenclast_si128 (tmp6, ((__m128i*)KS)[j]);
			tmp7 = _mm_aesenclast_si128 (tmp7, ((__m128i*)KS)[j]);
			tmp = _mm_xor_si128(tmp,_mm_loadu_si128(&((__m128i*)PT)[i]));
			tmp1 = _mm_xor_si128(tmp1,_mm_loadu_si128(&((__m128i*)PT)[i+1]));
			tmp2 = _mm_xor_si128(tmp2,_mm_loadu_si128(&((__m128i*)PT)[i+2]));
			tmp3 = _mm_xor_si128(tmp3,_mm_loadu_si128(&((__m128i*)PT)[i+3]));
			tmp4 = _mm_xor_si128(tmp4,_mm_loadu_si128(&((__m128i*)PT)[i+4]));
			tmp5 = _mm_xor_si128(tmp5,_mm_loadu_si128(&((__m128i*)PT)[i+5]));
			tmp6 = _mm_xor_si128(tmp6,_mm_loadu_si128(&((__m128i*)PT)[i+6]));
			tmp7 = _mm_xor_si128(tmp7,_mm_loadu_si128(&((__m128i*)PT)[i+7]));
			_mm_storeu_si128(&((__m128i*)CT)[i],tmp);
			_mm_storeu_si128(&((__m128i*)CT)[i+1],tmp1);
			_mm_storeu_si128(&((__m128i*)CT)[i+2],tmp2);
			_mm_storeu_si128(&((__m128i*)CT)[i+3],tmp3);
			_mm_storeu_si128(&((__m128i*)CT)[i+4],tmp4);
			_mm_storeu_si128(&((__m128i*)CT)[i+5],tmp5);
			_mm_storeu_si128(&((__m128i*)CT)[i+6],tmp6);
			_mm_storeu_si128(&((__m128i*)CT)[i+7],tmp7);
		}
	// handling remainder and less than 8 blocks
	if (length%8==0)
		return;
	// The remainder_loc is used to remember the location of our block handled 
	remainder_loc = length-length%8;
    for(i=0; i < (length%8); i++)
	{
		tmp = ctr_block;
        ctr_block = _mm_add_epi32(ctr_block, ONE);
        tmp = _mm_xor_si128(tmp, ((__m128i*)KS)[0]);

            for(j=1; j <10; j++) {
                tmp = _mm_aesenc_si128 (tmp, ((__m128i*)KS)[j]);
                };
            tmp = _mm_aesenclast_si128 (tmp, ((__m128i*)KS)[j]);
            tmp = _mm_xor_si128(tmp,_mm_loadu_si128(&((__m128i*)PT)[remainder_loc+i]));
            _mm_storeu_si128 (&((__m128i*)CT)[remainder_loc+i],tmp);
    }
}
Esempio n. 30
0
/*
* AES-256 Encryption
*/
void AES_256_NI::encrypt_n(const byte in[], byte out[], size_t blocks) const
   {
   const __m128i* in_mm = (const __m128i*)in;
   __m128i* out_mm = (__m128i*)out;

   const __m128i* key_mm = (const __m128i*)&EK[0];

   __m128i K0  = _mm_loadu_si128(key_mm);
   __m128i K1  = _mm_loadu_si128(key_mm + 1);
   __m128i K2  = _mm_loadu_si128(key_mm + 2);
   __m128i K3  = _mm_loadu_si128(key_mm + 3);
   __m128i K4  = _mm_loadu_si128(key_mm + 4);
   __m128i K5  = _mm_loadu_si128(key_mm + 5);
   __m128i K6  = _mm_loadu_si128(key_mm + 6);
   __m128i K7  = _mm_loadu_si128(key_mm + 7);
   __m128i K8  = _mm_loadu_si128(key_mm + 8);
   __m128i K9  = _mm_loadu_si128(key_mm + 9);
   __m128i K10 = _mm_loadu_si128(key_mm + 10);
   __m128i K11 = _mm_loadu_si128(key_mm + 11);
   __m128i K12 = _mm_loadu_si128(key_mm + 12);
   __m128i K13 = _mm_loadu_si128(key_mm + 13);
   __m128i K14 = _mm_loadu_si128(key_mm + 14);

   while(blocks >= 4)
      {
      __m128i B0 = _mm_loadu_si128(in_mm + 0);
      __m128i B1 = _mm_loadu_si128(in_mm + 1);
      __m128i B2 = _mm_loadu_si128(in_mm + 2);
      __m128i B3 = _mm_loadu_si128(in_mm + 3);

      B0 = _mm_xor_si128(B0, K0);
      B1 = _mm_xor_si128(B1, K0);
      B2 = _mm_xor_si128(B2, K0);
      B3 = _mm_xor_si128(B3, K0);

      AES_ENC_4_ROUNDS(K1);
      AES_ENC_4_ROUNDS(K2);
      AES_ENC_4_ROUNDS(K3);
      AES_ENC_4_ROUNDS(K4);
      AES_ENC_4_ROUNDS(K5);
      AES_ENC_4_ROUNDS(K6);
      AES_ENC_4_ROUNDS(K7);
      AES_ENC_4_ROUNDS(K8);
      AES_ENC_4_ROUNDS(K9);
      AES_ENC_4_ROUNDS(K10);
      AES_ENC_4_ROUNDS(K11);
      AES_ENC_4_ROUNDS(K12);
      AES_ENC_4_ROUNDS(K13);
      AES_ENC_4_LAST_ROUNDS(K14);

      _mm_storeu_si128(out_mm + 0, B0);
      _mm_storeu_si128(out_mm + 1, B1);
      _mm_storeu_si128(out_mm + 2, B2);
      _mm_storeu_si128(out_mm + 3, B3);

      blocks -= 4;
      in_mm += 4;
      out_mm += 4;
      }

   for(size_t i = 0; i != blocks; ++i)
      {
      __m128i B = _mm_loadu_si128(in_mm + i);

      B = _mm_xor_si128(B, K0);

      B = _mm_aesenc_si128(B, K1);
      B = _mm_aesenc_si128(B, K2);
      B = _mm_aesenc_si128(B, K3);
      B = _mm_aesenc_si128(B, K4);
      B = _mm_aesenc_si128(B, K5);
      B = _mm_aesenc_si128(B, K6);
      B = _mm_aesenc_si128(B, K7);
      B = _mm_aesenc_si128(B, K8);
      B = _mm_aesenc_si128(B, K9);
      B = _mm_aesenc_si128(B, K10);
      B = _mm_aesenc_si128(B, K11);
      B = _mm_aesenc_si128(B, K12);
      B = _mm_aesenc_si128(B, K13);
      B = _mm_aesenclast_si128(B, K14);

      _mm_storeu_si128(out_mm + i, B);
      }
   }