void aesni_encrypt_ecb(int rounds, const void *key_schedule, size_t len, const uint8_t *from, uint8_t *to) { __m128i tot; const __m128i *blocks; size_t i, cnt; cnt = len / AES_BLOCK_LEN / 8; for (i = 0; i < cnt; i++) { blocks = (const __m128i *)from; aesni_enc8(rounds - 1, key_schedule, blocks[0], blocks[1], blocks[2], blocks[3], blocks[4], blocks[5], blocks[6], blocks[7], (__m128i *)to); from += AES_BLOCK_LEN * 8; to += AES_BLOCK_LEN * 8; } i *= 8; cnt = len / AES_BLOCK_LEN; for (; i < cnt; i++) { tot = aesni_enc(rounds - 1, key_schedule, _mm_loadu_si128((const __m128i *)from)); _mm_storeu_si128((__m128i *)to, tot); from += AES_BLOCK_LEN; to += AES_BLOCK_LEN; } }
static void aesni_crypt_xts_block8(int rounds, const __m128i *key_schedule, __m128i *tweak, const uint8_t *from, uint8_t *to, int do_encrypt) { __m128i tmptweak; __m128i a, b, c, d, e, f, g, h; __m128i tweaks[8]; __m128i tmp[8]; __m128i *top; const __m128i *fromp; tmptweak = *tweak; /* * unroll the loop. This lets gcc put values directly in the * register and saves memory accesses. */ fromp = (const __m128i *)from; #define PREPINP(v, pos) \ do { \ tweaks[(pos)] = tmptweak; \ (v) = _mm_loadu_si128(&fromp[pos]) ^ \ tmptweak; \ tmptweak = xts_crank_lfsr(tmptweak); \ } while (0) PREPINP(a, 0); PREPINP(b, 1); PREPINP(c, 2); PREPINP(d, 3); PREPINP(e, 4); PREPINP(f, 5); PREPINP(g, 6); PREPINP(h, 7); *tweak = tmptweak; if (do_encrypt) aesni_enc8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h, tmp); else aesni_dec8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h, tmp); top = (__m128i *)to; _mm_storeu_si128(&top[0], tmp[0] ^ tweaks[0]); _mm_storeu_si128(&top[1], tmp[1] ^ tweaks[1]); _mm_storeu_si128(&top[2], tmp[2] ^ tweaks[2]); _mm_storeu_si128(&top[3], tmp[3] ^ tweaks[3]); _mm_storeu_si128(&top[4], tmp[4] ^ tweaks[4]); _mm_storeu_si128(&top[5], tmp[5] ^ tweaks[5]); _mm_storeu_si128(&top[6], tmp[6] ^ tweaks[6]); _mm_storeu_si128(&top[7], tmp[7] ^ tweaks[7]); }
static void aesni_crypt_xts_block8(int rounds, const void *key_schedule, __m128i *tweak, const __m128i *from, __m128i *to, int do_encrypt) { __m128i tmptweak; __m128i a, b, c, d, e, f, g, h; __m128i tweaks[8]; __m128i tmp[8]; tmptweak = *tweak; /* * unroll the loop. This lets gcc put values directly in the * register and saves memory accesses. */ #define PREPINP(v, pos) \ do { \ tweaks[(pos)] = tmptweak; \ (v) = from[(pos)] ^ tmptweak; \ tmptweak = xts_crank_lfsr(tmptweak); \ } while (0) PREPINP(a, 0); PREPINP(b, 1); PREPINP(c, 2); PREPINP(d, 3); PREPINP(e, 4); PREPINP(f, 5); PREPINP(g, 6); PREPINP(h, 7); *tweak = tmptweak; if (do_encrypt) aesni_enc8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h, tmp); else aesni_dec8(rounds - 1, key_schedule, a, b, c, d, e, f, g, h, tmp); to[0] = tmp[0] ^ tweaks[0]; to[1] = tmp[1] ^ tweaks[1]; to[2] = tmp[2] ^ tweaks[2]; to[3] = tmp[3] ^ tweaks[3]; to[4] = tmp[4] ^ tweaks[4]; to[5] = tmp[5] ^ tweaks[5]; to[6] = tmp[6] ^ tweaks[6]; to[7] = tmp[7] ^ tweaks[7]; }
void aesni_encrypt_ecb(int rounds, const void *key_schedule, size_t len, const uint8_t *from, uint8_t *to) { __m128i tot; __m128i tout[8]; struct blocks8 *top; const struct blocks8 *blks; size_t i, cnt; cnt = len / AES_BLOCK_LEN / 8; for (i = 0; i < cnt; i++) { blks = (const struct blocks8 *)from; top = (struct blocks8 *)to; aesni_enc8(rounds - 1, key_schedule, blks->blk[0], blks->blk[1], blks->blk[2], blks->blk[3], blks->blk[4], blks->blk[5], blks->blk[6], blks->blk[7], tout); top->blk[0] = tout[0]; top->blk[1] = tout[1]; top->blk[2] = tout[2]; top->blk[3] = tout[3]; top->blk[4] = tout[4]; top->blk[5] = tout[5]; top->blk[6] = tout[6]; top->blk[7] = tout[7]; from += AES_BLOCK_LEN * 8; to += AES_BLOCK_LEN * 8; } i *= 8; cnt = len / AES_BLOCK_LEN; for (; i < cnt; i++) { tot = aesni_enc(rounds - 1, key_schedule, _mm_loadu_si128((const __m128i *)from)); _mm_storeu_si128((__m128i *)to, tot); from += AES_BLOCK_LEN; to += AES_BLOCK_LEN; } }
void aesni_encrypt_icm(int rounds, const void *key_schedule, size_t len, const uint8_t *from, uint8_t *to, const uint8_t iv[AES_BLOCK_LEN]) { __m128i tot; __m128i tmp1, tmp2, tmp3, tmp4; __m128i tmp5, tmp6, tmp7, tmp8; __m128i ctr1, ctr2, ctr3, ctr4; __m128i ctr5, ctr6, ctr7, ctr8; __m128i BSWAP_EPI64; __m128i tout[8]; struct blocks8 *top; const struct blocks8 *blks; size_t i, cnt; BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7); ctr1 = _mm_loadu_si128((__m128i*)iv); ctr1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); cnt = len / AES_BLOCK_LEN / 8; for (i = 0; i < cnt; i++) { tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); ctr2 = nextc(ctr1); tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64); ctr3 = nextc(ctr2); tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64); ctr4 = nextc(ctr3); tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64); ctr5 = nextc(ctr4); tmp5 = _mm_shuffle_epi8(ctr5, BSWAP_EPI64); ctr6 = nextc(ctr5); tmp6 = _mm_shuffle_epi8(ctr6, BSWAP_EPI64); ctr7 = nextc(ctr6); tmp7 = _mm_shuffle_epi8(ctr7, BSWAP_EPI64); ctr8 = nextc(ctr7); tmp8 = _mm_shuffle_epi8(ctr8, BSWAP_EPI64); ctr1 = nextc(ctr8); blks = (const struct blocks8 *)from; top = (struct blocks8 *)to; aesni_enc8(rounds - 1, key_schedule, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, tmp8, tout); top->blk[0] = blks->blk[0] ^ tout[0]; top->blk[1] = blks->blk[1] ^ tout[1]; top->blk[2] = blks->blk[2] ^ tout[2]; top->blk[3] = blks->blk[3] ^ tout[3]; top->blk[4] = blks->blk[4] ^ tout[4]; top->blk[5] = blks->blk[5] ^ tout[5]; top->blk[6] = blks->blk[6] ^ tout[6]; top->blk[7] = blks->blk[7] ^ tout[7]; from += AES_BLOCK_LEN * 8; to += AES_BLOCK_LEN * 8; } i *= 8; cnt = len / AES_BLOCK_LEN; for (; i < cnt; i++) { tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); ctr1 = nextc(ctr1); tot = aesni_enc(rounds - 1, key_schedule, tmp1); tot = tot ^ _mm_loadu_si128((const __m128i *)from); _mm_storeu_si128((__m128i *)to, tot); from += AES_BLOCK_LEN; to += AES_BLOCK_LEN; } /* handle remaining partial round */ if (len % AES_BLOCK_LEN != 0) { tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); tot = aesni_enc(rounds - 1, key_schedule, tmp1); tot = tot ^ _mm_loadu_si128((const __m128i *)from); memcpy(to, &tot, len % AES_BLOCK_LEN); } }