static inline void load_xor_store_n(__m128i* out, const __m128i* in, const __m128i* states, const size_t num_blocks) { for (size_t i = 0; i < num_blocks; ++i) { out[i] = vxor(states[i], loadu(in+i)); } }
/*! * \brief Multiply the two given vectors of long */ ETL_STATIC_INLINE(avx_simd_long) mul(avx_simd_long lhs, avx_simd_long rhs) { int64_t result[4]; result[0] = lhs[0] * rhs[0]; result[1] = lhs[1] * rhs[1]; result[2] = lhs[2] * rhs[2]; result[3] = lhs[3] * rhs[3]; return loadu(&result[0]); }
static inline void load_xor_and_store_eight(__m128i* out, const __m128i* in, const __m128i* states) { out[0] = vxor(states[0], loadu(in )); out[1] = vxor(states[1], loadu(in+1)); out[2] = vxor(states[2], loadu(in+2)); out[3] = vxor(states[3], loadu(in+3)); out[4] = vxor(states[4], loadu(in+4)); out[5] = vxor(states[5], loadu(in+5)); out[6] = vxor(states[6], loadu(in+6)); out[7] = vxor(states[7], loadu(in+7)); }
int decrypt_final(riv_context_t* ctx, const unsigned char* ciphertext, const unsigned long long ciphertext_length, const unsigned char* header, const unsigned long long header_length, const unsigned char tag[TAGLEN], unsigned char* plaintext) { const __m128i iv = loadu(tag); decrypt(ctx, iv, plaintext, ciphertext_length, ciphertext); ALIGN(16) uint8_t iv_prime[BLOCKLEN]; clhash(&(ctx->prf_context), header, header_length, DOMAIN_0, plaintext, ciphertext_length, iv_prime); const __m128i iv_prime_ = aes_encrypt(load(iv_prime), ctx->expanced_enc_key); return _mm_testc_si128(iv, iv_prime_) - 1; }
void keysetup(riv_context_t* ctx, const unsigned char key[KEYLEN]) { AES_KEY expanded_sk; aes_expand_key(loadu(key), expanded_sk); const __m128i k = aes_encrypt(zero, expanded_sk); store(ctx->enc_key, k); deoxys_keysetup(ctx->expanded_key, k); uint8_t prf_key[CLHASH_KEYLEN]; const size_t num_blocks = CLHASH_KEYLEN / BLOCKLEN; size_t j = 0; for (size_t i = 1; i <= num_blocks; ++i) { storeu((prf_key + j), aes_encrypt( _mm_setr_epi8(i,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0), expanded_sk)); j += BLOCKLEN; } clhash_keysetup(&(ctx->prf_context), prf_key); }
/*! * \brief Fill a packed vector by replicating a value */ ETL_STATIC_INLINE(avx_simd_complex_double<etl::complex<double>>) set(etl::complex<double> value) { etl::complex<double> tmp[]{value, value}; return loadu(tmp); }
/*! * \brief Fill a packed vector by replicating a value */ ETL_STATIC_INLINE(avx_simd_complex_float<etl::complex<float>>) set(etl::complex<float> value) { etl::complex<float> tmp[]{value, value, value, value}; return loadu(tmp); }
inline F64vec4 loadu(const double *r) { F64vec4 res; loadu(res, const_cast<REAL_T*>(r)); return res; }
static inline void sct_mode(riv_context_t* ctx, const unsigned char iv[DEOXYS_IVLEN], const __m128i* in, const uint64_t length, __m128i* out) { const __m128i* k = ctx->expanded_key; uint64_t len = length; // --------------------------------------------------------------------- // The nonce serves as input to each call of the block cipher. // --------------------------------------------------------------------- const __m128i n = vxor(loadu(iv), k[0]); // --------------------------------------------------------------------- // We use r+1 tweaks to store the tweaks t_0, t_1, ..., t_r for one block // for r rounds: // tweak_ctr[r][i] = pi^{r}(i) // tweak_ctr[r][0] = pi^{r}(0) = 0 // In each round, we then simply have to have the subtweakey: // K[r] xor pi^r(T) xor pi^{r}(i) // --------------------------------------------------------------------- __m128i tweak_ctrs[DEOXYS_ROUND_KEYS*8]; prepare_tweak_counters(tweak_ctrs); // --------------------------------------------------------------------- // T, the initial tweak // We encode domain the into four least significant bits: // tweak = (0001 || tag). // --------------------------------------------------------------------- const __m128i initial_tweak = set_domain_in_tweak( loadu((iv+BLOCKLEN)), DOMAIN_ENC ); __m128i tweak_ctr_base = zero; __m128i tweaks[15]; // The permuted tweak for the individual rounds. __m128i states[8]; __m128i tmp; uint64_t j = 0; while (len >= 8*BLOCKLEN) { // tweak = vxor(initial_tweak, tweak_ctr_base); // deoxys_enc_eight(states, tweak, tweak_ctrs, k, n); tweaks[0] = vxor(initial_tweak, tweak_ctr_base); for (size_t i = 1; i < 8; ++i) { tweaks[i] = permute_tweak(tweaks[i-1]); } for (size_t i = 8; i <= 14; ++i) { tweaks[i] = tweaks[i-8]; } deoxys_enc_eight(states, tweaks, tweak_ctrs, k, n); load_xor_and_store_eight(out, in, states); len -= 8*BLOCKLEN; in += 8; out += 8; j += 8; // Every 256-th block, we have an overflow in the first byte and // have to update the next highest bytes in the counter. if ((j & 0xFF) == 0) { add_to_tweak(tweak_ctr_base, seight); } else { // No overflow, increment only the lowest byte in the counter. tweak_ctr_base = vadd(tweak_ctr_base, eight); } } tweaks[0] = vxor(initial_tweak, tweak_ctr_base); const size_t ceil_num_blocks = ceil(len, BLOCKLEN); const size_t num_blocks = len / BLOCKLEN; const size_t last_block = len % BLOCKLEN; deoxys_enc_n(states, tweaks[0], tweak_ctrs, k, ceil_num_blocks, n); load_xor_store_n(out, in, states, num_blocks); if (last_block != 0) { in += num_blocks; out += num_blocks; store_partial(out, vxor(states[num_blocks], load_partial(in, last_block)), last_block); } }