void Threefish_512::encrypt_n(const byte in[], byte out[], size_t blocks) const { BOTAN_ASSERT(m_K.size() == 9, "Key was set"); BOTAN_ASSERT(m_T.size() == 3, "Tweak was set"); for(size_t i = 0; i != blocks; ++i) { u64bit X0 = load_le<u64bit>(in, 0); u64bit X1 = load_le<u64bit>(in, 1); u64bit X2 = load_le<u64bit>(in, 2); u64bit X3 = load_le<u64bit>(in, 3); u64bit X4 = load_le<u64bit>(in, 4); u64bit X5 = load_le<u64bit>(in, 5); u64bit X6 = load_le<u64bit>(in, 6); u64bit X7 = load_le<u64bit>(in, 7); THREEFISH_INJECT_KEY(0); THREEFISH_ENC_8_ROUNDS(1,2); THREEFISH_ENC_8_ROUNDS(3,4); THREEFISH_ENC_8_ROUNDS(5,6); THREEFISH_ENC_8_ROUNDS(7,8); THREEFISH_ENC_8_ROUNDS(9,10); THREEFISH_ENC_8_ROUNDS(11,12); THREEFISH_ENC_8_ROUNDS(13,14); THREEFISH_ENC_8_ROUNDS(15,16); THREEFISH_ENC_8_ROUNDS(17,18); store_le(out, X0, X1, X2, X3, X4, X5, X6, X7); in += 64; out += 64; } }
void Threefish_512::encrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { BOTAN_ASSERT(m_K.size() == 9, "Key was set"); BOTAN_ASSERT(m_T.size() == 3, "Tweak was set"); #if defined(BOTAN_HAS_THREEFISH_512_AVX2) if(CPUID::has_avx2()) { return avx2_encrypt_n(in, out, blocks); } #endif BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i) { uint64_t X0, X1, X2, X3, X4, X5, X6, X7; load_le(in + BLOCK_SIZE*i, X0, X1, X2, X3, X4, X5, X6, X7); THREEFISH_INJECT_KEY(0); THREEFISH_ENC_8_ROUNDS(1,2); THREEFISH_ENC_8_ROUNDS(3,4); THREEFISH_ENC_8_ROUNDS(5,6); THREEFISH_ENC_8_ROUNDS(7,8); THREEFISH_ENC_8_ROUNDS(9,10); THREEFISH_ENC_8_ROUNDS(11,12); THREEFISH_ENC_8_ROUNDS(13,14); THREEFISH_ENC_8_ROUNDS(15,16); THREEFISH_ENC_8_ROUNDS(17,18); store_le(out + BLOCK_SIZE*i, X0, X1, X2, X3, X4, X5, X6, X7); } }
void Threefish_512::skein_feedfwd(const secure_vector<u64bit>& M, const secure_vector<u64bit>& T) { BOTAN_ASSERT(m_K.size() == 9, "Key was set"); BOTAN_ASSERT(M.size() == 8, "Single block"); m_T[0] = T[0]; m_T[1] = T[1]; m_T[2] = T[0] ^ T[1]; u64bit X0 = M[0]; u64bit X1 = M[1]; u64bit X2 = M[2]; u64bit X3 = M[3]; u64bit X4 = M[4]; u64bit X5 = M[5]; u64bit X6 = M[6]; u64bit X7 = M[7]; THREEFISH_INJECT_KEY(0); THREEFISH_ENC_8_ROUNDS(1,2); THREEFISH_ENC_8_ROUNDS(3,4); THREEFISH_ENC_8_ROUNDS(5,6); THREEFISH_ENC_8_ROUNDS(7,8); THREEFISH_ENC_8_ROUNDS(9,10); THREEFISH_ENC_8_ROUNDS(11,12); THREEFISH_ENC_8_ROUNDS(13,14); THREEFISH_ENC_8_ROUNDS(15,16); THREEFISH_ENC_8_ROUNDS(17,18); m_K[0] = M[0] ^ X0; m_K[1] = M[1] ^ X1; m_K[2] = M[2] ^ X2; m_K[3] = M[3] ^ X3; m_K[4] = M[4] ^ X4; m_K[5] = M[5] ^ X5; m_K[6] = M[6] ^ X6; m_K[7] = M[7] ^ X7; m_K[8] = m_K[0] ^ m_K[1] ^ m_K[2] ^ m_K[3] ^ m_K[4] ^ m_K[5] ^ m_K[6] ^ m_K[7] ^ 0x1BD11BDAA9FC1A22; }
void Threefish_512_AVX2::encrypt_n(const byte in[], byte out[], size_t blocks) const { const u64bit* K = &get_K()[0]; const u64bit* T_64 = &get_T()[0]; const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46); const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33); const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17); const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44); const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39); const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13); const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25); const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8); #define THREEFISH_ROUND(X0, X1, SHL) \ do { \ const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL); \ X0 = _mm256_add_epi64(X0, X1); \ X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \ X1 = _mm256_xor_si256(X1, X0); \ X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1)); \ X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \ } while(0) #define THREEFISH_ROUND_2(X0, X1, X2, X3, SHL) \ do { \ const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL); \ X0 = _mm256_add_epi64(X0, X1); \ X2 = _mm256_add_epi64(X2, X3); \ X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \ X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \ X1 = _mm256_xor_si256(X1, X0); \ X3 = _mm256_xor_si256(X3, X2); \ X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1)); \ X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(0, 3, 2, 1)); \ X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \ X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0)); \ } while(0) #define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I) \ do { \ const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \ const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \ X0 = _mm256_add_epi64(X0, K0); \ X1 = _mm256_add_epi64(X1, K1); \ X1 = _mm256_add_epi64(X1, R); \ X0 = _mm256_add_epi64(X0, T0); \ X1 = _mm256_add_epi64(X1, T1); \ R = _mm256_add_epi64(R, ONE); \ } while(0) #define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I) \ do { \ const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \ __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \ X0 = _mm256_add_epi64(X0, K0); \ X2 = _mm256_add_epi64(X2, K0); \ X1 = _mm256_add_epi64(X1, K1); \ X3 = _mm256_add_epi64(X3, K1); \ T1 = _mm256_add_epi64(T1, R); \ X0 = _mm256_add_epi64(X0, T0); \ X2 = _mm256_add_epi64(X2, T0); \ X1 = _mm256_add_epi64(X1, T1); \ X3 = _mm256_add_epi64(X3, T1); \ R = _mm256_add_epi64(R, ONE); \ } while(0) #define THREEFISH_ENC_8_ROUNDS(X0, X1, R, K1, K2, K3, T0, T1, T2) \ do { \ THREEFISH_ROUND(X0, X1, ROTATE_1); \ THREEFISH_ROUND(X0, X1, ROTATE_2); \ THREEFISH_ROUND(X0, X1, ROTATE_3); \ THREEFISH_ROUND(X0, X1, ROTATE_4); \ THREEFISH_INJECT_KEY(X0, X1, R, K1, K2, T0, T1); \ \ THREEFISH_ROUND(X0, X1, ROTATE_5); \ THREEFISH_ROUND(X0, X1, ROTATE_6); \ THREEFISH_ROUND(X0, X1, ROTATE_7); \ THREEFISH_ROUND(X0, X1, ROTATE_8); \ THREEFISH_INJECT_KEY(X0, X1, R, K2, K3, T2, T0); \ } while(0) #define THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K1, K2, K3, T0, T1, T2) \ do { \ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1); \ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2); \ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3); \ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4); \ THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K1, K2, T0, T1); \ \ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5); \ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6); \ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7); \ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8); \ THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K2, K3, T2, T0); \ } while(0) /* v1.0 key schedule: 9 ymm registers (only need 2 or 3) (0,1,2,3),(4,5,6,7) [8] then mutating with vpermq */ const __m256i K0 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]); const __m256i K1 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]); const __m256i K2 = _mm256_set_epi64x(K[8], K[6], K[4], K[2]); const __m256i K3 = _mm256_set_epi64x(K[0], K[7], K[5], K[3]); const __m256i K4 = _mm256_set_epi64x(K[1], K[8], K[6], K[4]); const __m256i K5 = _mm256_set_epi64x(K[2], K[0], K[7], K[5]); const __m256i K6 = _mm256_set_epi64x(K[3], K[1], K[8], K[6]); const __m256i K7 = _mm256_set_epi64x(K[4], K[2], K[0], K[7]); const __m256i K8 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]); const __m256i ONE = _mm256_set_epi64x(1, 0, 0, 0); const __m256i* in_mm = reinterpret_cast<const __m256i*>(in); __m256i* out_mm = reinterpret_cast<__m256i*>(out); while(blocks >= 2) { __m256i X0 = _mm256_loadu_si256(in_mm++); __m256i X1 = _mm256_loadu_si256(in_mm++); __m256i X2 = _mm256_loadu_si256(in_mm++); __m256i X3 = _mm256_loadu_si256(in_mm++); const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0); __m256i R = _mm256_set_epi64x(0, 0, 0, 0); interleave_epi64(X0, X1); interleave_epi64(X2, X3); THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, 2, 3); THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K1,K2,K3, 1, 2, 3); THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K3,K4,K5, 2, 3, 1); THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K5,K6,K7, 3, 1, 2); THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K7,K8,K0, 1, 2, 3); THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K0,K1,K2, 2, 3, 1); THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K2,K3,K4, 3, 1, 2); THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K4,K5,K6, 1, 2, 3); THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K6,K7,K8, 2, 3, 1); THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K8,K0,K1, 3, 1, 2); deinterleave_epi64(X0, X1); deinterleave_epi64(X2, X3); _mm256_storeu_si256(out_mm++, X0); _mm256_storeu_si256(out_mm++, X1); _mm256_storeu_si256(out_mm++, X2); _mm256_storeu_si256(out_mm++, X3); blocks -= 2; } for(size_t i = 0; i != blocks; ++i) { __m256i X0 = _mm256_loadu_si256(in_mm++); __m256i X1 = _mm256_loadu_si256(in_mm++); const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0); __m256i R = _mm256_set_epi64x(0, 0, 0, 0); interleave_epi64(X0, X1); THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, 2, 3); THREEFISH_ENC_8_ROUNDS(X0, X1, R, K1,K2,K3, 1, 2, 3); THREEFISH_ENC_8_ROUNDS(X0, X1, R, K3,K4,K5, 2, 3, 1); THREEFISH_ENC_8_ROUNDS(X0, X1, R, K5,K6,K7, 3, 1, 2); THREEFISH_ENC_8_ROUNDS(X0, X1, R, K7,K8,K0, 1, 2, 3); THREEFISH_ENC_8_ROUNDS(X0, X1, R, K0,K1,K2, 2, 3, 1); THREEFISH_ENC_8_ROUNDS(X0, X1, R, K2,K3,K4, 3, 1, 2); THREEFISH_ENC_8_ROUNDS(X0, X1, R, K4,K5,K6, 1, 2, 3); THREEFISH_ENC_8_ROUNDS(X0, X1, R, K6,K7,K8, 2, 3, 1); THREEFISH_ENC_8_ROUNDS(X0, X1, R, K8,K0,K1, 3, 1, 2); deinterleave_epi64(X0, X1); _mm256_storeu_si256(out_mm++, X0); _mm256_storeu_si256(out_mm++, X1); } #undef THREEFISH_ENC_8_ROUNDS #undef THREEFISH_ROUND #undef THREEFISH_INJECT_KEY #undef THREEFISH_ENC_2_8_ROUNDS #undef THREEFISH_ROUND_2 #undef THREEFISH_INJECT_KEY_2 }
void Threefish_512::decrypt_n(const byte in[], byte out[], size_t blocks) const { BOTAN_ASSERT(m_K.size() == 9, "Key was set"); BOTAN_ASSERT(m_T.size() == 3, "Tweak was set"); #define THREEFISH_ROUND(X0,X1,X2,X3,X4,X5,X6,X7,ROT1,ROT2,ROT3,ROT4) \ do { \ X4 ^= X0; \ X5 ^= X1; \ X6 ^= X2; \ X7 ^= X3; \ X4 = rotate_right(X4, ROT1); \ X5 = rotate_right(X5, ROT2); \ X6 = rotate_right(X6, ROT3); \ X7 = rotate_right(X7, ROT4); \ X0 -= X4; \ X1 -= X5; \ X2 -= X6; \ X3 -= X7; \ } while(0) #define THREEFISH_INJECT_KEY(r) \ do { \ X0 -= m_K[(r ) % 9]; \ X1 -= m_K[(r+1) % 9]; \ X2 -= m_K[(r+2) % 9]; \ X3 -= m_K[(r+3) % 9]; \ X4 -= m_K[(r+4) % 9]; \ X5 -= m_K[(r+5) % 9] + m_T[(r ) % 3]; \ X6 -= m_K[(r+6) % 9] + m_T[(r+1) % 3]; \ X7 -= m_K[(r+7) % 9] + (r); \ } while(0) #define THREEFISH_DEC_8_ROUNDS(R1,R2) \ do { \ THREEFISH_ROUND(X6,X0,X2,X4, X1,X7,X5,X3, 8,35,56,22); \ THREEFISH_ROUND(X4,X6,X0,X2, X1,X3,X5,X7, 25,29,39,43); \ THREEFISH_ROUND(X2,X4,X6,X0, X1,X7,X5,X3, 13,50,10,17); \ THREEFISH_ROUND(X0,X2,X4,X6, X1,X3,X5,X7, 39,30,34,24); \ THREEFISH_INJECT_KEY(R1); \ \ THREEFISH_ROUND(X6,X0,X2,X4, X1,X7,X5,X3, 44, 9,54,56); \ THREEFISH_ROUND(X4,X6,X0,X2, X1,X3,X5,X7, 17,49,36,39); \ THREEFISH_ROUND(X2,X4,X6,X0, X1,X7,X5,X3, 33,27,14,42); \ THREEFISH_ROUND(X0,X2,X4,X6, X1,X3,X5,X7, 46,36,19,37); \ THREEFISH_INJECT_KEY(R2); \ } while(0) for(size_t i = 0; i != blocks; ++i) { u64bit X0 = load_le<u64bit>(in, 0); u64bit X1 = load_le<u64bit>(in, 1); u64bit X2 = load_le<u64bit>(in, 2); u64bit X3 = load_le<u64bit>(in, 3); u64bit X4 = load_le<u64bit>(in, 4); u64bit X5 = load_le<u64bit>(in, 5); u64bit X6 = load_le<u64bit>(in, 6); u64bit X7 = load_le<u64bit>(in, 7); THREEFISH_INJECT_KEY(18); THREEFISH_DEC_8_ROUNDS(17,16); THREEFISH_DEC_8_ROUNDS(15,14); THREEFISH_DEC_8_ROUNDS(13,12); THREEFISH_DEC_8_ROUNDS(11,10); THREEFISH_DEC_8_ROUNDS(9,8); THREEFISH_DEC_8_ROUNDS(7,6); THREEFISH_DEC_8_ROUNDS(5,4); THREEFISH_DEC_8_ROUNDS(3,2); THREEFISH_DEC_8_ROUNDS(1,0); store_le(out, X0, X1, X2, X3, X4, X5, X6, X7); in += 64; out += 64; } #undef THREEFISH_DEC_8_ROUNDS #undef THREEFISH_INJECT_KEY #undef THREEFISH_ROUND }
void Threefish_512::decrypt_n(const uint8_t in[], uint8_t out[], size_t blocks) const { BOTAN_ASSERT(m_K.size() == 9, "Key was set"); BOTAN_ASSERT(m_T.size() == 3, "Tweak was set"); #if defined(BOTAN_HAS_THREEFISH_512_AVX2) if(CPUID::has_avx2()) { return avx2_decrypt_n(in, out, blocks); } #endif #define THREEFISH_ROUND(X0,X1,X2,X3,X4,X5,X6,X7,ROT1,ROT2,ROT3,ROT4) \ do { \ X4 ^= X0; \ X5 ^= X1; \ X6 ^= X2; \ X7 ^= X3; \ X4 = rotate_right(X4, ROT1); \ X5 = rotate_right(X5, ROT2); \ X6 = rotate_right(X6, ROT3); \ X7 = rotate_right(X7, ROT4); \ X0 -= X4; \ X1 -= X5; \ X2 -= X6; \ X3 -= X7; \ } while(0) #define THREEFISH_INJECT_KEY(r) \ do { \ X0 -= m_K[(r ) % 9]; \ X1 -= m_K[(r+1) % 9]; \ X2 -= m_K[(r+2) % 9]; \ X3 -= m_K[(r+3) % 9]; \ X4 -= m_K[(r+4) % 9]; \ X5 -= m_K[(r+5) % 9] + m_T[(r ) % 3]; \ X6 -= m_K[(r+6) % 9] + m_T[(r+1) % 3]; \ X7 -= m_K[(r+7) % 9] + (r); \ } while(0) #define THREEFISH_DEC_8_ROUNDS(R1,R2) \ do { \ THREEFISH_ROUND(X6,X0,X2,X4, X1,X7,X5,X3, 8,35,56,22); \ THREEFISH_ROUND(X4,X6,X0,X2, X1,X3,X5,X7, 25,29,39,43); \ THREEFISH_ROUND(X2,X4,X6,X0, X1,X7,X5,X3, 13,50,10,17); \ THREEFISH_ROUND(X0,X2,X4,X6, X1,X3,X5,X7, 39,30,34,24); \ THREEFISH_INJECT_KEY(R1); \ \ THREEFISH_ROUND(X6,X0,X2,X4, X1,X7,X5,X3, 44, 9,54,56); \ THREEFISH_ROUND(X4,X6,X0,X2, X1,X3,X5,X7, 17,49,36,39); \ THREEFISH_ROUND(X2,X4,X6,X0, X1,X7,X5,X3, 33,27,14,42); \ THREEFISH_ROUND(X0,X2,X4,X6, X1,X3,X5,X7, 46,36,19,37); \ THREEFISH_INJECT_KEY(R2); \ } while(0) BOTAN_PARALLEL_FOR(size_t i = 0; i < blocks; ++i) { uint64_t X0, X1, X2, X3, X4, X5, X6, X7; load_le(in + BLOCK_SIZE*i, X0, X1, X2, X3, X4, X5, X6, X7); THREEFISH_INJECT_KEY(18); THREEFISH_DEC_8_ROUNDS(17,16); THREEFISH_DEC_8_ROUNDS(15,14); THREEFISH_DEC_8_ROUNDS(13,12); THREEFISH_DEC_8_ROUNDS(11,10); THREEFISH_DEC_8_ROUNDS(9,8); THREEFISH_DEC_8_ROUNDS(7,6); THREEFISH_DEC_8_ROUNDS(5,4); THREEFISH_DEC_8_ROUNDS(3,2); THREEFISH_DEC_8_ROUNDS(1,0); store_le(out + BLOCK_SIZE*i, X0, X1, X2, X3, X4, X5, X6, X7); } #undef THREEFISH_DEC_8_ROUNDS #undef THREEFISH_INJECT_KEY #undef THREEFISH_ROUND }