void ChaCha_Policy<R>::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount) { word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; while (iterationCount--) { x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3]; x4 = m_state[4]; x5 = m_state[5]; x6 = m_state[6]; x7 = m_state[7]; x8 = m_state[8]; x9 = m_state[9]; x10 = m_state[10]; x11 = m_state[11]; x12 = m_state[12]; x13 = m_state[13]; x14 = m_state[14]; x15 = m_state[15]; for (int i = static_cast<int>(ROUNDS); i > 0; i -= 2) { CHACHA_QUARTER_ROUND(x0, x4, x8, x12); CHACHA_QUARTER_ROUND(x1, x5, x9, x13); CHACHA_QUARTER_ROUND(x2, x6, x10, x14); CHACHA_QUARTER_ROUND(x3, x7, x11, x15); CHACHA_QUARTER_ROUND(x0, x5, x10, x15); CHACHA_QUARTER_ROUND(x1, x6, x11, x12); CHACHA_QUARTER_ROUND(x2, x7, x8, x13); CHACHA_QUARTER_ROUND(x3, x4, x9, x14); } #undef CHACHA_OUTPUT #define CHACHA_OUTPUT(x){\ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 0, x0 + m_state[0]);\ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 1, x1 + m_state[1]);\ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 2, x2 + m_state[2]);\ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 3, x3 + m_state[3]);\ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 4, x4 + m_state[4]);\ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 5, x5 + m_state[5]);\ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 6, x6 + m_state[6]);\ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 7, x7 + m_state[7]);\ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 8, x8 + m_state[8]);\ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 9, x9 + m_state[9]);\ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 10, x10 + m_state[10]);\ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 11, x11 + m_state[11]);\ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 12, x12 + m_state[12]);\ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 13, x13 + m_state[13]);\ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 14, x14 + m_state[14]);\ CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, LITTLE_ENDIAN_ORDER, 15, x15 + m_state[15]);} #ifndef CRYPTOPP_DOXYGEN_PROCESSING CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(CHACHA_OUTPUT, BYTES_PER_ITERATION); #endif ++m_state[12]; m_state[13] += static_cast<word32>(m_state[12] == 0); } }
void ChaCha::chacha(byte output[64], const u32bit input[16]) { u32bit x00 = input[ 0], x01 = input[ 1], x02 = input[ 2], x03 = input[ 3], x04 = input[ 4], x05 = input[ 5], x06 = input[ 6], x07 = input[ 7], x08 = input[ 8], x09 = input[ 9], x10 = input[10], x11 = input[11], x12 = input[12], x13 = input[13], x14 = input[14], x15 = input[15]; #define CHACHA_QUARTER_ROUND(a, b, c, d) \ do { \ a += b; d ^= a; d = rotate_left(d, 16); \ c += d; b ^= c; b = rotate_left(b, 12); \ a += b; d ^= a; d = rotate_left(d, 8); \ c += d; b ^= c; b = rotate_left(b, 7); \ } while(0) for(size_t i = 0; i != 10; ++i) { CHACHA_QUARTER_ROUND(x00, x04, x08, x12); CHACHA_QUARTER_ROUND(x01, x05, x09, x13); CHACHA_QUARTER_ROUND(x02, x06, x10, x14); CHACHA_QUARTER_ROUND(x03, x07, x11, x15); CHACHA_QUARTER_ROUND(x00, x05, x10, x15); CHACHA_QUARTER_ROUND(x01, x06, x11, x12); CHACHA_QUARTER_ROUND(x02, x07, x08, x13); CHACHA_QUARTER_ROUND(x03, x04, x09, x14); } #undef CHACHA_QUARTER_ROUND store_le(x00 + input[ 0], output + 4 * 0); store_le(x01 + input[ 1], output + 4 * 1); store_le(x02 + input[ 2], output + 4 * 2); store_le(x03 + input[ 3], output + 4 * 3); store_le(x04 + input[ 4], output + 4 * 4); store_le(x05 + input[ 5], output + 4 * 5); store_le(x06 + input[ 6], output + 4 * 6); store_le(x07 + input[ 7], output + 4 * 7); store_le(x08 + input[ 8], output + 4 * 8); store_le(x09 + input[ 9], output + 4 * 9); store_le(x10 + input[10], output + 4 * 10); store_le(x11 + input[11], output + 4 * 11); store_le(x12 + input[12], output + 4 * 12); store_le(x13 + input[13], output + 4 * 13); store_le(x14 + input[14], output + 4 * 14); store_le(x15 + input[15], output + 4 * 15); }
//static void ChaCha::chacha_x4(byte output[64*4], u32bit input[16], size_t rounds) { BOTAN_ASSERT(rounds % 2 == 0, "Valid rounds"); #if defined(BOTAN_HAS_CHACHA_SSE2) if(CPUID::has_sse2()) { return ChaCha::chacha_sse2_x4(output, input, rounds); } #endif // TODO interleave rounds for(size_t i = 0; i != 4; ++i) { u32bit x00 = input[ 0], x01 = input[ 1], x02 = input[ 2], x03 = input[ 3], x04 = input[ 4], x05 = input[ 5], x06 = input[ 6], x07 = input[ 7], x08 = input[ 8], x09 = input[ 9], x10 = input[10], x11 = input[11], x12 = input[12], x13 = input[13], x14 = input[14], x15 = input[15]; #define CHACHA_QUARTER_ROUND(a, b, c, d) \ do { \ a += b; d ^= a; d = rotate_left(d, 16); \ c += d; b ^= c; b = rotate_left(b, 12); \ a += b; d ^= a; d = rotate_left(d, 8); \ c += d; b ^= c; b = rotate_left(b, 7); \ } while(0) for(size_t r = 0; r != rounds / 2; ++r) { CHACHA_QUARTER_ROUND(x00, x04, x08, x12); CHACHA_QUARTER_ROUND(x01, x05, x09, x13); CHACHA_QUARTER_ROUND(x02, x06, x10, x14); CHACHA_QUARTER_ROUND(x03, x07, x11, x15); CHACHA_QUARTER_ROUND(x00, x05, x10, x15); CHACHA_QUARTER_ROUND(x01, x06, x11, x12); CHACHA_QUARTER_ROUND(x02, x07, x08, x13); CHACHA_QUARTER_ROUND(x03, x04, x09, x14); } #undef CHACHA_QUARTER_ROUND x00 += input[0]; x01 += input[1]; x02 += input[2]; x03 += input[3]; x04 += input[4]; x05 += input[5]; x06 += input[6]; x07 += input[7]; x08 += input[8]; x09 += input[9]; x10 += input[10]; x11 += input[11]; x12 += input[12]; x13 += input[13]; x14 += input[14]; x15 += input[15]; store_le(x00, output + 64 * i + 4 * 0); store_le(x01, output + 64 * i + 4 * 1); store_le(x02, output + 64 * i + 4 * 2); store_le(x03, output + 64 * i + 4 * 3); store_le(x04, output + 64 * i + 4 * 4); store_le(x05, output + 64 * i + 4 * 5); store_le(x06, output + 64 * i + 4 * 6); store_le(x07, output + 64 * i + 4 * 7); store_le(x08, output + 64 * i + 4 * 8); store_le(x09, output + 64 * i + 4 * 9); store_le(x10, output + 64 * i + 4 * 10); store_le(x11, output + 64 * i + 4 * 11); store_le(x12, output + 64 * i + 4 * 12); store_le(x13, output + 64 * i + 4 * 13); store_le(x14, output + 64 * i + 4 * 14); store_le(x15, output + 64 * i + 4 * 15); input[12]++; input[13] += input[12] < i; // carry? } }
// OperateKeystream always produces a key stream. The key stream is written // to output. Optionally a message may be supplied to xor with the key stream. // The message is input, and output = output ^ input. void ChaCha_Policy::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount) { do { #if (CRYPTOPP_AVX2_AVAILABLE) if (HasAVX2()) { while (iterationCount >= 8 && MultiBlockSafe(8)) { const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL; ChaCha_OperateKeystream_AVX2(m_state, xorInput ? input : NULLPTR, output, m_rounds); // MultiBlockSafe avoids overflow on the counter words m_state[12] += 8; //if (m_state[12] < 8) // m_state[13]++; input += (!!xorInput) * 8 * BYTES_PER_ITERATION; output += 8 * BYTES_PER_ITERATION; iterationCount -= 8; } } #endif #if (CRYPTOPP_SSE2_INTRIN_AVAILABLE || CRYPTOPP_SSE2_ASM_AVAILABLE) if (HasSSE2()) { while (iterationCount >= 4 && MultiBlockSafe(4)) { const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL; ChaCha_OperateKeystream_SSE2(m_state, xorInput ? input : NULLPTR, output, m_rounds); // MultiBlockSafe avoids overflow on the counter words m_state[12] += 4; //if (m_state[12] < 4) // m_state[13]++; input += (!!xorInput)*4*BYTES_PER_ITERATION; output += 4*BYTES_PER_ITERATION; iterationCount -= 4; } } #endif #if (CRYPTOPP_ARM_NEON_AVAILABLE) if (HasNEON()) { while (iterationCount >= 4 && MultiBlockSafe(4)) { const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL; ChaCha_OperateKeystream_NEON(m_state, xorInput ? input : NULLPTR, output, m_rounds); // MultiBlockSafe avoids overflow on the counter words m_state[12] += 4; //if (m_state[12] < 4) // m_state[13]++; input += (!!xorInput)*4*BYTES_PER_ITERATION; output += 4*BYTES_PER_ITERATION; iterationCount -= 4; } } #endif #if (CRYPTOPP_ALTIVEC_AVAILABLE) if (HasAltivec()) { while (iterationCount >= 4 && MultiBlockSafe(4)) { const bool xorInput = (operation & INPUT_NULL) != INPUT_NULL; ChaCha_OperateKeystream_POWER7(m_state, xorInput ? input : NULLPTR, output, m_rounds); // MultiBlockSafe avoids overflow on the counter words m_state[12] += 4; //if (m_state[12] < 4) // m_state[13]++; input += (!!xorInput)*4*BYTES_PER_ITERATION; output += 4*BYTES_PER_ITERATION; iterationCount -= 4; } } #endif if (iterationCount) { word32 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12, x13, x14, x15; x0 = m_state[0]; x1 = m_state[1]; x2 = m_state[2]; x3 = m_state[3]; x4 = m_state[4]; x5 = m_state[5]; x6 = m_state[6]; x7 = m_state[7]; x8 = m_state[8]; x9 = m_state[9]; x10 = m_state[10]; x11 = m_state[11]; x12 = m_state[12]; x13 = m_state[13]; x14 = m_state[14]; x15 = m_state[15]; for (int i = static_cast<int>(m_rounds); i > 0; i -= 2) { CHACHA_QUARTER_ROUND(x0, x4, x8, x12); CHACHA_QUARTER_ROUND(x1, x5, x9, x13); CHACHA_QUARTER_ROUND(x2, x6, x10, x14); CHACHA_QUARTER_ROUND(x3, x7, x11, x15); CHACHA_QUARTER_ROUND(x0, x5, x10, x15); CHACHA_QUARTER_ROUND(x1, x6, x11, x12); CHACHA_QUARTER_ROUND(x2, x7, x8, x13); CHACHA_QUARTER_ROUND(x3, x4, x9, x14); } CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(CHACHA_OUTPUT, BYTES_PER_ITERATION); if (++m_state[12] == 0) m_state[13]++; } // We may re-enter a SIMD keystream operation from here. } while (iterationCount--); }