__m128i mul_check(void) { __m128i x; //x = _mm_set_epi64x(0x0f0e0d0c0b0a0908, 0x0706050403020100); x = _mm_set_epi64x(0x0000000000000000, 0x00000000000000f0); //x = _mm_set_epi64x(0x0f0e0d030b0a0908, 0x0706050403020100); x = M128(x); return x; }
void Compress(hashState_echo *ctx, const unsigned char *pmsg, unsigned int uBlockCount) { unsigned int r, b, i, j; __m128i t1, t2, t3, t4, s1, s2, s3, k1, ktemp; __m128i _state[4][4], _state2[4][4], _statebackup[4][4]; for(i = 0; i < 4; i++) for(j = 0; j < ctx->uHashSize / 256; j++) _state[i][j] = ctx->state[i][j]; #if HAVE_AES_NI // transform cv for(i = 0; i < 4; i++) for(j = 0; j < ctx->uHashSize / 256; j++) { TRANSFORM(_state[i][j], _k_ipt, t1, t2); } #endif for(b = 0; b < uBlockCount; b++) { ctx->k = _mm_add_epi64(ctx->k, ctx->const1536); // load message for(j = ctx->uHashSize / 256; j < 4; j++) { for(i = 0; i < 4; i++) { _state[i][j] = _mm_loadu_si128((__m128i*)pmsg + 4 * (j - (ctx->uHashSize / 256)) + i); #if HAVE_AES_NI // transform message TRANSFORM(_state[i][j], _k_ipt, t1, t2); #endif } } // save state SAVESTATE(_statebackup, _state); k1 = ctx->k; #if HAVE_AES_NI for(r = 0; r < ctx->uRounds / 2; r++) { ECHO_ROUND_UNROLL2; } #else for(r = 0; r < ctx->uRounds / 2; r++) { _state2[0][0] = M128(zero); _state2[1][0] = M128(zero); _state2[2][0] = M128(zero); _state2[3][0] = M128(zero); _state2[0][1] = M128(zero); _state2[1][1] = M128(zero); _state2[2][1] = M128(zero); _state2[3][1] = M128(zero); _state2[0][2] = M128(zero); _state2[1][2] = M128(zero); _state2[2][2] = M128(zero); _state2[3][2] = M128(zero); _state2[0][3] = M128(zero); _state2[1][3] = M128(zero); _state2[2][3] = M128(zero); _state2[3][3] = M128(zero); ECHO_SUB_AND_MIX(_state, 0, 0, _state2, 0, 0, 1, 2, 3); ECHO_SUB_AND_MIX(_state, 1, 0, _state2, 3, 1, 2, 3, 0); ECHO_SUB_AND_MIX(_state, 2, 0, _state2, 2, 2, 3, 0, 1); ECHO_SUB_AND_MIX(_state, 3, 0, _state2, 1, 3, 0, 1, 2); ECHO_SUB_AND_MIX(_state, 0, 1, _state2, 1, 0, 1, 2, 3); ECHO_SUB_AND_MIX(_state, 1, 1, _state2, 0, 1, 2, 3, 0); ECHO_SUB_AND_MIX(_state, 2, 1, _state2, 3, 2, 3, 0, 1); ECHO_SUB_AND_MIX(_state, 3, 1, _state2, 2, 3, 0, 1, 2); ECHO_SUB_AND_MIX(_state, 0, 2, _state2, 2, 0, 1, 2, 3); ECHO_SUB_AND_MIX(_state, 1, 2, _state2, 1, 1, 2, 3, 0); ECHO_SUB_AND_MIX(_state, 2, 2, _state2, 0, 2, 3, 0, 1); ECHO_SUB_AND_MIX(_state, 3, 2, _state2, 3, 3, 0, 1, 2); ECHO_SUB_AND_MIX(_state, 0, 3, _state2, 3, 0, 1, 2, 3); ECHO_SUB_AND_MIX(_state, 1, 3, _state2, 2, 1, 2, 3, 0); ECHO_SUB_AND_MIX(_state, 2, 3, _state2, 1, 2, 3, 0, 1); ECHO_SUB_AND_MIX(_state, 3, 3, _state2, 0, 3, 0, 1, 2); _state[0][0] = M128(zero); _state[1][0] = M128(zero); _state[2][0] = M128(zero); _state[3][0] = M128(zero); _state[0][1] = M128(zero); _state[1][1] = M128(zero); _state[2][1] = M128(zero); _state[3][1] = M128(zero); _state[0][2] = M128(zero); _state[1][2] = M128(zero); _state[2][2] = M128(zero); _state[3][2] = M128(zero); _state[0][3] = M128(zero); _state[1][3] = M128(zero); _state[2][3] = M128(zero); _state[3][3] = M128(zero); ECHO_SUB_AND_MIX(_state2, 0, 0, _state, 0, 0, 1, 2, 3); ECHO_SUB_AND_MIX(_state2, 1, 0, _state, 3, 1, 2, 3, 0); ECHO_SUB_AND_MIX(_state2, 2, 0, _state, 2, 2, 3, 0, 1); ECHO_SUB_AND_MIX(_state2, 3, 0, _state, 1, 3, 0, 1, 2); ECHO_SUB_AND_MIX(_state2, 0, 1, _state, 1, 0, 1, 2, 3); ECHO_SUB_AND_MIX(_state2, 1, 1, _state, 0, 1, 2, 3, 0); ECHO_SUB_AND_MIX(_state2, 2, 1, _state, 3, 2, 3, 0, 1); ECHO_SUB_AND_MIX(_state2, 3, 1, _state, 2, 3, 0, 1, 2); ECHO_SUB_AND_MIX(_state2, 0, 2, _state, 2, 0, 1, 2, 3); ECHO_SUB_AND_MIX(_state2, 1, 2, _state, 1, 1, 2, 3, 0); ECHO_SUB_AND_MIX(_state2, 2, 2, _state, 0, 2, 3, 0, 1); ECHO_SUB_AND_MIX(_state2, 3, 2, _state, 3, 3, 0, 1, 2); ECHO_SUB_AND_MIX(_state2, 0, 3, _state, 3, 0, 1, 2, 3); ECHO_SUB_AND_MIX(_state2, 1, 3, _state, 2, 1, 2, 3, 0); ECHO_SUB_AND_MIX(_state2, 2, 3, _state, 1, 2, 3, 0, 1); ECHO_SUB_AND_MIX(_state2, 3, 3, _state, 0, 3, 0, 1, 2); } #endif if(ctx->uHashSize == 256) { for(i = 0; i < 4; i++) { _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][1]); _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]); _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][3]); _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]); _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][1]); _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]); _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][3]); } } else { for(i = 0; i < 4; i++) { _state[i][0] = _mm_xor_si128(_state[i][0], _state[i][2]); _state[i][1] = _mm_xor_si128(_state[i][1], _state[i][3]); _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][0]); _state[i][0] = _mm_xor_si128(_state[i][0], _statebackup[i][2]); _state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][1]); _state[i][1] = _mm_xor_si128(_state[i][1], _statebackup[i][3]); } } pmsg += ctx->uBlockLength; } #if HAVE_AES_NI // transform state for(i = 0; i < 4; i++) for(j = 0; j < 4; j++) { TRANSFORM(_state[i][j], _k_opt, t1, t2); } #endif SAVESTATE(ctx->state, _state); }
/* * Simulates an implementation of a * Littlun-like cipher many times, for speed * evaluation */ __m128i dummy_cipher_eval() { __m128i x, k; unsigned long long tick1, tick2, dum; x = _mm_set_epi64x(0x0001020304050607, 0x08090a0b0c0d0e0f); k = _mm_set_epi64x(0x0a030d0c0f050607, 0x08090a000c0d0e0f); dum = 0; for (tick1 = 0; tick1 < 1ull << 31; tick1++) dum += 2*tick1 & (~tick1 | (tick1 >> 2)); x = SB(x); x = M128(x); x = SB(x); x = M128(x); x = SB(x); x = M128(x); x = SB(x); x = M128(x); x = SB(x); x = M128(x); x = SB(x); x = M128(x); x = SB(x); x = M128(x); x = SB(x); x = M128(x); x = SB(x); x = M128(x); x = SB(x); x = M128(x); x = SB(x); x = M128(x); x = SB(x); x = M128(x); x = SB(x); x = M128(x); x = SB(x); x = M128(x); x = SB(x); x = M128(x); x = SB(x); x = M128(x); tick1 = rdtsc(); // // r1 // x = _mm_xor_si128(x,k); // x = SB(x); // x = M128(x); // // r2 // x = _mm_xor_si128(x,k); // x = SB(x); // x = M128(x); // // r3 // x = _mm_xor_si128(x,k); // x = SB(x); // x = M128(x); // // r4 // x = _mm_xor_si128(x,k); // x = SB(x); // x = M128(x); // // r5 // x = _mm_xor_si128(x,k); // x = SB(x); // x = M128(x); // // r6 // x = _mm_xor_si128(x,k); // x = SB(x); // x = M128(x); // // r7 // x = _mm_xor_si128(x,k); // x = SB(x); // x = M128(x); // // r8 // x = _mm_xor_si128(x,k); // x = SB(x); // x = M128(x); // // final // x = _mm_xor_si128(x,k); tick2 = rdtsc(); printf("%llu ~ %llu cycles\n\n", dum, tick2 - tick1); return x; }