/** * Initializes the Sponge State. The first 512 bits are set to zeros and the remainder * receive Blake2b's IV as per Blake2b's specification. <b>Note:</b> Even though sponges * typically have their internal state initialized with zeros, Blake2b's G function * has a fixed point: if the internal state and message are both filled with zeros. the * resulting permutation will always be a block filled with zeros; this happens because * Blake2b does not use the constants originally employed in Blake2 inside its G function, * relying on the IV for avoiding possible fixed points. * * @param state The 1024-bit array to be initialized */ inline void initState(uint64_t state[/*16*/]) { #ifdef __AVX2__ (*(__m256i*)(&state[0])) = _mm256_setzero_si256(); (*(__m256i*)(&state[4])) = _mm256_setzero_si256(); (*(__m256i*)(&state[8])) = _mm256_set_epi64x( blake2b_IV[3], blake2b_IV[2], blake2b_IV[1], blake2b_IV[0] ); (*(__m256i*)(&state[12])) = _mm256_set_epi64x(blake2b_IV[7], blake2b_IV[6], blake2b_IV[5], blake2b_IV[4] ); //AVX is around the same number of instructions as unnoptimized //#elif defined __AVX__ #else //First 512 bis are zeros memset(state, 0, 64); //Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV state[8] = blake2b_IV[0]; state[9] = blake2b_IV[1]; state[10] = blake2b_IV[2]; state[11] = blake2b_IV[3]; state[12] = blake2b_IV[4]; state[13] = blake2b_IV[5]; state[14] = blake2b_IV[6]; state[15] = blake2b_IV[7]; #endif }
//int64 vec3l& vec3l::Sub(int64 v) { __m256i vxmm = _mm256_set_epi64x(0, v, v, v); __m256i xmm = _mm256_set_epi64x(0, z, y, x); xmm = _mm256_sub_epi64(xmm, vxmm); x = M256L(xmm, 0); y = M256L(xmm, 1); z = M256L(xmm, 2); return *this; }
vec3l& vec3l::Sub(const vec3l& v) { __m256i vxmm = _mm256_set_epi64x(0, v.z, v.y, v.x); __m256i xmm = _mm256_set_epi64x(0, z, y, x); xmm = _mm256_sub_epi64(xmm, vxmm); x = M256L(xmm, 0); y = M256L(xmm, 1); z = M256L(xmm, 2); return *this; }
static void avx2_test (void) { union256i_q u, s1, s2; long long int e[4]; int i; s1.x = _mm256_set_epi64x (1, 2, 3, 4); s2.x = _mm256_set_epi64x (88, 44, 3, 220000); u.x = _mm256_cmpgt_epi64 (s1.x, s2.x); for (i = 0; i < 4; i++) e[i] = (s1.a[i] > s2.a[i]) ? -1 : 0; if (check_union256i_q (u, e)) abort (); }
void key_schedule(const unsigned char *k, u256 rk[40][16]) { int i, j; u256 tk1[32], tmp[32]; unsigned char *tmp_key = malloc(32); for(i = 0; i < 2; i++) memcpy(tmp_key + 16*i, k, 16); pack_key(tk1, tmp_key); for(j = 0; j < 40; j++) { //Extract round key for(i = 0; i < 16; i++){ rk[j][i] = tk1[i]; } //Add constant into key u256 rc = _mm256_set_epi64x(0x000000FF000000FFull, 0x000000FF000000FFull, 0x000000FF000000FFull, 0x000000FF000000FFull); if(RC[j]>>5 & 1) rk[j][14] = XOR(rk[j][14], rc); if(RC[j]>>4 & 1) rk[j][15] = XOR(rk[j][15], rc); if(RC[j]>>3 & 1) rk[j][4] = XOR(rk[j][4], rc); if(RC[j]>>2 & 1) rk[j][5] = XOR(rk[j][5], rc); if(RC[j]>>1 & 1) rk[j][6] = XOR(rk[j][6], rc); if(RC[j]>>0 & 1) rk[j][7] = XOR(rk[j][7], rc); //Update TK1 for(i = 0; i < 16; i++){ tmp[16 + i] = tk1[0 + i]; } //Apply bit permutation for(i = 0; i < 8; i++){ tmp[0 + i] = XOR(_mm256_shuffle_epi8(tk1[16 + i], _mm256_set_epi8(0xff,28,0xff,29,0xff,24,0xff,25,0xff,20,0xff,21,0xff,16,0xff,17,0xff,12,0xff,13,0xff,8,0xff,9,0xff,4,0xff,5,0xff,0,0xff,1)), _mm256_shuffle_epi8(tk1[24 + i], _mm256_set_epi8(29,0xff,31,0xff,25,0xff,27,0xff,21,0xff,23,0xff,17,0xff,19,0xff,13,0xff,15,0xff,9,0xff,11,0xff,5,0xff,7,0xff,1,0xff,3,0xff))); tmp[8 + i] = XOR(_mm256_shuffle_epi8(tk1[16 + i], _mm256_set_epi8(31,0xff,0xff,30,27,0xff,0xff,26,23,0xff,0xff,22,19,0xff,0xff,18,15,0xff,0xff,14,11,0xff,0xff,10,7,0xff,0xff,6,3,0xff,0xff,2)), _mm256_shuffle_epi8(tk1[24 + i], _mm256_set_epi8(0xff,28,30,0xff,0xff,24,26,0xff,0xff,20,22,0xff,0xff,16,18,0xff,0xff,12,14,0xff,0xff,8,10,0xff,0xff,4,6,0xff,0xff,0,2,0xff))); } for(i = 0; i < 32; i++){ tk1[i] = tmp[i]; } } free(tmp_key); }
foo (long long x) { return _mm256_set_epi64x (x, x, x, x); }
void Threefish_512_AVX2::encrypt_n(const byte in[], byte out[], size_t blocks) const { const u64bit* K = &get_K()[0]; const u64bit* T_64 = &get_T()[0]; const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46); const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33); const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17); const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44); const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39); const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13); const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25); const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8); #define THREEFISH_ROUND(X0, X1, SHL) \ do { \ const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL); \ X0 = _mm256_add_epi64(X0, X1); \ X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \ X1 = _mm256_xor_si256(X1, X0); \ X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1)); \ X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \ } while(0) #define THREEFISH_ROUND_2(X0, X1, X2, X3, SHL) \ do { \ const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL); \ X0 = _mm256_add_epi64(X0, X1); \ X2 = _mm256_add_epi64(X2, X3); \ X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \ X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \ X1 = _mm256_xor_si256(X1, X0); \ X3 = _mm256_xor_si256(X3, X2); \ X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1)); \ X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(0, 3, 2, 1)); \ X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0)); \ X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0)); \ } while(0) #define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I) \ do { \ const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \ const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \ X0 = _mm256_add_epi64(X0, K0); \ X1 = _mm256_add_epi64(X1, K1); \ X1 = _mm256_add_epi64(X1, R); \ X0 = _mm256_add_epi64(X0, T0); \ X1 = _mm256_add_epi64(X1, T1); \ R = _mm256_add_epi64(R, ONE); \ } while(0) #define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I) \ do { \ const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \ __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \ X0 = _mm256_add_epi64(X0, K0); \ X2 = _mm256_add_epi64(X2, K0); \ X1 = _mm256_add_epi64(X1, K1); \ X3 = _mm256_add_epi64(X3, K1); \ T1 = _mm256_add_epi64(T1, R); \ X0 = _mm256_add_epi64(X0, T0); \ X2 = _mm256_add_epi64(X2, T0); \ X1 = _mm256_add_epi64(X1, T1); \ X3 = _mm256_add_epi64(X3, T1); \ R = _mm256_add_epi64(R, ONE); \ } while(0) #define THREEFISH_ENC_8_ROUNDS(X0, X1, R, K1, K2, K3, T0, T1, T2) \ do { \ THREEFISH_ROUND(X0, X1, ROTATE_1); \ THREEFISH_ROUND(X0, X1, ROTATE_2); \ THREEFISH_ROUND(X0, X1, ROTATE_3); \ THREEFISH_ROUND(X0, X1, ROTATE_4); \ THREEFISH_INJECT_KEY(X0, X1, R, K1, K2, T0, T1); \ \ THREEFISH_ROUND(X0, X1, ROTATE_5); \ THREEFISH_ROUND(X0, X1, ROTATE_6); \ THREEFISH_ROUND(X0, X1, ROTATE_7); \ THREEFISH_ROUND(X0, X1, ROTATE_8); \ THREEFISH_INJECT_KEY(X0, X1, R, K2, K3, T2, T0); \ } while(0) #define THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K1, K2, K3, T0, T1, T2) \ do { \ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1); \ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2); \ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3); \ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4); \ THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K1, K2, T0, T1); \ \ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5); \ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6); \ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7); \ THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8); \ THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K2, K3, T2, T0); \ } while(0) /* v1.0 key schedule: 9 ymm registers (only need 2 or 3) (0,1,2,3),(4,5,6,7) [8] then mutating with vpermq */ const __m256i K0 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]); const __m256i K1 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]); const __m256i K2 = _mm256_set_epi64x(K[8], K[6], K[4], K[2]); const __m256i K3 = _mm256_set_epi64x(K[0], K[7], K[5], K[3]); const __m256i K4 = _mm256_set_epi64x(K[1], K[8], K[6], K[4]); const __m256i K5 = _mm256_set_epi64x(K[2], K[0], K[7], K[5]); const __m256i K6 = _mm256_set_epi64x(K[3], K[1], K[8], K[6]); const __m256i K7 = _mm256_set_epi64x(K[4], K[2], K[0], K[7]); const __m256i K8 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]); const __m256i ONE = _mm256_set_epi64x(1, 0, 0, 0); const __m256i* in_mm = reinterpret_cast<const __m256i*>(in); __m256i* out_mm = reinterpret_cast<__m256i*>(out); while(blocks >= 2) { __m256i X0 = _mm256_loadu_si256(in_mm++); __m256i X1 = _mm256_loadu_si256(in_mm++); __m256i X2 = _mm256_loadu_si256(in_mm++); __m256i X3 = _mm256_loadu_si256(in_mm++); const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0); __m256i R = _mm256_set_epi64x(0, 0, 0, 0); interleave_epi64(X0, X1); interleave_epi64(X2, X3); THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, 2, 3); THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K1,K2,K3, 1, 2, 3); THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K3,K4,K5, 2, 3, 1); THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K5,K6,K7, 3, 1, 2); THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K7,K8,K0, 1, 2, 3); THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K0,K1,K2, 2, 3, 1); THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K2,K3,K4, 3, 1, 2); THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K4,K5,K6, 1, 2, 3); THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K6,K7,K8, 2, 3, 1); THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K8,K0,K1, 3, 1, 2); deinterleave_epi64(X0, X1); deinterleave_epi64(X2, X3); _mm256_storeu_si256(out_mm++, X0); _mm256_storeu_si256(out_mm++, X1); _mm256_storeu_si256(out_mm++, X2); _mm256_storeu_si256(out_mm++, X3); blocks -= 2; } for(size_t i = 0; i != blocks; ++i) { __m256i X0 = _mm256_loadu_si256(in_mm++); __m256i X1 = _mm256_loadu_si256(in_mm++); const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0); __m256i R = _mm256_set_epi64x(0, 0, 0, 0); interleave_epi64(X0, X1); THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, 2, 3); THREEFISH_ENC_8_ROUNDS(X0, X1, R, K1,K2,K3, 1, 2, 3); THREEFISH_ENC_8_ROUNDS(X0, X1, R, K3,K4,K5, 2, 3, 1); THREEFISH_ENC_8_ROUNDS(X0, X1, R, K5,K6,K7, 3, 1, 2); THREEFISH_ENC_8_ROUNDS(X0, X1, R, K7,K8,K0, 1, 2, 3); THREEFISH_ENC_8_ROUNDS(X0, X1, R, K0,K1,K2, 2, 3, 1); THREEFISH_ENC_8_ROUNDS(X0, X1, R, K2,K3,K4, 3, 1, 2); THREEFISH_ENC_8_ROUNDS(X0, X1, R, K4,K5,K6, 1, 2, 3); THREEFISH_ENC_8_ROUNDS(X0, X1, R, K6,K7,K8, 2, 3, 1); THREEFISH_ENC_8_ROUNDS(X0, X1, R, K8,K0,K1, 3, 1, 2); deinterleave_epi64(X0, X1); _mm256_storeu_si256(out_mm++, X0); _mm256_storeu_si256(out_mm++, X1); } #undef THREEFISH_ENC_8_ROUNDS #undef THREEFISH_ROUND #undef THREEFISH_INJECT_KEY #undef THREEFISH_ENC_2_8_ROUNDS #undef THREEFISH_ROUND_2 #undef THREEFISH_INJECT_KEY_2 }
parasail_result_t *result = parasail_result_new(); #endif #endif int32_t i = 0; int32_t j = 0; int32_t end_query = 0; int32_t end_ref = 0; int64_t score = NEG_INF; __m256i vNegInf = _mm256_set1_epi64x(NEG_INF); __m256i vNegInf0 = _mm256_srli_si256_rpl(vNegInf, 8); /* shift in a 0 */ __m256i vOpen = _mm256_set1_epi64x(open); __m256i vGap = _mm256_set1_epi64x(gap); __m256i vOne = _mm256_set1_epi64x(1); __m256i vN = _mm256_set1_epi64x(N); __m256i vNegOne = _mm256_set1_epi64x(-1); __m256i vI = _mm256_set_epi64x(0,1,2,3); __m256i vJreset = _mm256_set_epi64x(0,-1,-2,-3); __m256i vMaxScore = vNegInf; __m256i vEndI = vNegInf; __m256i vEndJ = vNegInf; __m256i vILimit = _mm256_set1_epi64x(s1Len); __m256i vILimit1 = _mm256_sub_epi64(vILimit, vOne); __m256i vJLimit = _mm256_set1_epi64x(s2Len); __m256i vJLimit1 = _mm256_sub_epi64(vJLimit, vOne); /* convert _s1 from char to int in range 0-23 */ for (i=0; i<s1Len; ++i) { s1[i] = matrix->mapper[(unsigned char)_s1[i]]; } /* pad back of s1 with dummy values */
foo (long long *v) { return _mm256_set_epi64x (v[3], v[2], v[1], v[0]); }
void Initialize() { /* Round constants for p_1: 01, 02, 05, 0a, 15, 0b, 17, 0e, 1d, 1b, 16, 0c Round constants for p_2: 18, 11, 03, 07, 0f, 1f Round constants for p_3: 1e, 1c, 19, 13, 06, 0d */ shuffleControlMaskFirstReg = _mm256_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, //0 9, 10, 11, 12, 13, 14, 15, 8, //1 18, 19, 20, 21, 22, 23, 16, 17, //2 28, 29, 30, 31, 24, 25, 26, 27); //4 shuffleControlMaskSecondReg = _mm256_setr_epi8( 7, 0, 1, 2, 3, 4, 5, 6, //7 255, 255, 255, 255, 255, 255, 255, 255, //Setting it to 0xFF makes shuffle zero the bits 255, 255, 255, 255, 255, 255, 255, 255, //Setting it to 0xFF makes shuffle zero the bits 255, 255, 255, 255, 255, 255, 255, 255); //Setting it to 0xFF makes shuffle zero the bits invShuffleControlMaskFirstReg = _mm256_setr_epi8( 0, 1, 2, 3, 4, 5, 6, 7, //0 15, 8, 9, 10, 11, 12, 13, 14, //1 22, 23, 16, 17, 18, 19, 20, 21, //2 28, 29, 30, 31, 24, 25, 26, 27); //4 invShuffleControlMaskSecondReg = _mm256_setr_epi8( 1, 2, 3, 4, 5, 6, 7, 0, //7 255, 255, 255, 255, 255, 255, 255, 255, //Setting it to 0xFF makes shuffle zero the bits 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255, 255); m256iAllOne = _mm256_set1_epi64x(0xFFFFFFFFFFFFFFFF); //Set the bits to 1111'1111 in the column two, second row byte, if the roundconstant has a onebit on this indice //p1 p1_constants_bit0[0] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit0[1] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit0[2] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit0[3] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit0[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit0[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit0[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit0[7] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit0[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit0[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit0[10] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit0[11] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit1[0] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit1[1] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit1[2] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit1[3] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit1[4] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit1[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit1[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit1[7] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit1[8] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit1[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit1[10] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit1[11] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit2[0] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit2[1] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit2[2] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit2[3] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit2[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit2[5] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit2[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit2[7] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit2[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit2[9] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit2[10] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit2[11] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit3[0] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit3[1] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit3[2] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit3[3] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit3[4] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit3[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit3[6] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit3[7] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit3[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit3[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit3[10] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit3[11] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit4[0] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit4[1] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit4[2] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit4[3] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit4[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit4[5] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit4[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit4[7] = _mm256_set_epi64x(0, 0, 0, 0); p1_constants_bit4[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit4[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit4[10] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p1_constants_bit4[11] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit0[0] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit0[1] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit0[2] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit0[3] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit0[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit0[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit0[6] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit0[7] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit0[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit0[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit0[10] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit0[11] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit1[0] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit1[1] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit1[2] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit1[3] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit1[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit1[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit1[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit1[7] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit1[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit1[9] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit1[10] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit1[11] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit2[0] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit2[1] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit2[2] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit2[3] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit2[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit2[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit2[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit2[7] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit2[8] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit2[9] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit2[10] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit2[11] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit3[0] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit3[1] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit3[2] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit3[3] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit3[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit3[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit3[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit3[7] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit3[8] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit3[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit3[10] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit3[11] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit4[0] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit4[1] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit4[2] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit4[3] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit4[4] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit4[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit4[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit4[7] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit4[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit4[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0); p4_constants_bit4[10] = _mm256_set_epi64x(0, 0, 0, 0); p4_constants_bit4[11] = _mm256_set_epi64x(0, 0, 0, 0); }
void encrypt_64blocks(u256 x[32], u256 rk[40][16]) { int i, j; u256 rc, tmp[8]; rc = _mm256_set_epi64x(0x000000FF000000FFull, 0x000000FF000000FFull, 0x000000FF000000FFull, 0x000000FF000000FFull); for(i = 0; i < 40; i++){ //SubBytes for(j = 0; j < 4; j++) { tmp[7] = XOR(x[2 + 8*j], NOR(XOR(x[3 + 8*j], NOR(x[0 + 8*j],x[1 + 8*j])),XOR(x[7 + 8*j], NOR(x[4 + 8*j],x[5 + 8*j])))); tmp[6] = XOR(x[3 + 8*j], NOR(x[0 + 8*j],x[1 + 8*j])); tmp[5] = XOR(x[7 + 8*j], NOR(x[4 + 8*j],x[5 + 8*j])); tmp[4] = XOR(x[4 + 8*j], NOR(XOR(x[2 + 8*j], NOR(XOR(x[3 + 8*j], NOR(x[0 + 8*j],x[1 + 8*j])),XOR(x[7 + 8*j], NOR(x[4 + 8*j],x[5 + 8*j])))),XOR(x[3 + 8*j], NOR(x[0 + 8*j],x[1 + 8*j])))); tmp[3] = XOR(x[6 + 8*j], NOR(XOR(x[7 + 8*j], NOR(x[4 + 8*j],x[5 + 8*j])),x[4 + 8*j])); tmp[2] = XOR(x[1 + 8*j], NOR(x[5 + 8*j],x[6 + 8*j])); tmp[1] = XOR(x[0 + 8*j], NOR(XOR(x[1 + 8*j], NOR(x[5 + 8*j],x[6 + 8*j])),XOR(x[2 + 8*j], NOR(XOR(x[3 + 8*j], NOR(x[0 + 8*j],x[1 + 8*j])),XOR(x[7 + 8*j], NOR(x[4 + 8*j],x[5 + 8*j])))))); tmp[0] = XOR(x[5 + 8*j], NOR(XOR(x[6 + 8*j], NOR(XOR(x[7 + 8*j], NOR(x[4 + 8*j],x[5 + 8*j])),x[4 + 8*j])),XOR(x[0 + 8*j], NOR(XOR(x[1 + 8*j], NOR(x[5 + 8*j],x[6 + 8*j])),XOR(x[2 + 8*j], NOR(XOR(x[3 + 8*j], NOR(x[0 + 8*j],x[1 + 8*j])),XOR(x[7 + 8*j], NOR(x[4 + 8*j],x[5 + 8*j])))))))); x[0 + 8*j] = tmp[7]; x[1 + 8*j] = tmp[6]; x[2 + 8*j] = tmp[5]; x[3 + 8*j] = tmp[4]; x[4 + 8*j] = tmp[3]; x[5 + 8*j] = tmp[2]; x[6 + 8*j] = tmp[1]; x[7 + 8*j] = tmp[0]; } //AddConstant //This only adds c2. The other constants are added with the key x[22] = XOR(x[22], rc); //AddKey x[0] = XOR(x[0], rk[i][0]); x[1] = XOR(x[1], rk[i][1]); x[2] = XOR(x[2], rk[i][2]); x[3] = XOR(x[3], rk[i][3]); x[4] = XOR(x[4], rk[i][4]); x[5] = XOR(x[5], rk[i][5]); x[6] = XOR(x[6], rk[i][6]); x[7] = XOR(x[7], rk[i][7]); x[8] = XOR(x[8], rk[i][8]); x[9] = XOR(x[9], rk[i][9]); x[10] = XOR(x[10], rk[i][10]); x[11] = XOR(x[11], rk[i][11]); x[12] = XOR(x[12], rk[i][12]); x[13] = XOR(x[13], rk[i][13]); x[14] = XOR(x[14], rk[i][14]); x[15] = XOR(x[15], rk[i][15]); //ShiftRows x[8] = SR1(x[8]); x[16] = SR2(x[16]); x[24] = SR3(x[24]); x[9] = SR1(x[9]); x[17] = SR2(x[17]); x[25] = SR3(x[25]); x[10] = SR1(x[10]); x[18] = SR2(x[18]); x[26] = SR3(x[26]); x[11] = SR1(x[11]); x[19] = SR2(x[19]); x[27] = SR3(x[27]); x[12] = SR1(x[12]); x[20] = SR2(x[20]); x[28] = SR3(x[28]); x[13] = SR1(x[13]); x[21] = SR2(x[21]); x[29] = SR3(x[29]); x[14] = SR1(x[14]); x[22] = SR2(x[22]); x[30] = SR3(x[30]); x[15] = SR1(x[15]); x[23] = SR2(x[23]); x[31] = SR3(x[31]); //MixColumns tmp[0] = x[24]; tmp[1] = x[25]; tmp[2] = x[26]; tmp[3] = x[27]; tmp[4] = x[28]; tmp[5] = x[29]; tmp[6] = x[30]; tmp[7] = x[31]; x[24] = XOR(x[16], x[0]); x[28] = XOR(x[20], x[4]); x[25] = XOR(x[17], x[1]); x[29] = XOR(x[21], x[5]); x[26] = XOR(x[18], x[2]); x[30] = XOR(x[22], x[6]); x[27] = XOR(x[19], x[3]); x[31] = XOR(x[23], x[7]); x[16] = XOR(x[8], x[16]); x[20] = XOR(x[12], x[20]); x[17] = XOR(x[9], x[17]); x[21] = XOR(x[13], x[21]); x[18] = XOR(x[10], x[18]); x[22] = XOR(x[14], x[22]); x[19] = XOR(x[11], x[19]); x[23] = XOR(x[15], x[23]); x[8] = x[0]; x[12] = x[4]; x[9] = x[1]; x[13] = x[5]; x[10] = x[2]; x[14] = x[6]; x[11] = x[3]; x[15] = x[7]; x[0] = XOR(tmp[0], x[24]); x[4] = XOR(tmp[4], x[28]); x[1] = XOR(tmp[1], x[25]); x[5] = XOR(tmp[5], x[29]); x[2] = XOR(tmp[2], x[26]); x[6] = XOR(tmp[6], x[30]); x[3] = XOR(tmp[3], x[27]); x[7] = XOR(tmp[7], x[31]); } }