Beispiel #1
0
/**
 * Initializes the Sponge State. The first 512 bits are set to zeros and the remainder
 * receive Blake2b's IV as per Blake2b's specification. <b>Note:</b> Even though sponges
 * typically have their internal state initialized with zeros, Blake2b's G function
 * has a fixed point: if the internal state and message are both filled with zeros. the
 * resulting permutation will always be a block filled with zeros; this happens because
 * Blake2b does not use the constants originally employed in Blake2 inside its G function,
 * relying on the IV for avoiding possible fixed points.
 *
 * @param state         The 1024-bit array to be initialized
 */
inline void initState(uint64_t state[/*16*/]) {
#ifdef __AVX2__
  (*(__m256i*)(&state[0])) = _mm256_setzero_si256();
  (*(__m256i*)(&state[4])) = _mm256_setzero_si256();

  (*(__m256i*)(&state[8])) = _mm256_set_epi64x( blake2b_IV[3],
                                                blake2b_IV[2],
                                                blake2b_IV[1],
                                                blake2b_IV[0] );
  (*(__m256i*)(&state[12])) = _mm256_set_epi64x(blake2b_IV[7],
                                                blake2b_IV[6],
                                                blake2b_IV[5],
                                                blake2b_IV[4] );

//AVX is around the same number of instructions as unnoptimized
//#elif defined __AVX__

#else
    //First 512 bis are zeros
    memset(state, 0, 64);
    //Remainder BLOCK_LEN_BLAKE2_SAFE_BYTES are reserved to the IV
    state[8] = blake2b_IV[0];
    state[9] = blake2b_IV[1];
    state[10] = blake2b_IV[2];
    state[11] = blake2b_IV[3];
    state[12] = blake2b_IV[4];
    state[13] = blake2b_IV[5];
    state[14] = blake2b_IV[6];
    state[15] = blake2b_IV[7];
#endif
}
Beispiel #2
0
//int64
vec3l& vec3l::Sub(int64 v) {
	__m256i vxmm = _mm256_set_epi64x(0, v, v, v);
	__m256i xmm = _mm256_set_epi64x(0, z, y, x);
	xmm = _mm256_sub_epi64(xmm, vxmm);

	x = M256L(xmm, 0);
	y = M256L(xmm, 1);
	z = M256L(xmm, 2);

	return *this;
}
Beispiel #3
0
vec3l& vec3l::Sub(const vec3l& v) {
	__m256i vxmm = _mm256_set_epi64x(0, v.z, v.y, v.x);
	__m256i xmm = _mm256_set_epi64x(0, z, y, x);
	xmm = _mm256_sub_epi64(xmm, vxmm);

	x = M256L(xmm, 0);
	y = M256L(xmm, 1);
	z = M256L(xmm, 2);

	return *this;
}
Beispiel #4
0
static void
avx2_test (void)
{
  union256i_q u, s1, s2;
  long long int e[4];
  int i;

  s1.x = _mm256_set_epi64x (1, 2, 3, 4);

  s2.x = _mm256_set_epi64x (88, 44, 3, 220000);

  u.x = _mm256_cmpgt_epi64 (s1.x, s2.x);

  for (i = 0; i < 4; i++)
    e[i] = (s1.a[i] > s2.a[i]) ? -1 : 0;

  if (check_union256i_q (u, e))
    abort ();
}
Beispiel #5
0
void key_schedule(const unsigned char *k, u256 rk[40][16]) {
  int i, j;
  u256 tk1[32], tmp[32];

  unsigned char *tmp_key = malloc(32);

  for(i = 0; i < 2; i++)
    memcpy(tmp_key + 16*i, k, 16);
  pack_key(tk1, tmp_key);

  for(j = 0; j < 40; j++) {
    //Extract round key
    for(i = 0; i < 16; i++){
      rk[j][i] = tk1[i];
    }

    //Add constant into key
    u256 rc = _mm256_set_epi64x(0x000000FF000000FFull,
                                0x000000FF000000FFull,
                                0x000000FF000000FFull,
                                0x000000FF000000FFull);

    if(RC[j]>>5 & 1)
      rk[j][14] = XOR(rk[j][14], rc);
    if(RC[j]>>4 & 1)
      rk[j][15] = XOR(rk[j][15], rc);
    if(RC[j]>>3 & 1)
      rk[j][4] = XOR(rk[j][4], rc);
    if(RC[j]>>2 & 1)
      rk[j][5] = XOR(rk[j][5], rc);
    if(RC[j]>>1 & 1)
      rk[j][6] = XOR(rk[j][6], rc);
    if(RC[j]>>0 & 1)
      rk[j][7] = XOR(rk[j][7], rc);

    //Update TK1
    for(i = 0; i < 16; i++){
      tmp[16 + i] = tk1[0 + i];
    }

    //Apply bit permutation
    for(i = 0; i < 8; i++){
      tmp[0 + i] = XOR(_mm256_shuffle_epi8(tk1[16  + i], _mm256_set_epi8(0xff,28,0xff,29,0xff,24,0xff,25,0xff,20,0xff,21,0xff,16,0xff,17,0xff,12,0xff,13,0xff,8,0xff,9,0xff,4,0xff,5,0xff,0,0xff,1)),
                       _mm256_shuffle_epi8(tk1[24  + i], _mm256_set_epi8(29,0xff,31,0xff,25,0xff,27,0xff,21,0xff,23,0xff,17,0xff,19,0xff,13,0xff,15,0xff,9,0xff,11,0xff,5,0xff,7,0xff,1,0xff,3,0xff)));
      tmp[8 + i] = XOR(_mm256_shuffle_epi8(tk1[16  + i], _mm256_set_epi8(31,0xff,0xff,30,27,0xff,0xff,26,23,0xff,0xff,22,19,0xff,0xff,18,15,0xff,0xff,14,11,0xff,0xff,10,7,0xff,0xff,6,3,0xff,0xff,2)),
                       _mm256_shuffle_epi8(tk1[24  + i], _mm256_set_epi8(0xff,28,30,0xff,0xff,24,26,0xff,0xff,20,22,0xff,0xff,16,18,0xff,0xff,12,14,0xff,0xff,8,10,0xff,0xff,4,6,0xff,0xff,0,2,0xff)));
    }

    for(i = 0; i < 32; i++){
      tk1[i] = tmp[i];
    }
  }
  free(tmp_key);
}
Beispiel #6
0
foo (long long x)
{
  return _mm256_set_epi64x (x, x, x, x);
}
Beispiel #7
0
void Threefish_512_AVX2::encrypt_n(const byte in[], byte out[], size_t blocks) const
   {
   const u64bit* K = &get_K()[0];
   const u64bit* T_64 = &get_T()[0];

   const __m256i ROTATE_1 = _mm256_set_epi64x(37,19,36,46);
   const __m256i ROTATE_2 = _mm256_set_epi64x(42,14,27,33);
   const __m256i ROTATE_3 = _mm256_set_epi64x(39,36,49,17);
   const __m256i ROTATE_4 = _mm256_set_epi64x(56,54, 9,44);
   const __m256i ROTATE_5 = _mm256_set_epi64x(24,34,30,39);
   const __m256i ROTATE_6 = _mm256_set_epi64x(17,10,50,13);
   const __m256i ROTATE_7 = _mm256_set_epi64x(43,39,29,25);
   const __m256i ROTATE_8 = _mm256_set_epi64x(22,56,35, 8);

#define THREEFISH_ROUND(X0, X1, SHL)                                                \
   do {                                                                             \
      const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL);            \
      X0 = _mm256_add_epi64(X0, X1);                                                \
      X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
      X1 = _mm256_xor_si256(X1, X0);                                                \
      X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1));                   \
      X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0));                   \
   } while(0)

#define THREEFISH_ROUND_2(X0, X1, X2, X3, SHL)                           \
   do {                                                                             \
      const __m256i SHR = _mm256_sub_epi64(_mm256_set1_epi64x(64), SHL);            \
      X0 = _mm256_add_epi64(X0, X1);                                                \
      X2 = _mm256_add_epi64(X2, X3);                                                \
      X1 = _mm256_or_si256(_mm256_sllv_epi64(X1, SHL), _mm256_srlv_epi64(X1, SHR)); \
      X3 = _mm256_or_si256(_mm256_sllv_epi64(X3, SHL), _mm256_srlv_epi64(X3, SHR)); \
      X1 = _mm256_xor_si256(X1, X0);                                                \
      X3 = _mm256_xor_si256(X3, X2);                                                \
      X0 = _mm256_permute4x64_epi64(X0, _MM_SHUFFLE(0, 3, 2, 1));                   \
      X2 = _mm256_permute4x64_epi64(X2, _MM_SHUFFLE(0, 3, 2, 1));                   \
      X1 = _mm256_permute4x64_epi64(X1, _MM_SHUFFLE(1, 2, 3, 0));                   \
      X3 = _mm256_permute4x64_epi64(X3, _MM_SHUFFLE(1, 2, 3, 0));                   \
   } while(0)

#define THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, T0I, T1I)                        \
   do {                                                                          \
      const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
      const __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
      X0 = _mm256_add_epi64(X0, K0);                                             \
      X1 = _mm256_add_epi64(X1, K1);                                             \
      X1 = _mm256_add_epi64(X1, R);                                              \
      X0 = _mm256_add_epi64(X0, T0);                                             \
      X1 = _mm256_add_epi64(X1, T1);                                             \
      R = _mm256_add_epi64(R, ONE);                                              \
   } while(0)

#define THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, T0I, T1I)              \
   do {                                                                          \
      const __m256i T0 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(T0I, 0, 0, 0)); \
      __m256i T1 = _mm256_permute4x64_epi64(T, _MM_SHUFFLE(0, T1I, 0, 0)); \
      X0 = _mm256_add_epi64(X0, K0);                                             \
      X2 = _mm256_add_epi64(X2, K0);                                             \
      X1 = _mm256_add_epi64(X1, K1);                                             \
      X3 = _mm256_add_epi64(X3, K1);                                             \
      T1 = _mm256_add_epi64(T1, R);                                              \
      X0 = _mm256_add_epi64(X0, T0);                                             \
      X2 = _mm256_add_epi64(X2, T0);                                             \
      X1 = _mm256_add_epi64(X1, T1);                                             \
      X3 = _mm256_add_epi64(X3, T1);                                             \
      R = _mm256_add_epi64(R, ONE);                                              \
   } while(0)

#define THREEFISH_ENC_8_ROUNDS(X0, X1, R, K1, K2, K3, T0, T1, T2)        \
   do {                                                        \
      THREEFISH_ROUND(X0, X1, ROTATE_1);                       \
      THREEFISH_ROUND(X0, X1, ROTATE_2);                       \
      THREEFISH_ROUND(X0, X1, ROTATE_3);                       \
      THREEFISH_ROUND(X0, X1, ROTATE_4);                       \
      THREEFISH_INJECT_KEY(X0, X1, R, K1, K2, T0, T1);         \
                                                               \
      THREEFISH_ROUND(X0, X1, ROTATE_5);                       \
      THREEFISH_ROUND(X0, X1, ROTATE_6);                       \
      THREEFISH_ROUND(X0, X1, ROTATE_7);                       \
      THREEFISH_ROUND(X0, X1, ROTATE_8);                       \
      THREEFISH_INJECT_KEY(X0, X1, R, K2, K3, T2, T0);         \
   } while(0)

#define THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K1, K2, K3, T0, T1, T2) \
   do {                                                                  \
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_1);                       \
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_2);                       \
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_3);                       \
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_4);                       \
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K1, K2, T0, T1);         \
                                                                         \
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_5);                       \
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_6);                       \
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_7);                       \
      THREEFISH_ROUND_2(X0, X1, X2, X3, ROTATE_8);                       \
      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K2, K3, T2, T0);         \
   } while(0)

   /*
   v1.0 key schedule: 9 ymm registers (only need 2 or 3)
   (0,1,2,3),(4,5,6,7) [8]
   then mutating with vpermq
   */
   const __m256i K0 = _mm256_set_epi64x(K[6], K[4], K[2], K[0]);
   const __m256i K1 = _mm256_set_epi64x(K[7], K[5], K[3], K[1]);
   const __m256i K2 = _mm256_set_epi64x(K[8], K[6], K[4], K[2]);
   const __m256i K3 = _mm256_set_epi64x(K[0], K[7], K[5], K[3]);
   const __m256i K4 = _mm256_set_epi64x(K[1], K[8], K[6], K[4]);
   const __m256i K5 = _mm256_set_epi64x(K[2], K[0], K[7], K[5]);
   const __m256i K6 = _mm256_set_epi64x(K[3], K[1], K[8], K[6]);
   const __m256i K7 = _mm256_set_epi64x(K[4], K[2], K[0], K[7]);
   const __m256i K8 = _mm256_set_epi64x(K[5], K[3], K[1], K[8]);

   const __m256i ONE = _mm256_set_epi64x(1, 0, 0, 0);

   const __m256i* in_mm = reinterpret_cast<const __m256i*>(in);
   __m256i* out_mm = reinterpret_cast<__m256i*>(out);

   while(blocks >= 2)
      {
      __m256i X0 = _mm256_loadu_si256(in_mm++);
      __m256i X1 = _mm256_loadu_si256(in_mm++);
      __m256i X2 = _mm256_loadu_si256(in_mm++);
      __m256i X3 = _mm256_loadu_si256(in_mm++);

      const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);

      __m256i R = _mm256_set_epi64x(0, 0, 0, 0);

      interleave_epi64(X0, X1);
      interleave_epi64(X2, X3);

      THREEFISH_INJECT_KEY_2(X0, X1, X2, X3, R, K0, K1, 2, 3);

      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K1,K2,K3, 1, 2, 3);
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K3,K4,K5, 2, 3, 1);
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K5,K6,K7, 3, 1, 2);

      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K7,K8,K0, 1, 2, 3);
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K0,K1,K2, 2, 3, 1);
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K2,K3,K4, 3, 1, 2);

      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K4,K5,K6, 1, 2, 3);
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K6,K7,K8, 2, 3, 1);
      THREEFISH_ENC_2_8_ROUNDS(X0, X1, X2, X3, R, K8,K0,K1, 3, 1, 2);

      deinterleave_epi64(X0, X1);
      deinterleave_epi64(X2, X3);

      _mm256_storeu_si256(out_mm++, X0);
      _mm256_storeu_si256(out_mm++, X1);
      _mm256_storeu_si256(out_mm++, X2);
      _mm256_storeu_si256(out_mm++, X3);

      blocks -= 2;
      }

   for(size_t i = 0; i != blocks; ++i)
      {
      __m256i X0 = _mm256_loadu_si256(in_mm++);
      __m256i X1 = _mm256_loadu_si256(in_mm++);

      const __m256i T = _mm256_set_epi64x(T_64[0], T_64[1], T_64[2], 0);

      __m256i R = _mm256_set_epi64x(0, 0, 0, 0);

      interleave_epi64(X0, X1);

      THREEFISH_INJECT_KEY(X0, X1, R, K0, K1, 2, 3);

      THREEFISH_ENC_8_ROUNDS(X0, X1, R, K1,K2,K3, 1, 2, 3);
      THREEFISH_ENC_8_ROUNDS(X0, X1, R, K3,K4,K5, 2, 3, 1);
      THREEFISH_ENC_8_ROUNDS(X0, X1, R, K5,K6,K7, 3, 1, 2);

      THREEFISH_ENC_8_ROUNDS(X0, X1, R, K7,K8,K0, 1, 2, 3);
      THREEFISH_ENC_8_ROUNDS(X0, X1, R, K0,K1,K2, 2, 3, 1);
      THREEFISH_ENC_8_ROUNDS(X0, X1, R, K2,K3,K4, 3, 1, 2);

      THREEFISH_ENC_8_ROUNDS(X0, X1, R, K4,K5,K6, 1, 2, 3);
      THREEFISH_ENC_8_ROUNDS(X0, X1, R, K6,K7,K8, 2, 3, 1);
      THREEFISH_ENC_8_ROUNDS(X0, X1, R, K8,K0,K1, 3, 1, 2);

      deinterleave_epi64(X0, X1);

      _mm256_storeu_si256(out_mm++, X0);
      _mm256_storeu_si256(out_mm++, X1);
      }

#undef THREEFISH_ENC_8_ROUNDS
#undef THREEFISH_ROUND
#undef THREEFISH_INJECT_KEY
#undef THREEFISH_ENC_2_8_ROUNDS
#undef THREEFISH_ROUND_2
#undef THREEFISH_INJECT_KEY_2
   }
    parasail_result_t *result = parasail_result_new();
#endif
#endif
    int32_t i = 0;
    int32_t j = 0;
    int32_t end_query = 0;
    int32_t end_ref = 0;
    int64_t score = NEG_INF;
    __m256i vNegInf = _mm256_set1_epi64x(NEG_INF);
    __m256i vNegInf0 = _mm256_srli_si256_rpl(vNegInf, 8); /* shift in a 0 */
    __m256i vOpen = _mm256_set1_epi64x(open);
    __m256i vGap  = _mm256_set1_epi64x(gap);
    __m256i vOne = _mm256_set1_epi64x(1);
    __m256i vN = _mm256_set1_epi64x(N);
    __m256i vNegOne = _mm256_set1_epi64x(-1);
    __m256i vI = _mm256_set_epi64x(0,1,2,3);
    __m256i vJreset = _mm256_set_epi64x(0,-1,-2,-3);
    __m256i vMaxScore = vNegInf;
    __m256i vEndI = vNegInf;
    __m256i vEndJ = vNegInf;
    __m256i vILimit = _mm256_set1_epi64x(s1Len);
    __m256i vILimit1 = _mm256_sub_epi64(vILimit, vOne);
    __m256i vJLimit = _mm256_set1_epi64x(s2Len);
    __m256i vJLimit1 = _mm256_sub_epi64(vJLimit, vOne);
    

    /* convert _s1 from char to int in range 0-23 */
    for (i=0; i<s1Len; ++i) {
        s1[i] = matrix->mapper[(unsigned char)_s1[i]];
    }
    /* pad back of s1 with dummy values */
Beispiel #9
0
foo (long long *v)
{
  return _mm256_set_epi64x (v[3], v[2], v[1], v[0]);
}
Beispiel #10
0
void Initialize() {
	/*
	Round constants for p_1:
	01, 02, 05, 0a, 15, 0b, 17, 0e, 1d, 1b, 16, 0c

	Round constants for p_2:
	18, 11, 03, 07, 0f, 1f

	Round constants for p_3:
	1e, 1c, 19, 13, 06, 0d
	*/

	shuffleControlMaskFirstReg = _mm256_setr_epi8(
		0, 1, 2, 3, 4, 5, 6, 7, //0
		9, 10, 11, 12, 13, 14, 15, 8, //1 
		18, 19, 20, 21, 22, 23, 16, 17, //2
		28, 29, 30, 31, 24, 25, 26, 27); //4
	shuffleControlMaskSecondReg = _mm256_setr_epi8(
		7, 0, 1, 2, 3, 4, 5, 6, //7
		255, 255, 255, 255, 255, 255, 255, 255, //Setting it to 0xFF makes shuffle zero the bits
		255, 255, 255, 255, 255, 255, 255, 255, //Setting it to 0xFF makes shuffle zero the bits
		255, 255, 255, 255, 255, 255, 255, 255); //Setting it to 0xFF makes shuffle zero the bits

	invShuffleControlMaskFirstReg = _mm256_setr_epi8(
		0, 1, 2, 3, 4, 5, 6, 7, //0
		15, 8, 9, 10, 11, 12, 13, 14, //1 
		22, 23, 16, 17, 18, 19, 20, 21, //2
		28, 29, 30, 31, 24, 25, 26, 27); //4
	invShuffleControlMaskSecondReg = _mm256_setr_epi8(
		1, 2, 3, 4, 5, 6, 7, 0, //7
		255, 255, 255, 255, 255, 255, 255, 255, //Setting it to 0xFF makes shuffle zero the bits
		255, 255, 255, 255, 255, 255, 255, 255,
		255, 255, 255, 255, 255, 255, 255, 255);

	m256iAllOne = _mm256_set1_epi64x(0xFFFFFFFFFFFFFFFF);

	//Set the bits to 1111'1111 in the column two, second row byte, if the roundconstant has a onebit on this indice
	//p1
	p1_constants_bit0[0] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit0[1] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit0[2] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit0[3] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit0[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit0[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit0[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit0[7] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit0[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit0[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit0[10] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit0[11] = _mm256_set_epi64x(0, 0, 0, 0);

	p1_constants_bit1[0] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit1[1] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit1[2] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit1[3] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit1[4] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit1[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit1[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit1[7] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit1[8] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit1[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit1[10] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit1[11] = _mm256_set_epi64x(0, 0, 0, 0);

	p1_constants_bit2[0] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit2[1] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit2[2] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit2[3] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit2[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit2[5] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit2[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit2[7] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit2[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit2[9] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit2[10] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit2[11] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);

	p1_constants_bit3[0] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit3[1] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit3[2] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit3[3] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit3[4] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit3[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit3[6] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit3[7] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit3[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit3[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit3[10] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit3[11] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);

	p1_constants_bit4[0] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit4[1] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit4[2] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit4[3] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit4[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit4[5] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit4[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit4[7] = _mm256_set_epi64x(0, 0, 0, 0);
	p1_constants_bit4[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit4[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit4[10] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p1_constants_bit4[11] = _mm256_set_epi64x(0, 0, 0, 0);

	p4_constants_bit0[0] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit0[1] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit0[2] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit0[3] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit0[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit0[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit0[6] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit0[7] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit0[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit0[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit0[10] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit0[11] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);

	p4_constants_bit1[0] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit1[1] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit1[2] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit1[3] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit1[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit1[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit1[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit1[7] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit1[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit1[9] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit1[10] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit1[11] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);

	p4_constants_bit2[0] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit2[1] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit2[2] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit2[3] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit2[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit2[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit2[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit2[7] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit2[8] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit2[9] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit2[10] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit2[11] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);

	p4_constants_bit3[0] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit3[1] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit3[2] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit3[3] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit3[4] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit3[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit3[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit3[7] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit3[8] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit3[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit3[10] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit3[11] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);

	p4_constants_bit4[0] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit4[1] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit4[2] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit4[3] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit4[4] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit4[5] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit4[6] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit4[7] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit4[8] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit4[9] = _mm256_set_epi64x(0, 0, 0b0000000000000000000000000000000000000000000000001111111100000000, 0);
	p4_constants_bit4[10] = _mm256_set_epi64x(0, 0, 0, 0);
	p4_constants_bit4[11] = _mm256_set_epi64x(0, 0, 0, 0);
}
Beispiel #11
0
void encrypt_64blocks(u256 x[32], u256 rk[40][16]) {

  int i, j;
  u256 rc, tmp[8];
  rc = _mm256_set_epi64x(0x000000FF000000FFull,
                         0x000000FF000000FFull,
                         0x000000FF000000FFull,
                         0x000000FF000000FFull);

  for(i = 0; i < 40; i++){
    //SubBytes
    for(j = 0; j < 4; j++) {
      tmp[7] = XOR(x[2 + 8*j], NOR(XOR(x[3 + 8*j], NOR(x[0 + 8*j],x[1 + 8*j])),XOR(x[7 + 8*j], NOR(x[4 + 8*j],x[5 + 8*j]))));
      tmp[6] = XOR(x[3 + 8*j], NOR(x[0 + 8*j],x[1 + 8*j]));
      tmp[5] = XOR(x[7 + 8*j], NOR(x[4 + 8*j],x[5 + 8*j]));
      tmp[4] = XOR(x[4 + 8*j], NOR(XOR(x[2 + 8*j], NOR(XOR(x[3 + 8*j], NOR(x[0 + 8*j],x[1 + 8*j])),XOR(x[7 + 8*j], NOR(x[4 + 8*j],x[5 + 8*j])))),XOR(x[3 + 8*j], NOR(x[0 + 8*j],x[1 + 8*j]))));
      tmp[3] = XOR(x[6 + 8*j], NOR(XOR(x[7 + 8*j], NOR(x[4 + 8*j],x[5 + 8*j])),x[4 + 8*j]));
      tmp[2] = XOR(x[1 + 8*j], NOR(x[5 + 8*j],x[6 + 8*j]));
      tmp[1] = XOR(x[0 + 8*j], NOR(XOR(x[1 + 8*j], NOR(x[5 + 8*j],x[6 + 8*j])),XOR(x[2 + 8*j], NOR(XOR(x[3 + 8*j], NOR(x[0 + 8*j],x[1 + 8*j])),XOR(x[7 + 8*j], NOR(x[4 + 8*j],x[5 + 8*j]))))));
      tmp[0] = XOR(x[5 + 8*j], NOR(XOR(x[6 + 8*j], NOR(XOR(x[7 + 8*j], NOR(x[4 + 8*j],x[5 + 8*j])),x[4 + 8*j])),XOR(x[0 + 8*j], NOR(XOR(x[1 + 8*j], NOR(x[5 + 8*j],x[6 + 8*j])),XOR(x[2 + 8*j], NOR(XOR(x[3 + 8*j], NOR(x[0 + 8*j],x[1 + 8*j])),XOR(x[7 + 8*j], NOR(x[4 + 8*j],x[5 + 8*j]))))))));

      x[0 + 8*j] = tmp[7];
      x[1 + 8*j] = tmp[6];
      x[2 + 8*j] = tmp[5];
      x[3 + 8*j] = tmp[4];
      x[4 + 8*j] = tmp[3];
      x[5 + 8*j] = tmp[2];
      x[6 + 8*j] = tmp[1];
      x[7 + 8*j] = tmp[0];
    }

    //AddConstant
    //This only adds c2. The other constants are added with the key
    x[22] = XOR(x[22], rc);
  
    //AddKey
    x[0] = XOR(x[0], rk[i][0]);
    x[1] = XOR(x[1], rk[i][1]);
    x[2] = XOR(x[2], rk[i][2]);
    x[3] = XOR(x[3], rk[i][3]);
    x[4] = XOR(x[4], rk[i][4]);
    x[5] = XOR(x[5], rk[i][5]);
    x[6] = XOR(x[6], rk[i][6]);
    x[7] = XOR(x[7], rk[i][7]);
    x[8] = XOR(x[8], rk[i][8]);
    x[9] = XOR(x[9], rk[i][9]);
    x[10] = XOR(x[10], rk[i][10]);
    x[11] = XOR(x[11], rk[i][11]);
    x[12] = XOR(x[12], rk[i][12]);
    x[13] = XOR(x[13], rk[i][13]);
    x[14] = XOR(x[14], rk[i][14]);
    x[15] = XOR(x[15], rk[i][15]);
    
    //ShiftRows
    x[8]  = SR1(x[8]);  x[16] = SR2(x[16]); x[24] = SR3(x[24]);
    x[9]  = SR1(x[9]);  x[17] = SR2(x[17]); x[25] = SR3(x[25]);
    x[10] = SR1(x[10]); x[18] = SR2(x[18]); x[26] = SR3(x[26]);
    x[11] = SR1(x[11]); x[19] = SR2(x[19]); x[27] = SR3(x[27]);
    x[12] = SR1(x[12]); x[20] = SR2(x[20]); x[28] = SR3(x[28]);
    x[13] = SR1(x[13]); x[21] = SR2(x[21]); x[29] = SR3(x[29]);
    x[14] = SR1(x[14]); x[22] = SR2(x[22]); x[30] = SR3(x[30]);
    x[15] = SR1(x[15]); x[23] = SR2(x[23]); x[31] = SR3(x[31]);

    //MixColumns
    tmp[0] = x[24]; tmp[1] = x[25]; tmp[2] = x[26]; tmp[3] = x[27];
    tmp[4] = x[28]; tmp[5] = x[29]; tmp[6] = x[30]; tmp[7] = x[31];

    x[24] = XOR(x[16], x[0]); x[28] = XOR(x[20], x[4]);
    x[25] = XOR(x[17], x[1]); x[29] = XOR(x[21], x[5]);
    x[26] = XOR(x[18], x[2]); x[30] = XOR(x[22], x[6]);
    x[27] = XOR(x[19], x[3]); x[31] = XOR(x[23], x[7]);

    x[16] = XOR(x[8],  x[16]); x[20] = XOR(x[12], x[20]);
    x[17] = XOR(x[9],  x[17]); x[21] = XOR(x[13], x[21]);
    x[18] = XOR(x[10], x[18]); x[22] = XOR(x[14], x[22]);
    x[19] = XOR(x[11], x[19]); x[23] = XOR(x[15], x[23]);

    x[8]  = x[0]; x[12] = x[4];
    x[9]  = x[1]; x[13] = x[5];
    x[10] = x[2]; x[14] = x[6];
    x[11] = x[3]; x[15] = x[7];


    x[0] = XOR(tmp[0], x[24]); x[4] = XOR(tmp[4], x[28]);
    x[1] = XOR(tmp[1], x[25]); x[5] = XOR(tmp[5], x[29]);
    x[2] = XOR(tmp[2], x[26]); x[6] = XOR(tmp[6], x[30]);
    x[3] = XOR(tmp[3], x[27]); x[7] = XOR(tmp[7], x[31]); 
  }
}