예제 #1
0
파일: speck256.c 프로젝트: odzhan/tinycrypt
void speck128_encryptx(
    const void *key, 
    void *in)
{
  uint64_t i, t, k0, k1, k2, k3, x0, x1;
  
  // copy 256-bit key to local space
  k0 = ((uint64_t*)key)[0];
  k1 = ((uint64_t*)key)[1];
  k2 = ((uint64_t*)key)[2];
  k3 = ((uint64_t*)key)[3];
  
  // copy input to local space
  x0 = ((uint64_t*)in)[0];
  x1 = ((uint64_t*)in)[1];
  
  for (i=0; i<34; i++)
  {
    // encrypt block
    x1 = (ROTR64(x1, 8) + x0) ^ k0;
    x0 =  ROTL64(x0, 3) ^ x1;
    
    // create next subkey
    k1 = (ROTR64(k1, 8) + k0) ^ i;
    k0 =  ROTL64(k0, 3) ^ k1;

    XCHG(k3, k2, t);
    XCHG(k3, k1, t);    
  }
  // save result
  ((uint64_t*)in)[0] = x0;
  ((uint64_t*)in)[1] = x1;
}
예제 #2
0
파일: speck256.c 프로젝트: odzhan/tinycrypt
void speck128_encrypt(
    int enc, 
    void *in,
    const void *keys)
{
  uint64_t i;
  uint64_t *ks=(uint64_t*)keys;
  
  // copy input to local space
  uint64_t x = ((uint64_t*)in)[0];
  uint64_t y = ((uint64_t*)in)[1];
  
  for (i=0; i<34; i++)
  {
    if (enc == SPECK_DECRYPT)
    {
      x = ROTR64(x  ^ y, 3);
      y = ROTL64((y ^ ks[34-1-i]) - x, 8);        
    } else {
      y = (ROTR64(y, 8) + x) ^ ks[i];
      x = ROTL64(x, 3)  ^ y;
    }
  }
  // save result
  ((uint64_t*)in)[0] = x;
  ((uint64_t*)in)[1] = y;
}
예제 #3
0
파일: speck256.c 프로젝트: odzhan/tinycrypt
void speck128_setkey(
    const void *in, 
    void *out)
{
  uint64_t i, t, k0, k1, k2, k3;

  // copy 256-bit key to local space
  k0 = ((uint64_t*)in)[0];
  k1 = ((uint64_t*)in)[1];
  k2 = ((uint64_t*)in)[2];
  k3 = ((uint64_t*)in)[3];

  // expand 256-bit key into round keys
  for (i=0; i<34; i++)
  {
    ((uint64_t*)out)[i] = k0;
    
    k1 = (ROTR64(k1, 8) + k0) ^ i;
    k0 = ROTL64(k0,  3) ^ k1;
    
    // rotate left 64-bits
    XCHG(k3, k2, t);
    XCHG(k3, k1, t);
  }
}
예제 #4
0
파일: sha3.c 프로젝트: Wagnerp/hashdeep
// update the state with given number of rounds
static void keccakf(uint64_t st[25], int rounds)
{
  int i, j, round_num;
  uint64_t t, bc[5];

  for (round_num = 0; round_num < rounds; round_num++) 
  {

    // Theta
    for (i = 0; i < 5; i++) 
      bc[i] = st[i] ^ st[i + 5] ^ st[i + 10] ^ st[i + 15] ^ st[i + 20];
    
    for (i = 0; i < 5; i++) 
    {
      t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
      for (j = 0; j < 25; j += 5)
	st[j + i] ^= t;
      }

    // Rho Pi
    t = st[1];
    for (i = 0; i < 24; i++) 
    {
      j = keccakf_piln[i];
      bc[0] = st[j];
      st[j] = ROTL64(t, keccakf_rotc[i]);
      t = bc[0];
    }

    //  Chi
    for (j = 0; j < 25; j += 5) 
    {
      for (i = 0; i < 5; i++)
	bc[i] = st[j + i];
      for (i = 0; i < 5; i++)
	st[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5];
    }

    //  Iota
    st[0] ^= keccakf_rndc[round_num];
  }
}
예제 #5
0
파일: siphash2.c 프로젝트: odzhan/tinycrypt
void siphash24x(void *out, void const *in, 
  int inlen, void const *key)
{
  uint64_t v[4];
  uint64_t k0, k1, x, b=inlen;
  int      c, r, i, len=inlen;
  uint8_t  *p=(uint8_t*)in;
  
  b <<= 56;
  
  k0 = ((uint64_t*)key)[0];
  k1 = ((uint64_t*)key)[1];
  
  v[0] = k0 ^ 0x736F6D6570736575ULL;
  v[1] = k1 ^ 0x646F72616E646F6DULL;
  v[2] = k0 ^ 0x6C7967656E657261ULL;
  v[3] = k1 ^ 0x7465646279746573ULL;

  // process all bytes
  while (len >= 0)
  {
    // get 8 or whatever bytes remaining.
    x = 0;
    r = (len < 8) ? len : 8;
    if (len != 0)
    {
      for (i=0; i<r; i++) {
        x |= *p++;
        x  = ROTR64(x, 8);
      }
    }
    // subtract r from len
    len -= r;
    // if this is last block
    if (len<=0) {
      // if r is less than 8
      if (r<8) {
        x = ROTL64(x, (r << 3));
        x |= b;
      } else {
        perm(0, x, v);
        x=b;
      }
    }
    // permutate
    perm(0, x, v);
  }
  // do last permutation
  perm(1, x, v);
  
  ((uint64_t*)out)[0] = v[0] ^ v[1] ^ v[2] ^ v[3];
}
예제 #6
0
/* SHA3 theta() transformation */
static void Sha3Theta(uint64_t *A)
{
	unsigned int x;
	uint64_t C[5], D[5];

	for (x = 0; x < 5; x++) {
		C[x] = A[x] ^ A[x + 5] ^ A[x + 10] ^ A[x + 15] ^ A[x + 20];
	}
	D[0] = ROTL64(C[1], 1) ^ C[4];
	D[1] = ROTL64(C[2], 1) ^ C[0];
	D[2] = ROTL64(C[3], 1) ^ C[1];
	D[3] = ROTL64(C[4], 1) ^ C[2];
	D[4] = ROTL64(C[0], 1) ^ C[3];

	for (x = 0; x < 5; x++) {
		A[x]      ^= D[x];
		A[x + 5]  ^= D[x];
		A[x + 10] ^= D[x];
		A[x + 15] ^= D[x];
		A[x + 20] ^= D[x];
	}
}
예제 #7
0
파일: keccak.c 프로젝트: madwyn/keccak
/* Updates the given state with a given number of rounds */
void keccakf(uint64_t _s[25], const int _rounds){
    register int i, j, round;
    register uint64_t t;
    uint64_t bc[5];

    for(round = 0; round < _rounds; round++){
        //Theta step
        for(i = 0; i < 5; i++)
            bc[i] = _s[i] ^ _s[i + 5] ^ _s[i + 10] ^ _s[i + 15] ^ _s[i + 20];
        for(i = 0; i < 5; i++){
            t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
            for(j = 0; j < 25; j += 5)
                _s[j + i] ^= t;
        }

        //Rho and Pi
        t = _s[1];
        for(i = 0; i < 24; i++){
            j = keccakf_piln[i];
            bc[0] = _s[j];
            _s[j] = ROTL64(t, keccakf_rotc[i]);
            t = bc[0];
        }

        //Chi
        for(j = 0; j < 25; j += 5){
            for(i = 0; i < 5; i++)
                bc[i] = _s[j + i];
            for(i = 0; i < 5; i++)
                _s[j + i] ^= (~bc[(i + 1) % 5]) & bc[(i + 2) % 5];
        }

        //Iota
        _s[0] ^= keccakf_rndc[round];
    }
}
예제 #8
0
파일: siphash2.c 프로젝트: odzhan/tinycrypt
void perm(int last, uint64_t x, uint64_t* v)
{
  int      rnds;
  uint64_t v0, v1, v2, v3;
  
  v0=v[0];
  v1=v[1];
  v2=v[2];
  v3=v[3];
  
  if (last) {
    v2 ^= 0xff;
  } else {
    v3 ^= x;
  }
  // if it's last round, do 4 loops
  rnds = 2 << last;
  
  do {
    v0 += v1;
    v1 = ROTL64(v1, 13);
    v1 ^= v0;
    v0 = ROTL64(v0, 32);

    v2 += v3;
    v3 = ROTL64(v3, 16);
    v3 ^= v2;

    v2 += v1;
    v1 = ROTL64(v1, 17);
    v1 ^= v2;
    v2 = ROTL64(v2, 32);

    v0 += v3;
    v3 = ROTL64(v3, 21);
    v3 ^= v0;
  } while (--rnds);
  
  if (!last) {
    v0 ^= x;
  }
  v[0]=v0;
  v[1]=v1;
  v[2]=v2;
  v[3]=v3;
}
예제 #9
0
파일: c_keccak.c 프로젝트: baudy2/xmr-stak
void keccakf(uint64_t st[25], int rounds)
{
	int i, j, round;
	uint64_t t, bc[5];

	for (round = 0; round < rounds; ++round) {

		// Theta
		bc[0] = st[0] ^ st[5] ^ st[10] ^ st[15] ^ st[20];
		bc[1] = st[1] ^ st[6] ^ st[11] ^ st[16] ^ st[21];
		bc[2] = st[2] ^ st[7] ^ st[12] ^ st[17] ^ st[22];
		bc[3] = st[3] ^ st[8] ^ st[13] ^ st[18] ^ st[23];
		bc[4] = st[4] ^ st[9] ^ st[14] ^ st[19] ^ st[24];

		for (i = 0; i < 5; ++i) {
			t = bc[(i + 4) % 5] ^ ROTL64(bc[(i + 1) % 5], 1);
			st[i     ] ^= t;
			st[i +  5] ^= t;
			st[i + 10] ^= t;
			st[i + 15] ^= t;
			st[i + 20] ^= t;
		}

		// Rho Pi
		t = st[1];
		st[ 1] = ROTL64(st[ 6], 44);
		st[ 6] = ROTL64(st[ 9], 20);
		st[ 9] = ROTL64(st[22], 61);
		st[22] = ROTL64(st[14], 39);
		st[14] = ROTL64(st[20], 18);
		st[20] = ROTL64(st[ 2], 62);
		st[ 2] = ROTL64(st[12], 43);
		st[12] = ROTL64(st[13], 25);
		st[13] = ROTL64(st[19],  8);
		st[19] = ROTL64(st[23], 56);
		st[23] = ROTL64(st[15], 41);
		st[15] = ROTL64(st[ 4], 27);
		st[ 4] = ROTL64(st[24], 14);
		st[24] = ROTL64(st[21],  2);
		st[21] = ROTL64(st[ 8], 55);
		st[ 8] = ROTL64(st[16], 45);
		st[16] = ROTL64(st[ 5], 36);
		st[ 5] = ROTL64(st[ 3], 28);
		st[ 3] = ROTL64(st[18], 21);
		st[18] = ROTL64(st[17], 15);
		st[17] = ROTL64(st[11], 10);
		st[11] = ROTL64(st[ 7],  6);
		st[ 7] = ROTL64(st[10],  3);
		st[10] = ROTL64(t, 1);

		//  Chi
		// unrolled loop, where only last iteration is different
		j = 0;
		bc[0] = st[j    ];
		bc[1] = st[j + 1];

		st[j    ] ^= (~st[j + 1]) & st[j + 2];
		st[j + 1] ^= (~st[j + 2]) & st[j + 3];
		st[j + 2] ^= (~st[j + 3]) & st[j + 4];
		st[j + 3] ^= (~st[j + 4]) & bc[0];
		st[j + 4] ^= (~bc[0]) & bc[1];

		j = 5;
		bc[0] = st[j    ];
		bc[1] = st[j + 1];

		st[j    ] ^= (~st[j + 1]) & st[j + 2];
		st[j + 1] ^= (~st[j + 2]) & st[j + 3];
		st[j + 2] ^= (~st[j + 3]) & st[j + 4];
		st[j + 3] ^= (~st[j + 4]) & bc[0];
		st[j + 4] ^= (~bc[0]) & bc[1];

		j = 10;
		bc[0] = st[j    ];
		bc[1] = st[j + 1];

		st[j    ] ^= (~st[j + 1]) & st[j + 2];
		st[j + 1] ^= (~st[j + 2]) & st[j + 3];
		st[j + 2] ^= (~st[j + 3]) & st[j + 4];
		st[j + 3] ^= (~st[j + 4]) & bc[0];
		st[j + 4] ^= (~bc[0]) & bc[1];

		j = 15;
		bc[0] = st[j    ];
		bc[1] = st[j + 1];

		st[j    ] ^= (~st[j + 1]) & st[j + 2];
		st[j + 1] ^= (~st[j + 2]) & st[j + 3];
		st[j + 2] ^= (~st[j + 3]) & st[j + 4];
		st[j + 3] ^= (~st[j + 4]) & bc[0];
		st[j + 4] ^= (~bc[0]) & bc[1];

		j = 20;
		bc[0] = st[j    ];
		bc[1] = st[j + 1];
		bc[2] = st[j + 2];
		bc[3] = st[j + 3];
		bc[4] = st[j + 4];

		st[j    ] ^= (~bc[1]) & bc[2];
		st[j + 1] ^= (~bc[2]) & bc[3];
		st[j + 2] ^= (~bc[3]) & bc[4];
		st[j + 3] ^= (~bc[4]) & bc[0];
		st[j + 4] ^= (~bc[0]) & bc[1];
		
		//  Iota
		st[0] ^= keccakf_rndc[round];
	}
}
예제 #10
0
void
murmur_hash_process3_x64_128(const char * key, uint32_t len, uint32_t seed, void *out)
{
  const uint8_t * data = (const uint8_t*)key;
  const int nblocks = len / 16;

  uint64_t h1 = seed;
  uint64_t h2 = seed;

  const uint64_t c1 = (uint64_t)BIG_CONSTANT(0x87c37b91114253d5);
  const uint64_t c2 = (uint64_t)BIG_CONSTANT(0x4cf5ad432745937f);

  //----------
  // body

  const uint64_t * blocks = (const uint64_t *)(data);

  int i;

  for(i = 0; i < nblocks; i++)
  {
    uint64_t k1 = getblock64(blocks,i*2+0);
    uint64_t k2 = getblock64(blocks,i*2+1);

    k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;

    h1 = ROTL64(h1,27); h1 += h2; h1 = h1*5+0x52dce729;

    k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;

    h2 = ROTL64(h2,31); h2 += h1; h2 = h2*5+0x38495ab5;
  }

  //----------
  // tail

  const uint8_t * tail = (const uint8_t*)(data + nblocks*16);

  uint64_t k1 = 0;
  uint64_t k2 = 0;

  switch(len & 15)
  {
  case 15: k2 ^= ((uint64_t)tail[14]) << 48;
  case 14: k2 ^= ((uint64_t)tail[13]) << 40;
  case 13: k2 ^= ((uint64_t)tail[12]) << 32;
  case 12: k2 ^= ((uint64_t)tail[11]) << 24;
  case 11: k2 ^= ((uint64_t)tail[10]) << 16;
  case 10: k2 ^= ((uint64_t)tail[ 9]) << 8;
  case  9: k2 ^= ((uint64_t)tail[ 8]) << 0;
           k2 *= c2; k2  = ROTL64(k2,33); k2 *= c1; h2 ^= k2;

  case  8: k1 ^= ((uint64_t)tail[ 7]) << 56;
  case  7: k1 ^= ((uint64_t)tail[ 6]) << 48;
  case  6: k1 ^= ((uint64_t)tail[ 5]) << 40;
  case  5: k1 ^= ((uint64_t)tail[ 4]) << 32;
  case  4: k1 ^= ((uint64_t)tail[ 3]) << 24;
  case  3: k1 ^= ((uint64_t)tail[ 2]) << 16;
  case  2: k1 ^= ((uint64_t)tail[ 1]) << 8;
  case  1: k1 ^= ((uint64_t)tail[ 0]) << 0;
           k1 *= c1; k1  = ROTL64(k1,31); k1 *= c2; h1 ^= k1;
  };

  //----------
  // finalization

  h1 ^= len; h2 ^= len;

  h1 += h2;
  h2 += h1;

  h1 = fmix64(h1);
  h2 = fmix64(h2);

  h1 += h2;
  h2 += h1;

  ((uint64_t*)out)[0] = h1;
  ((uint64_t*)out)[1] = h2;
}
예제 #11
0
static void
keccak_block(scrypt_hash_state *S, const uint8_t *in) {
	size_t i;
	uint64_t *s = S->state, t[5], u[5], v, w;

	/* absorb input */
	for (i = 0; i < SCRYPT_HASH_BLOCK_SIZE / 8; i++, in += 8)
		s[i] ^= U8TO64_LE(in);
	
	for (i = 0; i < 24; i++) {
		/* theta: c = a[0,i] ^ a[1,i] ^ .. a[4,i] */
		t[0] = s[0] ^ s[5] ^ s[10] ^ s[15] ^ s[20];
		t[1] = s[1] ^ s[6] ^ s[11] ^ s[16] ^ s[21];
		t[2] = s[2] ^ s[7] ^ s[12] ^ s[17] ^ s[22];
		t[3] = s[3] ^ s[8] ^ s[13] ^ s[18] ^ s[23];
		t[4] = s[4] ^ s[9] ^ s[14] ^ s[19] ^ s[24];

		/* theta: d[i] = c[i+4] ^ rotl(c[i+1],1) */
		u[0] = t[4] ^ ROTL64(t[1], 1);
		u[1] = t[0] ^ ROTL64(t[2], 1);
		u[2] = t[1] ^ ROTL64(t[3], 1);
		u[3] = t[2] ^ ROTL64(t[4], 1);
		u[4] = t[3] ^ ROTL64(t[0], 1);

		/* theta: a[0,i], a[1,i], .. a[4,i] ^= d[i] */
		s[0] ^= u[0]; s[5] ^= u[0]; s[10] ^= u[0]; s[15] ^= u[0]; s[20] ^= u[0];
		s[1] ^= u[1]; s[6] ^= u[1]; s[11] ^= u[1]; s[16] ^= u[1]; s[21] ^= u[1];
		s[2] ^= u[2]; s[7] ^= u[2]; s[12] ^= u[2]; s[17] ^= u[2]; s[22] ^= u[2];
		s[3] ^= u[3]; s[8] ^= u[3]; s[13] ^= u[3]; s[18] ^= u[3]; s[23] ^= u[3];
		s[4] ^= u[4]; s[9] ^= u[4]; s[14] ^= u[4]; s[19] ^= u[4]; s[24] ^= u[4];

		/* rho pi: b[..] = rotl(a[..], ..) */
		v = s[ 1];
		s[ 1] = ROTL64(s[ 6], 44);
		s[ 6] = ROTL64(s[ 9], 20);
		s[ 9] = ROTL64(s[22], 61);
		s[22] = ROTL64(s[14], 39);
		s[14] = ROTL64(s[20], 18);
		s[20] = ROTL64(s[ 2], 62);
		s[ 2] = ROTL64(s[12], 43);
		s[12] = ROTL64(s[13], 25);
		s[13] = ROTL64(s[19],  8);
		s[19] = ROTL64(s[23], 56);
		s[23] = ROTL64(s[15], 41);
		s[15] = ROTL64(s[ 4], 27);
		s[ 4] = ROTL64(s[24], 14);
		s[24] = ROTL64(s[21],  2);
		s[21] = ROTL64(s[ 8], 55);
		s[ 8] = ROTL64(s[16], 45);
		s[16] = ROTL64(s[ 5], 36);
		s[ 5] = ROTL64(s[ 3], 28);
		s[ 3] = ROTL64(s[18], 21);
		s[18] = ROTL64(s[17], 15);
		s[17] = ROTL64(s[11], 10);
		s[11] = ROTL64(s[ 7],  6);
		s[ 7] = ROTL64(s[10],  3);
		s[10] = ROTL64(    v,  1);

		/* chi: a[i,j] ^= ~b[i,j+1] & b[i,j+2] */
		v = s[ 0]; w = s[ 1]; s[ 0] ^= (~w) & s[ 2]; s[ 1] ^= (~s[ 2]) & s[ 3]; s[ 2] ^= (~s[ 3]) & s[ 4]; s[ 3] ^= (~s[ 4]) & v; s[ 4] ^= (~v) & w;
		v = s[ 5]; w = s[ 6]; s[ 5] ^= (~w) & s[ 7]; s[ 6] ^= (~s[ 7]) & s[ 8]; s[ 7] ^= (~s[ 8]) & s[ 9]; s[ 8] ^= (~s[ 9]) & v; s[ 9] ^= (~v) & w;
		v = s[10]; w = s[11]; s[10] ^= (~w) & s[12]; s[11] ^= (~s[12]) & s[13]; s[12] ^= (~s[13]) & s[14]; s[13] ^= (~s[14]) & v; s[14] ^= (~v) & w;
		v = s[15]; w = s[16]; s[15] ^= (~w) & s[17]; s[16] ^= (~s[17]) & s[18]; s[17] ^= (~s[18]) & s[19]; s[18] ^= (~s[19]) & v; s[19] ^= (~v) & w;
		v = s[20]; w = s[21]; s[20] ^= (~w) & s[22]; s[21] ^= (~s[22]) & s[23]; s[22] ^= (~s[23]) & s[24]; s[23] ^= (~s[24]) & v; s[24] ^= (~v) & w;

		/* iota: a[0,0] ^= round constant */
		s[0] ^= keccak_round_constants[i];
	}
}
예제 #12
0
static void Sha3Permutation(uint64_t *state)
{
	int round;
	for (round = 0; round < NumberOfRounds; round++)
	{
		Sha3Theta(state);

		/* apply SHA3 rho() transformation */
		state[ 1] = ROTL64(state[ 1],  1);
		state[ 2] = ROTL64(state[ 2], 62);
		state[ 3] = ROTL64(state[ 3], 28);
		state[ 4] = ROTL64(state[ 4], 27);
		state[ 5] = ROTL64(state[ 5], 36);
		state[ 6] = ROTL64(state[ 6], 44);
		state[ 7] = ROTL64(state[ 7],  6);
		state[ 8] = ROTL64(state[ 8], 55);
		state[ 9] = ROTL64(state[ 9], 20);
		state[10] = ROTL64(state[10],  3);
		state[11] = ROTL64(state[11], 10);
		state[12] = ROTL64(state[12], 43);
		state[13] = ROTL64(state[13], 25);
		state[14] = ROTL64(state[14], 39);
		state[15] = ROTL64(state[15], 41);
		state[16] = ROTL64(state[16], 45);
		state[17] = ROTL64(state[17], 15);
		state[18] = ROTL64(state[18], 21);
		state[19] = ROTL64(state[19],  8);
		state[20] = ROTL64(state[20], 18);
		state[21] = ROTL64(state[21],  2);
		state[22] = ROTL64(state[22], 61);
		state[23] = ROTL64(state[23], 56);
		state[24] = ROTL64(state[24], 14);

		Sha3Pi(state);
		Sha3Chi(state);

		/* apply iota(state, round) */
		*state ^= Sha3RoundConstants[round];
	}
}
예제 #13
0
void
sha3_permute (struct sha3_state *state)
{
  static const uint64_t rc[SHA3_ROUNDS] = {
    0x0000000000000001ULL, 0X0000000000008082ULL,
    0X800000000000808AULL, 0X8000000080008000ULL,
    0X000000000000808BULL, 0X0000000080000001ULL,
    0X8000000080008081ULL, 0X8000000000008009ULL,
    0X000000000000008AULL, 0X0000000000000088ULL,
    0X0000000080008009ULL, 0X000000008000000AULL,
    0X000000008000808BULL, 0X800000000000008BULL,
    0X8000000000008089ULL, 0X8000000000008003ULL,
    0X8000000000008002ULL, 0X8000000000000080ULL,
    0X000000000000800AULL, 0X800000008000000AULL,
    0X8000000080008081ULL, 0X8000000000008080ULL,
    0X0000000080000001ULL, 0X8000000080008008ULL,
  };

  /* Original permutation:
     
       0,10,20, 5,15,
      16, 1,11,21, 6,
       7,17, 2,12,22,
      23, 8,18, 3,13,
      14,24, 9,19, 4

     Rotation counts:

       0,  1, 62, 28, 27,
      36, 44,  6, 55, 20,
       3, 10, 43, 25, 39,
      41, 45, 15, 21,  8,
      18,  2, 61, 56, 14,
  */

  /* In-place implementation. Permutation done as a long sequence of
     25 moves "following" the permutation.

      T <--  1
      1 <--  6
      6 <--  9
      9 <-- 22
     22 <-- 14
     14 <-- 20
     20 <--  2
      2 <-- 12
     12 <-- 13
     13 <-- 19
     19 <-- 23
     23 <-- 15
     15 <--  4
      4 <-- 24
     24 <-- 21
     21 <--  8
      8 <-- 16
     16 <--  5
      5 <--  3
      3 <-- 18
     18 <-- 17
     17 <-- 11
     11 <--  7
      7 <-- 10
     10 <--  T

  */
  uint64_t C[5], D[5], T, X;
  unsigned i, y;

#define A state->a

  C[0] = A[0] ^ A[5+0] ^ A[10+0] ^ A[15+0] ^ A[20+0];
  C[1] = A[1] ^ A[5+1] ^ A[10+1] ^ A[15+1] ^ A[20+1];
  C[2] = A[2] ^ A[5+2] ^ A[10+2] ^ A[15+2] ^ A[20+2];
  C[3] = A[3] ^ A[5+3] ^ A[10+3] ^ A[15+3] ^ A[20+3];
  C[4] = A[4] ^ A[5+4] ^ A[10+4] ^ A[15+4] ^ A[20+4];

  for (i = 0; i < SHA3_ROUNDS; i++)
    {
      D[0] = C[4] ^ ROTL64(1, C[1]);
      D[1] = C[0] ^ ROTL64(1, C[2]);
      D[2] = C[1] ^ ROTL64(1, C[3]);
      D[3] = C[2] ^ ROTL64(1, C[4]);
      D[4] = C[3] ^ ROTL64(1, C[0]);

      A[0] ^= D[0];
      X = A[ 1] ^ D[1];     T = ROTL64(1, X);
      X = A[ 6] ^ D[1]; A[ 1] = ROTL64 (44, X);
      X = A[ 9] ^ D[4]; A[ 6] = ROTL64 (20, X);
      X = A[22] ^ D[2]; A[ 9] = ROTL64 (61, X);
      X = A[14] ^ D[4]; A[22] = ROTL64 (39, X);
      X = A[20] ^ D[0]; A[14] = ROTL64 (18, X);
      X = A[ 2] ^ D[2]; A[20] = ROTL64 (62, X);
      X = A[12] ^ D[2]; A[ 2] = ROTL64 (43, X);
      X = A[13] ^ D[3]; A[12] = ROTL64 (25, X);
      X = A[19] ^ D[4]; A[13] = ROTL64 ( 8, X);
      X = A[23] ^ D[3]; A[19] = ROTL64 (56, X);
      X = A[15] ^ D[0]; A[23] = ROTL64 (41, X);
      X = A[ 4] ^ D[4]; A[15] = ROTL64 (27, X);
      X = A[24] ^ D[4]; A[ 4] = ROTL64 (14, X);
      X = A[21] ^ D[1]; A[24] = ROTL64 ( 2, X);
      X = A[ 8] ^ D[3]; A[21] = ROTL64 (55, X); /* row 4 done */
      X = A[16] ^ D[1]; A[ 8] = ROTL64 (45, X);
      X = A[ 5] ^ D[0]; A[16] = ROTL64 (36, X);
      X = A[ 3] ^ D[3]; A[ 5] = ROTL64 (28, X);
      X = A[18] ^ D[3]; A[ 3] = ROTL64 (21, X); /* row 0 done */
      X = A[17] ^ D[2]; A[18] = ROTL64 (15, X);
      X = A[11] ^ D[1]; A[17] = ROTL64 (10, X); /* row 3 done */
      X = A[ 7] ^ D[2]; A[11] = ROTL64 ( 6, X); /* row 1 done */
      X = A[10] ^ D[0]; A[ 7] = ROTL64 ( 3, X);
      A[10] = T;				/* row 2 done */

      D[0] = ~A[1] & A[2];
      D[1] = ~A[2] & A[3];
      D[2] = ~A[3] & A[4];
      D[3] = ~A[4] & A[0];
      D[4] = ~A[0] & A[1];

      A[0] ^= D[0] ^ rc[i]; C[0] = A[0];
      A[1] ^= D[1]; C[1] = A[1];
      A[2] ^= D[2]; C[2] = A[2];
      A[3] ^= D[3]; C[3] = A[3];
      A[4] ^= D[4]; C[4] = A[4];

      for (y = 5; y < 25; y+= 5)
	{
	  D[0] = ~A[y+1] & A[y+2];
	  D[1] = ~A[y+2] & A[y+3];
	  D[2] = ~A[y+3] & A[y+4];
	  D[3] = ~A[y+4] & A[y+0];
	  D[4] = ~A[y+0] & A[y+1];

	  A[y+0] ^= D[0]; C[0] ^= A[y+0];
	  A[y+1] ^= D[1]; C[1] ^= A[y+1];
	  A[y+2] ^= D[2]; C[2] ^= A[y+2];
	  A[y+3] ^= D[3]; C[3] ^= A[y+3];
	  A[y+4] ^= D[4]; C[4] ^= A[y+4];
	}
    }
#undef A
}