Exemplo n.º 1
0
static void sha256_compress(unsigned int* iv, const uint8_t* data) {
  unsigned int a, b, c, d, e, f, g, h;
  unsigned int s0, s1;
  unsigned int t1, t2;
  unsigned int work_space[16];
  unsigned int n;
  unsigned int i;

  a = iv[0];
  b = iv[1];
  c = iv[2];
  d = iv[3];
  e = iv[4];
  f = iv[5];
  g = iv[6];
  h = iv[7];

  for (i = 0; i < 16; ++i) {
    n = BigEndian(&data);
    t1 = work_space[i] = n;
    t1 += h + Sigma1(e) + Ch(e, f, g) + k256[i];
    t2 = Sigma0(a) + Maj(a, b, c);
    h = g;
    g = f;
    f = e;
    e = d + t1;
    d = c;
    c = b;
    b = a;
    a = t1 + t2;
  }

  for (; i < 64; ++i) {
    s0 = work_space[(i + 1) & 0x0f];
    s0 = sigma0(s0);
    s1 = work_space[(i + 14) & 0x0f];
    s1 = sigma1(s1);

    t1 = work_space[i & 0xf] += s0 + s1 + work_space[(i + 9) & 0xf];
    t1 += h + Sigma1(e) + Ch(e, f, g) + k256[i];
    t2 = Sigma0(a) + Maj(a, b, c);
    h = g;
    g = f;
    f = e;
    e = d + t1;
    d = c;
    c = b;
    b = a;
    a = t1 + t2;
  }

  iv[0] += a;
  iv[1] += b;
  iv[2] += c;
  iv[3] += d;
  iv[4] += e;
  iv[5] += f;
  iv[6] += g;
  iv[7] += h;
}
Exemplo n.º 2
0
static void sha256_block (SHA256_CTX *ctx, const void *in, size_t num, int host)
	{
	unsigned MD32_REG_T a,b,c,d,e,f,g,h,s0,s1,T1,T2;
	SHA_LONG	X[16];
	int i;
	const unsigned char *data=in;

			while (num--) {

	a = ctx->h[0];	b = ctx->h[1];	c = ctx->h[2];	d = ctx->h[3];
	e = ctx->h[4];	f = ctx->h[5];	g = ctx->h[6];	h = ctx->h[7];

	if (host)
		{
		const SHA_LONG *W=(const SHA_LONG *)data;

		for (i=0;i<16;i++)
			{
			T1 = X[i] = W[i];
			T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i];
			T2 = Sigma0(a) + Maj(a,b,c);
			h = g;	g = f;	f = e;	e = d + T1;
			d = c;	c = b;	b = a;	a = T1 + T2;
			}

		data += SHA256_CBLOCK;
		}
	else
		{
		SHA_LONG l;

		for (i=0;i<16;i++)
			{
			HOST_c2l(data,l); T1 = X[i] = l;
			T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i];
			T2 = Sigma0(a) + Maj(a,b,c);
			h = g;	g = f;	f = e;	e = d + T1;
			d = c;	c = b;	b = a;	a = T1 + T2;
			}
		}

	for (;i<64;i++)
		{
		s0 = X[(i+1)&0x0f];	s0 = sigma0(s0);
		s1 = X[(i+14)&0x0f];	s1 = sigma1(s1);

		T1 = X[i&0xf] += s0 + s1 + X[(i+9)&0xf];
		T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i];
		T2 = Sigma0(a) + Maj(a,b,c);
		h = g;	g = f;	f = e;	e = d + T1;
		d = c;	c = b;	b = a;	a = T1 + T2;
		}

	ctx->h[0] += a;	ctx->h[1] += b;	ctx->h[2] += c;	ctx->h[3] += d;
	ctx->h[4] += e;	ctx->h[5] += f;	ctx->h[6] += g;	ctx->h[7] += h;

			}
}
/**
 * sha512 compression function - 32-bit machines
 * @param res The resulting hash value
 * @param hash The chaining input value
 * @param in The message input
 */
void sha512_comp (hashblock res, const hashblock hash, const messageblock in)
	{
	const uint64_t *W=in;
	uint64_t	A,E,T;
	uint64_t	X[9+80],*F;
	uint64_t H[8];
	int i;

   for (i = 0; i < SHA512_DIGEST_LENGTH/8; i++) {
	   H[i]=PULL64(hash[i*8]);
	}

	F    = X+80;
	A    = H[0];	F[1] = H[1];
	F[2] = H[2];	F[3] = H[3];
	E    = H[4];	F[5] = H[5];
	F[6] = H[6];	F[7] = H[7];

	for (i=0;i<16;i++,F--)
		{
#ifdef B_ENDIAN
		T = W[i];
#else
		T = PULL64(W[i]);
#endif
		F[0] = A;
		F[4] = E;
		F[8] = T;
		T   += F[7] + Sigma1(E) + Ch(E,F[5],F[6]) + K512[i];
		E    = F[3] + T;
		A    = T + Sigma0(A) + Maj(A,F[1],F[2]);
		}

	for (;i<80;i++,F--)
		{
		T    = sigma0(F[8+16-1]);
		T   += sigma1(F[8+16-14]);
		T   += F[8+16] + F[8+16-9];

		F[0] = A;
		F[4] = E;
		F[8] = T;
		T   += F[7] + Sigma1(E) + Ch(E,F[5],F[6]) + K512[i];
		E    = F[3] + T;
		A    = T + Sigma0(A) + Maj(A,F[1],F[2]);
		}

	H[0] += A;		H[1] += F[1];
	H[2] += F[2];	H[3] += F[3];
	H[4] += E;		H[5] += F[5];
	H[6] += F[6];	H[7] += F[7];

   for (i = 0; i < SHA512_DIGEST_LENGTH/8; i++) {
	   PUSH64(H[i],res[i*8]);
	}

	}
Exemplo n.º 4
0
/*
 * This code should give better results on 32-bit CPU with less than
 * ~24 registers, both size and performance wise...
 */
void sha512_block_data_order(uint64_t *state, const uint64_t *W, size_t num) {
  uint64_t A, E, T;
  uint64_t X[9 + 80], *F;
  int i;

  while (num--) {
    F = X + 80;
    A = state[0];
    F[1] = state[1];
    F[2] = state[2];
    F[3] = state[3];
    E = state[4];
    F[5] = state[5];
    F[6] = state[6];
    F[7] = state[7];

    for (i = 0; i < 16; i++, F--) {
      T = from_be_u64(W[i]);
      F[0] = A;
      F[4] = E;
      F[8] = T;
      T += F[7] + Sigma1(E) + Ch(E, F[5], F[6]) + K512[i];
      E = F[3] + T;
      A = T + Sigma0(A) + Maj(A, F[1], F[2]);
    }

    for (; i < 80; i++, F--) {
      T = sigma0(F[8 + 16 - 1]);
      T += sigma1(F[8 + 16 - 14]);
      T += F[8 + 16] + F[8 + 16 - 9];

      F[0] = A;
      F[4] = E;
      F[8] = T;
      T += F[7] + Sigma1(E) + Ch(E, F[5], F[6]) + K512[i];
      E = F[3] + T;
      A = T + Sigma0(A) + Maj(A, F[1], F[2]);
    }

    state[0] += A;
    state[1] += F[1];
    state[2] += F[2];
    state[3] += F[3];
    state[4] += E;
    state[5] += F[5];
    state[6] += F[6];
    state[7] += F[7];

    W += 16;
  }
}
/**
 * sha512 compression function - 64-bit machines
 * @param res The resulting hash value
 * @param hash The chaining input value
 * @param in The message input
 */
void sha512_comp (hashblock res, const hashblock hash, const messageblock in)
	{
    // CHANGE type casting added due to c++
	const uint64_t *W=reinterpret_cast<const uint64_t*>(in);
	uint64_t	a,b,c,d,e,f,g,h,s0,s1,T1,T2;
	uint64_t	X[16];
	uint64_t  H[8];
	int i;

   for (i = 0; i < SHA512_DIGEST_LENGTH/8; i++) {
	   H[i]=PULL64(hash[i*8]);
	}


	a = H[0];	b = H[1];	c = H[2];	d = H[3];
	e = H[4];	f = H[5];	g = H[6];	h = H[7];

	for (i=0;i<16;i++)
		{
#ifdef B_ENDIAN
		T1 = X[i] = W[i];
#else
		T1 = X[i] = PULL64(W[i]);
#endif
		T1 += h + Sigma1(e) + Ch(e,f,g) + K512[i];
		T2 = Sigma0(a) + Maj(a,b,c);
		h = g;	g = f;	f = e;	e = d + T1;
		d = c;	c = b;	b = a;	a = T1 + T2;
		}

	for (;i<80;i++)
		{
		s0 = X[(i+1)&0x0f];	s0 = sigma0(s0);
		s1 = X[(i+14)&0x0f];	s1 = sigma1(s1);

		T1 = X[i&0xf] += s0 + s1 + X[(i+9)&0xf];
		T1 += h + Sigma1(e) + Ch(e,f,g) + K512[i];
		T2 = Sigma0(a) + Maj(a,b,c);
		h = g;	g = f;	f = e;	e = d + T1;
		d = c;	c = b;	b = a;	a = T1 + T2;
		}

	H[0] += a;	H[1] += b;	H[2] += c;	H[3] += d;
	H[4] += e;	H[5] += f;	H[6] += g;	H[7] += h;

   for (i = 0; i < SHA512_DIGEST_LENGTH/8; i++) {
	   PUSH64(H[i],res[i*8]);
	}

	}
Exemplo n.º 6
0
/** One round of SHA-256. */
void inline __attribute__((always_inline)) Round(__m256i a, __m256i b, __m256i c, __m256i& d, __m256i e, __m256i f, __m256i g, __m256i& h, __m256i k)
{
    __m256i t1 = Add(h, Sigma1(e), Ch(e, f, g), k);
    __m256i t2 = Add(Sigma0(a), Maj(a, b, c));
    d = Add(d, t1);
    h = Add(t1, t2);
}
Exemplo n.º 7
0
static void
SHA256Transform(uint32_t *H, const uint8_t *cp)
{
	uint32_t a, b, c, d, e, f, g, h, t, T1, T2, W[64];

	for (t = 0; t < 16; t++, cp += 4)
		W[t] = (cp[0] << 24) | (cp[1] << 16) | (cp[2] << 8) | cp[3];

	for (t = 16; t < 64; t++)
		W[t] = sigma1(W[t - 2]) + W[t - 7] +
		    sigma0(W[t - 15]) + W[t - 16];

	a = H[0]; b = H[1]; c = H[2]; d = H[3];
	e = H[4]; f = H[5]; g = H[6]; h = H[7];

	for (t = 0; t < 64; t++) {
		T1 = h + SIGMA1(e) + Ch(e, f, g) + SHA256_K[t] + W[t];
		T2 = SIGMA0(a) + Maj(a, b, c);
		h = g; g = f; f = e; e = d + T1;
		d = c; c = b; b = a; a = T1 + T2;
	}

	H[0] += a; H[1] += b; H[2] += c; H[3] += d;
	H[4] += e; H[5] += f; H[6] += g; H[7] += h;
}
void HashSHA256Block(void* hash_block, SHA256_Context* ctx)
{
	unsigned int a,b,c,d,e,f,g,h,T1,T2,i;
	unsigned int w[0x40];
	unsigned char* block = (unsigned char*)hash_block;
	a = ctx->h0; b = ctx->h1;
	c = ctx->h2; d = ctx->h3;
	e = ctx->h4; f = ctx->h5;
	g = ctx->h6; h = ctx->h7;
	for (i = 0; i < 16; i++) w[i] = BSWAP(*(unsigned int*)(block + i * 4));
	for (i = 16; i < 64; i++) w[i] = SigmaS1(w[i-2]) + w[i-7] + SigmaS0(w[i-15]) + w[i-16];
	for (i = 0; i < 64; i++)
	{
		T1 = h + SigmaB1(e) + Ch(e,f,g) + sha256_constant[i] + w[i];
		T2 = SigmaB0(a) + Maj(a,b,c);
		h = g;
		g = f;
		f = e;
		e = d + T1;
		d = c;
		c = b;
		b = a;
		a = T1 + T2;
	}
	ctx->h0 += a;
	ctx->h1 += b;
	ctx->h2 += c;
	ctx->h3 += d;
	ctx->h4 += e;
	ctx->h5 += f;
	ctx->h6 += g;
	ctx->h7 += h;
	a = b = c = d = e = f = g = h = T1 = T2 = 0;
	memset(w,0,0x100);
}
/*****************************************
 *       sha256 compression function     *
 *                                       *
 *   H   points to chaining input        *
 *   in  points to the message input     *
 *                                       *
 *****************************************/
void sha256_comp (hashblock res, const hashblock hash, const void *in)
	{
	uint32_t a,b,c,d,e,f,g,h,s0,s1,T1,T2;
	uint32_t    H[8];
	uint32_t	X[16],l;
	int i;
    // CHANGE type casting added due to c++
	const unsigned char *data=static_cast<const unsigned char*>(in);

	for (i = 0; i < SHA256_DIGEST_LENGTH/4; i++) {
	   HOST_c2l(hash, H[i]);
	}

	a = H[0];	b = H[1];	c = H[2];	d = H[3];
	e = H[4];	f = H[5];	g = H[6];	h = H[7];

	for (i=0;i<16;i++)
		{
		HOST_c2l(data,l); T1 = X[i] = l;
		T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i];
		T2 = Sigma0(a) + Maj(a,b,c);
		h = g;
		g = f;
		f = e;
		e = d + T1;
		d = c;	c = b;	b = a;	a = T1 + T2;
		}

	for (;i<64;i++)
		{
		s0 = X[(i+1)&0x0f];	s0 = sigma0(s0);
		s1 = X[(i+14)&0x0f];	s1 = sigma1(s1);

		T1 = X[i&0xf] += s0 + s1 + X[(i+9)&0xf];
		T1 += h + Sigma1(e) + Ch(e,f,g) + K256[i];
		T2 = Sigma0(a) + Maj(a,b,c);
		h = g;	g = f;	f = e;	e = d + T1;
		d = c;	c = b;	b = a;	a = T1 + T2;
		}

	H[0] += a;	H[1] += b;	H[2] += c;	H[3] += d;
	H[4] += e;	H[5] += f;	H[6] += g;	H[7] += h;

	for (i = 0; i < SHA256_DIGEST_LENGTH/4; i++) {
	   HOST_l2c(H[i], res);
	}
}
Exemplo n.º 10
0
void  sha512_compress(psDigestContext_t * md, unsigned char *buf)
#endif
{
	uint64 S[8], W[80], t0, t1;
    int i;

    /* copy state into S */
    for (i = 0; i < 8; i++) {
        S[i] = md->sha512.state[i];
    }

    /* copy the state into 1024-bits into W[0..15] */
    for (i = 0; i < 16; i++) {
		LOAD64H(W[i], buf + (8*i));
    }

    /* fill W[16..79] */
    for (i = 16; i < 80; i++) {
        W[i] = Gamma1(W[i - 2]) + W[i - 7] + Gamma0(W[i - 15]) + W[i - 16];
    }        

    /* Compress */
#ifndef PS_SHA512_IMPROVE_PERF_INCREASE_CODESIZE
    for (i = 0; i < 80; i++) {
        t0 = S[7] + Sigma1(S[4]) + Ch(S[4], S[5], S[6]) + K[i] + W[i];
		t1 = Sigma0(S[0]) + Maj(S[0], S[1], S[2]);
        S[7] = S[6];
        S[6] = S[5];
        S[5] = S[4];
        S[4] = S[3] + t0;
        S[3] = S[2];
        S[2] = S[1];
        S[1] = S[0];
        S[0] = t0 + t1;
    }
#else
#define RND(a,b,c,d,e,f,g,h,i)                    \
     t0 = h + Sigma1(e) + Ch(e, f, g) + K[i] + W[i];   \
     t1 = Sigma0(a) + Maj(a, b, c);                  \
     d += t0;                                        \
     h  = t0 + t1;

     for (i = 0; i < 80; i += 8) {
         RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],i+0);
         RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],i+1);
         RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],i+2);
         RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],i+3);
         RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],i+4);
         RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],i+5);
         RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],i+6);
         RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],i+7);
     }
#endif /* PS_SHA512_IMPROVE_PERF_INCREASE_CODESIZE */

	/* feedback */
	for (i = 0; i < 8; i++) {
        md->sha512.state[i] = md->sha512.state[i] + S[i];
    }
}
Exemplo n.º 11
0
void sha256_block_data_order (SHA256_CTX *ctx, const void *in)
	{
	unsigned MD32_REG_T a,b,c,d,e,f,g,h,s0,s1,T1,T2,t;
	SHA_LONG	X[16],l,Ki;
	int i;
	const unsigned char *data=in;

	a = ctx->h[0];	b = ctx->h[1];	c = ctx->h[2];	d = ctx->h[3];
	e = ctx->h[4];	f = ctx->h[5];	g = ctx->h[6];	h = ctx->h[7];

	for (i=0;i<16;i++)
		{
		HOST_c2l(data,l); X[i] = l;
		Ki=K256[i];
		T1 = l + h + Sigma1(e) + Ch(e,f,g) + Ki;
		T2 = Sigma0(a) + Maj(a,b,c);
		h = g;	g = f;	f = e;	e = d + T1;
		d = c;	c = b;	b = a;	a = T1 + T2;
		}

	for (;i<64;i++)
		{
		s0 = X[(i+1)&0x0f];	s0 = sigma0(s0);
		s1 = X[(i+14)&0x0f];	s1 = sigma1(s1);

		T1 = X[i&0xf];
		t = X[(i+9)&0xf];
		T1 += s0 + s1 + t;
                X[i&0xf] = T1;
		Ki=K256[i];
		T1 += h + Sigma1(e) + Ch(e,f,g) + Ki;
		T2 = Sigma0(a) + Maj(a,b,c);
		h = g;	g = f;	f = e;	e = d + T1;
		d = c;	c = b;	b = a;	a = T1 + T2;
		}

	t=ctx->h[0]; ctx->h[0]=t+a;
	t=ctx->h[1]; ctx->h[1]=t+b;
	t=ctx->h[2]; ctx->h[2]=t+c;
	t=ctx->h[3]; ctx->h[3]=t+d;
	t=ctx->h[4]; ctx->h[4]=t+e;
	t=ctx->h[5]; ctx->h[5]=t+f;
	t=ctx->h[6]; ctx->h[6]=t+g;
	t=ctx->h[7]; ctx->h[7]=t+h;
       return;
}
Exemplo n.º 12
0
void
_sha2block128(uchar *p, ulong len, uint64 *s)
{
	uint64 a, b, c, d, e, f, g, h, t1, t2;
	uint64 *kp, *wp;
	uint64 w[80];
	uchar *end;

	/* at this point, we have a multiple of 64 bytes */
	for(end = p+len; p < end;){
		a = s[0];
		b = s[1];
		c = s[2];
		d = s[3];
		e = s[4];
		f = s[5];
		g = s[6];
		h = s[7];

		for(wp = w; wp < &w[16]; wp++, p += 8)
			wp[0] = ((vlong)p[0])<<56 | ((vlong)p[1])<<48 |
				((vlong)p[2])<<40 | ((vlong)p[3])<<32 |
				p[4] << 24 | p[5] << 16 | p[6] << 8 | p[7];
		for(; wp < &w[80]; wp++) {
			uint64 s0, s1;

			s0 = sigma0(wp[-15]);
			s1 = sigma1(wp[-2]);
//			wp[0] = sigma1(wp[-2]) + wp[-7] + sigma0(wp[-15]) + wp[-16];
			wp[0] = s1 + wp[-7] + s0 + wp[-16];
		}

		for(kp = K512, wp = w; wp < &w[80]; ) {
			t1 = h + SIGMA1(e) + Ch(e,f,g) + *kp++ + *wp++;
			t2 = SIGMA0(a) + Maj(a,b,c);
			h = g;
			g = f;
			f = e;
			e = d + t1;
			d = c;
			c = b;
			b = a;
			a = t1 + t2;
		}

		/* save state */
		s[0] += a;
		s[1] += b;
		s[2] += c;
		s[3] += d;
		s[4] += e;
		s[5] += f;
		s[6] += g;
		s[7] += h;
	}
}
Exemplo n.º 13
0
static void
sha512_transform(u64 *state, const u8 *input)
{
	u64 a, b, c, d, e, f, g, h, t1, t2;

	int i;
	u64 W[16];

	/* load the state into our registers */
	a=state[0];   b=state[1];   c=state[2];   d=state[3];
	e=state[4];   f=state[5];   g=state[6];   h=state[7];

	/* now iterate */
	for (i=0; i<80; i+=8) {
		if (!(i & 8)) {
			int j;

			if (i < 16) {
				/* load the input */
				for (j = 0; j < 16; j++)
					LOAD_OP(i + j, W, input);
			} else {
				for (j = 0; j < 16; j++) {
					BLEND_OP(i + j, W);
				}
			}
		}

		t1 = h + e1(e) + Ch(e,f,g) + sha512_K[i  ] + W[(i & 15)];
		t2 = e0(a) + Maj(a,b,c);    d+=t1;    h=t1+t2;
		t1 = g + e1(d) + Ch(d,e,f) + sha512_K[i+1] + W[(i & 15) + 1];
		t2 = e0(h) + Maj(h,a,b);    c+=t1;    g=t1+t2;
		t1 = f + e1(c) + Ch(c,d,e) + sha512_K[i+2] + W[(i & 15) + 2];
		t2 = e0(g) + Maj(g,h,a);    b+=t1;    f=t1+t2;
		t1 = e + e1(b) + Ch(b,c,d) + sha512_K[i+3] + W[(i & 15) + 3];
		t2 = e0(f) + Maj(f,g,h);    a+=t1;    e=t1+t2;
		t1 = d + e1(a) + Ch(a,b,c) + sha512_K[i+4] + W[(i & 15) + 4];
		t2 = e0(e) + Maj(e,f,g);    h+=t1;    d=t1+t2;
		t1 = c + e1(h) + Ch(h,a,b) + sha512_K[i+5] + W[(i & 15) + 5];
		t2 = e0(d) + Maj(d,e,f);    g+=t1;    c=t1+t2;
		t1 = b + e1(g) + Ch(g,h,a) + sha512_K[i+6] + W[(i & 15) + 6];
		t2 = e0(c) + Maj(c,d,e);    f+=t1;    b=t1+t2;
		t1 = a + e1(f) + Ch(f,g,h) + sha512_K[i+7] + W[(i & 15) + 7];
		t2 = e0(b) + Maj(b,c,d);    e+=t1;    a=t1+t2;
	}

	state[0] += a; state[1] += b; state[2] += c; state[3] += d;
	state[4] += e; state[5] += f; state[6] += g; state[7] += h;

	/* erase our data */
	a = b = c = d = e = f = g = h = t1 = t2 = 0;
}
Exemplo n.º 14
0
static void sha256_transform(sha256_context *ctx)
{
  uint32 W[64]; // Words of message schedule.
  uint32 v[8];  // FIPS a, b, c, d, e, f, g, h working variables.

  if (ctx == NULL) // Clean variables and return.
  {
    cleandata(v,sizeof(v));
    cleandata(W,sizeof(W));
    return;
  }

  // Prepare message schedule. Loop unrolling provides some small gain here.
  W[0] =  b2i(ctx->Data + 0 * 4 );   W[1] =  b2i(ctx->Data + 1 * 4 );
  W[2] =  b2i(ctx->Data + 2 * 4 );   W[3] =  b2i(ctx->Data + 3 * 4 );
  W[4] =  b2i(ctx->Data + 4 * 4 );   W[5] =  b2i(ctx->Data + 5 * 4 );
  W[6] =  b2i(ctx->Data + 6 * 4 );   W[7] =  b2i(ctx->Data + 7 * 4 );
  W[8] =  b2i(ctx->Data + 8 * 4 );   W[9] =  b2i(ctx->Data + 9 * 4 );
  W[10] = b2i(ctx->Data + 10 * 4 );  W[11] = b2i(ctx->Data + 11 * 4 );
  W[12] = b2i(ctx->Data + 12 * 4 );  W[13] = b2i(ctx->Data + 13 * 4 );
  W[14] = b2i(ctx->Data + 14 * 4 );  W[15] = b2i(ctx->Data + 15 * 4 );

  for (uint I = 16; I < 64; I++)
    W[I] = sg1(W[I-2]) + W[I-7] + sg0(W[I-15]) + W[I-16];

  uint32 *H=ctx->H;
  for (uint I = 0; I < 8; I++)
    v[I]=H[I];

  // MSVC -O2 partially unrolls this loop automatically.
  for (uint I = 0; I < 64; I++)
  {
    uint T1 = v[7] + Sg1(v[4]) + Ch(v[4], v[5], v[6]) + K[I] + W[I];

    // It is possible to eliminate variable copying if we unroll loop
    // and rename variables every time. But my test did not show any speed
    // gain on i7 for such full or partial unrolling.
    v[7] = v[6];
    v[6] = v[5];
    v[5] = v[4];
    v[4] = v[3] + T1;

    // It works a little faster when moved here from beginning of loop.
    uint T2 = Sg0(v[0]) + Maj(v[0], v[1], v[2]);

    v[3] = v[2];
    v[2] = v[1];
    v[1] = v[0];
    v[0] = T1 + T2;
  }

  for (uint I = 0; I < 8; I++)
    H[I]+=v[I];
}
Exemplo n.º 15
0
/*
========================================================================
Routine Description:
    SHA256 computation for one block (512 bits)

Arguments:
    pSHA_CTX    Pointer to SHA256_CTX_STRUC

Return Value:
    None

Note:
    None
========================================================================
*/
VOID RT_SHA256_Hash (
    IN  SHA256_CTX_STRUC *pSHA_CTX)
{
    uint32_t W_i,t;
    uint32_t W[64];
    uint32_t a,b,c,d,e,f,g,h,T1,T2;

    /* Prepare the message schedule, {W_i}, 0 < t < 15 */
    memmove(W, pSHA_CTX->Block, SHA256_BLOCK_SIZE);
    for (W_i = 0; W_i < 16; W_i++)
        W[W_i] = cpu2be32(W[W_i]); /* Endian Swap */
        /* End of for */

    /* SHA256 hash computation */
    /* Initialize the working variables */
    a = pSHA_CTX->HashValue[0];
    b = pSHA_CTX->HashValue[1];
    c = pSHA_CTX->HashValue[2];
    d = pSHA_CTX->HashValue[3];
    e = pSHA_CTX->HashValue[4];
    f = pSHA_CTX->HashValue[5];
    g = pSHA_CTX->HashValue[6];
    h = pSHA_CTX->HashValue[7];

    /* 64 rounds */
    for (t = 0;t < 64;t++) {
        if (t > 15) /* Prepare the message schedule, {W_i}, 16 < t < 63 */
            W[t] = Sigma_256_1(W[t-2]) + W[t-7] + Sigma_256_0(W[t-15]) + W[t-16];
            /* End of if */
        T1 = h + Zsigma_256_1(e) + Ch(e,f,g) + SHA256_K[t] + W[t];
        T2 = Zsigma_256_0(a) + Maj(a,b,c);
        h = g;
        g = f;
        f = e;
        e = d + T1;
        d = c;
        c = b;
        b = a;
        a = T1 + T2;
     } /* End of for */

     /* Compute the i^th intermediate hash value H^(i) */
     pSHA_CTX->HashValue[0] += a;
     pSHA_CTX->HashValue[1] += b;
     pSHA_CTX->HashValue[2] += c;
     pSHA_CTX->HashValue[3] += d;
     pSHA_CTX->HashValue[4] += e;
     pSHA_CTX->HashValue[5] += f;
     pSHA_CTX->HashValue[6] += g;
     pSHA_CTX->HashValue[7] += h;

    memset(pSHA_CTX->Block, 0, SHA256_BLOCK_SIZE);
    pSHA_CTX->BlockLen = 0;
} /* End of RT_SHA256_Hash */
Exemplo n.º 16
0
void
_sha2block64(uchar *p, ulong len, uint32 *s)
{
	uint32 a, b, c, d, e, f, g, h, t1, t2;
	uint32 *kp, *wp;
	uint32 w[64];
	uchar *end;

	/* at this point, we have a multiple of 64 bytes */
	for(end = p+len; p < end;){
		a = s[0];
		b = s[1];
		c = s[2];
		d = s[3];
		e = s[4];
		f = s[5];
		g = s[6];
		h = s[7];

		for(wp = w; wp < &w[16]; wp++, p += 4)
			wp[0] = p[0] << 24 | p[1] << 16 | p[2] << 8 | p[3];
		for(; wp < &w[64]; wp++)
			wp[0] = sigma1(wp[-2]) + wp[-7] +
				sigma0(wp[-15]) + wp[-16];

		for(kp = K256, wp = w; wp < &w[64]; ) {
			t1 = h + SIGMA1(e) + Ch(e,f,g) + *kp++ + *wp++;
			t2 = SIGMA0(a) + Maj(a,b,c);
			h = g;
			g = f;
			f = e;
			e = d + t1;
			d = c;
			c = b;
			b = a;
			a = t1 + t2;
		}

		/* save state */
		s[0] += a;
		s[1] += b;
		s[2] += c;
		s[3] += d;
		s[4] += e;
		s[5] += f;
		s[6] += g;
		s[7] += h;
	}
}
Exemplo n.º 17
0
static void
processblock(struct sha512 *s, const uint8_t *buf)
{
	uint64_t W[80], t1, t2, a, b, c, d, e, f, g, h;
	int i;

	for (i = 0; i < 16; i++) {
		W[i] = (uint64_t)buf[8*i]<<56;
		W[i] |= (uint64_t)buf[8*i+1]<<48;
		W[i] |= (uint64_t)buf[8*i+2]<<40;
		W[i] |= (uint64_t)buf[8*i+3]<<32;
		W[i] |= (uint64_t)buf[8*i+4]<<24;
		W[i] |= (uint64_t)buf[8*i+5]<<16;
		W[i] |= (uint64_t)buf[8*i+6]<<8;
		W[i] |= buf[8*i+7];
	}
	for (; i < 80; i++)
		W[i] = R1(W[i-2]) + W[i-7] + R0(W[i-15]) + W[i-16];
	a = s->h[0];
	b = s->h[1];
	c = s->h[2];
	d = s->h[3];
	e = s->h[4];
	f = s->h[5];
	g = s->h[6];
	h = s->h[7];
	for (i = 0; i < 80; i++) {
		t1 = h + S1(e) + Ch(e,f,g) + K[i] + W[i];
		t2 = S0(a) + Maj(a,b,c);
		h = g;
		g = f;
		f = e;
		e = d + t1;
		d = c;
		c = b;
		b = a;
		a = t1 + t2;
	}
	s->h[0] += a;
	s->h[1] += b;
	s->h[2] += c;
	s->h[3] += d;
	s->h[4] += e;
	s->h[5] += f;
	s->h[6] += g;
	s->h[7] += h;
}
Exemplo n.º 18
0
static void SHA256_transform( SHA256_ctx* ctx )
{
   int t;
   unsigned int A = ctx->H[ 0 ];
   unsigned int B = ctx->H[ 1 ];
   unsigned int C = ctx->H[ 2 ];
   unsigned int D = ctx->H[ 3 ];
   unsigned int E = ctx->H[ 4 ];
   unsigned int F = ctx->H[ 5 ];
   unsigned int G = ctx->H[ 6 ];
   unsigned int H = ctx->H[ 7 ];
   unsigned int T1, T2;
   unsigned int W[ 64 ];

   memcpy( W, ctx->M, 64 );

   for ( t = 16; t < 64; t++ )
   {
      W[ t ] = sig1(W[t-2]) + W[t-7] + sig0(W[t-15]) + W[t-16];
   }

   for ( t = 0; t < 64; t++ )
   {
      T1 = H + SIG1(E) + Ch(E,F,G) + K[t] + W[t];
      T2 = SIG0(A) + Maj(A,B,C);
      H = G;
      G = F;
      F = E;
      E = D + T1;
      D = C;
      C = B;
      B = A;
      A = T1 + T2;
   }


   ctx->H[ 0 ] += A;
   ctx->H[ 1 ] += B;
   ctx->H[ 2 ] += C;
   ctx->H[ 3 ] += D;
   ctx->H[ 4 ] += E;
   ctx->H[ 5 ] += F;
   ctx->H[ 6 ] += G;
   ctx->H[ 7 ] += H;
}
Exemplo n.º 19
0
static void sha256_compress(hash_state * md)
#endif
{
    unsigned long S[8], W[64], t0, t1;
    int i;

    _ARGCHK(md != NULL);

    /* copy state into S */
    for (i = 0; i < 8; i++)
        S[i] = md->sha256.state[i];

    /* copy the state into 512-bits into W[0..15] */
    for (i = 0; i < 16; i++) {
        LOAD32H(W[i], md->sha256.buf + (4*i));
    }

    /* fill W[16..63] */
    for (i = 16; i < 64; i++) {
        W[i] = Gamma1(W[i - 2]) + W[i - 7] + Gamma0(W[i - 15]) + W[i - 16];
    }        

    /* Compress */
    for (i = 0; i < 64; i++) {
        t0 = S[7] + Sigma1(S[4]) + Ch(S[4], S[5], S[6]) + K[i] + W[i];
        t1 = Sigma0(S[0]) + Maj(S[0], S[1], S[2]);
        S[7] = S[6];
        S[6] = S[5];
        S[5] = S[4];
        S[4] = S[3] + t0;
        S[3] = S[2];
        S[2] = S[1];
        S[1] = S[0];
        S[0] = t0 + t1;
    }

    /* feedback */
    for (i = 0; i < 8; i++) {
        md->sha256.state[i] = md->sha256.state[i] + S[i];
    }

}
Exemplo n.º 20
0
void GflSHA256::Generate(void)
{
	int i;
	DWORD W[SHA256_WORK];
	DWORD Hash[SHA256_WORK + SHA256_HASH];

	for(i = 0; i < SHA256_BLOCK; i++) W[i] = ReverseEndian(m_aBlock[i]);
	for(i = SHA256_BLOCK; i < SHA256_WORK; i++) W[i] = s1(W[i - 2]) + W[i - 7] + s0(W[i - 15]) + W[i - 16];

	for(i = 0; i < SHA256_HASH; i++) Hash[SHA256_WORK + i] = m_dwH[i];
	DWORD *pHash = &Hash[SHA256_WORK];
	DWORD dwT1, dwT2;
	for(i = 0; i < SHA256_WORK; i++){
		pHash--;
		dwT1 = pHash[8] + S1(pHash[5]) + Ch(pHash[5], pHash[6], pHash[7]) + c_dwK[i] + W[i];
		dwT2 = S0(pHash[1]) + Maj(pHash[1], pHash[2], pHash[3]);
		pHash[0] = dwT1 + dwT2;
		pHash[4] += dwT1;
	}
	for(i = 0; i < SHA256_HASH; i++) m_dwH[i] += pHash[i];
}
Exemplo n.º 21
0
static void shs_transform(sha256 *sh)
{   /* basic transformation step */
    mr_unsign32 a,b,c,d,e,f,g,h,t1,t2;
    int j;
    for (j=16; j<64; j++)
        sh->w[j]=theta1(sh->w[j-2])+sh->w[j-7]+theta0(sh->w[j-15])+sh->w[j-16];

    a=sh->h[0];
    b=sh->h[1];
    c=sh->h[2];
    d=sh->h[3];
    e=sh->h[4];
    f=sh->h[5];
    g=sh->h[6];
    h=sh->h[7];

    for (j=0; j<64; j++)
    {   /* 64 times - mush it up */
        t1=h+Sig1(e)+Ch(e,f,g)+K[j]+sh->w[j];
        t2=Sig0(a)+Maj(a,b,c);
        h=g;
        g=f;
        f=e;
        e=d+t1;
        d=c;
        c=b;
        b=a;
        a=t1+t2;
    }
    sh->h[0]+=a;
    sh->h[1]+=b;
    sh->h[2]+=c;
    sh->h[3]+=d;
    sh->h[4]+=e;
    sh->h[5]+=f;
    sh->h[6]+=g;
    sh->h[7]+=h;
}
Exemplo n.º 22
0
static void myF(myu64 *state, const myu64 *w, int k_index)
{
  myu64 t, t1, t2;
  unsigned char i;
  Ch(&t, state+4, state+5, state+6);
  Sigma(&t1, state+4, 14, 18, 23);
  bigint_add64(t1.v, t1.v, (state+7)->v);  
  bigint_add64(t1.v, t1.v, t.v);  
  for(i=0;i<8;i++)
    t.v[i] = pgm_read_byte((unsigned char *)roundconstants_pgm+k_index*8+i);
  bigint_add64(t1.v, t1.v, t.v);  
  bigint_add64(t1.v, t1.v, w->v);  
  Sigma(&t2, state+0, 28, 34, 5);
  Maj(&t,state+0,state+1,state+2);
  bigint_add64(t2.v, t2.v, t.v);    
  *(state+7) = *(state+6);
  *(state+6) = *(state+5);
  *(state+5) = *(state+4);
  bigint_add64((state+4)->v, (state+3)->v, t1.v);  
  *(state+3) = *(state+2);
  *(state+2) = *(state+1);
  *(state+1) = *(state+0);
  bigint_add64((state+0)->v, t1.v, t2.v);
}
Exemplo n.º 23
0
/* Process LEN bytes of BUFFER, accumulating context into CTX.
   It is assumed that LEN % 128 == 0.  */
void
sha512_process_block (const void *buffer, size_t len, struct sha512_ctx *ctx)
{
  const uint64_t *words = buffer;
  size_t nwords = len / sizeof (uint64_t);
  uint64_t a = ctx->H[0];
  uint64_t b = ctx->H[1];
  uint64_t c = ctx->H[2];
  uint64_t d = ctx->H[3];
  uint64_t e = ctx->H[4];
  uint64_t f = ctx->H[5];
  uint64_t g = ctx->H[6];
  uint64_t h = ctx->H[7];

  /* First increment the byte count.  FIPS 180-2 specifies the possible
     length of the file up to 2^128 bits.  Here we only compute the
     number of bytes.  Do a double word increment.  */
#ifdef USE_TOTAL128
  ctx->total128 += len;
#else
  uint64_t lolen = len;
  ctx->total[TOTAL128_low] += lolen;
  ctx->total[TOTAL128_high] += ((len >> 31 >> 31 >> 2)
				+ (ctx->total[TOTAL128_low] < lolen));
#endif

  /* Process all bytes in the buffer with 128 bytes in each round of
     the loop.  */
  while (nwords > 0)
    {
      uint64_t W[80];
      uint64_t a_save = a;
      uint64_t b_save = b;
      uint64_t c_save = c;
      uint64_t d_save = d;
      uint64_t e_save = e;
      uint64_t f_save = f;
      uint64_t g_save = g;
      uint64_t h_save = h;

      /* Operators defined in FIPS 180-2:4.1.2.  */
#define Ch(x, y, z) ((x & y) ^ (~x & z))
#define Maj(x, y, z) ((x & y) ^ (x & z) ^ (y & z))
#define S0(x) (CYCLIC (x, 28) ^ CYCLIC (x, 34) ^ CYCLIC (x, 39))
#define S1(x) (CYCLIC (x, 14) ^ CYCLIC (x, 18) ^ CYCLIC (x, 41))
#define R0(x) (CYCLIC (x, 1) ^ CYCLIC (x, 8) ^ (x >> 7))
#define R1(x) (CYCLIC (x, 19) ^ CYCLIC (x, 61) ^ (x >> 6))

      /* It is unfortunate that C does not provide an operator for
	 cyclic rotation.  Hope the C compiler is smart enough.  */
#define CYCLIC(w, s) ((w >> s) | (w << (64 - s)))

      /* Compute the message schedule according to FIPS 180-2:6.3.2 step 2.  */
      for (unsigned int t = 0; t < 16; ++t)
	{
	  W[t] = SWAP (*words);
	  ++words;
	}
      for (unsigned int t = 16; t < 80; ++t)
	W[t] = R1 (W[t - 2]) + W[t - 7] + R0 (W[t - 15]) + W[t - 16];

      /* The actual computation according to FIPS 180-2:6.3.2 step 3.  */
      for (unsigned int t = 0; t < 80; ++t)
	{
	  uint64_t T1 = h + S1 (e) + Ch (e, f, g) + K[t] + W[t];
	  uint64_t T2 = S0 (a) + Maj (a, b, c);
	  h = g;
	  g = f;
	  f = e;
	  e = d + T1;
	  d = c;
	  c = b;
	  b = a;
	  a = T1 + T2;
	}

      /* Add the starting values of the context according to FIPS 180-2:6.3.2
	 step 4.  */
      a += a_save;
      b += b_save;
      c += c_save;
      d += d_save;
      e += e_save;
      f += f_save;
      g += g_save;
      h += h_save;

      /* Prepare for the next round.  */
      nwords -= 16;
    }

  /* Put checksum in context given as argument.  */
  ctx->H[0] = a;
  ctx->H[1] = b;
  ctx->H[2] = c;
  ctx->H[3] = d;
  ctx->H[4] = e;
  ctx->H[5] = f;
  ctx->H[6] = g;
  ctx->H[7] = h;
}
Exemplo n.º 24
0
static int  sha512_compress(hash_state * md, unsigned char *buf)
#endif
{
    ulong64 S[8], W[80], t0, t1;
    int i;

    /* copy state into S */
    for (i = 0; i < 8; i++) {
        S[i] = md->sha512.state[i];
    }

    /* copy the state into 1024-bits into W[0..15] */
    for (i = 0; i < 16; i++) {
        LOAD64H(W[i], buf + (8*i));
    }

    /* fill W[16..79] */
    for (i = 16; i < 80; i++) {
        W[i] = Gamma1(W[i - 2]) + W[i - 7] + Gamma0(W[i - 15]) + W[i - 16];
    }        

    /* Compress */
#ifdef LTC_SMALL_CODE
    for (i = 0; i < 80; i++) {
        t0 = S[7] + Sigma1(S[4]) + Ch(S[4], S[5], S[6]) + K[i] + W[i];
        t1 = Sigma0(S[0]) + Maj(S[0], S[1], S[2]);
        S[7] = S[6];
        S[6] = S[5];
        S[5] = S[4];
        S[4] = S[3] + t0;
        S[3] = S[2];
        S[2] = S[1];
        S[1] = S[0];
        S[0] = t0 + t1;
    }
#else
#define RND(a,b,c,d,e,f,g,h,i)                    \
     t0 = h + Sigma1(e) + Ch(e, f, g) + K[i] + W[i];   \
     t1 = Sigma0(a) + Maj(a, b, c);                  \
     d += t0;                                        \
     h  = t0 + t1;

     for (i = 0; i < 80; i += 8) {
         RND(S[0],S[1],S[2],S[3],S[4],S[5],S[6],S[7],i+0);
         RND(S[7],S[0],S[1],S[2],S[3],S[4],S[5],S[6],i+1);
         RND(S[6],S[7],S[0],S[1],S[2],S[3],S[4],S[5],i+2);
         RND(S[5],S[6],S[7],S[0],S[1],S[2],S[3],S[4],i+3);
         RND(S[4],S[5],S[6],S[7],S[0],S[1],S[2],S[3],i+4);
         RND(S[3],S[4],S[5],S[6],S[7],S[0],S[1],S[2],i+5);
         RND(S[2],S[3],S[4],S[5],S[6],S[7],S[0],S[1],i+6);
         RND(S[1],S[2],S[3],S[4],S[5],S[6],S[7],S[0],i+7);
     }
#endif     


    /* feedback */
    for (i = 0; i < 8; i++) {
        md->sha512.state[i] = md->sha512.state[i] + S[i];
    }

    return CRYPT_OK;
}
Exemplo n.º 25
0
static void sha512_transform(uint64_t *state, const uint8_t buffer[128])
{
    uint64_t a, b, c, d, e, f, g, h;
    uint64_t block[80];
    uint64_t T1;
    int i;

    a = state[0];
    b = state[1];
    c = state[2];
    d = state[3];
    e = state[4];
    f = state[5];
    g = state[6];
    h = state[7];
#if CONFIG_SMALL
    for (i = 0; i < 80; i++) {
        uint64_t T2;
        if (i < 16)
            T1 = blk0(i);
        else
            T1 = blk(i);
        T1 += h + Sigma1_512(e) + Ch(e, f, g) + K512[i];
        T2 = Sigma0_512(a) + Maj(a, b, c);
        h = g;
        g = f;
        f = e;
        e = d + T1;
        d = c;
        c = b;
        b = a;
        a = T1 + T2;
    }
#else

#define R512_0 \
    ROUND512_0_TO_15(a, b, c, d, e, f, g, h); \
    ROUND512_0_TO_15(h, a, b, c, d, e, f, g); \
    ROUND512_0_TO_15(g, h, a, b, c, d, e, f); \
    ROUND512_0_TO_15(f, g, h, a, b, c, d, e); \
    ROUND512_0_TO_15(e, f, g, h, a, b, c, d); \
    ROUND512_0_TO_15(d, e, f, g, h, a, b, c); \
    ROUND512_0_TO_15(c, d, e, f, g, h, a, b); \
    ROUND512_0_TO_15(b, c, d, e, f, g, h, a)

    i = 0;
    R512_0; R512_0;

#define R512_16 \
    ROUND512_16_TO_80(a, b, c, d, e, f, g, h); \
    ROUND512_16_TO_80(h, a, b, c, d, e, f, g); \
    ROUND512_16_TO_80(g, h, a, b, c, d, e, f); \
    ROUND512_16_TO_80(f, g, h, a, b, c, d, e); \
    ROUND512_16_TO_80(e, f, g, h, a, b, c, d); \
    ROUND512_16_TO_80(d, e, f, g, h, a, b, c); \
    ROUND512_16_TO_80(c, d, e, f, g, h, a, b); \
    ROUND512_16_TO_80(b, c, d, e, f, g, h, a)

    R512_16; R512_16; R512_16; R512_16;
    R512_16; R512_16; R512_16; R512_16;
#endif
    state[0] += a;
    state[1] += b;
    state[2] += c;
    state[3] += d;
    state[4] += e;
    state[5] += f;
    state[6] += g;
    state[7] += h;
}
Exemplo n.º 26
0
Arquivo: sha.c Projeto: AVbin/libav
static void sha256_transform(uint32_t *state, const uint8_t buffer[64])
{
    unsigned int i, a, b, c, d, e, f, g, h;
    uint32_t block[64];
    uint32_t T1;

    a = state[0];
    b = state[1];
    c = state[2];
    d = state[3];
    e = state[4];
    f = state[5];
    g = state[6];
    h = state[7];
#if CONFIG_SMALL
    for (i = 0; i < 64; i++) {
        uint32_t T2;
        if (i < 16)
            T1 = blk0(i);
        else
            T1 = blk(i);
        T1 += h + Sigma1_256(e) + Ch(e, f, g) + K256[i];
        T2 = Sigma0_256(a) + Maj(a, b, c);
        h = g;
        g = f;
        f = e;
        e = d + T1;
        d = c;
        c = b;
        b = a;
        a = T1 + T2;
    }
#else
    for (i = 0; i < 16;) {
        ROUND256_0_TO_15(a, b, c, d, e, f, g, h);
        ROUND256_0_TO_15(h, a, b, c, d, e, f, g);
        ROUND256_0_TO_15(g, h, a, b, c, d, e, f);
        ROUND256_0_TO_15(f, g, h, a, b, c, d, e);
        ROUND256_0_TO_15(e, f, g, h, a, b, c, d);
        ROUND256_0_TO_15(d, e, f, g, h, a, b, c);
        ROUND256_0_TO_15(c, d, e, f, g, h, a, b);
        ROUND256_0_TO_15(b, c, d, e, f, g, h, a);
    }

    for (; i < 64;) {
        ROUND256_16_TO_63(a, b, c, d, e, f, g, h);
        ROUND256_16_TO_63(h, a, b, c, d, e, f, g);
        ROUND256_16_TO_63(g, h, a, b, c, d, e, f);
        ROUND256_16_TO_63(f, g, h, a, b, c, d, e);
        ROUND256_16_TO_63(e, f, g, h, a, b, c, d);
        ROUND256_16_TO_63(d, e, f, g, h, a, b, c);
        ROUND256_16_TO_63(c, d, e, f, g, h, a, b);
        ROUND256_16_TO_63(b, c, d, e, f, g, h, a);
    }
#endif
    state[0] += a;
    state[1] += b;
    state[2] += c;
    state[3] += d;
    state[4] += e;
    state[5] += f;
    state[6] += g;
    state[7] += h;
}
Exemplo n.º 27
0
/****************
 * Transform the message W which consists of 16 64-bit-words
 */
static void
transform (SHA512_CONTEXT *hd, const unsigned char *data)
{
  u64 a, b, c, d, e, f, g, h;
  u64 w[80];
  int t;
  static const u64 k[] =
    {
      U64_C(0x428a2f98d728ae22), U64_C(0x7137449123ef65cd),
      U64_C(0xb5c0fbcfec4d3b2f), U64_C(0xe9b5dba58189dbbc),
      U64_C(0x3956c25bf348b538), U64_C(0x59f111f1b605d019),
      U64_C(0x923f82a4af194f9b), U64_C(0xab1c5ed5da6d8118),
      U64_C(0xd807aa98a3030242), U64_C(0x12835b0145706fbe),
      U64_C(0x243185be4ee4b28c), U64_C(0x550c7dc3d5ffb4e2),
      U64_C(0x72be5d74f27b896f), U64_C(0x80deb1fe3b1696b1),
      U64_C(0x9bdc06a725c71235), U64_C(0xc19bf174cf692694),
      U64_C(0xe49b69c19ef14ad2), U64_C(0xefbe4786384f25e3),
      U64_C(0x0fc19dc68b8cd5b5), U64_C(0x240ca1cc77ac9c65),
      U64_C(0x2de92c6f592b0275), U64_C(0x4a7484aa6ea6e483),
      U64_C(0x5cb0a9dcbd41fbd4), U64_C(0x76f988da831153b5),
      U64_C(0x983e5152ee66dfab), U64_C(0xa831c66d2db43210),
      U64_C(0xb00327c898fb213f), U64_C(0xbf597fc7beef0ee4),
      U64_C(0xc6e00bf33da88fc2), U64_C(0xd5a79147930aa725),
      U64_C(0x06ca6351e003826f), U64_C(0x142929670a0e6e70),
      U64_C(0x27b70a8546d22ffc), U64_C(0x2e1b21385c26c926),
      U64_C(0x4d2c6dfc5ac42aed), U64_C(0x53380d139d95b3df),
      U64_C(0x650a73548baf63de), U64_C(0x766a0abb3c77b2a8),
      U64_C(0x81c2c92e47edaee6), U64_C(0x92722c851482353b),
      U64_C(0xa2bfe8a14cf10364), U64_C(0xa81a664bbc423001),
      U64_C(0xc24b8b70d0f89791), U64_C(0xc76c51a30654be30),
      U64_C(0xd192e819d6ef5218), U64_C(0xd69906245565a910),
      U64_C(0xf40e35855771202a), U64_C(0x106aa07032bbd1b8),
      U64_C(0x19a4c116b8d2d0c8), U64_C(0x1e376c085141ab53),
      U64_C(0x2748774cdf8eeb99), U64_C(0x34b0bcb5e19b48a8),
      U64_C(0x391c0cb3c5c95a63), U64_C(0x4ed8aa4ae3418acb),
      U64_C(0x5b9cca4f7763e373), U64_C(0x682e6ff3d6b2b8a3),
      U64_C(0x748f82ee5defb2fc), U64_C(0x78a5636f43172f60),
      U64_C(0x84c87814a1f0ab72), U64_C(0x8cc702081a6439ec),
      U64_C(0x90befffa23631e28), U64_C(0xa4506cebde82bde9),
      U64_C(0xbef9a3f7b2c67915), U64_C(0xc67178f2e372532b),
      U64_C(0xca273eceea26619c), U64_C(0xd186b8c721c0c207),
      U64_C(0xeada7dd6cde0eb1e), U64_C(0xf57d4f7fee6ed178),
      U64_C(0x06f067aa72176fba), U64_C(0x0a637dc5a2c898a6),
      U64_C(0x113f9804bef90dae), U64_C(0x1b710b35131c471b),
      U64_C(0x28db77f523047d84), U64_C(0x32caab7b40c72493),
      U64_C(0x3c9ebe0a15c9bebc), U64_C(0x431d67c49c100d4c),
      U64_C(0x4cc5d4becb3e42b6), U64_C(0x597f299cfc657e2a),
      U64_C(0x5fcb6fab3ad6faec), U64_C(0x6c44198c4a475817)
    };

  /* get values from the chaining vars */
  a = hd->h0;
  b = hd->h1;
  c = hd->h2;
  d = hd->h3;
  e = hd->h4;
  f = hd->h5;
  g = hd->h6;
  h = hd->h7;

#ifdef WORDS_BIGENDIAN
  memcpy (w, data, 128);
#else
  {
    int i;
    byte *p2;

    for (i = 0, p2 = (byte *) w; i < 16; i++, p2 += 8)
      {
	p2[7] = *data++;
	p2[6] = *data++;
	p2[5] = *data++;
	p2[4] = *data++;
	p2[3] = *data++;
	p2[2] = *data++;
	p2[1] = *data++;
	p2[0] = *data++;
      }
  }
#endif

#define S0(x) (ROTR((x),1) ^ ROTR((x),8) ^ ((x)>>7))
#define S1(x) (ROTR((x),19) ^ ROTR((x),61) ^ ((x)>>6))

  for (t = 16; t < 80; t++)
    w[t] = S1 (w[t - 2]) + w[t - 7] + S0 (w[t - 15]) + w[t - 16];


  for (t = 0; t < 80; )
    {
      u64 t1, t2;

      /* Performance on a AMD Athlon(tm) Dual Core Processor 4050e
         with gcc 4.3.3 using gcry_md_hash_buffer of each 10000 bytes
         initialized to 0,1,2,3...255,0,... and 1000 iterations:

         Not unrolled with macros:  440ms
         Unrolled with macros:      350ms
         Unrolled with inline:      330ms
      */
#if 1 /* Not unrolled.  */
      t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t];
      t2 = Sum0 (a) + Maj (a, b, c);
      h = g;
      g = f;
      f = e;
      e = d + t1;
      d = c;
      c = b;
      b = a;
      a = t1 + t2;
      t++;
#else /* Unrolled to interweave the chain variables.  */
      t1 = h + Sum1 (e) + Ch (e, f, g) + k[t] + w[t];
      t2 = Sum0 (a) + Maj (a, b, c);
      d += t1;
      h  = t1 + t2;

      t1 = g + Sum1 (d) + Ch (d, e, f) + k[t+1] + w[t+1];
      t2 = Sum0 (h) + Maj (h, a, b);
      c += t1;
      g  = t1 + t2;

      t1 = f + Sum1 (c) + Ch (c, d, e) + k[t+2] + w[t+2];
      t2 = Sum0 (g) + Maj (g, h, a);
      b += t1;
      f  = t1 + t2;

      t1 = e + Sum1 (b) + Ch (b, c, d) + k[t+3] + w[t+3];
      t2 = Sum0 (f) + Maj (f, g, h);
      a += t1;
      e  = t1 + t2;

      t1 = d + Sum1 (a) + Ch (a, b, c) + k[t+4] + w[t+4];
      t2 = Sum0 (e) + Maj (e, f, g);
      h += t1;
      d  = t1 + t2;

      t1 = c + Sum1 (h) + Ch (h, a, b) + k[t+5] + w[t+5];
      t2 = Sum0 (d) + Maj (d, e, f);
      g += t1;
      c  = t1 + t2;

      t1 = b + Sum1 (g) + Ch (g, h, a) + k[t+6] + w[t+6];
      t2 = Sum0 (c) + Maj (c, d, e);
      f += t1;
      b  = t1 + t2;

      t1 = a + Sum1 (f) + Ch (f, g, h) + k[t+7] + w[t+7];
      t2 = Sum0 (b) + Maj (b, c, d);
      e += t1;
      a  = t1 + t2;

      t += 8;
#endif
    }

  /* Update chaining vars.  */
  hd->h0 += a;
  hd->h1 += b;
  hd->h2 += c;
  hd->h3 += d;
  hd->h4 += e;
  hd->h5 += f;
  hd->h6 += g;
  hd->h7 += h;
}
Exemplo n.º 28
0
static void sha256_transform(u32 *state, const u8 *input)
{
	u32 a, b, c, d, e, f, g, h, t1, t2;
	u32 W[64];
	int i;

	/* load the input */
	for (i = 0; i < 16; i++)
		LOAD_OP(i, W, input);

	/* now blend */
	for (i = 16; i < 64; i++)
		BLEND_OP(i, W);

	/* load the state into our registers */
	a=state[0];  b=state[1];  c=state[2];  d=state[3];
	e=state[4];  f=state[5];  g=state[6];  h=state[7];

	/* now iterate */
	t1 = h + e1(e) + Ch(e,f,g) + 0x428a2f98 + W[ 0];
	t2 = e0(a) + Maj(a,b,c);    d+=t1;    h=t1+t2;
	t1 = g + e1(d) + Ch(d,e,f) + 0x71374491 + W[ 1];
	t2 = e0(h) + Maj(h,a,b);    c+=t1;    g=t1+t2;
	t1 = f + e1(c) + Ch(c,d,e) + 0xb5c0fbcf + W[ 2];
	t2 = e0(g) + Maj(g,h,a);    b+=t1;    f=t1+t2;
	t1 = e + e1(b) + Ch(b,c,d) + 0xe9b5dba5 + W[ 3];
	t2 = e0(f) + Maj(f,g,h);    a+=t1;    e=t1+t2;
	t1 = d + e1(a) + Ch(a,b,c) + 0x3956c25b + W[ 4];
	t2 = e0(e) + Maj(e,f,g);    h+=t1;    d=t1+t2;
	t1 = c + e1(h) + Ch(h,a,b) + 0x59f111f1 + W[ 5];
	t2 = e0(d) + Maj(d,e,f);    g+=t1;    c=t1+t2;
	t1 = b + e1(g) + Ch(g,h,a) + 0x923f82a4 + W[ 6];
	t2 = e0(c) + Maj(c,d,e);    f+=t1;    b=t1+t2;
	t1 = a + e1(f) + Ch(f,g,h) + 0xab1c5ed5 + W[ 7];
	t2 = e0(b) + Maj(b,c,d);    e+=t1;    a=t1+t2;

	t1 = h + e1(e) + Ch(e,f,g) + 0xd807aa98 + W[ 8];
	t2 = e0(a) + Maj(a,b,c);    d+=t1;    h=t1+t2;
	t1 = g + e1(d) + Ch(d,e,f) + 0x12835b01 + W[ 9];
	t2 = e0(h) + Maj(h,a,b);    c+=t1;    g=t1+t2;
	t1 = f + e1(c) + Ch(c,d,e) + 0x243185be + W[10];
	t2 = e0(g) + Maj(g,h,a);    b+=t1;    f=t1+t2;
	t1 = e + e1(b) + Ch(b,c,d) + 0x550c7dc3 + W[11];
	t2 = e0(f) + Maj(f,g,h);    a+=t1;    e=t1+t2;
	t1 = d + e1(a) + Ch(a,b,c) + 0x72be5d74 + W[12];
	t2 = e0(e) + Maj(e,f,g);    h+=t1;    d=t1+t2;
	t1 = c + e1(h) + Ch(h,a,b) + 0x80deb1fe + W[13];
	t2 = e0(d) + Maj(d,e,f);    g+=t1;    c=t1+t2;
	t1 = b + e1(g) + Ch(g,h,a) + 0x9bdc06a7 + W[14];
	t2 = e0(c) + Maj(c,d,e);    f+=t1;    b=t1+t2;
	t1 = a + e1(f) + Ch(f,g,h) + 0xc19bf174 + W[15];
	t2 = e0(b) + Maj(b,c,d);    e+=t1;    a=t1+t2;

	t1 = h + e1(e) + Ch(e,f,g) + 0xe49b69c1 + W[16];
	t2 = e0(a) + Maj(a,b,c);    d+=t1;    h=t1+t2;
	t1 = g + e1(d) + Ch(d,e,f) + 0xefbe4786 + W[17];
	t2 = e0(h) + Maj(h,a,b);    c+=t1;    g=t1+t2;
	t1 = f + e1(c) + Ch(c,d,e) + 0x0fc19dc6 + W[18];
	t2 = e0(g) + Maj(g,h,a);    b+=t1;    f=t1+t2;
	t1 = e + e1(b) + Ch(b,c,d) + 0x240ca1cc + W[19];
	t2 = e0(f) + Maj(f,g,h);    a+=t1;    e=t1+t2;
	t1 = d + e1(a) + Ch(a,b,c) + 0x2de92c6f + W[20];
	t2 = e0(e) + Maj(e,f,g);    h+=t1;    d=t1+t2;
	t1 = c + e1(h) + Ch(h,a,b) + 0x4a7484aa + W[21];
	t2 = e0(d) + Maj(d,e,f);    g+=t1;    c=t1+t2;
	t1 = b + e1(g) + Ch(g,h,a) + 0x5cb0a9dc + W[22];
	t2 = e0(c) + Maj(c,d,e);    f+=t1;    b=t1+t2;
	t1 = a + e1(f) + Ch(f,g,h) + 0x76f988da + W[23];
	t2 = e0(b) + Maj(b,c,d);    e+=t1;    a=t1+t2;

	t1 = h + e1(e) + Ch(e,f,g) + 0x983e5152 + W[24];
	t2 = e0(a) + Maj(a,b,c);    d+=t1;    h=t1+t2;
	t1 = g + e1(d) + Ch(d,e,f) + 0xa831c66d + W[25];
	t2 = e0(h) + Maj(h,a,b);    c+=t1;    g=t1+t2;
	t1 = f + e1(c) + Ch(c,d,e) + 0xb00327c8 + W[26];
	t2 = e0(g) + Maj(g,h,a);    b+=t1;    f=t1+t2;
	t1 = e + e1(b) + Ch(b,c,d) + 0xbf597fc7 + W[27];
	t2 = e0(f) + Maj(f,g,h);    a+=t1;    e=t1+t2;
	t1 = d + e1(a) + Ch(a,b,c) + 0xc6e00bf3 + W[28];
	t2 = e0(e) + Maj(e,f,g);    h+=t1;    d=t1+t2;
	t1 = c + e1(h) + Ch(h,a,b) + 0xd5a79147 + W[29];
	t2 = e0(d) + Maj(d,e,f);    g+=t1;    c=t1+t2;
	t1 = b + e1(g) + Ch(g,h,a) + 0x06ca6351 + W[30];
	t2 = e0(c) + Maj(c,d,e);    f+=t1;    b=t1+t2;
	t1 = a + e1(f) + Ch(f,g,h) + 0x14292967 + W[31];
	t2 = e0(b) + Maj(b,c,d);    e+=t1;    a=t1+t2;

	t1 = h + e1(e) + Ch(e,f,g) + 0x27b70a85 + W[32];
	t2 = e0(a) + Maj(a,b,c);    d+=t1;    h=t1+t2;
	t1 = g + e1(d) + Ch(d,e,f) + 0x2e1b2138 + W[33];
	t2 = e0(h) + Maj(h,a,b);    c+=t1;    g=t1+t2;
	t1 = f + e1(c) + Ch(c,d,e) + 0x4d2c6dfc + W[34];
	t2 = e0(g) + Maj(g,h,a);    b+=t1;    f=t1+t2;
	t1 = e + e1(b) + Ch(b,c,d) + 0x53380d13 + W[35];
	t2 = e0(f) + Maj(f,g,h);    a+=t1;    e=t1+t2;
	t1 = d + e1(a) + Ch(a,b,c) + 0x650a7354 + W[36];
	t2 = e0(e) + Maj(e,f,g);    h+=t1;    d=t1+t2;
	t1 = c + e1(h) + Ch(h,a,b) + 0x766a0abb + W[37];
	t2 = e0(d) + Maj(d,e,f);    g+=t1;    c=t1+t2;
	t1 = b + e1(g) + Ch(g,h,a) + 0x81c2c92e + W[38];
	t2 = e0(c) + Maj(c,d,e);    f+=t1;    b=t1+t2;
	t1 = a + e1(f) + Ch(f,g,h) + 0x92722c85 + W[39];
	t2 = e0(b) + Maj(b,c,d);    e+=t1;    a=t1+t2;

	t1 = h + e1(e) + Ch(e,f,g) + 0xa2bfe8a1 + W[40];
	t2 = e0(a) + Maj(a,b,c);    d+=t1;    h=t1+t2;
	t1 = g + e1(d) + Ch(d,e,f) + 0xa81a664b + W[41];
	t2 = e0(h) + Maj(h,a,b);    c+=t1;    g=t1+t2;
	t1 = f + e1(c) + Ch(c,d,e) + 0xc24b8b70 + W[42];
	t2 = e0(g) + Maj(g,h,a);    b+=t1;    f=t1+t2;
	t1 = e + e1(b) + Ch(b,c,d) + 0xc76c51a3 + W[43];
	t2 = e0(f) + Maj(f,g,h);    a+=t1;    e=t1+t2;
	t1 = d + e1(a) + Ch(a,b,c) + 0xd192e819 + W[44];
	t2 = e0(e) + Maj(e,f,g);    h+=t1;    d=t1+t2;
	t1 = c + e1(h) + Ch(h,a,b) + 0xd6990624 + W[45];
	t2 = e0(d) + Maj(d,e,f);    g+=t1;    c=t1+t2;
	t1 = b + e1(g) + Ch(g,h,a) + 0xf40e3585 + W[46];
	t2 = e0(c) + Maj(c,d,e);    f+=t1;    b=t1+t2;
	t1 = a + e1(f) + Ch(f,g,h) + 0x106aa070 + W[47];
	t2 = e0(b) + Maj(b,c,d);    e+=t1;    a=t1+t2;

	t1 = h + e1(e) + Ch(e,f,g) + 0x19a4c116 + W[48];
	t2 = e0(a) + Maj(a,b,c);    d+=t1;    h=t1+t2;
	t1 = g + e1(d) + Ch(d,e,f) + 0x1e376c08 + W[49];
	t2 = e0(h) + Maj(h,a,b);    c+=t1;    g=t1+t2;
	t1 = f + e1(c) + Ch(c,d,e) + 0x2748774c + W[50];
	t2 = e0(g) + Maj(g,h,a);    b+=t1;    f=t1+t2;
	t1 = e + e1(b) + Ch(b,c,d) + 0x34b0bcb5 + W[51];
	t2 = e0(f) + Maj(f,g,h);    a+=t1;    e=t1+t2;
	t1 = d + e1(a) + Ch(a,b,c) + 0x391c0cb3 + W[52];
	t2 = e0(e) + Maj(e,f,g);    h+=t1;    d=t1+t2;
	t1 = c + e1(h) + Ch(h,a,b) + 0x4ed8aa4a + W[53];
	t2 = e0(d) + Maj(d,e,f);    g+=t1;    c=t1+t2;
	t1 = b + e1(g) + Ch(g,h,a) + 0x5b9cca4f + W[54];
	t2 = e0(c) + Maj(c,d,e);    f+=t1;    b=t1+t2;
	t1 = a + e1(f) + Ch(f,g,h) + 0x682e6ff3 + W[55];
	t2 = e0(b) + Maj(b,c,d);    e+=t1;    a=t1+t2;

	t1 = h + e1(e) + Ch(e,f,g) + 0x748f82ee + W[56];
	t2 = e0(a) + Maj(a,b,c);    d+=t1;    h=t1+t2;
	t1 = g + e1(d) + Ch(d,e,f) + 0x78a5636f + W[57];
	t2 = e0(h) + Maj(h,a,b);    c+=t1;    g=t1+t2;
	t1 = f + e1(c) + Ch(c,d,e) + 0x84c87814 + W[58];
	t2 = e0(g) + Maj(g,h,a);    b+=t1;    f=t1+t2;
	t1 = e + e1(b) + Ch(b,c,d) + 0x8cc70208 + W[59];
	t2 = e0(f) + Maj(f,g,h);    a+=t1;    e=t1+t2;
	t1 = d + e1(a) + Ch(a,b,c) + 0x90befffa + W[60];
	t2 = e0(e) + Maj(e,f,g);    h+=t1;    d=t1+t2;
	t1 = c + e1(h) + Ch(h,a,b) + 0xa4506ceb + W[61];
	t2 = e0(d) + Maj(d,e,f);    g+=t1;    c=t1+t2;
	t1 = b + e1(g) + Ch(g,h,a) + 0xbef9a3f7 + W[62];
	t2 = e0(c) + Maj(c,d,e);    f+=t1;    b=t1+t2;
	t1 = a + e1(f) + Ch(f,g,h) + 0xc67178f2 + W[63];
	t2 = e0(b) + Maj(b,c,d);    e+=t1;    a=t1+t2;

	state[0] += a; state[1] += b; state[2] += c; state[3] += d;
	state[4] += e; state[5] += f; state[6] += g; state[7] += h;

	/* clear any sensitive info... */
	a = b = c = d = e = f = g = h = t1 = t2 = 0;
	memset(W, 0, 64 * sizeof(u32));
}
Exemplo n.º 29
0
static void SHA1_HashBlock(SHA1_CONTEXT *p)
{
    int t, j;
    UINT32 W[80], a, b, c, d, e, T;

    // Prepare Message Schedule, {W sub t}.
    //
    for (t = 0, j = 0; t <= 15; t++, j += 4)
    {
        W[t] = (p->block[j  ] << 24)
             | (p->block[j+1] << 16)
             | (p->block[j+2] <<  8)
             | (p->block[j+3]      );
    }
    for (t = 16; t <= 79; t++)
    {
        W[t] = ROTL(W[t-3] ^ W[t-8] ^ W[t-14] ^ W[t-16], 1);
    }

    a = p->H[0];
    b = p->H[1];
    c = p->H[2];
    d = p->H[3];
    e = p->H[4];

    for (t =  0; t <= 19; t++)
    {
        T = ROTL(a,5) + Ch(b,c,d) + e + 0x5A827999 + W[t];
        e = d;
        d = c;
        c = ROTL(b,30);
        b = a;
        a = T;
    }
    for (t = 20; t <= 39; t++)
    {
        T = ROTL(a,5) + Parity(b,c,d) + e + 0x6ED9EBA1 + W[t];
        e = d;
        d = c;
        c = ROTL(b,30);
        b = a;
        a = T;
    }
    for (t = 40; t <= 59; t++)
    {
        T = ROTL(a,5) + Maj(b,c,d) + e + 0x8F1BBCDC + W[t];
        e = d;
        d = c;
        c = ROTL(b,30);
        b = a;
        a = T;
    }
    for (t = 60; t <= 79; t++)
    {
        T = ROTL(a,5) + Parity(b,c,d) + e + 0xCA62C1D6 + W[t];
        e = d;
        d = c;
        c = ROTL(b,30);
        b = a;
        a = T;
    }

    p->H[0] += a;
    p->H[1] += b;
    p->H[2] += c;
    p->H[3] += d;
    p->H[4] += e;
}
Exemplo n.º 30
0
static inline void sha256_transform(__m128i *state, __m128i *block, __m128i *dst)
{
    __m128i W[64], t1, t2;

    W[0]  = block[ 0];
    W[1]  = block[ 1];
    W[2]  = block[ 2];
    W[3]  = block[ 3];
    W[4]  = block[ 4];
    W[5]  = block[ 5];
    W[6]  = block[ 6];
    W[7]  = block[ 7];
    W[8]  = block[ 8];
    W[9]  = block[ 9];
    W[10] = block[10];
    W[11] = block[11];
    W[12] = block[12];
    W[13] = block[13];
    W[14] = block[14];
    W[15] = block[15];

    W[16] = add4(sigma1(W[16 - 2]), W[16 - 7], sigma0(W[16 - 15]), W[16 - 16]);
    W[17] = add4(sigma1(W[17 - 2]), W[17 - 7], sigma0(W[17 - 15]), W[17 - 16]);
    W[18] = add4(sigma1(W[18 - 2]), W[18 - 7], sigma0(W[18 - 15]), W[18 - 16]);
    W[19] = add4(sigma1(W[19 - 2]), W[19 - 7], sigma0(W[19 - 15]), W[19 - 16]);
    W[20] = add4(sigma1(W[20 - 2]), W[20 - 7], sigma0(W[20 - 15]), W[20 - 16]);
    W[21] = add4(sigma1(W[21 - 2]), W[21 - 7], sigma0(W[21 - 15]), W[21 - 16]);
    W[22] = add4(sigma1(W[22 - 2]), W[22 - 7], sigma0(W[22 - 15]), W[22 - 16]);
    W[23] = add4(sigma1(W[23 - 2]), W[23 - 7], sigma0(W[23 - 15]), W[23 - 16]);
    W[24] = add4(sigma1(W[24 - 2]), W[24 - 7], sigma0(W[24 - 15]), W[24 - 16]);
    W[25] = add4(sigma1(W[25 - 2]), W[25 - 7], sigma0(W[25 - 15]), W[25 - 16]);
    W[26] = add4(sigma1(W[26 - 2]), W[26 - 7], sigma0(W[26 - 15]), W[26 - 16]);
    W[27] = add4(sigma1(W[27 - 2]), W[27 - 7], sigma0(W[27 - 15]), W[27 - 16]);
    W[28] = add4(sigma1(W[28 - 2]), W[28 - 7], sigma0(W[28 - 15]), W[28 - 16]);
    W[29] = add4(sigma1(W[29 - 2]), W[29 - 7], sigma0(W[29 - 15]), W[29 - 16]);
    W[30] = add4(sigma1(W[30 - 2]), W[30 - 7], sigma0(W[30 - 15]), W[30 - 16]);
    W[31] = add4(sigma1(W[31 - 2]), W[31 - 7], sigma0(W[31 - 15]), W[31 - 16]);
    W[32] = add4(sigma1(W[32 - 2]), W[32 - 7], sigma0(W[32 - 15]), W[32 - 16]);
    W[33] = add4(sigma1(W[33 - 2]), W[33 - 7], sigma0(W[33 - 15]), W[33 - 16]);
    W[34] = add4(sigma1(W[34 - 2]), W[34 - 7], sigma0(W[34 - 15]), W[34 - 16]);
    W[35] = add4(sigma1(W[35 - 2]), W[35 - 7], sigma0(W[35 - 15]), W[35 - 16]);
    W[36] = add4(sigma1(W[36 - 2]), W[36 - 7], sigma0(W[36 - 15]), W[36 - 16]);
    W[37] = add4(sigma1(W[37 - 2]), W[37 - 7], sigma0(W[37 - 15]), W[37 - 16]);
    W[38] = add4(sigma1(W[38 - 2]), W[38 - 7], sigma0(W[38 - 15]), W[38 - 16]);
    W[39] = add4(sigma1(W[39 - 2]), W[39 - 7], sigma0(W[39 - 15]), W[39 - 16]);
    W[40] = add4(sigma1(W[40 - 2]), W[40 - 7], sigma0(W[40 - 15]), W[40 - 16]);
    W[41] = add4(sigma1(W[41 - 2]), W[41 - 7], sigma0(W[41 - 15]), W[41 - 16]);
    W[42] = add4(sigma1(W[42 - 2]), W[42 - 7], sigma0(W[42 - 15]), W[42 - 16]);
    W[43] = add4(sigma1(W[43 - 2]), W[43 - 7], sigma0(W[43 - 15]), W[43 - 16]);
    W[44] = add4(sigma1(W[44 - 2]), W[44 - 7], sigma0(W[44 - 15]), W[44 - 16]);
    W[45] = add4(sigma1(W[45 - 2]), W[45 - 7], sigma0(W[45 - 15]), W[45 - 16]);
    W[46] = add4(sigma1(W[46 - 2]), W[46 - 7], sigma0(W[46 - 15]), W[46 - 16]);
    W[47] = add4(sigma1(W[47 - 2]), W[47 - 7], sigma0(W[47 - 15]), W[47 - 16]);
    W[48] = add4(sigma1(W[48 - 2]), W[48 - 7], sigma0(W[48 - 15]), W[48 - 16]);
    W[49] = add4(sigma1(W[49 - 2]), W[49 - 7], sigma0(W[49 - 15]), W[49 - 16]);
    W[50] = add4(sigma1(W[50 - 2]), W[50 - 7], sigma0(W[50 - 15]), W[50 - 16]);
    W[51] = add4(sigma1(W[51 - 2]), W[51 - 7], sigma0(W[51 - 15]), W[51 - 16]);
    W[52] = add4(sigma1(W[52 - 2]), W[52 - 7], sigma0(W[52 - 15]), W[52 - 16]);
    W[53] = add4(sigma1(W[53 - 2]), W[53 - 7], sigma0(W[53 - 15]), W[53 - 16]);
    W[54] = add4(sigma1(W[54 - 2]), W[54 - 7], sigma0(W[54 - 15]), W[54 - 16]);
    W[55] = add4(sigma1(W[55 - 2]), W[55 - 7], sigma0(W[55 - 15]), W[55 - 16]);
    W[56] = add4(sigma1(W[56 - 2]), W[56 - 7], sigma0(W[56 - 15]), W[56 - 16]);
    W[57] = add4(sigma1(W[57 - 2]), W[57 - 7], sigma0(W[57 - 15]), W[57 - 16]);
    W[58] = add4(sigma1(W[58 - 2]), W[58 - 7], sigma0(W[58 - 15]), W[58 - 16]);
    W[59] = add4(sigma1(W[59 - 2]), W[59 - 7], sigma0(W[59 - 15]), W[59 - 16]);
    W[60] = add4(sigma1(W[60 - 2]), W[60 - 7], sigma0(W[60 - 15]), W[60 - 16]);
    W[61] = add4(sigma1(W[61 - 2]), W[61 - 7], sigma0(W[61 - 15]), W[61 - 16]);
    W[62] = add4(sigma1(W[62 - 2]), W[62 - 7], sigma0(W[62 - 15]), W[62 - 16]);
    W[63] = add4(sigma1(W[63 - 2]), W[63 - 7], sigma0(W[63 - 15]), W[63 - 16]);

    // read existing state
    __m128i a = state[0];
    __m128i b = state[1];
    __m128i c = state[2];
    __m128i d = state[3];
    __m128i e = state[4];
    __m128i f = state[5];
    __m128i g = state[6];
    __m128i h = state[7];

    t1 = add5(h, Sigma1(e), Ch(e, f, g), _mm_set1_epi32(0x428a2f98), W[0]);
    t2 = add2(Sigma0(a), Maj(a, b, c));
    d = add2(d, t1);
    h = add2(t1, t2);
    t1 = add5(g, Sigma1(d), Ch(d, e, f), _mm_set1_epi32(0x71374491), W[1]);
    t2 = add2(Sigma0(h), Maj(h, a, b));
    c = add2(c, t1);
    g = add2(t1, t2);
    t1 = add5(f, Sigma1(c), Ch(c, d, e), _mm_set1_epi32(0xb5c0fbcf), W[2]);
    t2 = add2(Sigma0(g), Maj(g, h, a));
    b = add2(b, t1);
    f = add2(t1, t2);
    t1 = add5(e, Sigma1(b), Ch(b, c, d), _mm_set1_epi32(0xe9b5dba5), W[3]);
    t2 = add2(Sigma0(f), Maj(f, g, h));
    a = add2(a, t1);
    e = add2(t1, t2);
    t1 = add5(d, Sigma1(a), Ch(a, b, c), _mm_set1_epi32(0x3956c25b), W[4]);
    t2 = add2(Sigma0(e), Maj(e, f, g));
    h = add2(h, t1);
    d = add2(t1, t2);
    t1 = add5(c, Sigma1(h), Ch(h, a, b), _mm_set1_epi32(0x59f111f1), W[5]);
    t2 = add2(Sigma0(d), Maj(d, e, f));
    g = add2(g, t1);
    c = add2(t1, t2);
    t1 = add5(b, Sigma1(g), Ch(g, h, a), _mm_set1_epi32(0x923f82a4), W[6]);
    t2 = add2(Sigma0(c), Maj(c, d, e));
    f = add2(f, t1);
    b = add2(t1, t2);
    t1 = add5(a, Sigma1(f), Ch(f, g, h), _mm_set1_epi32(0xab1c5ed5), W[7]);
    t2 = add2(Sigma0(b), Maj(b, c, d));
    e = add2(e, t1);
    a = add2(t1, t2);

    t1 = add5(h, Sigma1(e), Ch(e, f, g), _mm_set1_epi32(0xd807aa98), W[8]);
    t2 = add2(Sigma0(a), Maj(a, b, c));
    d = add2(d, t1);
    h = add2(t1, t2);
    t1 = add5(g, Sigma1(d), Ch(d, e, f), _mm_set1_epi32(0x12835b01), W[9]);
    t2 = add2(Sigma0(h), Maj(h, a, b));
    c = add2(c, t1);
    g = add2(t1, t2);
    t1 = add5(f, Sigma1(c), Ch(c, d, e), _mm_set1_epi32(0x243185be), W[10]);
    t2 = add2(Sigma0(g), Maj(g, h, a));
    b = add2(b, t1);
    f = add2(t1, t2);
    t1 = add5(e, Sigma1(b), Ch(b, c, d), _mm_set1_epi32(0x550c7dc3), W[11]);
    t2 = add2(Sigma0(f), Maj(f, g, h));
    a = add2(a, t1);
    e = add2(t1, t2);
    t1 = add5(d, Sigma1(a), Ch(a, b, c), _mm_set1_epi32(0x72be5d74), W[12]);
    t2 = add2(Sigma0(e), Maj(e, f, g));
    h = add2(h, t1);
    d = add2(t1, t2);
    t1 = add5(c, Sigma1(h), Ch(h, a, b), _mm_set1_epi32(0x80deb1fe), W[13]);
    t2 = add2(Sigma0(d), Maj(d, e, f));
    g = add2(g, t1);
    c = add2(t1, t2);
    t1 = add5(b, Sigma1(g), Ch(g, h, a), _mm_set1_epi32(0x9bdc06a7), W[14]);
    t2 = add2(Sigma0(c), Maj(c, d, e));
    f = add2(f, t1);
    b = add2(t1, t2);
    t1 = add5(a, Sigma1(f), Ch(f, g, h), _mm_set1_epi32(0xc19bf174), W[15]);
    t2 = add2(Sigma0(b), Maj(b, c, d));
    e = add2(e, t1);
    a = add2(t1, t2);

    t1 = add5(h, Sigma1(e), Ch(e, f, g), _mm_set1_epi32(0xe49b69c1), W[16]);
    t2 = add2(Sigma0(a), Maj(a, b, c));
    d = add2(d, t1);
    h = add2(t1, t2);
    t1 = add5(g, Sigma1(d), Ch(d, e, f), _mm_set1_epi32(0xefbe4786), W[17]);
    t2 = add2(Sigma0(h), Maj(h, a, b));
    c = add2(c, t1);
    g = add2(t1, t2);
    t1 = add5(f, Sigma1(c), Ch(c, d, e), _mm_set1_epi32(0x0fc19dc6), W[18]);
    t2 = add2(Sigma0(g), Maj(g, h, a));
    b = add2(b, t1);
    f = add2(t1, t2);
    t1 = add5(e, Sigma1(b), Ch(b, c, d), _mm_set1_epi32(0x240ca1cc), W[19]);
    t2 = add2(Sigma0(f), Maj(f, g, h));
    a = add2(a, t1);
    e = add2(t1, t2);
    t1 = add5(d, Sigma1(a), Ch(a, b, c), _mm_set1_epi32(0x2de92c6f), W[20]);
    t2 = add2(Sigma0(e), Maj(e, f, g));
    h = add2(h, t1);
    d = add2(t1, t2);
    t1 = add5(c, Sigma1(h), Ch(h, a, b), _mm_set1_epi32(0x4a7484aa), W[21]);
    t2 = add2(Sigma0(d), Maj(d, e, f));
    g = add2(g, t1);
    c = add2(t1, t2);
    t1 = add5(b, Sigma1(g), Ch(g, h, a), _mm_set1_epi32(0x5cb0a9dc), W[22]);
    t2 = add2(Sigma0(c), Maj(c, d, e));
    f = add2(f, t1);
    b = add2(t1, t2);
    t1 = add5(a, Sigma1(f), Ch(f, g, h), _mm_set1_epi32(0x76f988da), W[23]);
    t2 = add2(Sigma0(b), Maj(b, c, d));
    e = add2(e, t1);
    a = add2(t1, t2);

    t1 = add5(h, Sigma1(e), Ch(e, f, g), _mm_set1_epi32(0x983e5152), W[24]);
    t2 = add2(Sigma0(a), Maj(a, b, c));
    d = add2(d, t1);
    h = add2(t1, t2);
    t1 = add5(g, Sigma1(d), Ch(d, e, f), _mm_set1_epi32(0xa831c66d), W[25]);
    t2 = add2(Sigma0(h), Maj(h, a, b));
    c = add2(c, t1);
    g = add2(t1, t2);
    t1 = add5(f, Sigma1(c), Ch(c, d, e), _mm_set1_epi32(0xb00327c8), W[26]);
    t2 = add2(Sigma0(g), Maj(g, h, a));
    b = add2(b, t1);
    f = add2(t1, t2);
    t1 = add5(e, Sigma1(b), Ch(b, c, d), _mm_set1_epi32(0xbf597fc7), W[27]);
    t2 = add2(Sigma0(f), Maj(f, g, h));
    a = add2(a, t1);
    e = add2(t1, t2);
    t1 = add5(d, Sigma1(a), Ch(a, b, c), _mm_set1_epi32(0xc6e00bf3), W[28]);
    t2 = add2(Sigma0(e), Maj(e, f, g));
    h = add2(h, t1);
    d = add2(t1, t2);
    t1 = add5(c, Sigma1(h), Ch(h, a, b), _mm_set1_epi32(0xd5a79147), W[29]);
    t2 = add2(Sigma0(d), Maj(d, e, f));
    g = add2(g, t1);
    c = add2(t1, t2);
    t1 = add5(b, Sigma1(g), Ch(g, h, a), _mm_set1_epi32(0x06ca6351), W[30]);
    t2 = add2(Sigma0(c), Maj(c, d, e));
    f = add2(f, t1);
    b = add2(t1, t2);
    t1 = add5(a, Sigma1(f), Ch(f, g, h), _mm_set1_epi32(0x14292967), W[31]);
    t2 = add2(Sigma0(b), Maj(b, c, d));
    e = add2(e, t1);
    a = add2(t1, t2);

    t1 = add5(h, Sigma1(e), Ch(e, f, g), _mm_set1_epi32(0x27b70a85), W[32]);
    t2 = add2(Sigma0(a), Maj(a, b, c));
    d = add2(d, t1);
    h = add2(t1, t2);
    t1 = add5(g, Sigma1(d), Ch(d, e, f), _mm_set1_epi32(0x2e1b2138), W[33]);
    t2 = add2(Sigma0(h), Maj(h, a, b));
    c = add2(c, t1);
    g = add2(t1, t2);
    t1 = add5(f, Sigma1(c), Ch(c, d, e), _mm_set1_epi32(0x4d2c6dfc), W[34]);
    t2 = add2(Sigma0(g), Maj(g, h, a));
    b = add2(b, t1);
    f = add2(t1, t2);
    t1 = add5(e, Sigma1(b), Ch(b, c, d), _mm_set1_epi32(0x53380d13), W[35]);
    t2 = add2(Sigma0(f), Maj(f, g, h));
    a = add2(a, t1);
    e = add2(t1, t2);
    t1 = add5(d, Sigma1(a), Ch(a, b, c), _mm_set1_epi32(0x650a7354), W[36]);
    t2 = add2(Sigma0(e), Maj(e, f, g));
    h = add2(h, t1);
    d = add2(t1, t2);
    t1 = add5(c, Sigma1(h), Ch(h, a, b), _mm_set1_epi32(0x766a0abb), W[37]);
    t2 = add2(Sigma0(d), Maj(d, e, f));
    g = add2(g, t1);
    c = add2(t1, t2);
    t1 = add5(b, Sigma1(g), Ch(g, h, a), _mm_set1_epi32(0x81c2c92e), W[38]);
    t2 = add2(Sigma0(c), Maj(c, d, e));
    f = add2(f, t1);
    b = add2(t1, t2);
    t1 = add5(a, Sigma1(f), Ch(f, g, h), _mm_set1_epi32(0x92722c85), W[39]);
    t2 = add2(Sigma0(b), Maj(b, c, d));
    e = add2(e, t1);
    a = add2(t1, t2);

    t1 = add5(h, Sigma1(e), Ch(e, f, g), _mm_set1_epi32(0xa2bfe8a1), W[40]);
    t2 = add2(Sigma0(a), Maj(a, b, c));
    d = add2(d, t1);
    h = add2(t1, t2);
    t1 = add5(g, Sigma1(d), Ch(d, e, f), _mm_set1_epi32(0xa81a664b), W[41]);
    t2 = add2(Sigma0(h), Maj(h, a, b));
    c = add2(c, t1);
    g = add2(t1, t2);
    t1 = add5(f, Sigma1(c), Ch(c, d, e), _mm_set1_epi32(0xc24b8b70), W[42]);
    t2 = add2(Sigma0(g), Maj(g, h, a));
    b = add2(b, t1);
    f = add2(t1, t2);
    t1 = add5(e, Sigma1(b), Ch(b, c, d), _mm_set1_epi32(0xc76c51a3), W[43]);
    t2 = add2(Sigma0(f), Maj(f, g, h));
    a = add2(a, t1);
    e = add2(t1, t2);
    t1 = add5(d, Sigma1(a), Ch(a, b, c), _mm_set1_epi32(0xd192e819), W[44]);
    t2 = add2(Sigma0(e), Maj(e, f, g));
    h = add2(h, t1);
    d = add2(t1, t2);
    t1 = add5(c, Sigma1(h), Ch(h, a, b), _mm_set1_epi32(0xd6990624), W[45]);
    t2 = add2(Sigma0(d), Maj(d, e, f));
    g = add2(g, t1);
    c = add2(t1, t2);
    t1 = add5(b, Sigma1(g), Ch(g, h, a), _mm_set1_epi32(0xf40e3585), W[46]);
    t2 = add2(Sigma0(c), Maj(c, d, e));
    f = add2(f, t1);
    b = add2(t1, t2);
    t1 = add5(a, Sigma1(f), Ch(f, g, h), _mm_set1_epi32(0x106aa070), W[47]);
    t2 = add2(Sigma0(b), Maj(b, c, d));
    e = add2(e, t1);
    a = add2(t1, t2);

    t1 = add5(h, Sigma1(e), Ch(e, f, g), _mm_set1_epi32(0x19a4c116), W[48]);
    t2 = add2(Sigma0(a), Maj(a, b, c));
    d = add2(d, t1);
    h = add2(t1, t2);
    t1 = add5(g, Sigma1(d), Ch(d, e, f), _mm_set1_epi32(0x1e376c08), W[49]);
    t2 = add2(Sigma0(h), Maj(h, a, b));
    c = add2(c, t1);
    g = add2(t1, t2);
    t1 = add5(f, Sigma1(c), Ch(c, d, e), _mm_set1_epi32(0x2748774c), W[50]);
    t2 = add2(Sigma0(g), Maj(g, h, a));
    b = add2(b, t1);
    f = add2(t1, t2);
    t1 = add5(e, Sigma1(b), Ch(b, c, d), _mm_set1_epi32(0x34b0bcb5), W[51]);
    t2 = add2(Sigma0(f), Maj(f, g, h));
    a = add2(a, t1);
    e = add2(t1, t2);
    t1 = add5(d, Sigma1(a), Ch(a, b, c), _mm_set1_epi32(0x391c0cb3), W[52]);
    t2 = add2(Sigma0(e), Maj(e, f, g));
    h = add2(h, t1);
    d = add2(t1, t2);
    t1 = add5(c, Sigma1(h), Ch(h, a, b), _mm_set1_epi32(0x4ed8aa4a), W[53]);
    t2 = add2(Sigma0(d), Maj(d, e, f));
    g = add2(g, t1);
    c = add2(t1, t2);
    t1 = add5(b, Sigma1(g), Ch(g, h, a), _mm_set1_epi32(0x5b9cca4f), W[54]);
    t2 = add2(Sigma0(c), Maj(c, d, e));
    f = add2(f, t1);
    b = add2(t1, t2);
    t1 = add5(a, Sigma1(f), Ch(f, g, h), _mm_set1_epi32(0x682e6ff3), W[55]);
    t2 = add2(Sigma0(b), Maj(b, c, d));
    e = add2(e, t1);
    a = add2(t1, t2);

    t1 = add5(h, Sigma1(e), Ch(e, f, g), _mm_set1_epi32(0x748f82ee), W[56]);
    t2 = add2(Sigma0(a), Maj(a, b, c));
    d = add2(d, t1);
    h = add2(t1, t2);
    t1 = add5(g, Sigma1(d), Ch(d, e, f), _mm_set1_epi32(0x78a5636f), W[57]);
    t2 = add2(Sigma0(h), Maj(h, a, b));
    c = add2(c, t1);
    g = add2(t1, t2);
    t1 = add5(f, Sigma1(c), Ch(c, d, e), _mm_set1_epi32(0x84c87814), W[58]);
    t2 = add2(Sigma0(g), Maj(g, h, a));
    b = add2(b, t1);
    f = add2(t1, t2);
    t1 = add5(e, Sigma1(b), Ch(b, c, d), _mm_set1_epi32(0x8cc70208), W[59]);
    t2 = add2(Sigma0(f), Maj(f, g, h));
    a = add2(a, t1);
    e = add2(t1, t2);
    t1 = add5(d, Sigma1(a), Ch(a, b, c), _mm_set1_epi32(0x90befffa), W[60]);
    t2 = add2(Sigma0(e), Maj(e, f, g));
    h = add2(h, t1);
    d = add2(t1, t2);
    t1 = add5(c, Sigma1(h), Ch(h, a, b), _mm_set1_epi32(0xa4506ceb), W[61]);
    t2 = add2(Sigma0(d), Maj(d, e, f));
    g = add2(g, t1);
    c = add2(t1, t2);
    t1 = add5(b, Sigma1(g), Ch(g, h, a), _mm_set1_epi32(0xbef9a3f7), W[62]);
    t2 = add2(Sigma0(c), Maj(c, d, e));
    f = add2(f, t1);
    b = add2(t1, t2);
    t1 = add5(a, Sigma1(f), Ch(f, g, h), _mm_set1_epi32(0xc67178f2), W[63]);
    t2 = add2(Sigma0(b), Maj(b, c, d));
    e = add2(e, t1);
    a = add2(t1, t2);

    dst[0] = add2(state[0], a);
    dst[1] = add2(state[1], b);
    dst[2] = add2(state[2], c);
    dst[3] = add2(state[3], d);
    dst[4] = add2(state[4], e);
    dst[5] = add2(state[5], f);
    dst[6] = add2(state[6], g);
    dst[7] = add2(state[7], h);
}