Exemple #1
0
__m64 unsigned_add3( const __m64 *a, const __m64 *b, __m64 *result, unsigned long count )
{
  __m64 sum, _a, _b;
  unsigned int i;

  _a = a[0];
  _b = b[0];

  sum = _mm_add_si64( _a, _b );
  for( i = 1; i < count; i++ )
  {
   result[i-1] = sum;
   _a = a[i];
   _b = b[i];
   sum = _mm_add_si64( _a, _b );
  }
  return sum;
}
Exemple #2
0
__m64
unsigned_add3 (const __m64 * a, const __m64 * b, size_t count)
{
  __m64 sum = { 0, 0 };

  if (count > 0)
    sum = _mm_add_si64 (a[count-1], b[count-1]);

  return sum;
}
Exemple #3
0
__m64
unsigned_add3 (const __m64 * a, const __m64 * b, unsigned int count)
{
  __m64 sum;
  unsigned int i;

  for (i = 1; i < count; i++)
    sum = _mm_add_si64 (a[i], b[i]);

  return sum;
}
Exemple #4
0
__m64
unsigned_add3 (const __m64 * a, const __m64 * b,
	       __m64 * result, unsigned int count)
{
  __m64 _a, _b, one, sum, carry, onesCarry;

  unsigned int i;

  carry = _mm_setzero_si64 ();

  one = _mm_cmpeq_pi8 (carry, carry);
  one = _mm_sub_si64 (carry, one);

  for (i = 0; i < count; i++)
    {
      _a = a[i];
      _b = b[i];

      sum = _mm_add_si64 (_a, _b);
      sum = _mm_add_si64 (sum, carry);

      result[i] = sum;

      onesCarry = _mm_and_si64 (_mm_xor_si64 (_a, _b), carry);
      onesCarry = _mm_or_si64 (_mm_and_si64 (_a, _b), onesCarry);
      onesCarry = _mm_and_si64 (onesCarry, one);

      _a = _mm_srli_si64 (_a, 1);
      _b = _mm_srli_si64 (_b, 1);

      carry = _mm_add_si64 (_mm_add_si64 (_a, _b), onesCarry);
      carry = _mm_srli_si64 (carry, 63);
    }

  return carry;
}
Exemple #5
0
void sha384Process(register sha384Param* sp)
{
	#ifdef OPTIMIZE_SSE2 
	
	# if defined(_MSC_VER) || defined (__INTEL_COMPILER)
	static const __m64 MASK = { 0x00FF00FF00FF00FF00 };
	# elif defined(__GNUC__)
	static const __m64 MASK = { 0x00FF00FF, 0x00FF00FF };
	# else
	#  error
	# endif

	__m64 a, b, c, d, e, f, g, h, temp;
	register       __m64 *w;
	register const __m64 *k;
	register byte t;

	w = (__m64*) sp->data;
	t = 16;
	while (t--)
	{
		temp = *w;
		*(w++) = _m_pxor(
				_mm_slli_si64(_m_pshufw(_m_pand(temp, MASK), 27), 8),
				_m_pshufw(_m_pand(_mm_srli_si64(temp, 8), MASK), 27)
			);
	}

	t = 64;
	while (t--)
	{
		temp = _mm_add_si64(_mm_add_si64(sig1(w[-2]), w[-7]), _mm_add_si64(sig0(w[-15]), w[-16]));
		*(w++) = temp;
	}

	w = (__m64*) sp->h;

	a = w[0]; b = w[1]; c = w[2]; d = w[3];
	e = w[4]; f = w[5]; g = w[6]; h = w[7];

	w = (__m64*) sp->data;
	k = (__m64*) SHA2_64BIT_K;

	#else

	register uint64_t a, b, c, d, e, f, g, h, temp;
	register       uint64_t *w;
	register const uint64_t *k;
	register byte t;

	# if WORDS_BIGENDIAN
	w = sp->data + 16;
	# else
	w = sp->data;
	t = 16;
	while (t--)
	{
		temp = swapu64(*w);
		*(w++) = temp;
	}
	# endif

	t = 64;
	while (t--)
	{
		temp = sig1(w[-2]) + w[-7] + sig0(w[-15]) + w[-16];
		*(w++) = temp;
	}

	w = sp->data;

	a = sp->h[0]; b = sp->h[1]; c = sp->h[2]; d = sp->h[3];
	e = sp->h[4]; f = sp->h[5]; g = sp->h[6]; h = sp->h[7];

	k = SHA2_64BIT_K;
	#endif

	ROUND(a,b,c,d,e,f,g,h,w[ 0],k[ 0]);
	ROUND(h,a,b,c,d,e,f,g,w[ 1],k[ 1]);
	ROUND(g,h,a,b,c,d,e,f,w[ 2],k[ 2]);
	ROUND(f,g,h,a,b,c,d,e,w[ 3],k[ 3]);
	ROUND(e,f,g,h,a,b,c,d,w[ 4],k[ 4]);
	ROUND(d,e,f,g,h,a,b,c,w[ 5],k[ 5]);
	ROUND(c,d,e,f,g,h,a,b,w[ 6],k[ 6]);
	ROUND(b,c,d,e,f,g,h,a,w[ 7],k[ 7]);
	ROUND(a,b,c,d,e,f,g,h,w[ 8],k[ 8]);
	ROUND(h,a,b,c,d,e,f,g,w[ 9],k[ 9]);
	ROUND(g,h,a,b,c,d,e,f,w[10],k[10]);
	ROUND(f,g,h,a,b,c,d,e,w[11],k[11]);
	ROUND(e,f,g,h,a,b,c,d,w[12],k[12]);
	ROUND(d,e,f,g,h,a,b,c,w[13],k[13]);
	ROUND(c,d,e,f,g,h,a,b,w[14],k[14]);
	ROUND(b,c,d,e,f,g,h,a,w[15],k[15]);
	ROUND(a,b,c,d,e,f,g,h,w[16],k[16]);
	ROUND(h,a,b,c,d,e,f,g,w[17],k[17]);
	ROUND(g,h,a,b,c,d,e,f,w[18],k[18]);
	ROUND(f,g,h,a,b,c,d,e,w[19],k[19]);
	ROUND(e,f,g,h,a,b,c,d,w[20],k[20]);
	ROUND(d,e,f,g,h,a,b,c,w[21],k[21]);
	ROUND(c,d,e,f,g,h,a,b,w[22],k[22]);
	ROUND(b,c,d,e,f,g,h,a,w[23],k[23]);
	ROUND(a,b,c,d,e,f,g,h,w[24],k[24]);
	ROUND(h,a,b,c,d,e,f,g,w[25],k[25]);
	ROUND(g,h,a,b,c,d,e,f,w[26],k[26]);
	ROUND(f,g,h,a,b,c,d,e,w[27],k[27]);
	ROUND(e,f,g,h,a,b,c,d,w[28],k[28]);
	ROUND(d,e,f,g,h,a,b,c,w[29],k[29]);
	ROUND(c,d,e,f,g,h,a,b,w[30],k[30]);
	ROUND(b,c,d,e,f,g,h,a,w[31],k[31]);
	ROUND(a,b,c,d,e,f,g,h,w[32],k[32]);
	ROUND(h,a,b,c,d,e,f,g,w[33],k[33]);
	ROUND(g,h,a,b,c,d,e,f,w[34],k[34]);
	ROUND(f,g,h,a,b,c,d,e,w[35],k[35]);
	ROUND(e,f,g,h,a,b,c,d,w[36],k[36]);
	ROUND(d,e,f,g,h,a,b,c,w[37],k[37]);
	ROUND(c,d,e,f,g,h,a,b,w[38],k[38]);
	ROUND(b,c,d,e,f,g,h,a,w[39],k[39]);
	ROUND(a,b,c,d,e,f,g,h,w[40],k[40]);
	ROUND(h,a,b,c,d,e,f,g,w[41],k[41]);
	ROUND(g,h,a,b,c,d,e,f,w[42],k[42]);
	ROUND(f,g,h,a,b,c,d,e,w[43],k[43]);
	ROUND(e,f,g,h,a,b,c,d,w[44],k[44]);
	ROUND(d,e,f,g,h,a,b,c,w[45],k[45]);
	ROUND(c,d,e,f,g,h,a,b,w[46],k[46]);
	ROUND(b,c,d,e,f,g,h,a,w[47],k[47]);
	ROUND(a,b,c,d,e,f,g,h,w[48],k[48]);
	ROUND(h,a,b,c,d,e,f,g,w[49],k[49]);
	ROUND(g,h,a,b,c,d,e,f,w[50],k[50]);
	ROUND(f,g,h,a,b,c,d,e,w[51],k[51]);
	ROUND(e,f,g,h,a,b,c,d,w[52],k[52]);
	ROUND(d,e,f,g,h,a,b,c,w[53],k[53]);
	ROUND(c,d,e,f,g,h,a,b,w[54],k[54]);
	ROUND(b,c,d,e,f,g,h,a,w[55],k[55]);
	ROUND(a,b,c,d,e,f,g,h,w[56],k[56]);
	ROUND(h,a,b,c,d,e,f,g,w[57],k[57]);
	ROUND(g,h,a,b,c,d,e,f,w[58],k[58]);
	ROUND(f,g,h,a,b,c,d,e,w[59],k[59]);
	ROUND(e,f,g,h,a,b,c,d,w[60],k[60]);
	ROUND(d,e,f,g,h,a,b,c,w[61],k[61]);
	ROUND(c,d,e,f,g,h,a,b,w[62],k[62]);
	ROUND(b,c,d,e,f,g,h,a,w[63],k[63]);
	ROUND(a,b,c,d,e,f,g,h,w[64],k[64]);
	ROUND(h,a,b,c,d,e,f,g,w[65],k[65]);
	ROUND(g,h,a,b,c,d,e,f,w[66],k[66]);
	ROUND(f,g,h,a,b,c,d,e,w[67],k[67]);
	ROUND(e,f,g,h,a,b,c,d,w[68],k[68]);
	ROUND(d,e,f,g,h,a,b,c,w[69],k[69]);
	ROUND(c,d,e,f,g,h,a,b,w[70],k[70]);
	ROUND(b,c,d,e,f,g,h,a,w[71],k[71]);
	ROUND(a,b,c,d,e,f,g,h,w[72],k[72]);
	ROUND(h,a,b,c,d,e,f,g,w[73],k[73]);
	ROUND(g,h,a,b,c,d,e,f,w[74],k[74]);
	ROUND(f,g,h,a,b,c,d,e,w[75],k[75]);
	ROUND(e,f,g,h,a,b,c,d,w[76],k[76]);
	ROUND(d,e,f,g,h,a,b,c,w[77],k[77]);
	ROUND(c,d,e,f,g,h,a,b,w[78],k[78]);
	ROUND(b,c,d,e,f,g,h,a,w[79],k[79]);

	#ifdef OPTIMIZE_SSE2
	w = (__m64*) sp->h;
	w[0] = _mm_add_si64(w[0], a);
	w[1] = _mm_add_si64(w[1], b);
	w[2] = _mm_add_si64(w[2], c);
	w[3] = _mm_add_si64(w[3], d);
	w[4] = _mm_add_si64(w[4], e);
	w[5] = _mm_add_si64(w[5], f);
	w[6] = _mm_add_si64(w[6], g);
	w[7] = _mm_add_si64(w[7], h);
	_mm_empty();
	#else
	sp->h[0] += a;
	sp->h[1] += b;
	sp->h[2] += c;
	sp->h[3] += d;
	sp->h[4] += e;
	sp->h[5] += f;
	sp->h[6] += g;
	sp->h[7] += h;
	#endif
}
Exemple #6
0
static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const uint8 *blkPtr,size_t blkCnt,size_t byteCntAdd)
{
    __m64  kw[12];                         /* key schedule words : chaining vars + tweak */
    __m64  X0,X1,X2,X3,X4,X5,X6,X7;        /* local copy of vars, for speed */
    __m64  w[8];                          /* local copy of input block */
    __m64  z1;
    __m64  z3;
    __m64  z5;
    __m64  z7;

    ts[0] = ctx->T[0];
    ts[1] = ctx->T[1];
    do {
        ts[0] = _mm_add_si64(ts[0],_mm_set_pi32(0,byteCntAdd));

	z1 = SKEIN_KS_PARITY;
        ks[0] = ctx->X[0];
	z1 = _mm_xor_si64(z1,ks[0]);
        ks[1] = ctx->X[1];
	z1 = _mm_xor_si64(z1,ks[1]);
        ks[2] = ctx->X[2];
	z1 = _mm_xor_si64(z1,ks[2]);
        ks[3] = ctx->X[3];
	z1 = _mm_xor_si64(z1,ks[3]);
        ks[4] = ctx->X[4];
	z1 = _mm_xor_si64(z1,ks[4]);
        ks[5] = ctx->X[5];
	z1 = _mm_xor_si64(z1,ks[5]);
        ks[6] = ctx->X[6];
	z1 = _mm_xor_si64(z1,ks[6]);
        ks[7] = ctx->X[7];
	z1 = _mm_xor_si64(z1,ks[7]);
	ks[8] = z1;

        ts[2] = _mm_xor_si64(ts[0],ts[1]);

        X0 = 0[(__m64 *) blkPtr];
        X1 = 1[(__m64 *) blkPtr];
        X2 = 2[(__m64 *) blkPtr];
        X3 = 3[(__m64 *) blkPtr];
        X4 = 4[(__m64 *) blkPtr];
        X5 = 5[(__m64 *) blkPtr];
        X6 = 6[(__m64 *) blkPtr];
        X7 = 7[(__m64 *) blkPtr];

        w[0] = X0;
        w[1] = X1;
        w[2] = X2;
        w[3] = X3;
        w[4] = X4;
        w[5] = X5;
        w[6] = X6;
        w[7] = X7;

        X0 = _mm_add_si64(X0,ks[0]);
        X1 = _mm_add_si64(X1,ks[1]);
        X2 = _mm_add_si64(X2,ks[2]);
        X3 = _mm_add_si64(X3,ks[3]);
        X4 = _mm_add_si64(X4,ks[4]);
        X5 = _mm_add_si64(X5,_mm_add_si64(ks[5],ts[0]));
        X6 = _mm_add_si64(X6,_mm_add_si64(ks[6],ts[1]));
        X7 = _mm_add_si64(X7,ks[7]);

        blkPtr += 64;

#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
    X##p0 = _mm_add_si64(X##p0,X##p1); \
      X##p2 = _mm_add_si64(X##p2,X##p3); \
        X##p4 = _mm_add_si64(X##p4,X##p5); \
          X##p6 = _mm_add_si64(X##p6,X##p7); \
    z1 = X##p1; \
    X##p1 = _m_psrlqi(X##p1,64-ROT##_0); \
    z1 = _m_psllqi(z1,ROT##_0); \
    X##p1 = _mm_or_si64(X##p1,z1); \
      z3 = X##p3; \
      X##p3 = _m_psrlqi(X##p3,64-ROT##_1); \
      z3 = _m_psllqi(z3,ROT##_1); \
      X##p3 = _mm_or_si64(X##p3,z3); \
        z5 = X##p5; \
        z5 = _m_psllqi(z5,ROT##_2); \
        X##p5 = _m_psrlqi(X##p5,64-ROT##_2); \
        X##p5 = _mm_or_si64(X##p5,z5); \
          z7 = X##p7; \
          X##p7 = _m_psrlqi(X##p7,64-ROT##_3); \
          z7 = _m_psllqi(z7,ROT##_3); \
          X##p7 = _mm_or_si64(X##p7,z7); \
    X##p1 = _mm_xor_si64(X##p1,X##p0); \
      X##p3 = _mm_xor_si64(X##p3,X##p2); \
        X##p5 = _mm_xor_si64(X##p5,X##p4); \
          X##p7 = _mm_xor_si64(X##p7,X##p6); \

#define I512(R)                                                     \
    X0 = _mm_add_si64(X0,ks[((R)+1) % 9]);   /* inject the key schedule value */  \
    X1 = _mm_add_si64(X1,ks[((R)+2) % 9]);                                        \
    X2 = _mm_add_si64(X2,ks[((R)+3) % 9]);                                        \
    X3 = _mm_add_si64(X3,ks[((R)+4) % 9]);                                        \
    X4 = _mm_add_si64(X4,ks[((R)+5) % 9]);                                        \
    X5 = _mm_add_si64(X5,_mm_add_si64(ks[((R)+6) % 9],ts[((R)+1) % 3]));          \
    X6 = _mm_add_si64(X6,_mm_add_si64(ks[((R)+7) % 9],ts[((R)+2) % 3]));          \
    X7 = _mm_add_si64(X7,_mm_add_si64(ks[((R)+8) % 9],_mm_set_pi32(0,(R)+1)));     \

#define R512_8_rounds(R) \
        R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1);   \
        R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2);   \
        R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3);   \
        R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4);   \
        I512(2*(R));                              \
        R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5);   \
        R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6);   \
        R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7);   \
        R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8);   \
        I512(2*(R)+1);

        R512_8_rounds( 0);
        R512_8_rounds( 1);
        R512_8_rounds( 2);
        R512_8_rounds( 3);
        R512_8_rounds( 4);
        R512_8_rounds( 5);
        R512_8_rounds( 6);
        R512_8_rounds( 7);
        R512_8_rounds( 8);

        ctx->X[0] = _mm_xor_si64(X0,w[0]);
        ctx->X[1] = _mm_xor_si64(X1,w[1]);
        ctx->X[2] = _mm_xor_si64(X2,w[2]);
        ctx->X[3] = _mm_xor_si64(X3,w[3]);
        ctx->X[4] = _mm_xor_si64(X4,w[4]);
        ctx->X[5] = _mm_xor_si64(X5,w[5]);
        ctx->X[6] = _mm_xor_si64(X6,w[6]);
        ctx->X[7] = _mm_xor_si64(X7,w[7]);

        ts[1] = _mm_and_si64(ts[1],_mm_set_pi32(~(((uint32)  64 ) << 24),~0));
    } while (--blkCnt);
    ctx->T[0] = ts[0];
    ctx->T[1] = ts[1];
}
__m64 test_mm_add_si64(__m64 a, __m64 b) {
  // CHECK-LABEL: test_mm_add_si64
  // CHECK: call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %{{.*}}, x86_mmx %{{.*}})
  return _mm_add_si64(a, b);
}