Example #1
0
static void
TEST (void)
{
  __m64_union u, s1, s2;
  __m64_union e;
  int i;
   
  s1.as_m64 = _mm_set_pi32 (30, 90);
  s2.as_m64 = _mm_set_pi32 (76, -100);
  u.as_m64 = test (s1.as_m64, s2.as_m64);
   
  for (i = 0; i < 2; i++)
     e.as_int[i] = s1.as_int[i] + s2.as_int[i];

  if (u.as_m64 != e.as_m64)
    abort ();
}
Example #2
0
static void
TEST (void)
{
  __m64_union u, s1, s2;
  __m64_union e;
  int i;

  s1.as_m64 = _mm_set_pi32 (99, 25);
  s2.as_m64 = _mm_set_pi32 (98, -100);
  u.as_m64 = test (s1.as_m64, s2.as_m64);

  for (i = 0; i < 2; i++)
     e.as_int[i] = (s1.as_int[i] > s2.as_int[i]) ? -1:0;

  if (u.as_m64 != e.as_m64)
    abort ();
}
Example #3
0
int crypto_hash
    (
    unsigned char *out,
    const unsigned char *in,
    unsigned long long inlen
    )

{
    Skein_512_Ctxt_t ctx;

    memcpy(ctx.X,IV,sizeof(ctx.X));
    ctx.T[0] = _mm_set_pi32(0,0);
    ctx.T[1] = _mm_set_pi32(((uint32) 112) << 24,0);

    if (inlen > 64) {
        size_t n = (inlen-1) / 64;
        Skein_512_Process_Block(&ctx,in,n,64);
        inlen -= n * 64;
        in    += n * 64;
    }

    memset(ctx.b,0,sizeof(ctx.b));
    if (inlen) memcpy(ctx.b,in,inlen);
    ctx.T[1] = _mm_or_si64(ctx.T[1],_mm_set_pi32(((uint32) 128) << 24,0));
    Skein_512_Process_Block(&ctx,ctx.b,1,inlen);
    
    memset(ctx.b,0,sizeof(ctx.b));
    ctx.T[0] = _mm_set_pi32(0,0);
    ctx.T[1] = _mm_set_pi32(((uint32) 255) << 24,0);
    Skein_512_Process_Block(&ctx,ctx.b,1,sizeof(uint64));

    0[(__m64 *) out] = ctx.X[0];
    1[(__m64 *) out] = ctx.X[1];
    2[(__m64 *) out] = ctx.X[2];
    3[(__m64 *) out] = ctx.X[3];

    return 0;
}
Example #4
0
static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const uint8 *blkPtr,size_t blkCnt,size_t byteCntAdd)
{
    __m64  kw[12];                         /* key schedule words : chaining vars + tweak */
    __m64  X0,X1,X2,X3,X4,X5,X6,X7;        /* local copy of vars, for speed */
    __m64  w[8];                          /* local copy of input block */
    __m64  z1;
    __m64  z3;
    __m64  z5;
    __m64  z7;

    ts[0] = ctx->T[0];
    ts[1] = ctx->T[1];
    do {
        ts[0] = _mm_add_si64(ts[0],_mm_set_pi32(0,byteCntAdd));

	z1 = SKEIN_KS_PARITY;
        ks[0] = ctx->X[0];
	z1 = _mm_xor_si64(z1,ks[0]);
        ks[1] = ctx->X[1];
	z1 = _mm_xor_si64(z1,ks[1]);
        ks[2] = ctx->X[2];
	z1 = _mm_xor_si64(z1,ks[2]);
        ks[3] = ctx->X[3];
	z1 = _mm_xor_si64(z1,ks[3]);
        ks[4] = ctx->X[4];
	z1 = _mm_xor_si64(z1,ks[4]);
        ks[5] = ctx->X[5];
	z1 = _mm_xor_si64(z1,ks[5]);
        ks[6] = ctx->X[6];
	z1 = _mm_xor_si64(z1,ks[6]);
        ks[7] = ctx->X[7];
	z1 = _mm_xor_si64(z1,ks[7]);
	ks[8] = z1;

        ts[2] = _mm_xor_si64(ts[0],ts[1]);

        X0 = 0[(__m64 *) blkPtr];
        X1 = 1[(__m64 *) blkPtr];
        X2 = 2[(__m64 *) blkPtr];
        X3 = 3[(__m64 *) blkPtr];
        X4 = 4[(__m64 *) blkPtr];
        X5 = 5[(__m64 *) blkPtr];
        X6 = 6[(__m64 *) blkPtr];
        X7 = 7[(__m64 *) blkPtr];

        w[0] = X0;
        w[1] = X1;
        w[2] = X2;
        w[3] = X3;
        w[4] = X4;
        w[5] = X5;
        w[6] = X6;
        w[7] = X7;

        X0 = _mm_add_si64(X0,ks[0]);
        X1 = _mm_add_si64(X1,ks[1]);
        X2 = _mm_add_si64(X2,ks[2]);
        X3 = _mm_add_si64(X3,ks[3]);
        X4 = _mm_add_si64(X4,ks[4]);
        X5 = _mm_add_si64(X5,_mm_add_si64(ks[5],ts[0]));
        X6 = _mm_add_si64(X6,_mm_add_si64(ks[6],ts[1]));
        X7 = _mm_add_si64(X7,ks[7]);

        blkPtr += 64;

#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
    X##p0 = _mm_add_si64(X##p0,X##p1); \
      X##p2 = _mm_add_si64(X##p2,X##p3); \
        X##p4 = _mm_add_si64(X##p4,X##p5); \
          X##p6 = _mm_add_si64(X##p6,X##p7); \
    z1 = X##p1; \
    X##p1 = _m_psrlqi(X##p1,64-ROT##_0); \
    z1 = _m_psllqi(z1,ROT##_0); \
    X##p1 = _mm_or_si64(X##p1,z1); \
      z3 = X##p3; \
      X##p3 = _m_psrlqi(X##p3,64-ROT##_1); \
      z3 = _m_psllqi(z3,ROT##_1); \
      X##p3 = _mm_or_si64(X##p3,z3); \
        z5 = X##p5; \
        z5 = _m_psllqi(z5,ROT##_2); \
        X##p5 = _m_psrlqi(X##p5,64-ROT##_2); \
        X##p5 = _mm_or_si64(X##p5,z5); \
          z7 = X##p7; \
          X##p7 = _m_psrlqi(X##p7,64-ROT##_3); \
          z7 = _m_psllqi(z7,ROT##_3); \
          X##p7 = _mm_or_si64(X##p7,z7); \
    X##p1 = _mm_xor_si64(X##p1,X##p0); \
      X##p3 = _mm_xor_si64(X##p3,X##p2); \
        X##p5 = _mm_xor_si64(X##p5,X##p4); \
          X##p7 = _mm_xor_si64(X##p7,X##p6); \

#define I512(R)                                                     \
    X0 = _mm_add_si64(X0,ks[((R)+1) % 9]);   /* inject the key schedule value */  \
    X1 = _mm_add_si64(X1,ks[((R)+2) % 9]);                                        \
    X2 = _mm_add_si64(X2,ks[((R)+3) % 9]);                                        \
    X3 = _mm_add_si64(X3,ks[((R)+4) % 9]);                                        \
    X4 = _mm_add_si64(X4,ks[((R)+5) % 9]);                                        \
    X5 = _mm_add_si64(X5,_mm_add_si64(ks[((R)+6) % 9],ts[((R)+1) % 3]));          \
    X6 = _mm_add_si64(X6,_mm_add_si64(ks[((R)+7) % 9],ts[((R)+2) % 3]));          \
    X7 = _mm_add_si64(X7,_mm_add_si64(ks[((R)+8) % 9],_mm_set_pi32(0,(R)+1)));     \

#define R512_8_rounds(R) \
        R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1);   \
        R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2);   \
        R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3);   \
        R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4);   \
        I512(2*(R));                              \
        R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5);   \
        R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6);   \
        R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7);   \
        R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8);   \
        I512(2*(R)+1);

        R512_8_rounds( 0);
        R512_8_rounds( 1);
        R512_8_rounds( 2);
        R512_8_rounds( 3);
        R512_8_rounds( 4);
        R512_8_rounds( 5);
        R512_8_rounds( 6);
        R512_8_rounds( 7);
        R512_8_rounds( 8);

        ctx->X[0] = _mm_xor_si64(X0,w[0]);
        ctx->X[1] = _mm_xor_si64(X1,w[1]);
        ctx->X[2] = _mm_xor_si64(X2,w[2]);
        ctx->X[3] = _mm_xor_si64(X3,w[3]);
        ctx->X[4] = _mm_xor_si64(X4,w[4]);
        ctx->X[5] = _mm_xor_si64(X5,w[5]);
        ctx->X[6] = _mm_xor_si64(X6,w[6]);
        ctx->X[7] = _mm_xor_si64(X7,w[7]);

        ts[1] = _mm_and_si64(ts[1],_mm_set_pi32(~(((uint32)  64 ) << 24),~0));
    } while (--blkCnt);
    ctx->T[0] = ts[0];
    ctx->T[1] = ts[1];
}
Example #5
0
__m64 test_mm_set_pi32(int a, int b) {
  // CHECK-LABEL: test_mm_set_pi32
  // CHECK: insertelement <2 x i32>
  // CHECK: insertelement <2 x i32>
  return _mm_set_pi32(a, b);
}