static void TEST (void) { __m64_union u, s1, s2; __m64_union e; int i; s1.as_m64 = _mm_set_pi32 (30, 90); s2.as_m64 = _mm_set_pi32 (76, -100); u.as_m64 = test (s1.as_m64, s2.as_m64); for (i = 0; i < 2; i++) e.as_int[i] = s1.as_int[i] + s2.as_int[i]; if (u.as_m64 != e.as_m64) abort (); }
static void TEST (void) { __m64_union u, s1, s2; __m64_union e; int i; s1.as_m64 = _mm_set_pi32 (99, 25); s2.as_m64 = _mm_set_pi32 (98, -100); u.as_m64 = test (s1.as_m64, s2.as_m64); for (i = 0; i < 2; i++) e.as_int[i] = (s1.as_int[i] > s2.as_int[i]) ? -1:0; if (u.as_m64 != e.as_m64) abort (); }
int crypto_hash ( unsigned char *out, const unsigned char *in, unsigned long long inlen ) { Skein_512_Ctxt_t ctx; memcpy(ctx.X,IV,sizeof(ctx.X)); ctx.T[0] = _mm_set_pi32(0,0); ctx.T[1] = _mm_set_pi32(((uint32) 112) << 24,0); if (inlen > 64) { size_t n = (inlen-1) / 64; Skein_512_Process_Block(&ctx,in,n,64); inlen -= n * 64; in += n * 64; } memset(ctx.b,0,sizeof(ctx.b)); if (inlen) memcpy(ctx.b,in,inlen); ctx.T[1] = _mm_or_si64(ctx.T[1],_mm_set_pi32(((uint32) 128) << 24,0)); Skein_512_Process_Block(&ctx,ctx.b,1,inlen); memset(ctx.b,0,sizeof(ctx.b)); ctx.T[0] = _mm_set_pi32(0,0); ctx.T[1] = _mm_set_pi32(((uint32) 255) << 24,0); Skein_512_Process_Block(&ctx,ctx.b,1,sizeof(uint64)); 0[(__m64 *) out] = ctx.X[0]; 1[(__m64 *) out] = ctx.X[1]; 2[(__m64 *) out] = ctx.X[2]; 3[(__m64 *) out] = ctx.X[3]; return 0; }
static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const uint8 *blkPtr,size_t blkCnt,size_t byteCntAdd) { __m64 kw[12]; /* key schedule words : chaining vars + tweak */ __m64 X0,X1,X2,X3,X4,X5,X6,X7; /* local copy of vars, for speed */ __m64 w[8]; /* local copy of input block */ __m64 z1; __m64 z3; __m64 z5; __m64 z7; ts[0] = ctx->T[0]; ts[1] = ctx->T[1]; do { ts[0] = _mm_add_si64(ts[0],_mm_set_pi32(0,byteCntAdd)); z1 = SKEIN_KS_PARITY; ks[0] = ctx->X[0]; z1 = _mm_xor_si64(z1,ks[0]); ks[1] = ctx->X[1]; z1 = _mm_xor_si64(z1,ks[1]); ks[2] = ctx->X[2]; z1 = _mm_xor_si64(z1,ks[2]); ks[3] = ctx->X[3]; z1 = _mm_xor_si64(z1,ks[3]); ks[4] = ctx->X[4]; z1 = _mm_xor_si64(z1,ks[4]); ks[5] = ctx->X[5]; z1 = _mm_xor_si64(z1,ks[5]); ks[6] = ctx->X[6]; z1 = _mm_xor_si64(z1,ks[6]); ks[7] = ctx->X[7]; z1 = _mm_xor_si64(z1,ks[7]); ks[8] = z1; ts[2] = _mm_xor_si64(ts[0],ts[1]); X0 = 0[(__m64 *) blkPtr]; X1 = 1[(__m64 *) blkPtr]; X2 = 2[(__m64 *) blkPtr]; X3 = 3[(__m64 *) blkPtr]; X4 = 4[(__m64 *) blkPtr]; X5 = 5[(__m64 *) blkPtr]; X6 = 6[(__m64 *) blkPtr]; X7 = 7[(__m64 *) blkPtr]; w[0] = X0; w[1] = X1; w[2] = X2; w[3] = X3; w[4] = X4; w[5] = X5; w[6] = X6; w[7] = X7; X0 = _mm_add_si64(X0,ks[0]); X1 = _mm_add_si64(X1,ks[1]); X2 = _mm_add_si64(X2,ks[2]); X3 = _mm_add_si64(X3,ks[3]); X4 = _mm_add_si64(X4,ks[4]); X5 = _mm_add_si64(X5,_mm_add_si64(ks[5],ts[0])); X6 = _mm_add_si64(X6,_mm_add_si64(ks[6],ts[1])); X7 = _mm_add_si64(X7,ks[7]); blkPtr += 64; #define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \ X##p0 = _mm_add_si64(X##p0,X##p1); \ X##p2 = _mm_add_si64(X##p2,X##p3); \ X##p4 = _mm_add_si64(X##p4,X##p5); \ X##p6 = _mm_add_si64(X##p6,X##p7); \ z1 = X##p1; \ X##p1 = _m_psrlqi(X##p1,64-ROT##_0); \ z1 = _m_psllqi(z1,ROT##_0); \ X##p1 = _mm_or_si64(X##p1,z1); \ z3 = X##p3; \ X##p3 = _m_psrlqi(X##p3,64-ROT##_1); \ z3 = _m_psllqi(z3,ROT##_1); \ X##p3 = _mm_or_si64(X##p3,z3); \ z5 = X##p5; \ z5 = _m_psllqi(z5,ROT##_2); \ X##p5 = _m_psrlqi(X##p5,64-ROT##_2); \ X##p5 = _mm_or_si64(X##p5,z5); \ z7 = X##p7; \ X##p7 = _m_psrlqi(X##p7,64-ROT##_3); \ z7 = _m_psllqi(z7,ROT##_3); \ X##p7 = _mm_or_si64(X##p7,z7); \ X##p1 = _mm_xor_si64(X##p1,X##p0); \ X##p3 = _mm_xor_si64(X##p3,X##p2); \ X##p5 = _mm_xor_si64(X##p5,X##p4); \ X##p7 = _mm_xor_si64(X##p7,X##p6); \ #define I512(R) \ X0 = _mm_add_si64(X0,ks[((R)+1) % 9]); /* inject the key schedule value */ \ X1 = _mm_add_si64(X1,ks[((R)+2) % 9]); \ X2 = _mm_add_si64(X2,ks[((R)+3) % 9]); \ X3 = _mm_add_si64(X3,ks[((R)+4) % 9]); \ X4 = _mm_add_si64(X4,ks[((R)+5) % 9]); \ X5 = _mm_add_si64(X5,_mm_add_si64(ks[((R)+6) % 9],ts[((R)+1) % 3])); \ X6 = _mm_add_si64(X6,_mm_add_si64(ks[((R)+7) % 9],ts[((R)+2) % 3])); \ X7 = _mm_add_si64(X7,_mm_add_si64(ks[((R)+8) % 9],_mm_set_pi32(0,(R)+1))); \ #define R512_8_rounds(R) \ R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1); \ R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2); \ R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3); \ R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4); \ I512(2*(R)); \ R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5); \ R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6); \ R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7); \ R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8); \ I512(2*(R)+1); R512_8_rounds( 0); R512_8_rounds( 1); R512_8_rounds( 2); R512_8_rounds( 3); R512_8_rounds( 4); R512_8_rounds( 5); R512_8_rounds( 6); R512_8_rounds( 7); R512_8_rounds( 8); ctx->X[0] = _mm_xor_si64(X0,w[0]); ctx->X[1] = _mm_xor_si64(X1,w[1]); ctx->X[2] = _mm_xor_si64(X2,w[2]); ctx->X[3] = _mm_xor_si64(X3,w[3]); ctx->X[4] = _mm_xor_si64(X4,w[4]); ctx->X[5] = _mm_xor_si64(X5,w[5]); ctx->X[6] = _mm_xor_si64(X6,w[6]); ctx->X[7] = _mm_xor_si64(X7,w[7]); ts[1] = _mm_and_si64(ts[1],_mm_set_pi32(~(((uint32) 64 ) << 24),~0)); } while (--blkCnt); ctx->T[0] = ts[0]; ctx->T[1] = ts[1]; }
__m64 test_mm_set_pi32(int a, int b) { // CHECK-LABEL: test_mm_set_pi32 // CHECK: insertelement <2 x i32> // CHECK: insertelement <2 x i32> return _mm_set_pi32(a, b); }