__m64 unsigned_add3( const __m64 *a, const __m64 *b, __m64 *result, unsigned long count ) { __m64 sum, _a, _b; unsigned int i; _a = a[0]; _b = b[0]; sum = _mm_add_si64( _a, _b ); for( i = 1; i < count; i++ ) { result[i-1] = sum; _a = a[i]; _b = b[i]; sum = _mm_add_si64( _a, _b ); } return sum; }
__m64 unsigned_add3 (const __m64 * a, const __m64 * b, size_t count) { __m64 sum = { 0, 0 }; if (count > 0) sum = _mm_add_si64 (a[count-1], b[count-1]); return sum; }
__m64 unsigned_add3 (const __m64 * a, const __m64 * b, unsigned int count) { __m64 sum; unsigned int i; for (i = 1; i < count; i++) sum = _mm_add_si64 (a[i], b[i]); return sum; }
__m64 unsigned_add3 (const __m64 * a, const __m64 * b, __m64 * result, unsigned int count) { __m64 _a, _b, one, sum, carry, onesCarry; unsigned int i; carry = _mm_setzero_si64 (); one = _mm_cmpeq_pi8 (carry, carry); one = _mm_sub_si64 (carry, one); for (i = 0; i < count; i++) { _a = a[i]; _b = b[i]; sum = _mm_add_si64 (_a, _b); sum = _mm_add_si64 (sum, carry); result[i] = sum; onesCarry = _mm_and_si64 (_mm_xor_si64 (_a, _b), carry); onesCarry = _mm_or_si64 (_mm_and_si64 (_a, _b), onesCarry); onesCarry = _mm_and_si64 (onesCarry, one); _a = _mm_srli_si64 (_a, 1); _b = _mm_srli_si64 (_b, 1); carry = _mm_add_si64 (_mm_add_si64 (_a, _b), onesCarry); carry = _mm_srli_si64 (carry, 63); } return carry; }
void sha384Process(register sha384Param* sp) { #ifdef OPTIMIZE_SSE2 # if defined(_MSC_VER) || defined (__INTEL_COMPILER) static const __m64 MASK = { 0x00FF00FF00FF00FF00 }; # elif defined(__GNUC__) static const __m64 MASK = { 0x00FF00FF, 0x00FF00FF }; # else # error # endif __m64 a, b, c, d, e, f, g, h, temp; register __m64 *w; register const __m64 *k; register byte t; w = (__m64*) sp->data; t = 16; while (t--) { temp = *w; *(w++) = _m_pxor( _mm_slli_si64(_m_pshufw(_m_pand(temp, MASK), 27), 8), _m_pshufw(_m_pand(_mm_srli_si64(temp, 8), MASK), 27) ); } t = 64; while (t--) { temp = _mm_add_si64(_mm_add_si64(sig1(w[-2]), w[-7]), _mm_add_si64(sig0(w[-15]), w[-16])); *(w++) = temp; } w = (__m64*) sp->h; a = w[0]; b = w[1]; c = w[2]; d = w[3]; e = w[4]; f = w[5]; g = w[6]; h = w[7]; w = (__m64*) sp->data; k = (__m64*) SHA2_64BIT_K; #else register uint64_t a, b, c, d, e, f, g, h, temp; register uint64_t *w; register const uint64_t *k; register byte t; # if WORDS_BIGENDIAN w = sp->data + 16; # else w = sp->data; t = 16; while (t--) { temp = swapu64(*w); *(w++) = temp; } # endif t = 64; while (t--) { temp = sig1(w[-2]) + w[-7] + sig0(w[-15]) + w[-16]; *(w++) = temp; } w = sp->data; a = sp->h[0]; b = sp->h[1]; c = sp->h[2]; d = sp->h[3]; e = sp->h[4]; f = sp->h[5]; g = sp->h[6]; h = sp->h[7]; k = SHA2_64BIT_K; #endif ROUND(a,b,c,d,e,f,g,h,w[ 0],k[ 0]); ROUND(h,a,b,c,d,e,f,g,w[ 1],k[ 1]); ROUND(g,h,a,b,c,d,e,f,w[ 2],k[ 2]); ROUND(f,g,h,a,b,c,d,e,w[ 3],k[ 3]); ROUND(e,f,g,h,a,b,c,d,w[ 4],k[ 4]); ROUND(d,e,f,g,h,a,b,c,w[ 5],k[ 5]); ROUND(c,d,e,f,g,h,a,b,w[ 6],k[ 6]); ROUND(b,c,d,e,f,g,h,a,w[ 7],k[ 7]); ROUND(a,b,c,d,e,f,g,h,w[ 8],k[ 8]); ROUND(h,a,b,c,d,e,f,g,w[ 9],k[ 9]); ROUND(g,h,a,b,c,d,e,f,w[10],k[10]); ROUND(f,g,h,a,b,c,d,e,w[11],k[11]); ROUND(e,f,g,h,a,b,c,d,w[12],k[12]); ROUND(d,e,f,g,h,a,b,c,w[13],k[13]); ROUND(c,d,e,f,g,h,a,b,w[14],k[14]); ROUND(b,c,d,e,f,g,h,a,w[15],k[15]); ROUND(a,b,c,d,e,f,g,h,w[16],k[16]); ROUND(h,a,b,c,d,e,f,g,w[17],k[17]); ROUND(g,h,a,b,c,d,e,f,w[18],k[18]); ROUND(f,g,h,a,b,c,d,e,w[19],k[19]); ROUND(e,f,g,h,a,b,c,d,w[20],k[20]); ROUND(d,e,f,g,h,a,b,c,w[21],k[21]); ROUND(c,d,e,f,g,h,a,b,w[22],k[22]); ROUND(b,c,d,e,f,g,h,a,w[23],k[23]); ROUND(a,b,c,d,e,f,g,h,w[24],k[24]); ROUND(h,a,b,c,d,e,f,g,w[25],k[25]); ROUND(g,h,a,b,c,d,e,f,w[26],k[26]); ROUND(f,g,h,a,b,c,d,e,w[27],k[27]); ROUND(e,f,g,h,a,b,c,d,w[28],k[28]); ROUND(d,e,f,g,h,a,b,c,w[29],k[29]); ROUND(c,d,e,f,g,h,a,b,w[30],k[30]); ROUND(b,c,d,e,f,g,h,a,w[31],k[31]); ROUND(a,b,c,d,e,f,g,h,w[32],k[32]); ROUND(h,a,b,c,d,e,f,g,w[33],k[33]); ROUND(g,h,a,b,c,d,e,f,w[34],k[34]); ROUND(f,g,h,a,b,c,d,e,w[35],k[35]); ROUND(e,f,g,h,a,b,c,d,w[36],k[36]); ROUND(d,e,f,g,h,a,b,c,w[37],k[37]); ROUND(c,d,e,f,g,h,a,b,w[38],k[38]); ROUND(b,c,d,e,f,g,h,a,w[39],k[39]); ROUND(a,b,c,d,e,f,g,h,w[40],k[40]); ROUND(h,a,b,c,d,e,f,g,w[41],k[41]); ROUND(g,h,a,b,c,d,e,f,w[42],k[42]); ROUND(f,g,h,a,b,c,d,e,w[43],k[43]); ROUND(e,f,g,h,a,b,c,d,w[44],k[44]); ROUND(d,e,f,g,h,a,b,c,w[45],k[45]); ROUND(c,d,e,f,g,h,a,b,w[46],k[46]); ROUND(b,c,d,e,f,g,h,a,w[47],k[47]); ROUND(a,b,c,d,e,f,g,h,w[48],k[48]); ROUND(h,a,b,c,d,e,f,g,w[49],k[49]); ROUND(g,h,a,b,c,d,e,f,w[50],k[50]); ROUND(f,g,h,a,b,c,d,e,w[51],k[51]); ROUND(e,f,g,h,a,b,c,d,w[52],k[52]); ROUND(d,e,f,g,h,a,b,c,w[53],k[53]); ROUND(c,d,e,f,g,h,a,b,w[54],k[54]); ROUND(b,c,d,e,f,g,h,a,w[55],k[55]); ROUND(a,b,c,d,e,f,g,h,w[56],k[56]); ROUND(h,a,b,c,d,e,f,g,w[57],k[57]); ROUND(g,h,a,b,c,d,e,f,w[58],k[58]); ROUND(f,g,h,a,b,c,d,e,w[59],k[59]); ROUND(e,f,g,h,a,b,c,d,w[60],k[60]); ROUND(d,e,f,g,h,a,b,c,w[61],k[61]); ROUND(c,d,e,f,g,h,a,b,w[62],k[62]); ROUND(b,c,d,e,f,g,h,a,w[63],k[63]); ROUND(a,b,c,d,e,f,g,h,w[64],k[64]); ROUND(h,a,b,c,d,e,f,g,w[65],k[65]); ROUND(g,h,a,b,c,d,e,f,w[66],k[66]); ROUND(f,g,h,a,b,c,d,e,w[67],k[67]); ROUND(e,f,g,h,a,b,c,d,w[68],k[68]); ROUND(d,e,f,g,h,a,b,c,w[69],k[69]); ROUND(c,d,e,f,g,h,a,b,w[70],k[70]); ROUND(b,c,d,e,f,g,h,a,w[71],k[71]); ROUND(a,b,c,d,e,f,g,h,w[72],k[72]); ROUND(h,a,b,c,d,e,f,g,w[73],k[73]); ROUND(g,h,a,b,c,d,e,f,w[74],k[74]); ROUND(f,g,h,a,b,c,d,e,w[75],k[75]); ROUND(e,f,g,h,a,b,c,d,w[76],k[76]); ROUND(d,e,f,g,h,a,b,c,w[77],k[77]); ROUND(c,d,e,f,g,h,a,b,w[78],k[78]); ROUND(b,c,d,e,f,g,h,a,w[79],k[79]); #ifdef OPTIMIZE_SSE2 w = (__m64*) sp->h; w[0] = _mm_add_si64(w[0], a); w[1] = _mm_add_si64(w[1], b); w[2] = _mm_add_si64(w[2], c); w[3] = _mm_add_si64(w[3], d); w[4] = _mm_add_si64(w[4], e); w[5] = _mm_add_si64(w[5], f); w[6] = _mm_add_si64(w[6], g); w[7] = _mm_add_si64(w[7], h); _mm_empty(); #else sp->h[0] += a; sp->h[1] += b; sp->h[2] += c; sp->h[3] += d; sp->h[4] += e; sp->h[5] += f; sp->h[6] += g; sp->h[7] += h; #endif }
static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const uint8 *blkPtr,size_t blkCnt,size_t byteCntAdd) { __m64 kw[12]; /* key schedule words : chaining vars + tweak */ __m64 X0,X1,X2,X3,X4,X5,X6,X7; /* local copy of vars, for speed */ __m64 w[8]; /* local copy of input block */ __m64 z1; __m64 z3; __m64 z5; __m64 z7; ts[0] = ctx->T[0]; ts[1] = ctx->T[1]; do { ts[0] = _mm_add_si64(ts[0],_mm_set_pi32(0,byteCntAdd)); z1 = SKEIN_KS_PARITY; ks[0] = ctx->X[0]; z1 = _mm_xor_si64(z1,ks[0]); ks[1] = ctx->X[1]; z1 = _mm_xor_si64(z1,ks[1]); ks[2] = ctx->X[2]; z1 = _mm_xor_si64(z1,ks[2]); ks[3] = ctx->X[3]; z1 = _mm_xor_si64(z1,ks[3]); ks[4] = ctx->X[4]; z1 = _mm_xor_si64(z1,ks[4]); ks[5] = ctx->X[5]; z1 = _mm_xor_si64(z1,ks[5]); ks[6] = ctx->X[6]; z1 = _mm_xor_si64(z1,ks[6]); ks[7] = ctx->X[7]; z1 = _mm_xor_si64(z1,ks[7]); ks[8] = z1; ts[2] = _mm_xor_si64(ts[0],ts[1]); X0 = 0[(__m64 *) blkPtr]; X1 = 1[(__m64 *) blkPtr]; X2 = 2[(__m64 *) blkPtr]; X3 = 3[(__m64 *) blkPtr]; X4 = 4[(__m64 *) blkPtr]; X5 = 5[(__m64 *) blkPtr]; X6 = 6[(__m64 *) blkPtr]; X7 = 7[(__m64 *) blkPtr]; w[0] = X0; w[1] = X1; w[2] = X2; w[3] = X3; w[4] = X4; w[5] = X5; w[6] = X6; w[7] = X7; X0 = _mm_add_si64(X0,ks[0]); X1 = _mm_add_si64(X1,ks[1]); X2 = _mm_add_si64(X2,ks[2]); X3 = _mm_add_si64(X3,ks[3]); X4 = _mm_add_si64(X4,ks[4]); X5 = _mm_add_si64(X5,_mm_add_si64(ks[5],ts[0])); X6 = _mm_add_si64(X6,_mm_add_si64(ks[6],ts[1])); X7 = _mm_add_si64(X7,ks[7]); blkPtr += 64; #define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum) \ X##p0 = _mm_add_si64(X##p0,X##p1); \ X##p2 = _mm_add_si64(X##p2,X##p3); \ X##p4 = _mm_add_si64(X##p4,X##p5); \ X##p6 = _mm_add_si64(X##p6,X##p7); \ z1 = X##p1; \ X##p1 = _m_psrlqi(X##p1,64-ROT##_0); \ z1 = _m_psllqi(z1,ROT##_0); \ X##p1 = _mm_or_si64(X##p1,z1); \ z3 = X##p3; \ X##p3 = _m_psrlqi(X##p3,64-ROT##_1); \ z3 = _m_psllqi(z3,ROT##_1); \ X##p3 = _mm_or_si64(X##p3,z3); \ z5 = X##p5; \ z5 = _m_psllqi(z5,ROT##_2); \ X##p5 = _m_psrlqi(X##p5,64-ROT##_2); \ X##p5 = _mm_or_si64(X##p5,z5); \ z7 = X##p7; \ X##p7 = _m_psrlqi(X##p7,64-ROT##_3); \ z7 = _m_psllqi(z7,ROT##_3); \ X##p7 = _mm_or_si64(X##p7,z7); \ X##p1 = _mm_xor_si64(X##p1,X##p0); \ X##p3 = _mm_xor_si64(X##p3,X##p2); \ X##p5 = _mm_xor_si64(X##p5,X##p4); \ X##p7 = _mm_xor_si64(X##p7,X##p6); \ #define I512(R) \ X0 = _mm_add_si64(X0,ks[((R)+1) % 9]); /* inject the key schedule value */ \ X1 = _mm_add_si64(X1,ks[((R)+2) % 9]); \ X2 = _mm_add_si64(X2,ks[((R)+3) % 9]); \ X3 = _mm_add_si64(X3,ks[((R)+4) % 9]); \ X4 = _mm_add_si64(X4,ks[((R)+5) % 9]); \ X5 = _mm_add_si64(X5,_mm_add_si64(ks[((R)+6) % 9],ts[((R)+1) % 3])); \ X6 = _mm_add_si64(X6,_mm_add_si64(ks[((R)+7) % 9],ts[((R)+2) % 3])); \ X7 = _mm_add_si64(X7,_mm_add_si64(ks[((R)+8) % 9],_mm_set_pi32(0,(R)+1))); \ #define R512_8_rounds(R) \ R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1); \ R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2); \ R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3); \ R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4); \ I512(2*(R)); \ R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5); \ R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6); \ R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7); \ R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8); \ I512(2*(R)+1); R512_8_rounds( 0); R512_8_rounds( 1); R512_8_rounds( 2); R512_8_rounds( 3); R512_8_rounds( 4); R512_8_rounds( 5); R512_8_rounds( 6); R512_8_rounds( 7); R512_8_rounds( 8); ctx->X[0] = _mm_xor_si64(X0,w[0]); ctx->X[1] = _mm_xor_si64(X1,w[1]); ctx->X[2] = _mm_xor_si64(X2,w[2]); ctx->X[3] = _mm_xor_si64(X3,w[3]); ctx->X[4] = _mm_xor_si64(X4,w[4]); ctx->X[5] = _mm_xor_si64(X5,w[5]); ctx->X[6] = _mm_xor_si64(X6,w[6]); ctx->X[7] = _mm_xor_si64(X7,w[7]); ts[1] = _mm_and_si64(ts[1],_mm_set_pi32(~(((uint32) 64 ) << 24),~0)); } while (--blkCnt); ctx->T[0] = ts[0]; ctx->T[1] = ts[1]; }
__m64 test_mm_add_si64(__m64 a, __m64 b) { // CHECK-LABEL: test_mm_add_si64 // CHECK: call x86_mmx @llvm.x86.mmx.padd.q(x86_mmx %{{.*}}, x86_mmx %{{.*}}) return _mm_add_si64(a, b); }