int signal_energy_nodc(int *input,unsigned int length) { int i; int temp,temp2; register __m64 mm0,mm1,mm2,mm3; __m64 *in = (__m64 *)input; #ifdef MAIN short *printb; #endif mm0 = _m_pxor(mm0,mm0); mm3 = _m_pxor(mm3,mm3); for (i=0;i<length>>1;i++) { mm1 = in[i]; mm2 = mm1; mm1 = _m_pmaddwd(mm1,mm1);// SIMD complex multiplication mm1 = _m_psradi(mm1,shift); mm0 = _m_paddd(mm0,mm1); // temp2 = mm0; // printf("%d %d\n",((int *)&in[i])[0],((int *)&in[i])[1]); // printb = (short *)&mm2; // printf("mm2 %d : %d %d %d %d\n",i,printb[0],printb[1],printb[2],printb[3]); } /* #ifdef MAIN printb = (short *)&mm3; printf("%d %d %d %d\n",printb[0],printb[1],printb[2],printb[3]); #endif */ mm1 = mm0; mm0 = _m_psrlqi(mm0,32); mm0 = _m_paddd(mm0,mm1); temp = _m_to_int(mm0); temp/=length; temp<<=shift; // this is the average of x^2 #ifdef MAIN printf("E x^2 = %d\n",temp); #endif _mm_empty(); _m_empty(); return((temp>0)?temp:1); }
void main() { a = _m_pxor( b, c ); printf( "m1="AS_QWORD"\n" "m2="AS_QWORD"\n" "mm="AS_QWORD"\n", b, c, a ); }
void sha384Process(register sha384Param* sp) { #ifdef OPTIMIZE_SSE2 # if defined(_MSC_VER) || defined (__INTEL_COMPILER) static const __m64 MASK = { 0x00FF00FF00FF00FF00 }; # elif defined(__GNUC__) static const __m64 MASK = { 0x00FF00FF, 0x00FF00FF }; # else # error # endif __m64 a, b, c, d, e, f, g, h, temp; register __m64 *w; register const __m64 *k; register byte t; w = (__m64*) sp->data; t = 16; while (t--) { temp = *w; *(w++) = _m_pxor( _mm_slli_si64(_m_pshufw(_m_pand(temp, MASK), 27), 8), _m_pshufw(_m_pand(_mm_srli_si64(temp, 8), MASK), 27) ); } t = 64; while (t--) { temp = _mm_add_si64(_mm_add_si64(sig1(w[-2]), w[-7]), _mm_add_si64(sig0(w[-15]), w[-16])); *(w++) = temp; } w = (__m64*) sp->h; a = w[0]; b = w[1]; c = w[2]; d = w[3]; e = w[4]; f = w[5]; g = w[6]; h = w[7]; w = (__m64*) sp->data; k = (__m64*) SHA2_64BIT_K; #else register uint64_t a, b, c, d, e, f, g, h, temp; register uint64_t *w; register const uint64_t *k; register byte t; # if WORDS_BIGENDIAN w = sp->data + 16; # else w = sp->data; t = 16; while (t--) { temp = swapu64(*w); *(w++) = temp; } # endif t = 64; while (t--) { temp = sig1(w[-2]) + w[-7] + sig0(w[-15]) + w[-16]; *(w++) = temp; } w = sp->data; a = sp->h[0]; b = sp->h[1]; c = sp->h[2]; d = sp->h[3]; e = sp->h[4]; f = sp->h[5]; g = sp->h[6]; h = sp->h[7]; k = SHA2_64BIT_K; #endif ROUND(a,b,c,d,e,f,g,h,w[ 0],k[ 0]); ROUND(h,a,b,c,d,e,f,g,w[ 1],k[ 1]); ROUND(g,h,a,b,c,d,e,f,w[ 2],k[ 2]); ROUND(f,g,h,a,b,c,d,e,w[ 3],k[ 3]); ROUND(e,f,g,h,a,b,c,d,w[ 4],k[ 4]); ROUND(d,e,f,g,h,a,b,c,w[ 5],k[ 5]); ROUND(c,d,e,f,g,h,a,b,w[ 6],k[ 6]); ROUND(b,c,d,e,f,g,h,a,w[ 7],k[ 7]); ROUND(a,b,c,d,e,f,g,h,w[ 8],k[ 8]); ROUND(h,a,b,c,d,e,f,g,w[ 9],k[ 9]); ROUND(g,h,a,b,c,d,e,f,w[10],k[10]); ROUND(f,g,h,a,b,c,d,e,w[11],k[11]); ROUND(e,f,g,h,a,b,c,d,w[12],k[12]); ROUND(d,e,f,g,h,a,b,c,w[13],k[13]); ROUND(c,d,e,f,g,h,a,b,w[14],k[14]); ROUND(b,c,d,e,f,g,h,a,w[15],k[15]); ROUND(a,b,c,d,e,f,g,h,w[16],k[16]); ROUND(h,a,b,c,d,e,f,g,w[17],k[17]); ROUND(g,h,a,b,c,d,e,f,w[18],k[18]); ROUND(f,g,h,a,b,c,d,e,w[19],k[19]); ROUND(e,f,g,h,a,b,c,d,w[20],k[20]); ROUND(d,e,f,g,h,a,b,c,w[21],k[21]); ROUND(c,d,e,f,g,h,a,b,w[22],k[22]); ROUND(b,c,d,e,f,g,h,a,w[23],k[23]); ROUND(a,b,c,d,e,f,g,h,w[24],k[24]); ROUND(h,a,b,c,d,e,f,g,w[25],k[25]); ROUND(g,h,a,b,c,d,e,f,w[26],k[26]); ROUND(f,g,h,a,b,c,d,e,w[27],k[27]); ROUND(e,f,g,h,a,b,c,d,w[28],k[28]); ROUND(d,e,f,g,h,a,b,c,w[29],k[29]); ROUND(c,d,e,f,g,h,a,b,w[30],k[30]); ROUND(b,c,d,e,f,g,h,a,w[31],k[31]); ROUND(a,b,c,d,e,f,g,h,w[32],k[32]); ROUND(h,a,b,c,d,e,f,g,w[33],k[33]); ROUND(g,h,a,b,c,d,e,f,w[34],k[34]); ROUND(f,g,h,a,b,c,d,e,w[35],k[35]); ROUND(e,f,g,h,a,b,c,d,w[36],k[36]); ROUND(d,e,f,g,h,a,b,c,w[37],k[37]); ROUND(c,d,e,f,g,h,a,b,w[38],k[38]); ROUND(b,c,d,e,f,g,h,a,w[39],k[39]); ROUND(a,b,c,d,e,f,g,h,w[40],k[40]); ROUND(h,a,b,c,d,e,f,g,w[41],k[41]); ROUND(g,h,a,b,c,d,e,f,w[42],k[42]); ROUND(f,g,h,a,b,c,d,e,w[43],k[43]); ROUND(e,f,g,h,a,b,c,d,w[44],k[44]); ROUND(d,e,f,g,h,a,b,c,w[45],k[45]); ROUND(c,d,e,f,g,h,a,b,w[46],k[46]); ROUND(b,c,d,e,f,g,h,a,w[47],k[47]); ROUND(a,b,c,d,e,f,g,h,w[48],k[48]); ROUND(h,a,b,c,d,e,f,g,w[49],k[49]); ROUND(g,h,a,b,c,d,e,f,w[50],k[50]); ROUND(f,g,h,a,b,c,d,e,w[51],k[51]); ROUND(e,f,g,h,a,b,c,d,w[52],k[52]); ROUND(d,e,f,g,h,a,b,c,w[53],k[53]); ROUND(c,d,e,f,g,h,a,b,w[54],k[54]); ROUND(b,c,d,e,f,g,h,a,w[55],k[55]); ROUND(a,b,c,d,e,f,g,h,w[56],k[56]); ROUND(h,a,b,c,d,e,f,g,w[57],k[57]); ROUND(g,h,a,b,c,d,e,f,w[58],k[58]); ROUND(f,g,h,a,b,c,d,e,w[59],k[59]); ROUND(e,f,g,h,a,b,c,d,w[60],k[60]); ROUND(d,e,f,g,h,a,b,c,w[61],k[61]); ROUND(c,d,e,f,g,h,a,b,w[62],k[62]); ROUND(b,c,d,e,f,g,h,a,w[63],k[63]); ROUND(a,b,c,d,e,f,g,h,w[64],k[64]); ROUND(h,a,b,c,d,e,f,g,w[65],k[65]); ROUND(g,h,a,b,c,d,e,f,w[66],k[66]); ROUND(f,g,h,a,b,c,d,e,w[67],k[67]); ROUND(e,f,g,h,a,b,c,d,w[68],k[68]); ROUND(d,e,f,g,h,a,b,c,w[69],k[69]); ROUND(c,d,e,f,g,h,a,b,w[70],k[70]); ROUND(b,c,d,e,f,g,h,a,w[71],k[71]); ROUND(a,b,c,d,e,f,g,h,w[72],k[72]); ROUND(h,a,b,c,d,e,f,g,w[73],k[73]); ROUND(g,h,a,b,c,d,e,f,w[74],k[74]); ROUND(f,g,h,a,b,c,d,e,w[75],k[75]); ROUND(e,f,g,h,a,b,c,d,w[76],k[76]); ROUND(d,e,f,g,h,a,b,c,w[77],k[77]); ROUND(c,d,e,f,g,h,a,b,w[78],k[78]); ROUND(b,c,d,e,f,g,h,a,w[79],k[79]); #ifdef OPTIMIZE_SSE2 w = (__m64*) sp->h; w[0] = _mm_add_si64(w[0], a); w[1] = _mm_add_si64(w[1], b); w[2] = _mm_add_si64(w[2], c); w[3] = _mm_add_si64(w[3], d); w[4] = _mm_add_si64(w[4], e); w[5] = _mm_add_si64(w[5], f); w[6] = _mm_add_si64(w[6], g); w[7] = _mm_add_si64(w[7], h); _mm_empty(); #else sp->h[0] += a; sp->h[1] += b; sp->h[2] += c; sp->h[3] += d; sp->h[4] += e; sp->h[5] += f; sp->h[6] += g; sp->h[7] += h; #endif }
int signal_energy(int *input,unsigned int length) { int i; int temp,temp2; register __m64 mm0,mm1,mm2,mm3; __m64 *in = (__m64 *)input; #ifdef MAIN short *printb; #endif mm0 = _m_pxor(mm0,mm0); mm3 = _m_pxor(mm3,mm3); for (i=0;i<length>>1;i++) { mm1 = in[i]; mm2 = mm1; mm1 = _m_pmaddwd(mm1,mm1); mm1 = _m_psradi(mm1,shift);// shift any 32 bits blocs of the word by the value shift mm0 = _m_paddd(mm0,mm1);// add the two 64 bits words 4 bytes by 4 bytes // temp2 = mm0; // printf("%d %d\n",((int *)&temp2)[0],((int *)&temp2)[1]); // printb = (short *)&mm2; // printf("mm2 %d : %d %d %d %d\n",i,printb[0],printb[1],printb[2],printb[3]); mm2 = _m_psrawi(mm2,shift_DC); mm3 = _m_paddw(mm3,mm2);// add the two 64 bits words 2 bytes by 2 bytes // printb = (short *)&mm3; // printf("mm3 %d : %d %d %d %d\n",i,printb[0],printb[1],printb[2],printb[3]); } /* #ifdef MAIN printb = (short *)&mm3; printf("%d %d %d %d\n",printb[0],printb[1],printb[2],printb[3]); #endif */ mm1 = mm0; mm0 = _m_psrlqi(mm0,32); mm0 = _m_paddd(mm0,mm1); temp = _m_to_int(mm0); temp/=length; temp<<=shift; // this is the average of x^2 // now remove the DC component mm2 = _m_psrlqi(mm3,32); mm2 = _m_paddw(mm2,mm3); mm2 = _m_pmaddwd(mm2,mm2); temp2 = _m_to_int(mm2); temp2/=(length*length); temp2<<=(2*shift_DC); #ifdef MAIN printf("E x^2 = %d\n",temp); #endif temp -= temp2; #ifdef MAIN printf("(E x)^2=%d\n",temp2); #endif _mm_empty(); _m_empty(); return((temp>0)?temp:1); }