static std::complex<double> ZDot(int n, const double* A, const std::complex<double>* B) { if (n) { #ifdef __SSE2__ std::complex<double> sum(0); while (n && !IsAligned(A) ) { sum += *A * *B; ++A; Maybe<!c2>::increment(B); --n; } int n_2 = (n>>1); int nb = n-(n_2<<1); if (n_2) { union { __m128d xm; double xd[2]; } xsum; xsum.xm = _mm_set1_pd(0.); __m128d xsum2 = _mm_set1_pd(0.); const std::complex<double>* B1 = Maybe<!c2>::plus(B,1); assert(IsAligned(A)); assert(IsAligned(B)); do { const __m128d& xA = *(const __m128d*)(A); const __m128d& xB1 = *(const __m128d*)(B); const __m128d& xB2 = *(const __m128d*)(B1); A += 2; Maybe<!c2>::increment(B,2); Maybe<!c2>::increment(B1,2); __m128d xA1 = _mm_shuffle_pd(xA,xA,_MM_SHUFFLE2(0,0)); __m128d xA2 = _mm_shuffle_pd(xA,xA,_MM_SHUFFLE2(1,1)); __m128d x1 = _mm_mul_pd(xA1,xB1); __m128d x2 = _mm_mul_pd(xA2,xB2); xsum.xm = _mm_add_pd(xsum.xm,x1); xsum2 = _mm_add_pd(xsum2,x2); } while (--n_2); xsum.xm = _mm_add_pd(xsum.xm,xsum2); sum += std::complex<double>(xsum.xd[0],xsum.xd[1]); } if (nb) { sum += *A * *B; ++A; Maybe<!c2>::increment(B); } return Maybe<c2>::conj(sum); #else std::complex<double> sum = 0.; do { sum += *A * *B; ++A; Maybe<!c2>::increment(B); } while (--n); return Maybe<c2>::conj(sum); #endif } else { return 0.;
void transpose_misaligned(double *a, double *b, int N1, int N2, double factor) { int i,j,k,k1,it,jt,itt,jtt,it_bound,jt_bound,itt_bound,jtt_bound; int conflict,tmp,tmpN,offset,line_offset,setnum,set[8192/(4*sizeof(double))]; double *pA, *pB; register __m128d x, y, z, w, t, t1,fac_vector; fac_vector = _mm_load_sd(&factor); fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector); itt_bound = (N1/tilesize)*tilesize; for (itt = 0; itt < itt_bound; itt=itt+5*tilesize) { jtt_bound =(N2/tilesize)*tilesize; for (jtt = 0; jtt < jtt_bound; jtt=jtt+5*tilesize) { it_bound = (itt+5*tilesize > itt_bound)?itt_bound:itt+5*tilesize; for (it = itt; it < it_bound; it = it+tilesize) { jt_bound = (jtt+5*tilesize>itt_bound)?jtt_bound:jtt+5*tilesize; for (jt = jtt; jt < jt_bound; jt = jt+tilesize) { k = 0; for (j = jt; j < jt+tilesize; j=j+2) { for (i = it; i < it+tilesize; i=i+2) { pA = a+i*N2+j; pB = b+j*N1+i; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } for (i = itt; i < itt+5*tilesize && i < itt_bound; i++) { for (j = jtt_bound; j < N2; j++) { b[j*N1+i] = factor * a[i*N2+j]; } } } for (i = itt_bound; i < N1; i++) { for (j = 0; j < N2; j++) { b[j*N1+i] = factor * a[i*N2+j]; } } }
dcomplex zdotc_( int* n, dcomplex* x, int* inc_x, dcomplex* z, int* inc_z ) { dcomplex* restrict x1; dcomplex* restrict z1; int i; v2df_t rho1v; v2df_t z11v, z12v; v2df_t x1v, x1rv; dcomplex rho; int n1 = *n; int incx = *inc_x; int incz = *inc_z; x1 = x; z1 = z; rho1v.v = _mm_setzero_pd(); { v2df_t bcac, adbd; for ( i = 0; i < n1; ++i ) { z11v.v = _mm_loaddup_pd( ( double* )&(z1->real) ); z12v.v = _mm_loaddup_pd( ( double* )&(z1->imag) ); x1v.v = _mm_load_pd( ( double* )x1 ); x1rv.v = _mm_shuffle_pd( x1v.v, x1v.v, _MM_SHUFFLE2 (0,1) ); bcac.v = x1rv.v * z11v.v; adbd.v = x1v.v * z12v.v; rho1v.v = rho1v.v + _mm_addsub_pd( bcac.v, adbd.v ); x1 += incx; z1 += incz; } rho1v.v = _mm_shuffle_pd( rho1v.v, rho1v.v, _MM_SHUFFLE2 (0,1) ); rho1v.d[1] = -rho1v.d[1]; } rho.real = rho1v.d[0]; rho.imag = rho1v.d[1]; return rho; }
void transpose_aligned(double *a, double *b, int N1, int N2, double factor) { int i,j,k,k1,it,jt,itt,jtt,conflict,tmp,tmpN; double *pA, *pB; register __m128d x, y, z, w,fac_vector; fac_vector = _mm_load_sd(&factor); fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector); for (it = 0; it < N1; it=it+tilesize) { for (jt = 0; jt < N2; jt=jt+tilesize) { k = 0; for (j = jt; j < jt+tilesize; j=j+2) { for (i = it; i < it+tilesize; i=i+2) { pA = a+i*N2+j; x = _mm_load_pd(pA); y = _mm_load_pd(pA + N2); x = _mm_mul_pd(x,fac_vector); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); k = (j-jt)*tilesize + (i-it); _mm_store_pd(buf + k,z); _mm_store_pd(buf + k + tilesize,w); } } k = 0; k1 = 0; for (j = jt; j < jt+tilesize; j++) { pB = b+j*N1+it; k = (j-jt)*tilesize; x = _mm_load_pd(&buf[k]); y = _mm_load_pd(&buf[k]+2); z = _mm_load_pd(&buf[k]+2*2); w = _mm_load_pd(&buf[k]+3*2); _mm_stream_pd(pB,x); _mm_stream_pd(pB+2,y); _mm_stream_pd(pB+2*2,z); _mm_stream_pd(pB+3*2,w); } } } }
/* use compiler intrinsics for 2x parallel processing */ static inline double chi2_intrinsic_double(int n, const double* x, const double* y) { double result=0; const __m128d eps = _mm_set1_pd(DBL_MIN); const __m128d zero = _mm_setzero_pd(); __m128d chi2 = _mm_setzero_pd(); for ( ; n>1; n-=2) { const __m128d a = _mm_loadu_pd(x); const __m128d b = _mm_loadu_pd(y); x+=2; y+=2; const __m128d a_plus_b = _mm_add_pd(a,b); const __m128d a_plus_b_plus_eps = _mm_add_pd(a_plus_b,eps); const __m128d a_minus_b = _mm_sub_pd(a,b); const __m128d a_minus_b_sq = _mm_mul_pd(a_minus_b, a_minus_b); const __m128d quotient = _mm_div_pd(a_minus_b_sq, a_plus_b_plus_eps); chi2 = _mm_add_pd(chi2, quotient); } const __m128d shuffle = _mm_shuffle_pd(chi2, chi2, _MM_SHUFFLE2(0,1)); const __m128d sum = _mm_add_pd(chi2, shuffle); // with SSE3, we could use hadd_pd, but the difference is negligible _mm_store_sd(&result,sum); _mm_empty(); if (n) result += chi2_baseline_double(n, x, y); // remaining entries return result; }
/* xvm_dot: * Return the dot product of the two given vectors. */ double xvm_dot(const double x[], const double y[], uint64_t N) { double r = 0.0; #if defined(__SSE2__) && !defined(XVM_ANSI) assert(x != NULL && ((uintptr_t)x % 16) == 0); assert(y != NULL && ((uintptr_t)y % 16) == 0); uint64_t n, d = N % 4; __m128d s0 = _mm_setzero_pd(); __m128d s1 = _mm_setzero_pd(); for (n = 0; n < N - d; n += 4) { const __m128d x0 = _mm_load_pd(x + n ); const __m128d x1 = _mm_load_pd(x + n + 2); const __m128d y0 = _mm_load_pd(y + n ); const __m128d y1 = _mm_load_pd(y + n + 2); const __m128d r0 = _mm_mul_pd(x0, y0); const __m128d r1 = _mm_mul_pd(x1, y1); s0 = _mm_add_pd(s0, r0); s1 = _mm_add_pd(s1, r1); } s0 = _mm_add_pd(s0, s1); s1 = _mm_shuffle_pd(s0, s0, _MM_SHUFFLE2(1, 1)); s0 = _mm_add_pd(s0, s1); _mm_store_sd(&r, s0); for ( ; n < N; n++) r += x[n] * y[n]; #else for (uint64_t n = 0; n < N; n++) r += x[n] * y[n]; #endif return r; }
inline double operator[](int i) const { __m128d buf0; if (i < 4) { if (i < 2) { buf0 = _mm512_extractf64x2_pd(val, 0); } else { buf0 = _mm512_extractf64x2_pd(val, 1); } } else { if (i < 6) { buf0 = _mm512_extractf64x2_pd(val, 2); } else { buf0 = _mm512_extractf64x2_pd(val, 3); } } i &= 1; if (i == 0) { return _mm_cvtsd_f64(buf0); } buf0 = _mm_shuffle_pd(buf0, buf0, 1); return _mm_cvtsd_f64(buf0); }
__m128d test_mm_shuffle_pd(__m128d A, __m128d B) { // DAG-LABEL: test_mm_shuffle_pd // DAG: shufflevector <2 x double> %{{.*}}, <2 x double> %{{.*}}, <2 x i32> <i32 1, i32 2> // // ASM-LABEL: test_mm_shuffle_pd // ASM: shufpd $1, return _mm_shuffle_pd(A, B, 1); }
int fft4a_(double *a, double *b, double *w, int *l) { int j, j0, j1, j2, j3, j4, j5, j6, j7; /* double x0, y0, x1, y1, x2, y2, x3, y3, wi1, wi2, wi3, wr1, wr2, wr3; */ __m128d t0, t1, t2, t3, t4, w1, w2, w3; for (j = 0; j < *l; j++) { j0 = j << 1; j1 = j0 + (*l << 1); j2 = j1 + (*l << 1); j3 = j2 + (*l << 1); j4 = j << 3; j5 = j4 + 2; j6 = j5 + 2; j7 = j6 + 2; /* wr1 = w[j0]; wi1 = w[j0 + 1]; wr2 = wr1 * wr1 - wi1 * wi1; wi2 = wr1 * wi1 + wr1 * wi1; wr3 = wr1 * wr2 - wi1 * wi2; wi3 = wr1 * wi2 + wi1 * wr2; */ w1 = _mm_load_pd(&w[j0]); w2 = ZMUL(w1, w1); w3 = ZMUL(w1, w2); /* x0 = a[j0] + a[j2]; y0 = a[j0 + 1] + a[j2 + 1]; x1 = a[j0] - a[j2]; y1 = a[j0 + 1] - a[j2 + 1]; x2 = a[j1] + a[j3]; y2 = a[j1 + 1] + a[j3 + 1]; x3 = a[j1 + 1] - a[j3 + 1]; y3 = a[j3] - a[j1]; */ t0 = _mm_load_pd(&a[j0]); t2 = _mm_load_pd(&a[j2]); t1 = _mm_sub_pd(t0, t2); t0 = _mm_add_pd(t0, t2); t3 = _mm_load_pd(&a[j1]); t4 = _mm_load_pd(&a[j3]); t2 = _mm_add_pd(t3, t4); t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0)); t3 = _mm_shuffle_pd(t3, t3, 1); /* b[j4] = x0 + x2; b[j4 + 1] = y0 + y2; b[j6] = wr2 * (x0 - x2) - wi2 * (y0 - y2); b[j6 + 1] = wr2 * (y0 - y2) + wi2 * (x0 - x2); b[j5] = wr1 * (x1 + x3) - wi1 * (y1 + y3); b[j5 + 1] = wr1 * (y1 + y3) + wi1 * (x1 + x3); b[j7] = wr3 * (x1 - x3) - wi3 * (y1 - y3); b[j7 + 1] = wr3 * (y1 - y3) + wi3 * (x1 - x3); */ _mm_store_pd(&b[j4], _mm_add_pd(t0, t2)); _mm_store_pd(&b[j6], ZMUL(w2, _mm_sub_pd(t0, t2))); _mm_store_pd(&b[j5], ZMUL(w1, _mm_add_pd(t1, t3))); _mm_store_pd(&b[j7], ZMUL(w3, _mm_sub_pd(t1, t3))); } return 0; }
__inline __m128d Length(__m128d vec1,__m128d vec2) { __m128d result1 = _mm_mul_pd(vec1, vec1); __m128d result2 = _mm_mul_sd(vec2, vec2); __m128d result3 = _mm_shuffle_pd(result1, result1, 1); __m128d result4 = _mm_add_sd(result1, result2); __m128d result5 = _mm_add_sd(result4, result3); __m128d result6 = _mm_sqrt_sd(vec1, result5); return result6; }
double Point::Cross(const Point &point) const { #ifdef __SSE3__ __m128d b = _mm_shuffle_pd(point.v, point.v, 0x01); b *= v; b = _mm_hsub_pd(b, b); return reinterpret_cast<double &>(b); #else return x * point.y - y * point.x; #endif }
static __inline __m128d ZMUL(__m128d a, __m128d b) { __m128d ar, ai; ar = _mm_movedup_pd(a); /* ar = [a.r a.r] */ ar = _mm_mul_pd(ar, b); /* ar = [a.r*b.r a.r*b.i] */ ai = _mm_unpackhi_pd(a, a); /* ai = [a.i a.i] */ b = _mm_shuffle_pd(b, b, 1); /* b = [b.i b.r] */ ai = _mm_mul_pd(ai, b); /* ai = [a.i*b.i a.i*b.r] */ return _mm_addsub_pd(ar, ai); /* [a.r*b.r-a.i*b.i a.r*b.i+a.i*b.r] */ }
int fft3a_(double *a, double *b, double *w, int *l) { /* static double c31 = .86602540378443865; static double c32 = .5; */ static __m128d c31, c32; int j, j0, j1, j2, j3, j4, j5; /* double x0, y0, x1, y1, x2, y2, wi1, wi2, wr1, wr2; */ __m128d t0, t1, t2, t3, w1, w2; c31 = _mm_set1_pd(0.86602540378443865); c32 = _mm_set1_pd(0.5); for (j = 0; j < *l; j++) { j0 = j << 1; j1 = j0 + (*l << 1); j2 = j1 + (*l << 1); j3 = j * 6; j4 = j3 + 2; j5 = j4 + 2; /* wr1 = w[j0]; wi1 = w[j0 + 1]; wr2 = wr1 * wr1 - wi1 * wi1; wi2 = wr1 * wi1 + wr1 * wi1; */ w1 = _mm_load_pd(&w[j0]); w2 = ZMUL(w1, w1); /* x0 = a[j1] + a[j2]; y0 = a[j1 + 1] + a[j2 + 1]; x1 = a[j0] - c32 * x0; y1 = a[j0 + 1] - c32 * y0; x2 = c31 * (a[j1 + 1] - a[j2 + 1]); y2 = c31 * (a[j2] - a[j1]); */ t1 = _mm_load_pd(&a[j1]); t2 = _mm_load_pd(&a[j2]); t0 = _mm_add_pd(t1, t2); t2 = _mm_xor_pd(_mm_sub_pd(t1, t2), _mm_set_sd(-0.0)); t2 = _mm_mul_pd(c31, _mm_shuffle_pd(t2, t2, 1)); t3 = _mm_load_pd(&a[j0]); t1 = _mm_sub_pd(t3, _mm_mul_pd(c32, t0)); /* b[j3] = a[j0] + x0; b[j3 + 1] = a[j0 + 1] + y0; b[j4] = wr1 * (x1 + x2) - wi1 * (y1 + y2); b[j4 + 1] = wr1 * (y1 + y2) + wi1 * (x1 + x2); b[j5] = wr2 * (x1 - x2) - wi2 * (y1 - y2); b[j5 + 1] = wr2 * (y1 - y2) + wi2 * (x1 - x2); */ _mm_store_pd(&b[j3], _mm_add_pd(t3, t0)); _mm_store_pd(&b[j4], ZMUL(w1, _mm_add_pd(t1, t2))); _mm_store_pd(&b[j5], ZMUL(w2, _mm_sub_pd(t1, t2))); } return 0; }
value complex_mul(value vab, value vcd) { CAMLparam2(vab, vcd); CAMLlocal1(vz); vz = caml_alloc(Double_array_tag, 2); __m128d ab, cd, ac_bd, ba, bc_ad; ab = _mm_loadu_pd((double const*) vab); cd = _mm_loadu_pd((double const*) vcd); ac_bd = _mm_mul_pd(ab, cd); ba = _mm_shuffle_pd(ab, ab, 1); bc_ad = _mm_mul_pd(ba, cd); _mm_storeu_pd((double*) vz, _mm_addsub_pd(ac_bd, bc_ad)); CAMLreturn(vz); }
void AES_192_Key_Expansion (const unsigned char *userkey, unsigned char *key) { __m128i temp1, temp2, temp3, temp4; __m128i *Key_Schedule = (__m128i*)key; temp1 = _mm_loadu_si128((__m128i*)userkey); temp3 = _mm_loadu_si128((__m128i*)(userkey+16)); Key_Schedule[0]=temp1; Key_Schedule[1]=temp3; temp2=_mm_aeskeygenassist_si128 (temp3,0x1); KEY_192_ASSIST(&temp1, &temp2, &temp3); Key_Schedule[1] = (__m128i)_mm_shuffle_pd((__m128d)Key_Schedule[1], (__m128d)temp1,0); Key_Schedule[2] = (__m128i)_mm_shuffle_pd((__m128d)temp1,(__m128d)temp3,1); temp2=_mm_aeskeygenassist_si128 (temp3,0x2); KEY_192_ASSIST(&temp1, &temp2, &temp3); Key_Schedule[3]=temp1; Key_Schedule[4]=temp3; temp2=_mm_aeskeygenassist_si128 (temp3,0x4); KEY_192_ASSIST(&temp1, &temp2, &temp3); Key_Schedule[4] = (__m128i)_mm_shuffle_pd((__m128d)Key_Schedule[4], (__m128d)temp1,0); Key_Schedule[5] = (__m128i)_mm_shuffle_pd((__m128d)temp1,(__m128d)temp3,1); temp2=_mm_aeskeygenassist_si128 (temp3,0x8); KEY_192_ASSIST(&temp1, &temp2, &temp3); Key_Schedule[6]=temp1; Key_Schedule[7]=temp3; temp2=_mm_aeskeygenassist_si128 (temp3,0x10); KEY_192_ASSIST(&temp1, &temp2, &temp3); Key_Schedule[7] = (__m128i)_mm_shuffle_pd((__m128d)Key_Schedule[7], (__m128d)temp1,0); Key_Schedule[8] = (__m128i)_mm_shuffle_pd((__m128d)temp1,(__m128d)temp3,1); temp2=_mm_aeskeygenassist_si128 (temp3,0x20); KEY_192_ASSIST(&temp1, &temp2, &temp3); Key_Schedule[9]=temp1; Key_Schedule[10]=temp3; temp2=_mm_aeskeygenassist_si128 (temp3,0x40); KEY_192_ASSIST(&temp1, &temp2, &temp3); Key_Schedule[10] = (__m128i)_mm_shuffle_pd((__m128d)Key_Schedule[10], (__m128d)temp1,0); Key_Schedule[11] = (__m128i)_mm_shuffle_pd((__m128d)temp1,(__m128d)temp3,1); temp2=_mm_aeskeygenassist_si128 (temp3,0x80); KEY_192_ASSIST(&temp1, &temp2, &temp3); Key_Schedule[12]=temp1; }
static inline void inner_product_gdouble_cubic_1_sse2 (gdouble * o, const gdouble * a, const gdouble * b, gint len, const gdouble * icoeff, gint bstride) { gint i; __m128d f[2], sum[4], t; const gdouble *c[4] = { (gdouble *) ((gint8 *) b + 0 * bstride), (gdouble *) ((gint8 *) b + 1 * bstride), (gdouble *) ((gint8 *) b + 2 * bstride), (gdouble *) ((gint8 *) b + 3 * bstride) }; f[0] = _mm_loadu_pd (icoeff + 0); f[1] = _mm_loadu_pd (icoeff + 2); sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_pd (); for (i = 0; i < len; i += 2) { t = _mm_loadu_pd (a + i + 0); sum[0] = _mm_add_pd (sum[0], _mm_mul_pd (t, _mm_load_pd (c[0] + i))); sum[1] = _mm_add_pd (sum[1], _mm_mul_pd (t, _mm_load_pd (c[1] + i))); sum[2] = _mm_add_pd (sum[2], _mm_mul_pd (t, _mm_load_pd (c[2] + i))); sum[3] = _mm_add_pd (sum[3], _mm_mul_pd (t, _mm_load_pd (c[3] + i))); } sum[0] = _mm_mul_pd (sum[0], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (0, 0))); sum[1] = _mm_mul_pd (sum[1], _mm_shuffle_pd (f[0], f[0], _MM_SHUFFLE2 (1, 1))); sum[2] = _mm_mul_pd (sum[2], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (0, 0))); sum[3] = _mm_mul_pd (sum[3], _mm_shuffle_pd (f[1], f[1], _MM_SHUFFLE2 (1, 1))); sum[0] = _mm_add_pd (sum[0], sum[1]); sum[2] = _mm_add_pd (sum[2], sum[3]); sum[0] = _mm_add_pd (sum[0], sum[2]); sum[0] = _mm_add_sd (sum[0], _mm_unpackhi_pd (sum[0], sum[0])); _mm_store_sd (o, sum[0]); }
// from Intel's sample intrin_double_sample.c void multiply_SSE3(double xr, double xi, double yr, double yi, complex_num *z) { __m128d num1, num2, num3; // Duplicates lower vector element into upper vector element. // num1: [x.real, x.real] num1 = _mm_loaddup_pd(&xr); // Move y elements into a vector // num2: [y.img, y.real] num2 = _mm_set_pd(yi, yr); // Multiplies vector elements // num3: [(x.real*y.img), (x.real*y.real)] num3 = _mm_mul_pd(num2, num1); // num1: [x.img, x.img] num1 = _mm_loaddup_pd(&xi); // Swaps the vector elements // num2: [y.real, y.img] num2 = _mm_shuffle_pd(num2, num2, 1); // num2: [(x.img*y.real), (x.img*y.img)] num2 = _mm_mul_pd(num2, num1); // Adds upper vector element while subtracting lower vector element // num3: [((x.real *y.img)+(x.img*y.real)), // ((x.real*y.real)-(x.img*y.img))] num3 = _mm_addsub_pd(num3, num2); // Stores the elements of num3 into z _mm_storeu_pd((double *)z, num3); }
int main(){ __m128d a,b,c; double res[2] __attribute__((aligned(16))); a = _mm_set_pd(1,2); b = _mm_set_pd(3,4); c = _mm_shuffle_pd(a,b, _MM_SHUFFLE2(0,1)); _mm_store_pd(res, c); /* 0 1 */ printf("%f %f\n", res[0] , res[1]); return 0; }
// only compute the necessary indices of su2_i = subgroup( U*staple^\dagger ) void only_subgroup( GLU_complex *s0 , GLU_complex *s1 , double *scale , const GLU_complex U[ NCNC ] , const GLU_complex staple[ NCNC ] , const size_t su2_index ) { const __m128d *u = (const __m128d*)U ; const __m128d *s = (const __m128d*)staple ; register __m128d sm0 ; register __m128d sm1 ; #if NC == 3 switch( su2_index%3 ) { // I don't like this // rotation 1 // | s0 s1 0 | // | -s1* s0* 0 | // | 0 0 1 | case 0 : sm0 = _mm_add_pd( // temp0 _mm_add_pd( SSE2_MUL_CONJ( *( u + 0 ) , *( s + 0 ) ) , _mm_add_pd( SSE2_MUL_CONJ( *( u + 1 ) , *( s + 1 ) ) , SSE2_MUL_CONJ( *( u + 2 ) , *( s + 2 ) ) ) ) , // temp3^* _mm_add_pd( SSE2_MULCONJ( *( u + 3 ) , *( s + 3 ) ) , _mm_add_pd( SSE2_MULCONJ( *( u + 4 ) , *( s + 4 ) ) , SSE2_MULCONJ( *( u + 5 ) , *( s + 5 ) ) ) ) ) ; sm1 = _mm_sub_pd( // temp1 _mm_add_pd( SSE2_MUL_CONJ( *( u + 0 ) , *( s + 3 ) ) , _mm_add_pd( SSE2_MUL_CONJ( *( u + 1 ) , *( s + 4 ) ) , SSE2_MUL_CONJ( *( u + 2 ) , *( s + 5 ) ) ) ) , // temp2^* _mm_add_pd( SSE2_MULCONJ( *( u + 3 ) , *( s + 0 ) ) , _mm_add_pd( SSE2_MULCONJ( *( u + 4 ) , *( s + 1 ) ) , SSE2_MULCONJ( *( u + 5 ) , *( s + 2 ) ) ) ) ) ; break ; case 1 : // rotation 2 // | 1 0 0 | // | 0 s0 s1 | // | 0 -s1* s0* | sm0 = _mm_add_pd( // temp0 _mm_add_pd( SSE2_MUL_CONJ( *( u + 3 ) , *( s + 3 ) ) , _mm_add_pd( SSE2_MUL_CONJ( *( u + 4 ) , *( s + 4 ) ) , SSE2_MUL_CONJ( *( u + 5 ) , *( s + 5 ) ) ) ) , // temp3^* _mm_add_pd( SSE2_MULCONJ( *( u + 6 ) , *( s + 6 ) ) , _mm_add_pd( SSE2_MULCONJ( *( u + 7 ) , *( s + 7 ) ) , SSE2_MULCONJ( *( u + 8 ) , *( s + 8 ) ) ) ) ) ; sm1 = _mm_sub_pd( // temp1 _mm_add_pd( SSE2_MUL_CONJ( *( u + 3 ) , *( s + 6 ) ) , _mm_add_pd( SSE2_MUL_CONJ( *( u + 4 ) , *( s + 7 ) ) , SSE2_MUL_CONJ( *( u + 5 ) , *( s + 8 ) ) ) ) , // temp2^* _mm_add_pd( SSE2_MULCONJ( *( u + 6 ) , *( s + 3 ) ) , _mm_add_pd( SSE2_MULCONJ( *( u + 7 ) , *( s + 4 ) ) , SSE2_MULCONJ( *( u + 8 ) , *( s + 5 ) ) ) ) ) ; break ; case 2 : // rotation 3 // | s0* 0 -s1 | // | 0 1 0 | // | s1 0 s0 | sm0 = _mm_add_pd( // temp3^* _mm_add_pd( SSE2_MULCONJ( *( u + 0 ) , *( s + 0 ) ) , _mm_add_pd( SSE2_MULCONJ( *( u + 1 ) , *( s + 1 ) ) , SSE2_MULCONJ( *( u + 2 ) , *( s + 2 ) ) ) ) , // temp0 _mm_add_pd( SSE2_MUL_CONJ( *( u + 6 ) , *( s + 6 ) ) , _mm_add_pd( SSE2_MUL_CONJ( *( u + 7 ) , *( s + 7 ) ) , SSE2_MUL_CONJ( *( u + 8 ) , *( s + 8 ) ) ) ) ) ; sm1 = _mm_sub_pd( // temp1 _mm_add_pd( SSE2_MUL_CONJ( *( u + 6 ) , *( s + 0 ) ) , _mm_add_pd( SSE2_MUL_CONJ( *( u + 7 ) , *( s + 1 ) ) , SSE2_MUL_CONJ( *( u + 8 ) , *( s + 2 ) ) ) ) , // temp2^* _mm_add_pd( SSE2_MULCONJ( *( u + 0 ) , *( s + 6 ) ) , _mm_add_pd( SSE2_MULCONJ( *( u + 1 ) , *( s + 7 ) ) , SSE2_MULCONJ( *( u + 2 ) , *( s + 8 ) ) ) ) ) ; break ; } #elif NC == 2 sm0 = _mm_add_pd( // temp0 _mm_add_pd( SSE2_MUL_CONJ( *( u + 0 ) , *( s + 0 ) ) , SSE2_MUL_CONJ( *( u + 1 ) , *( s + 1 ) ) ) , // temp3^* _mm_add_pd( SSE2_MULCONJ( *( u + 2 ) , *( s + 2 ) ) , SSE2_MULCONJ( *( u + 3 ) , *( s + 3 ) ) ) ) ; sm1 = _mm_sub_pd( // temp1 _mm_add_pd( SSE2_MUL_CONJ( *( u + 0 ) , *( s + 2 ) ) , SSE2_MUL_CONJ( *( u + 1 ) , *( s + 3 ) ) ) , // temp2^* _mm_add_pd( SSE2_MULCONJ( *( u + 2 ) , *( s + 0 ) ) , SSE2_MULCONJ( *( u + 3 ) , *( s + 1 ) ) ) ) ; #else // su(N) version const size_t row_a = Latt.su2_data[ su2_index ].idx_a / NC ; const size_t col_b = Latt.su2_data[ su2_index ].idx_b % NC ; // prefetch the staple & link indices const __m128d *S1 = ( s + NC * row_a ) , *S2 = ( s + NC * col_b ) ; const __m128d *U1 = ( u + NC * row_a ) , *U2 = ( u + NC * col_b ) ; // initialise to zero & perform multiplication sm0 = _mm_setzero_pd() ; sm1 = _mm_setzero_pd() ; size_t i ; for( i = 0 ; i < NC ; i++ ) { sm0 = _mm_add_pd( sm0 , _mm_add_pd( SSE2_MUL_CONJ( *U1 , *S1 ) , SSE2_MULCONJ( *U2 , *S2 ) ) ) ; sm1 = _mm_add_pd( sm1 , _mm_sub_pd( SSE2_MUL_CONJ( *U1 , *S2 ) , SSE2_MULCONJ( *U2 , *S1 ) ) ) ; // increment our pointers S1++ , S2++ , U1++ , U2++ ; } #endif // puts the norm in both parts register __m128d z = SSE2_FMA( sm0 , sm0 , _mm_mul_pd( sm1 , sm1 ) ) ; z = _mm_add_pd( z , _mm_shuffle_pd( z , z , 1 ) ) ; z = _mm_sqrt_pd( z ) ; z = _mm_div_pd( _mm_set1_pd( 1.0 ) , z ) ; sm0 = _mm_mul_pd( sm0 , z ) ; sm1 = _mm_mul_pd( sm1 , z ) ; // poke back into *s0 and *s1 and *scale _mm_store_pd( (void*)s0 , sm0 ) ; _mm_store_pd( (void*)s1 , sm1 ) ; _mm_store_sd( (void*)scale , z ) ; return ; }
BOOST_FORCEINLINE __m128d shuffle(__m128d const lower, __m128d const upper) { return _mm_shuffle_pd(lower, upper, _MM_SHUFFLE2(upper_i0, lower_i0)); }
void transpose_4321_loop_3241_( double *unsorted, double *sorted, int *p_dim1, int *p_dim2, int *p_dim3, int *p_dim4, double *p_factor ) { int dim1,dim2,dim3,dim4; int dim1mod,dim2mod,dim3mod,dim4mod; unsigned int old_offset,new_offset; unsigned int j1,j2,j3,j4; double factor = *p_factor; double *pA, *pB; register __m128d x, y, z, w, t, t1,fac_vector; unsigned int N1,N2; fac_vector = _mm_load_sd(&factor); fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector); dim1 = *p_dim1; dim2 = *p_dim2; dim3 = *p_dim3; dim4 = *p_dim4; N1 = dim2*dim3*dim4; N2 = dim2*dim3*dim4; dim1mod = (int) floor( (float)dim1 / (float) 4); dim2mod = (int) floor( (float)dim2 / (float) 4); dim3mod = (int) floor( (float)dim3 / (float) 4); dim4mod = (int) floor( (float)dim4 / (float) 4); /* pluto start (dim1,dim2,dim3,dim4) */ #pragma ivdep #pragma parallel #pragma loop count min(10) max(80) avg(40) #pragma unroll for( j3 = 0; j3<dim3; j3++) { #pragma loop count min(10) max(80) avg(40) #pragma unroll for( j2 = 0; j2<dim2; j2++) { #pragma loop count min(10) max(80) avg(40) #pragma unroll #pragma vector always for( j4 = 0; j4<dim4; j4+=2) { #pragma loop count min(10) max(80) avg(40) #pragma unroll #pragma vector always for( j1 = 0; j1<dim1; j1+=2) { //sorted[j1+dim1*(j2+dim2*(j3+dim3*j4))] = unsorted[j4+dim4*(j3+dim3*(j2+dim2*j1))] * factor; pA = unsorted + j4+dim4*(j3+dim3*(j2+dim2*j1)); pB = sorted + j1+dim1*(j2+dim2*(j3+dim3*j4)); x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } /* pluto end */ return; }
int fft8b_(double *a, double *b, double *w, int *m, int *l) { /* static double c81 = .70710678118654752; */ static __m128d c81; int i, i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, j, j0; /* double u0, v0, u1, x0, y0, x1, y1, x2, y2, x3, y3, v1, x4, y4, x5, y5, x6, y6, x7, y7, u2, v2, u3, v3, wi1, wi2, wi3, wi4, wi5, wi6, wi7, wr1, wr2, wr3, wr4, wr5, wr6, wr7; */ __m128d t0, t1, t2, t3, t4, t5, t6, t7, t8, u0, u1, u2, u3, w1, w2, w3, w4, w5, w6, w7; c81 = _mm_set1_pd(0.70710678118654752); for (i = 0; i < *m; i++) { i0 = i << 1; i1 = i0 + (*m * *l << 1); i2 = i1 + (*m * *l << 1); i3 = i2 + (*m * *l << 1); i4 = i3 + (*m * *l << 1); i5 = i4 + (*m * *l << 1); i6 = i5 + (*m * *l << 1); i7 = i6 + (*m * *l << 1); i8 = i << 1; i9 = i8 + (*m << 1); i10 = i9 + (*m << 1); i11 = i10 + (*m << 1); i12 = i11 + (*m << 1); i13 = i12 + (*m << 1); i14 = i13 + (*m << 1); i15 = i14 + (*m << 1); /* x0 = a[i0] + a[i4]; y0 = a[i0 + 1] + a[i4 + 1]; x1 = a[i0] - a[i4]; y1 = a[i0 + 1] - a[i4 + 1]; x2 = a[i2] + a[i6]; y2 = a[i2 + 1] + a[i6 + 1]; x3 = a[i2 + 1] - a[i6 + 1]; y3 = a[i6] - a[i2]; */ t0 = _mm_load_pd(&a[i0]); t2 = _mm_load_pd(&a[i4]); t1 = _mm_sub_pd(t0, t2); t0 = _mm_add_pd(t0, t2); t3 = _mm_load_pd(&a[i2]); t4 = _mm_load_pd(&a[i6]); t2 = _mm_add_pd(t3, t4); t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0)); t3 = _mm_shuffle_pd(t3, t3, 1); /* u0 = x0 + x2; v0 = y0 + y2; u1 = x0 - x2; v1 = y0 - y2; */ u0 = _mm_add_pd(t0, t2); u1 = _mm_sub_pd(t0, t2); /* x4 = a[i1] + a[i5]; y4 = a[i1 + 1] + a[i5 + 1]; x5 = a[i1] - a[i5]; y5 = a[i1 + 1] - a[i5 + 1]; x6 = a[i3] + a[i7]; y6 = a[i3 + 1] + a[i7 + 1]; x7 = a[i3] - a[i7]; y7 = a[i3 + 1] - a[i7 + 1]; */ t4 = _mm_load_pd(&a[i1]); t6 = _mm_load_pd(&a[i5]); t5 = _mm_sub_pd(t4, t6); t4 = _mm_add_pd(t4, t6); t7 = _mm_load_pd(&a[i3]); t8 = _mm_load_pd(&a[i7]); t6 = _mm_add_pd(t7, t8); t7 = _mm_sub_pd(t7, t8); /* u2 = x4 + x6; v2 = y4 + y6; u3 = y4 - y6; v3 = x6 - x4; */ u2 = _mm_add_pd(t4, t6); u3 = _mm_xor_pd(_mm_sub_pd(t4, t6), _mm_set_sd(-0.0)); u3 = _mm_shuffle_pd(u3, u3, 1); /* b[i8] = u0 + u2; b[i8 + 1] = v0 + v2; b[i12] = u0 - u2; b[i12 + 1] = v0 - v2; b[i10] = u1 + u3; b[i10 + 1] = v1 + v3; b[i14] = u1 - u3; b[i14 + 1] = v1 - v3; */ _mm_store_pd(&b[i8], _mm_add_pd(u0, u2)); _mm_store_pd(&b[i12], _mm_sub_pd(u0, u2)); _mm_store_pd(&b[i10], _mm_add_pd(u1, u3)); _mm_store_pd(&b[i14], _mm_sub_pd(u1, u3)); /* u0 = x1 + c81 * (x5 - x7); v0 = y1 + c81 * (y5 - y7); u1 = x1 - c81 * (x5 - x7); v1 = y1 - c81 * (y5 - y7); u2 = x3 + c81 * (y5 + y7); v2 = y3 - c81 * (x5 + x7); u3 = x3 - c81 * (y5 + y7); v3 = y3 + c81 * (x5 + x7); */ u1 = _mm_mul_pd(c81, _mm_sub_pd(t5, t7)); u0 = _mm_add_pd(t1, u1); u1 = _mm_sub_pd(t1, u1); u3 = _mm_xor_pd(_mm_mul_pd(c81, _mm_add_pd(t5, t7)), _mm_set_sd(-0.0)); u3 = _mm_shuffle_pd(u3, u3, 1); u2 = _mm_add_pd(t3, u3); u3 = _mm_sub_pd(t3, u3); /* b[i9] = u0 + u2; b[i9 + 1] = v0 + v2; b[i13] = u1 + u3; b[i13 + 1] = v1 + v3; b[i11] = u1 - u3; b[i11 + 1] = v1 - v3; b[i15] = u0 - u2; b[i15 + 1] = v0 - v2; */ _mm_store_pd(&b[i9], _mm_add_pd(u0, u2)); _mm_store_pd(&b[i13], _mm_add_pd(u1, u3)); _mm_store_pd(&b[i11], _mm_sub_pd(u1, u3)); _mm_store_pd(&b[i15], _mm_sub_pd(u0, u2)); } for (j = 1; j < *l; j++) { j0 = j << 1; /* wr1 = w[j0]; wi1 = w[j0 + 1]; wr2 = wr1 * wr1 - wi1 * wi1; wi2 = wr1 * wi1 + wr1 * wi1; wr3 = wr1 * wr2 - wi1 * wi2; wi3 = wr1 * wi2 + wi1 * wr2; wr4 = wr2 * wr2 - wi2 * wi2; wi4 = wr2 * wi2 + wr2 * wi2; wr5 = wr2 * wr3 - wi2 * wi3; wi5 = wr2 * wi3 + wi2 * wr3; wr6 = wr3 * wr3 - wi3 * wi3; wi6 = wr3 * wi3 + wr3 * wi3; wr7 = wr3 * wr4 - wi3 * wi4; wi7 = wr3 * wi4 + wi3 * wr4; */ w1 = _mm_load_pd(&w[j0]); w2 = ZMUL(w1, w1); w3 = ZMUL(w1, w2); w4 = ZMUL(w2, w2); w5 = ZMUL(w2, w3); w6 = ZMUL(w3, w3); w7 = ZMUL(w3, w4); for (i = 0; i < *m; i++) { i0 = (i << 1) + (j * *m << 1); i1 = i0 + (*m * *l << 1); i2 = i1 + (*m * *l << 1); i3 = i2 + (*m * *l << 1); i4 = i3 + (*m * *l << 1); i5 = i4 + (*m * *l << 1); i6 = i5 + (*m * *l << 1); i7 = i6 + (*m * *l << 1); i8 = (i << 1) + (j * *m << 4); i9 = i8 + (*m << 1); i10 = i9 + (*m << 1); i11 = i10 + (*m << 1); i12 = i11 + (*m << 1); i13 = i12 + (*m << 1); i14 = i13 + (*m << 1); i15 = i14 + (*m << 1); /* x0 = a[i0] + a[i4]; y0 = a[i0 + 1] + a[i4 + 1]; x1 = a[i0] - a[i4]; y1 = a[i0 + 1] - a[i4 + 1]; x2 = a[i2] + a[i6]; y2 = a[i2 + 1] + a[i6 + 1]; x3 = a[i2 + 1] - a[i6 + 1]; y3 = a[i6] - a[i2]; */ t0 = _mm_load_pd(&a[i0]); t2 = _mm_load_pd(&a[i4]); t1 = _mm_sub_pd(t0, t2); t0 = _mm_add_pd(t0, t2); t3 = _mm_load_pd(&a[i2]); t4 = _mm_load_pd(&a[i6]); t2 = _mm_add_pd(t3, t4); t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0)); t3 = _mm_shuffle_pd(t3, t3, 1); /* u0 = x0 + x2; v0 = y0 + y2; u1 = x0 - x2; v1 = y0 - y2; */ u0 = _mm_add_pd(t0, t2); u1 = _mm_sub_pd(t0, t2); /* x4 = a[i1] + a[i5]; y4 = a[i1 + 1] + a[i5 + 1]; x5 = a[i1] - a[i5]; y5 = a[i1 + 1] - a[i5 + 1]; x6 = a[i3] + a[i7]; y6 = a[i3 + 1] + a[i7 + 1]; x7 = a[i3] - a[i7]; y7 = a[i3 + 1] - a[i7 + 1]; */ t4 = _mm_load_pd(&a[i1]); t6 = _mm_load_pd(&a[i5]); t5 = _mm_sub_pd(t4, t6); t4 = _mm_add_pd(t4, t6); t7 = _mm_load_pd(&a[i3]); t8 = _mm_load_pd(&a[i7]); t6 = _mm_add_pd(t7, t8); t7 = _mm_sub_pd(t7, t8); /* u2 = x4 + x6; v2 = y4 + y6; u3 = y4 - y6; v3 = x6 - x4; */ u2 = _mm_add_pd(t4, t6); u3 = _mm_xor_pd(_mm_sub_pd(t4, t6), _mm_set_sd(-0.0)); u3 = _mm_shuffle_pd(u3, u3, 1); /* b[i8] = u0 + u2; b[i8 + 1] = v0 + v2; b[i12] = wr4 * (u0 - u2) - wi4 * (v0 - v2); b[i12 + 1] = wr4 * (v0 - v2) + wi4 * (u0 - u2); b[i10] = wr2 * (u1 + u3) - wi2 * (v1 + v3); b[i10 + 1] = wr2 * (v1 + v3) + wi2 * (u1 + u3); b[i14] = wr6 * (u1 - u3) - wi6 * (v1 - v3); b[i14 + 1] = wr6 * (v1 - v3) + wi6 * (u1 - u3); */ _mm_store_pd(&b[i8], _mm_add_pd(u0, u2)); _mm_store_pd(&b[i12], ZMUL(w4, _mm_sub_pd(u0, u2))); _mm_store_pd(&b[i10], ZMUL(w2, _mm_add_pd(u1, u3))); _mm_store_pd(&b[i14], ZMUL(w6, _mm_sub_pd(u1, u3))); /* u0 = x1 + c81 * (x5 - x7); v0 = y1 + c81 * (y5 - y7); u1 = x1 - c81 * (x5 - x7); v1 = y1 - c81 * (y5 - y7); u2 = x3 + c81 * (y5 + y7); v2 = y3 - c81 * (x5 + x7); u3 = x3 - c81 * (y5 + y7); v3 = y3 + c81 * (x5 + x7); */ u1 = _mm_mul_pd(c81, _mm_sub_pd(t5, t7)); u0 = _mm_add_pd(t1, u1); u1 = _mm_sub_pd(t1, u1); u3 = _mm_xor_pd(_mm_mul_pd(c81, _mm_add_pd(t5, t7)), _mm_set_sd(-0.0)); u3 = _mm_shuffle_pd(u3, u3, 1); u2 = _mm_add_pd(t3, u3); u3 = _mm_sub_pd(t3, u3); /* b[i9] = wr1 * (u0 + u2) - wi1 * (v0 + v2); b[i9 + 1] = wr1 * (v0 + v2) + wi1 * (u0 + u2); b[i13] = wr5 * (u1 + u3) - wi5 * (v1 + v3); b[i13 + 1] = wr5 * (v1 + v3) + wi5 * (u1 + u3); b[i11] = wr3 * (u1 - u3) - wi3 * (v1 - v3); b[i11 + 1] = wr3 * (v1 - v3) + wi3 * (u1 - u3); b[i15] = wr7 * (u0 - u2) - wi7 * (v0 - v2); b[i15 + 1] = wr7 * (v0 - v2) + wi7 * (u0 - u2); */ _mm_store_pd(&b[i9], ZMUL(w1, _mm_add_pd(u0, u2))); _mm_store_pd(&b[i13], ZMUL(w5, _mm_add_pd(u1, u3))); _mm_store_pd(&b[i11], ZMUL(w3, _mm_sub_pd(u1, u3))); _mm_store_pd(&b[i15], ZMUL(w7, _mm_sub_pd(u0, u2))); } } return 0; }
void tce_sort_6_simd(double* unsorted,double* sorted, int a, int b, int c, int d, int e, int f, int i, int j, int k, int l, int m, int n, double factor) { int id[6],jd[6],ia,ib,j1,j2,j3,j4,j5,j6; int l1,l2,l3,l4,l5,l6; int ia1,ia2,ia3,ia4,ia5,ia6; int ib1,ib2,ib3,ib4,ib5,ib6; int rangea1,rangea2,rangea3,rangea4,rangea5,rangea6; int rangeb1,rangeb2,rangeb3,rangeb4,rangeb5,rangeb6; int range[6],order[6],order_r[6]; int jj1,jj2,jj3,jj4,jj5,jj6; int jj1_bound,jj2_bound,jj3_bound,jj4_bound,jj5_bound,jj6_bound; int N1,N2; double *pA, *pB; register __m128d x, y, z, w, p, q,fac_vector; fac_vector = _mm_load_sd(&factor); fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector); jd[0] = a; jd[1] = b; jd[2] = c; jd[3] = d; jd[4] = e; jd[5] = f; // prefer writes range[0] = b*c*d*e*f; range[1] = c*d*e*f; range[2] = d*e*f; range[3] = e*f; range[4] = f; range[5] = 1; l1 = jd[i]; l2 = jd[j]; l3 = jd[k]; l4 = jd[l]; l5 = jd[m]; l6 = jd[n]; rangea1 = range[i]; rangea2 = range[j]; rangea3 = range[k]; rangea4 = range[l]; rangea5 = range[m]; rangea6 = range[n]; rangeb1 = l2*l3*l4*l5*l6; rangeb2 = l3*l4*l5*l6; rangeb3 = l4*l5*l6; rangeb4 = l5*l6; rangeb5 = l6; rangeb6 = 1; // here vectorization can rely on the compiler if (n == 5) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2++) { ia2 = ia1 + j2*rangea2; ib2 = ib1 + j2*rangeb2; for (j3 = 0; j3 < l3; j3++) { ia3 = ia2 + j3*rangea3; ib3 = ib2 + j3*rangeb3; for (j4 = 0; j4 < l4; j4++) { ia4 = ia3 + j4*rangea4; ib4 = ib3 + j4*rangeb4; for (j5 = 0; j5 < l5; j5++) { ia5 = ia4 + j5*rangea5; ib5 = ib4 + j5*rangeb5; for (j6 = 0; j6 < l6; j6++) { ia = ia5 + j6*rangea6; ib = ib5 + j6*rangeb6; sorted[ib] = unsorted[ia] * factor; } } } } } } } if (m == 5) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2++) { ia2 = ia1 + j2*rangea2; ib2 = ib1 + j2*rangeb2; for (j3 = 0; j3 < l3; j3++) { ia3 = ia2 + j3*rangea3; ib3 = ib2 + j3*rangeb3; for (j4 = 0; j4 < l4; j4++) { ia4 = ia3 + j4*rangea4; ib4 = ib3 + j4*rangeb4; for (j5 = 0; j5 < l5; j5 += tilesize) { for (j6 = 0; j6 < l6; j6 += tilesize) { jj5_bound = (j5 + tilesize > l5)? l5 :j5+tilesize; for (jj5 = j5; jj5 < jj5_bound; jj5 += 2) { ia5 = ia4 + jj5*rangea5; ib5 = ib4 + jj5*rangeb5; jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize; for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) { ia = ia5 + jj6*rangea6; ib = ib5 + jj6*rangeb6; N1 = rangeb5; N2 = rangea6; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } } } if (l == 5) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2++) { ia2 = ia1 + j2*rangea2; ib2 = ib1 + j2*rangeb2; for (j3 = 0; j3 < l3; j3++) { ia3 = ia2 + j3*rangea3; ib3 = ib2 + j3*rangeb3; for (j4 = 0; j4 < l4; j4 += tilesize) { for (j5 = 0; j5 < l5; j5++) { ia5 = ia3 + j5*rangea5; ib5 = ib3 + j5*rangeb5; for (j6 = 0; j6 < l6; j6 += tilesize) { jj4_bound = (j4 + tilesize > l4)? l4 :j4+tilesize; for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) { ia4 = ia5 + jj4*rangea4; ib4 = ib5 + jj4*rangeb4; jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize; for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) { ia = ia4 + jj6*rangea6; ib = ib4 + jj6*rangeb6; N1 = rangeb4; N2 = rangea6; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } } } if (k == 5) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2++) { ia2 = ia1 + j2*rangea2; ib2 = ib1 + j2*rangeb2; for (j3 = 0; j3 < l3; j3 += tilesize) { for (j4 = 0; j4 < l4; j4++) { ia4 = ia2 + j4*rangea4; ib4 = ib2 + j4*rangeb4; for (j5 = 0; j5 < l5; j5++) { ia5 = ia4 + j5*rangea5; ib5 = ib4 + j5*rangeb5; for (j6 = 0; j6 < l6; j6 += tilesize) { jj3_bound = (j3 + tilesize > l3)? l3 :j3+tilesize; for (jj3 = j3; jj3 < jj3_bound; jj3 += 2) { ia3 = ia5 + jj3*rangea3; ib3 = ib5 + jj3*rangeb3; jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize; for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) { ia = ia3 + jj6*rangea6; ib = ib3 + jj6*rangeb6; N1 = rangeb3; N2 = rangea6; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } } } if (j == 5) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2 += tilesize) { for (j3 = 0; j3 < l3; j3++) { ia3 = ia1 + j3*rangea3; ib3 = ib1 + j3*rangeb3; for (j4 = 0; j4 < l4; j4++) { ia4 = ia3 + j4*rangea4; ib4 = ib3 + j4*rangeb4; for (j5 = 0; j5 < l5; j5++) { ia5 = ia4 + j5*rangea5; ib5 = ib4 + j5*rangeb5; for (j6 = 0; j6 < l6; j6 += tilesize) { jj2_bound = (j2 + tilesize > l2)? l2 :j2+tilesize; for (jj2 = j2; jj2 < jj2_bound; jj2 += 2) { ia2 = ia5 + jj2*rangea2; ib2 = ib5 + jj2*rangeb2; jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize; for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) { ia = ia2 + jj6*rangea6; ib = ib2 + jj6*rangeb6; N1 = rangeb2; N2 = rangea6; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } } } if (i == 5) { for (j1 = 0; j1 < l1; j1 += tilesize) { for (j2 = 0; j2 < l2; j2++) { ia2 = j2*rangea2; ib2 = j2*rangeb2; for (j3 = 0; j3 < l3; j3++) { ia3 = ia2 + j3*rangea3; ib3 = ib2 + j3*rangeb3; for (j4 = 0; j4 < l4; j4++) { ia4 = ia3 + j4*rangea4; ib4 = ib3 + j4*rangeb4; for (j5 = 0; j5 < l5; j5++) { ia5 = ia4 + j5*rangea5; ib5 = ib4 + j5*rangeb5; for (j6 = 0; j6 < l6; j6 += tilesize) { jj1_bound = (j1 + tilesize > l1)? l1 :j1+tilesize; for (jj1 = j1; jj1 < jj1_bound; jj1 += 2) { ia1 = ia5 + jj1*rangea1; ib1 = ib5 + jj1*rangeb1; jj6_bound = (j6 + tilesize > l6)? l6:j6+tilesize; for (jj6 = j6; jj6 < jj6_bound; jj6 += 2) { ia = ia1 + jj6*rangea6; ib = ib1 + jj6*rangeb6; N1 = rangeb1; N2 = rangea6; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } } } }
void ffts_transpose(uint64_t *in, uint64_t *out, int w, int h) { #ifdef HAVE_NEON #if 0 neon_transpose4(in, out, w, h); #else neon_transpose8(in, out, w, h); #endif #elif HAVE_SSE2 uint64_t FFTS_ALIGN(64) tmp[TSIZE*TSIZE]; int tx, ty; /* int x; */ int y; int tw = w / TSIZE; int th = h / TSIZE; for (ty = 0; ty < th; ty++) { for (tx = 0; tx < tw; tx++) { uint64_t *ip0 = in + w*TSIZE*ty + tx * TSIZE; uint64_t *op0 = tmp; /* out + h*TSIZE*tx + ty*TSIZE; */ /* copy/transpose to tmp */ for (y = 0; y < TSIZE; y += 2) { /* for (x=0;x<TSIZE;x+=2) { op[x*TSIZE] = ip[x]; */ __m128d q0 = _mm_load_pd((double*)(ip0 + 0*w)); __m128d q1 = _mm_load_pd((double*)(ip0 + 1*w)); __m128d q2 = _mm_load_pd((double*)(ip0 + 2*w)); __m128d q3 = _mm_load_pd((double*)(ip0 + 3*w)); __m128d q4 = _mm_load_pd((double*)(ip0 + 4*w)); __m128d q5 = _mm_load_pd((double*)(ip0 + 5*w)); __m128d q6 = _mm_load_pd((double*)(ip0 + 6*w)); __m128d q7 = _mm_load_pd((double*)(ip0 + 7*w)); __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0)); __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1)); __m128d t2 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(0, 0)); __m128d t3 = _mm_shuffle_pd(q2, q3, _MM_SHUFFLE2(1, 1)); __m128d t4 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(0, 0)); __m128d t5 = _mm_shuffle_pd(q4, q5, _MM_SHUFFLE2(1, 1)); __m128d t6 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(0, 0)); __m128d t7 = _mm_shuffle_pd(q6, q7, _MM_SHUFFLE2(1, 1)); ip0 += 2; /* _mm_store_pd((double *)(op0 + y*h + x), t0); _mm_store_pd((double *)(op0 + y*h + x + h), t1); */ _mm_store_pd((double*)(op0 + 0 ), t0); _mm_store_pd((double*)(op0 + 0 + TSIZE), t1); _mm_store_pd((double*)(op0 + 2 ), t2); _mm_store_pd((double*)(op0 + 2 + TSIZE), t3); _mm_store_pd((double*)(op0 + 4 ), t4); _mm_store_pd((double*)(op0 + 4 + TSIZE), t5); _mm_store_pd((double*)(op0 + 6 ), t6); _mm_store_pd((double*)(op0 + 6 + TSIZE), t7); /* } */ op0 += 2*TSIZE; } op0 = out + h*tx*TSIZE + ty*TSIZE; ip0 = tmp; for (y = 0; y < TSIZE; y += 1) { /* memcpy(op0, ip0, TSIZE * sizeof(*ip0)); */ __m128d q0 = _mm_load_pd((double*)(ip0 + 0)); __m128d q1 = _mm_load_pd((double*)(ip0 + 2)); __m128d q2 = _mm_load_pd((double*)(ip0 + 4)); __m128d q3 = _mm_load_pd((double*)(ip0 + 6)); _mm_store_pd((double*)(op0 + 0), q0); _mm_store_pd((double*)(op0 + 2), q1); _mm_store_pd((double*)(op0 + 4), q2); _mm_store_pd((double*)(op0 + 6), q3); op0 += h; ip0 += TSIZE; } } } /* size_t i,j; for(i=0;i<w;i+=2) { for(j=0;j<h;j+=2) { // out[i*h + j] = in[j*w + i]; __m128d q0 = _mm_load_pd((double *)(in + j*w + i)); __m128d q1 = _mm_load_pd((double *)(in + j*w + i + w)); __m128d t0 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(0, 0)); __m128d t1 = _mm_shuffle_pd(q0, q1, _MM_SHUFFLE2(1, 1)); _mm_store_pd((double *)(out + i*h + j), t0); _mm_store_pd((double *)(out + i*h + j + h), t1); } } */ #else const int bw = 1; const int bh = 8; int i = 0, j = 0; for (; i <= h - bh; i += bh) { for (j = 0; j <= w - bw; j += bw) { uint64_t const *ib = &in[w*i + j]; uint64_t *ob = &out[h*j + i]; uint64_t s_0_0 = ib[0*w + 0]; uint64_t s_1_0 = ib[1*w + 0]; uint64_t s_2_0 = ib[2*w + 0]; uint64_t s_3_0 = ib[3*w + 0]; uint64_t s_4_0 = ib[4*w + 0]; uint64_t s_5_0 = ib[5*w + 0]; uint64_t s_6_0 = ib[6*w + 0]; uint64_t s_7_0 = ib[7*w + 0]; ob[0*h + 0] = s_0_0; ob[0*h + 1] = s_1_0; ob[0*h + 2] = s_2_0; ob[0*h + 3] = s_3_0; ob[0*h + 4] = s_4_0; ob[0*h + 5] = s_5_0; ob[0*h + 6] = s_6_0; ob[0*h + 7] = s_7_0; } } if (i < h) { int i1; for (i1 = 0; i1 < w; i1++) { for (j = i; j < h; j++) { out[i1*h + j] = in[j*w + i1]; } } } if (j < w) { int j1; for (i = j; i < w; i++) { for (j1 = 0; j1 < h; j1++) { out[i*h + j1] = in[j1*w + i]; } } } #endif }
void tce_sort_4_simd(double* unsorted,double* sorted, int a, int b, int c, int d, int i, int j, int k, int l, double factor) { int id[4],jd[4],ia,ib,j1,j2,j3,j4; int l1,l2,l3,l4; int ia1,ia2,ia3,ia4; int ib1,ib2,ib3,ib4; int rangea1,rangea2,rangea3,rangea4; int rangeb1,rangeb2,rangeb3,rangeb4; int range[4],order[4],order_r[4]; int jj1,jj2,jj3,jj4; int jj1_bound,jj2_bound,jj3_bound,jj4_bound; int count,ir,jr,kr,lr,N1,N2; double *pA, *pB; register __m128d x, y, z, w, t, t1,fac_vector; fac_vector = _mm_load_sd(&factor); fac_vector = _mm_unpacklo_pd(fac_vector,fac_vector); jd[0] = a; jd[1] = b; jd[2] = c; jd[3] = d; // prefer writes range[0] = b*c*d; range[1] = c*d; range[2] = d; range[3] = 1; l1 = jd[i]; l2 = jd[j]; l3 = jd[k]; l4 = jd[l]; rangea1 = range[i]; rangea2 = range[j]; rangea3 = range[k]; rangea4 = range[l]; rangeb1 = l2*l3*l4; rangeb2 = l3*l4; rangeb3 = l4; rangeb4 = 1; // here vectorization can rely on the compiler if (l == 3) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2++) { ia2 = ia1 + j2*rangea2; ib2 = ib1 + j2*rangeb2; for (j3 = 0; j3 < l3; j3++) { ia3 = ia2 + j3*rangea3; ib3 = ib2 + j3*rangeb3; for (j4 = 0; j4 < l4; j4++) { ia = ia3 + j4*rangea4; ib = ib3 + j4*rangeb4; sorted[ib] = unsorted[ia] * factor; } } } } } if (k == 3) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2++) { ia2 = ia1 + j2*rangea2; ib2 = ib1 + j2*rangeb2; for (j3 = 0; j3 < l3; j3 += tilesize) { for (j4 = 0; j4 < l4; j4 += tilesize) { jj3_bound = (j3 + tilesize > l3)? l3 :j3+tilesize; for (jj3 = j3; jj3 < jj3_bound; jj3 += 2) { ia3 = ia2 + jj3*rangea3; ib3 = ib2 + jj3*rangeb3; jj4_bound = (j4 + tilesize > l4)? l4:j4+tilesize; for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) { ia = ia3 + jj4*rangea4; ib = ib3 + jj4*rangeb4; N1 = rangeb3; N2 = rangea4; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } if (j == 3) { for (j1 = 0; j1 < l1; j1++) { ia1 = j1*rangea1; ib1 = j1*rangeb1; for (j2 = 0; j2 < l2; j2 += tilesize) { for (j3 = 0; j3 < l3; j3++) { ia3 = ia1 + j3*rangea3; ib3 = ib1 + j3*rangeb3; for (j4 = 0; j4 < l4; j4 += tilesize) { jj2_bound = (j2 + tilesize > l2)? l2 :j2+tilesize; for (jj2 = j2; jj2 < jj2_bound; jj2 += 2) { ia2 = ia3 + jj2*rangea2; ib2 = ib3 + jj2*rangeb2; jj4_bound = (j4 + tilesize > l4)? l4:j4+tilesize; for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) { ia = ia2 + jj4*rangea4; ib = ib2 + jj4*rangeb4; N1 = rangeb2; N2 = rangea4; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } if (i == 3) { for (j1 = 0; j1 < l1; j1 += tilesize) { for (j2 = 0; j2 < l2; j2++) { ia2 = j2*rangea2; ib2 = j2*rangeb2; for (j3 = 0; j3 < l3; j3++) { ia3 = ia2 + j3*rangea3; ib3 = ib2 + j3*rangeb3; for (j4 = 0; j4 < l4; j4 += tilesize) { jj1_bound = (j1 + tilesize > l1)? l1 :j1+tilesize; for (jj1 = j1; jj1 < jj1_bound; jj1 += 2) { ia1 = ia3 + jj1*rangea1; ib1 = ib3 + jj1*rangeb1; jj4_bound = (j4 + tilesize > l4)? l4:j4+tilesize; for (jj4 = j4; jj4 < jj4_bound; jj4 += 2) { ia = ia1 + jj4*rangea4; ib = ib1 + jj4*rangeb4; N1 = rangeb1; N2 = rangea4; pA = unsorted+ia; pB = sorted+ib; x = _mm_loadu_pd(pA); x = _mm_mul_pd(x,fac_vector); y = _mm_loadu_pd(pA + N2); y = _mm_mul_pd(y,fac_vector); z = _mm_shuffle_pd( x, y, 0); w = _mm_shuffle_pd( x, y, 3); _mm_storeu_pd(pB,z); _mm_storeu_pd(pB + N1,w); } } } } } } } }
void AES_Key_Expansion_PARA_3( const unsigned char *userkey1, const unsigned char *userkey2, const unsigned char *userkey3, unsigned char *key1, unsigned char *key2, unsigned char *key3) { __m128i temp1_1, temp2_1; __m128i temp1_2, temp2_2, temp3_2, temp4_2; __m128i temp1_3, temp2_3; __m128i *Key_Schedule1 = (__m128i*)key1; __m128i *Key_Schedule2 = (__m128i*)key2; __m128i *Key_Schedule3 = (__m128i*)key3; temp1_1 = _mm_loadu_si128((__m128i*)userkey1); temp1_2 = _mm_loadu_si128((__m128i*)userkey2); temp3_2 = _mm_loadu_si128((__m128i*)(userkey2+16)); temp1_3 = _mm_loadu_si128((__m128i*)userkey3); Key_Schedule1[0] = temp1_1; Key_Schedule2[0] = temp1_2; Key_Schedule2[1] = temp3_2; Key_Schedule3[0] = temp1_3; temp2_1 = _mm_aeskeygenassist_si128 (temp1_1 ,0x1); temp2_2 = _mm_aeskeygenassist_si128 (temp3_2 ,0x1); temp2_3 = _mm_aeskeygenassist_si128 (temp1_3 ,0x1); temp1_1 = AES_128_ASSIST(temp1_1, temp2_1); KEY_192_ASSIST(&temp1_2, &temp2_2, &temp3_2); temp1_3 = AES_128_ASSIST(temp1_3, temp2_3); /*KEY_ASSIST_PARA_3( temp1_1, temp2_1, &temp1_1, &temp1_2, &temp2_2, &temp3_2, temp1_3, temp2_3, &temp1_3);*/ Key_Schedule1[1] = temp1_1; Key_Schedule2[1] = (__m128i)_mm_shuffle_pd((__m128d)Key_Schedule2[1], (__m128d)temp1_2,0); Key_Schedule2[2] = (__m128i)_mm_shuffle_pd((__m128d)temp1_2, (__m128d)temp3_2,1); Key_Schedule3[1] = temp1_3; temp2_1 = _mm_aeskeygenassist_si128 (temp1_1,0x2); temp2_2 = _mm_aeskeygenassist_si128 (temp3_2,0x2); temp2_3 = _mm_aeskeygenassist_si128 (temp1_3,0x2); temp1_1 = AES_128_ASSIST(temp1_1, temp2_1); KEY_192_ASSIST(&temp1_2, &temp2_2, &temp3_2); temp1_3 = AES_128_ASSIST(temp1_3, temp2_3); /*KEY_ASSIST_PARA_3( temp1_1, temp2_1, &temp1_1, &temp1_2, &temp2_2, &temp3_2, temp1_3, temp2_3, &temp1_3);*/ Key_Schedule1[2] = temp1_1; Key_Schedule2[3] = temp1_2; Key_Schedule2[4] = temp3_2; Key_Schedule3[2] = temp1_3; temp2_1 = _mm_aeskeygenassist_si128 (temp1_1,0x4); temp2_2 = _mm_aeskeygenassist_si128 (temp3_2,0x4); temp2_3 = _mm_aeskeygenassist_si128 (temp1_3,0x4); temp1_1 = AES_128_ASSIST(temp1_1, temp2_1); KEY_192_ASSIST(&temp1_2, &temp2_2, &temp3_2); temp1_3 = AES_128_ASSIST(temp1_3, temp2_3); /*KEY_ASSIST_PARA_3( temp1_1, temp2_1, &temp1_1, &temp1_2, &temp2_2, &temp3_2, temp1_3, temp2_3, &temp1_3);*/ Key_Schedule1[3] = temp1_1; Key_Schedule2[4] = (__m128i)_mm_shuffle_pd((__m128d)Key_Schedule2[4], (__m128d)temp1_2,0); Key_Schedule2[5] = (__m128i)_mm_shuffle_pd((__m128d)temp1_2,(__m128d)temp3_2,1); Key_Schedule3[3] = temp1_3; temp2_1 = _mm_aeskeygenassist_si128 (temp1_1,0x8); temp2_2 = _mm_aeskeygenassist_si128 (temp3_2,0x8); temp2_3 = _mm_aeskeygenassist_si128 (temp1_3,0x8); temp1_1 = AES_128_ASSIST(temp1_1, temp2_1); KEY_192_ASSIST(&temp1_2, &temp2_2, &temp3_2); temp1_3 = AES_128_ASSIST(temp1_3, temp2_3); /*KEY_ASSIST_PARA_3( temp1_1, temp2_1, &temp1_1, &temp1_2, &temp2_2, &temp3_2, temp1_3, temp2_3, &temp1_3);*/ Key_Schedule1[4] = temp1_1; Key_Schedule2[6] = temp1_2; Key_Schedule2[7] = temp3_2; Key_Schedule3[4] = temp1_3; temp2_1 = _mm_aeskeygenassist_si128 (temp1_1,0x10); temp2_2 = _mm_aeskeygenassist_si128 (temp3_2,0x10); temp2_3 = _mm_aeskeygenassist_si128 (temp1_3,0x10); temp1_1 = AES_128_ASSIST(temp1_1, temp2_1); KEY_192_ASSIST(&temp1_2, &temp2_2, &temp3_2); temp1_3 = AES_128_ASSIST(temp1_3, temp2_3); /*KEY_ASSIST_PARA_3( temp1_1, temp2_1, &temp1_1, &temp1_2, &temp2_2, &temp3_2, temp1_3, temp2_3, &temp1_3);*/ Key_Schedule1[5] = temp1_1; Key_Schedule2[7] = (__m128i)_mm_shuffle_pd((__m128d)Key_Schedule2[7], (__m128d)temp1_2,0); Key_Schedule2[8] = (__m128i)_mm_shuffle_pd((__m128d)temp1_2,(__m128d)temp3_2,1); Key_Schedule3[5] = temp1_3; temp2_1 = _mm_aeskeygenassist_si128 (temp1_1,0x20); temp2_2 = _mm_aeskeygenassist_si128 (temp3_2,0x20); temp2_3 = _mm_aeskeygenassist_si128 (temp1_3,0x20); temp1_1 = AES_128_ASSIST(temp1_1, temp2_1); KEY_192_ASSIST(&temp1_2, &temp2_2, &temp3_2); temp1_3 = AES_128_ASSIST(temp1_3, temp2_3); /*KEY_ASSIST_PARA_3( temp1_1, temp2_1, &temp1_1, &temp1_2, &temp2_2, &temp3_2, temp1_3, temp2_3, &temp1_3);*/ Key_Schedule1[6] = temp1_1; Key_Schedule2[9] = temp1_2; Key_Schedule2[10]= temp3_2; Key_Schedule3[6] = temp1_3; temp2_1 = _mm_aeskeygenassist_si128 (temp1_1,0x40); temp2_2 = _mm_aeskeygenassist_si128 (temp3_2,0x40); temp2_3 = _mm_aeskeygenassist_si128 (temp1_3,0x40); temp1_1 = AES_128_ASSIST(temp1_1, temp2_1); KEY_192_ASSIST(&temp1_2, &temp2_2, &temp3_2); temp1_3 = AES_128_ASSIST(temp1_3, temp2_3); /*KEY_ASSIST_PARA_3( temp1_1, temp2_1, &temp1_1, &temp1_2, &temp2_2, &temp3_2, temp1_3, temp2_3, &temp1_3);*/ Key_Schedule1[7] = temp1_1; Key_Schedule2[10]= (__m128i)_mm_shuffle_pd((__m128d)Key_Schedule2[10], (__m128d)temp1_2,0); Key_Schedule2[11]= (__m128i)_mm_shuffle_pd((__m128d)temp1_2,(__m128d)temp3_2,1); Key_Schedule3[7] = temp1_3; temp2_1 = _mm_aeskeygenassist_si128 (temp1_1,0x80); temp2_2 = _mm_aeskeygenassist_si128 (temp3_2,0x80); temp2_3 = _mm_aeskeygenassist_si128 (temp1_3,0x80); temp1_1 = AES_128_ASSIST(temp1_1, temp2_1); KEY_192_ASSIST(&temp1_2, &temp2_2, &temp3_2); temp1_3 = AES_128_ASSIST(temp1_3, temp2_3); /*KEY_ASSIST_PARA_3( temp1_1, temp2_1, &temp1_1, &temp1_2, &temp2_2, &temp3_2, temp1_3, temp2_3, &temp1_3);*/ Key_Schedule1[8] = temp1_1; Key_Schedule2[12]= temp1_2; Key_Schedule3[8] = temp1_3; temp2_1 = _mm_aeskeygenassist_si128 (temp1_1,0x1b); temp2_3 = _mm_aeskeygenassist_si128 (temp1_3,0x1b); temp1_1 = AES_128_ASSIST(temp1_1, temp2_1); temp1_3 = AES_128_ASSIST(temp1_3, temp2_3); Key_Schedule1[9] = temp1_1; Key_Schedule3[9] = temp1_3; temp2_1 = _mm_aeskeygenassist_si128 (temp1_1,0x36); temp2_3 = _mm_aeskeygenassist_si128 (temp1_3,0x36); temp1_1 = AES_128_ASSIST(temp1_1, temp2_1); temp1_3 = AES_128_ASSIST(temp1_3, temp2_3); Key_Schedule1[10] = temp1_1; Key_Schedule3[10] = temp1_3; }
static inline __m128d my_invrsq_pd(__m128d x) { const __m128d three = (const __m128d) {3.0f, 3.0f}; const __m128d half = (const __m128d) {0.5f, 0.5f}; __m128 t = _mm_rsqrt_ps(_mm_cvtpd_ps(x)); /* Convert to single precision and do _mm_rsqrt_ps() */ __m128d t1 = _mm_cvtps_pd(t); /* Convert back to double precision */ /* First Newton-Rapson step, accuracy is now 24 bits */ __m128d t2 = _mm_mul_pd(half,_mm_mul_pd(t1,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t1,t1))))); /* Return second Newton-Rapson step, accuracy 48 bits */ return (__m128d) _mm_mul_pd(half,_mm_mul_pd(t2,_mm_sub_pd(three,_mm_mul_pd(x,_mm_mul_pd(t2,t2))))); } /* to extract single integers from a __m128i datatype */ #define _mm_extract_epi64(x, imm) \ _mm_cvtsi128_si32(_mm_srli_si128((x), 4 * (imm))) void nb_kernel400_x86_64_sse2(int * p_nri, int * iinr, int * jindex, int * jjnr, int * shift, double * shiftvec, double * fshift, int * gid, double * pos, double * faction, double * charge, double * p_facel, double * p_krf, double * p_crf, double * Vc, int * type, int * p_ntype, double * vdwparam, double * Vvdw, double * p_tabscale, double * VFtab, double * invsqrta, double * dvda, double * p_gbtabscale, double * GBtab, int * p_nthreads, int * count, void * mtx, int * outeriter, int * inneriter, double * work) { int nri,ntype,nthreads,offset; int n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid; double facel,krf,crf,tabscl,gbtabscl,vct,vgbt; double shX,shY,shZ,isai_d,dva; gmx_gbdata_t *gbdata; float * gpol; __m128d ix,iy,iz,jx,jy,jz; __m128d dx,dy,dz,t1,t2,t3; __m128d fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2; __m128d q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj; __m128d Y,F,G,H,Fp,VV,FF,vgb,fijC,dvdatmp,dvdasum,vctot,vgbtot,n0d; __m128d xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8; __m128d fac,tabscale,gbtabscale; __m128i n0,nnn; const __m128d neg = {-1.0f,-1.0f}; const __m128d zero = {0.0f,0.0f}; const __m128d half = {0.5f,0.5f}; const __m128d two = {2.0f,2.0f}; const __m128d three = {3.0f,3.0f}; gbdata = (gmx_gbdata_t *)work; gpol = gbdata->gpol; nri = *p_nri; ntype = *p_ntype; nthreads = *p_nthreads; facel = (*p_facel) * (1.0 - (1.0/gbdata->gb_epsilon_solvent)); krf = *p_krf; crf = *p_crf; tabscl = *p_tabscale; gbtabscl = *p_gbtabscale; nj1 = 0; /* Splat variables */ fac = _mm_load1_pd(&facel); tabscale = _mm_load1_pd(&tabscl); gbtabscale = _mm_load1_pd(&gbtabscl); /* Keep compiler happy */ dvdatmp = _mm_setzero_pd(); vgb = _mm_setzero_pd(); dvdaj = _mm_setzero_pd(); isaj = _mm_setzero_pd(); vcoul = _mm_setzero_pd(); t1 = _mm_setzero_pd(); t2 = _mm_setzero_pd(); t3 = _mm_setzero_pd(); jnr1=jnr2=0; j13=j23=0; for(n=0;n<nri;n++) { is3 = 3*shift[n]; shX = shiftvec[is3]; shY = shiftvec[is3+1]; shZ = shiftvec[is3+2]; nj0 = jindex[n]; nj1 = jindex[n+1]; offset = (nj1-nj0)%2; ii = iinr[n]; ii3 = ii*3; ix = _mm_set1_pd(shX+pos[ii3+0]); iy = _mm_set1_pd(shX+pos[ii3+1]); iz = _mm_set1_pd(shX+pos[ii3+2]); q = _mm_set1_pd(charge[ii]); iq = _mm_mul_pd(fac,q); isai_d = invsqrta[ii]; isai = _mm_load1_pd(&isai_d); fix = _mm_setzero_pd(); fiy = _mm_setzero_pd(); fiz = _mm_setzero_pd(); dvdasum = _mm_setzero_pd(); vctot = _mm_setzero_pd(); vgbtot = _mm_setzero_pd(); for(k=nj0;k<nj1-offset; k+=2) { jnr1 = jjnr[k]; jnr2 = jjnr[k+1]; j13 = jnr1 * 3; j23 = jnr2 * 3; /* Load coordinates */ xmm1 = _mm_loadu_pd(pos+j13); /* x1 y1 */ xmm2 = _mm_loadu_pd(pos+j23); /* x2 y2 */ xmm5 = _mm_load_sd(pos+j13+2); /* z1 - */ xmm6 = _mm_load_sd(pos+j23+2); /* z2 - */ /* transpose */ jx = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); jy = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); jz = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* distances */ dx = _mm_sub_pd(ix,jx); dy = _mm_sub_pd(iy,jy); dz = _mm_sub_pd(iz,jz); rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) ); rinv = my_invrsq_pd(rsq11); /* Load invsqrta */ isaj = _mm_loadl_pd(isaj,invsqrta+jnr1); isaj = _mm_loadh_pd(isaj,invsqrta+jnr2); isaprod = _mm_mul_pd(isai,isaj); /* Load charges */ q = _mm_loadl_pd(q,charge+jnr1); q = _mm_loadh_pd(q,charge+jnr2); qq = _mm_mul_pd(iq,q); vcoul = _mm_mul_pd(qq,rinv); fscal = _mm_mul_pd(vcoul,rinv); qq = _mm_mul_pd(isaprod,qq); qq = _mm_mul_pd(qq,neg); gbscale = _mm_mul_pd(isaprod,gbtabscale); /* Load dvdaj */ dvdaj = _mm_loadl_pd(dvdaj, dvda+jnr1); dvdaj = _mm_loadh_pd(dvdaj, dvda+jnr2); r = _mm_mul_pd(rsq11,rinv); rt = _mm_mul_pd(r,gbscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_pd(rt,n0d); eps2 = _mm_mul_pd(eps,eps); nnn = _mm_slli_epi64(n0,2); xmm1 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); /* Y1 F1 */ xmm2 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); /* Y2 F2 */ xmm3 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */ xmm4 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */ Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */ F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */ G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */ H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */ G = _mm_mul_pd(G,eps); H = _mm_mul_pd(H,eps2); Fp = _mm_add_pd(F,G); Fp = _mm_add_pd(Fp,H); VV = _mm_mul_pd(Fp,eps); VV = _mm_add_pd(Y,VV); H = _mm_mul_pd(two,H); FF = _mm_add_pd(Fp,G); FF = _mm_add_pd(FF,H); vgb = _mm_mul_pd(qq,VV); fijC = _mm_mul_pd(qq,FF); fijC = _mm_mul_pd(fijC,gbscale); dvdatmp = _mm_mul_pd(fijC,r); dvdatmp = _mm_add_pd(vgb,dvdatmp); dvdatmp = _mm_mul_pd(dvdatmp,neg); dvdatmp = _mm_mul_pd(dvdatmp,half); dvdasum = _mm_add_pd(dvdasum,dvdatmp); xmm1 = _mm_mul_pd(dvdatmp,isaj); xmm1 = _mm_mul_pd(xmm1,isaj); dvdaj = _mm_add_pd(dvdaj,xmm1); /* store dvda */ _mm_storel_pd(dvda+jnr1,dvdaj); _mm_storeh_pd(dvda+jnr2,dvdaj); vctot = _mm_add_pd(vctot,vcoul); vgbtot = _mm_add_pd(vgbtot,vgb); fscal = _mm_sub_pd(fijC,fscal); fscal = _mm_mul_pd(fscal,neg); fscal = _mm_mul_pd(fscal,rinv); /* calculate partial force terms */ t1 = _mm_mul_pd(fscal,dx); t2 = _mm_mul_pd(fscal,dy); t3 = _mm_mul_pd(fscal,dz); /* update the i force */ fix = _mm_add_pd(fix,t1); fiy = _mm_add_pd(fiy,t2); fiz = _mm_add_pd(fiz,t3); /* accumulate forces from memory */ xmm1 = _mm_loadu_pd(faction+j13); /* fx1 fy1 */ xmm2 = _mm_loadu_pd(faction+j23); /* fx2 fy2 */ xmm5 = _mm_load1_pd(faction+j13+2); /* fz1 fz1 */ xmm6 = _mm_load1_pd(faction+j23+2); /* fz2 fz2 */ /* transpose */ xmm7 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */ xmm5 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */ xmm6 = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */ /* subtract partial forces */ xmm5 = _mm_sub_pd(xmm5,t1); xmm6 = _mm_sub_pd(xmm6,t2); xmm7 = _mm_sub_pd(xmm7,t3); xmm1 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */ xmm2 = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */ /* store fx and fy */ _mm_storeu_pd(faction+j13,xmm1); _mm_storeu_pd(faction+j23,xmm2); /* .. then fz */ _mm_storel_pd(faction+j13+2,xmm7); _mm_storel_pd(faction+j23+2,xmm7); } /* In double precision, offset can only be either 0 or 1 */ if(offset!=0) { jnr1 = jjnr[k]; j13 = jnr1*3; jx = _mm_load_sd(pos+j13); jy = _mm_load_sd(pos+j13+1); jz = _mm_load_sd(pos+j13+2); isaj = _mm_load_sd(invsqrta+jnr1); isaprod = _mm_mul_sd(isai,isaj); dvdaj = _mm_load_sd(dvda+jnr1); q = _mm_load_sd(charge+jnr1); qq = _mm_mul_sd(iq,q); dx = _mm_sub_sd(ix,jx); dy = _mm_sub_sd(iy,jy); dz = _mm_sub_sd(iz,jz); rsq11 = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) ); rinv = my_invrsq_pd(rsq11); vcoul = _mm_mul_sd(qq,rinv); fscal = _mm_mul_sd(vcoul,rinv); qq = _mm_mul_sd(isaprod,qq); qq = _mm_mul_sd(qq,neg); gbscale = _mm_mul_sd(isaprod,gbtabscale); r = _mm_mul_sd(rsq11,rinv); rt = _mm_mul_sd(r,gbscale); n0 = _mm_cvttpd_epi32(rt); n0d = _mm_cvtepi32_pd(n0); eps = _mm_sub_sd(rt,n0d); eps2 = _mm_mul_sd(eps,eps); nnn = _mm_slli_epi64(n0,2); xmm1 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); xmm2 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); xmm3 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); xmm4 = _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); Y = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); F = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); G = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); H = _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); G = _mm_mul_sd(G,eps); H = _mm_mul_sd(H,eps2); Fp = _mm_add_sd(F,G); Fp = _mm_add_sd(Fp,H); VV = _mm_mul_sd(Fp,eps); VV = _mm_add_sd(Y,VV); H = _mm_mul_sd(two,H); FF = _mm_add_sd(Fp,G); FF = _mm_add_sd(FF,H); vgb = _mm_mul_sd(qq,VV); fijC = _mm_mul_sd(qq,FF); fijC = _mm_mul_sd(fijC,gbscale); dvdatmp = _mm_mul_sd(fijC,r); dvdatmp = _mm_add_sd(vgb,dvdatmp); dvdatmp = _mm_mul_sd(dvdatmp,neg); dvdatmp = _mm_mul_sd(dvdatmp,half); dvdasum = _mm_add_sd(dvdasum,dvdatmp); xmm1 = _mm_mul_sd(dvdatmp,isaj); xmm1 = _mm_mul_sd(xmm1,isaj); dvdaj = _mm_add_sd(dvdaj,xmm1); /* store dvda */ _mm_storel_pd(dvda+jnr1,dvdaj); vctot = _mm_add_sd(vctot,vcoul); vgbtot = _mm_add_sd(vgbtot,vgb); fscal = _mm_sub_sd(fijC,fscal); fscal = _mm_mul_sd(fscal,neg); fscal = _mm_mul_sd(fscal,rinv); /* calculate partial force terms */ t1 = _mm_mul_sd(fscal,dx); t2 = _mm_mul_sd(fscal,dy); t3 = _mm_mul_sd(fscal,dz); /* update the i force */ fix = _mm_add_sd(fix,t1); fiy = _mm_add_sd(fiy,t2); fiz = _mm_add_sd(fiz,t3); /* accumulate forces from memory */ xmm5 = _mm_load_sd(faction+j13); /* fx */ xmm6 = _mm_load_sd(faction+j13+1); /* fy */ xmm7 = _mm_load_sd(faction+j13+2); /* fz */ /* subtract partial forces */ xmm5 = _mm_sub_sd(xmm5,t1); xmm6 = _mm_sub_sd(xmm6,t2); xmm7 = _mm_sub_sd(xmm7,t3); /* store forces */ _mm_store_sd(faction+j13,xmm5); _mm_store_sd(faction+j13+1,xmm6); _mm_store_sd(faction+j13+2,xmm7); } /* fix/fiy/fiz now contain four partial terms, that all should be * added to the i particle forces */ t1 = _mm_unpacklo_pd(t1,fix); t2 = _mm_unpacklo_pd(t2,fiy); t3 = _mm_unpacklo_pd(t3,fiz); fix = _mm_add_pd(fix,t1); fiy = _mm_add_pd(fiy,t2); fiz = _mm_add_pd(fiz,t3); fix = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1)); fiy = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1)); fiz = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1)); /* Load i forces from memory */ xmm1 = _mm_load_sd(faction+ii3); xmm2 = _mm_load_sd(faction+ii3+1); xmm3 = _mm_load_sd(faction+ii3+2); /* Add to i force */ fix = _mm_add_sd(fix,xmm1); fiy = _mm_add_sd(fiy,xmm2); fiz = _mm_add_sd(fiz,xmm3); /* store i forces to memory */ _mm_store_sd(faction+ii3,fix); _mm_store_sd(faction+ii3+1,fiy); _mm_store_sd(faction+ii3+2,fiz); /* now do dvda */ dvdatmp = _mm_unpacklo_pd(dvdatmp,dvdasum); dvdasum = _mm_add_pd(dvdasum,dvdatmp); _mm_storeh_pd(&dva,dvdasum); dvda[ii] = dvda[ii] + dva*isai_d*isai_d; ggid = gid[n]; /* Coulomb potential */ vcoul = _mm_unpacklo_pd(vcoul,vctot); vctot = _mm_add_pd(vctot,vcoul); _mm_storeh_pd(&vct,vctot); Vc[ggid] = Vc[ggid] + vct; /* GB potential */ vgb = _mm_unpacklo_pd(vgb,vgbtot); vgbtot = _mm_add_pd(vgbtot,vgb); _mm_storeh_pd(&vgbt,vgbtot); gpol[ggid] = gpol[ggid] + vgbt; } *outeriter = nri; *inneriter = nj1; }
inline static __m128d absfac( const __m128d a ) { register const __m128d z2 = _mm_shuffle_pd( a , a , 1 ) ; return _mm_add_pd( _mm_mul_pd( a , a ) , _mm_mul_pd( z2 , z2 ) ) ; }
inline void Cryptor::expandKey192(const unsigned char *key, unsigned char *schedule) { __m128i *keySchedule = (__m128i*) schedule; // Save the first 128 bits of the key as the first one. __m128i tmp = _mm_loadu_si128((__m128i*) key); if (!bigEndian) { reverse_m128i(tmp); // swap byte-order => big-endian. } keySchedule[0] = tmp; // The next 64 bits as the second. unsigned char buf[128]; memset(buf, 0, 128); memcpy(buf, key + 16, 64); __m128i tmp3 = _mm_loadu_si128((__m128i*) buf); if (!bigEndian) { reverse_m128i(tmp3); // swap byte-order => big-endian. } keySchedule[1] = tmp3; __m128i tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x1); assistKey192(&tmp, &tmp2, &tmp3); keySchedule[1] = (__m128i) _mm_shuffle_pd((__m128d) keySchedule[1], (__m128d) tmp, 0); keySchedule[2] = (__m128i) _mm_shuffle_pd((__m128d) tmp, (__m128d) tmp3, 1); tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x2); assistKey192(&tmp, &tmp2, &tmp3); keySchedule[3] = tmp; keySchedule[4] = tmp3; tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x4); assistKey192(&tmp, &tmp2, &tmp3); keySchedule[4] = (__m128i) _mm_shuffle_pd((__m128d) keySchedule[4], (__m128d) tmp, 0); keySchedule[5] = (__m128i) _mm_shuffle_pd((__m128d) tmp, (__m128d) tmp3, 1); tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x8); assistKey192(&tmp, &tmp2, &tmp3); keySchedule[6] = tmp; keySchedule[7] = tmp3; tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x10); assistKey192(&tmp, &tmp2, &tmp3); keySchedule[7] = (__m128i) _mm_shuffle_pd((__m128d) keySchedule[7], (__m128d) tmp, 0); keySchedule[8] = (__m128i) _mm_shuffle_pd((__m128d) tmp, (__m128d) tmp3, 1); tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x20); assistKey192(&tmp, &tmp2, &tmp3); keySchedule[9] = tmp; keySchedule[10] = tmp3; tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x40); assistKey192(&tmp, &tmp2, &tmp3); keySchedule[10] = (__m128i) _mm_shuffle_pd((__m128d) keySchedule[10], (__m128d) tmp, 0); keySchedule[11] = (__m128i) _mm_shuffle_pd((__m128d) tmp, (__m128d) tmp3, 1); tmp2 = _mm_aeskeygenassist_si128(tmp3, 0x80); assistKey192(&tmp, &tmp2, &tmp3); keySchedule[12] = tmp; keySchedule[13] = tmp3; }
/* Encryption key setup */ static void aes_key_setup_enc(__m128i rk[], const u8* cipherKey, int keylen) { switch (keylen) { case 16: { /* 128 bit key setup */ rk[0] = _mm_loadu_si128((const __m128i*) cipherKey); rk[1] = KEYEXP128(rk[0], 0x01); rk[2] = KEYEXP128(rk[1], 0x02); rk[3] = KEYEXP128(rk[2], 0x04); rk[4] = KEYEXP128(rk[3], 0x08); rk[5] = KEYEXP128(rk[4], 0x10); rk[6] = KEYEXP128(rk[5], 0x20); rk[7] = KEYEXP128(rk[6], 0x40); rk[8] = KEYEXP128(rk[7], 0x80); rk[9] = KEYEXP128(rk[8], 0x1B); rk[10] = KEYEXP128(rk[9], 0x36); break; } case 24: { /* 192 bit key setup */ __m128i temp[2]; rk[0] = _mm_loadu_si128((const __m128i*) cipherKey); rk[1] = _mm_loadu_si128((const __m128i*) (cipherKey+16)); temp[0] = KEYEXP192(rk[0], rk[1], 0x01); temp[1] = KEYEXP192_2(temp[0], rk[1]); rk[1] = (__m128i)_mm_shuffle_pd((__m128d)rk[1], (__m128d)temp[0], 0); rk[2] = (__m128i)_mm_shuffle_pd((__m128d)temp[0], (__m128d)temp[1], 1); rk[3] = KEYEXP192(temp[0], temp[1], 0x02); rk[4] = KEYEXP192_2(rk[3], temp[1]); temp[0] = KEYEXP192(rk[3], rk[4], 0x04); temp[1] = KEYEXP192_2(temp[0], rk[4]); rk[4] = (__m128i)_mm_shuffle_pd((__m128d)rk[4], (__m128d)temp[0], 0); rk[5] = (__m128i)_mm_shuffle_pd((__m128d)temp[0], (__m128d)temp[1], 1); rk[6] = KEYEXP192(temp[0], temp[1], 0x08); rk[7] = KEYEXP192_2(rk[6], temp[1]); temp[0] = KEYEXP192(rk[6], rk[7], 0x10); temp[1] = KEYEXP192_2(temp[0], rk[7]); rk[7] = (__m128i)_mm_shuffle_pd((__m128d)rk[7], (__m128d)temp[0], 0); rk[8] = (__m128i)_mm_shuffle_pd((__m128d)temp[0], (__m128d)temp[1], 1); rk[9] = KEYEXP192(temp[0], temp[1], 0x20); rk[10] = KEYEXP192_2(rk[9], temp[1]); temp[0] = KEYEXP192(rk[9], rk[10], 0x40); temp[1] = KEYEXP192_2(temp[0], rk[10]); rk[10] = (__m128i)_mm_shuffle_pd((__m128d)rk[10], (__m128d) temp[0], 0); rk[11] = (__m128i)_mm_shuffle_pd((__m128d)temp[0],(__m128d) temp[1], 1); rk[12] = KEYEXP192(temp[0], temp[1], 0x80); break; } case 32: { /* 256 bit key setup */ rk[0] = _mm_loadu_si128((const __m128i*) cipherKey); rk[1] = _mm_loadu_si128((const __m128i*) (cipherKey+16)); rk[2] = KEYEXP256(rk[0], rk[1], 0x01); rk[3] = KEYEXP256_2(rk[1], rk[2]); rk[4] = KEYEXP256(rk[2], rk[3], 0x02); rk[5] = KEYEXP256_2(rk[3], rk[4]); rk[6] = KEYEXP256(rk[4], rk[5], 0x04); rk[7] = KEYEXP256_2(rk[5], rk[6]); rk[8] = KEYEXP256(rk[6], rk[7], 0x08); rk[9] = KEYEXP256_2(rk[7], rk[8]); rk[10] = KEYEXP256(rk[8], rk[9], 0x10); rk[11] = KEYEXP256_2(rk[9], rk[10]); rk[12] = KEYEXP256(rk[10], rk[11], 0x20); rk[13] = KEYEXP256_2(rk[11], rk[12]); rk[14] = KEYEXP256(rk[12], rk[13], 0x40); break; } } }