__m128d test_mm_xor_pd(__m128d A, __m128d B) { // DAG-LABEL: test_mm_xor_pd // DAG: xor <4 x i32> %{{.*}}, %{{.*}} // // ASM-LABEL: test_mm_xor_pd // ASM: pxor return _mm_xor_pd(A, B); }
int fft4a_(double *a, double *b, double *w, int *l) { int j, j0, j1, j2, j3, j4, j5, j6, j7; /* double x0, y0, x1, y1, x2, y2, x3, y3, wi1, wi2, wi3, wr1, wr2, wr3; */ __m128d t0, t1, t2, t3, t4, w1, w2, w3; for (j = 0; j < *l; j++) { j0 = j << 1; j1 = j0 + (*l << 1); j2 = j1 + (*l << 1); j3 = j2 + (*l << 1); j4 = j << 3; j5 = j4 + 2; j6 = j5 + 2; j7 = j6 + 2; /* wr1 = w[j0]; wi1 = w[j0 + 1]; wr2 = wr1 * wr1 - wi1 * wi1; wi2 = wr1 * wi1 + wr1 * wi1; wr3 = wr1 * wr2 - wi1 * wi2; wi3 = wr1 * wi2 + wi1 * wr2; */ w1 = _mm_load_pd(&w[j0]); w2 = ZMUL(w1, w1); w3 = ZMUL(w1, w2); /* x0 = a[j0] + a[j2]; y0 = a[j0 + 1] + a[j2 + 1]; x1 = a[j0] - a[j2]; y1 = a[j0 + 1] - a[j2 + 1]; x2 = a[j1] + a[j3]; y2 = a[j1 + 1] + a[j3 + 1]; x3 = a[j1 + 1] - a[j3 + 1]; y3 = a[j3] - a[j1]; */ t0 = _mm_load_pd(&a[j0]); t2 = _mm_load_pd(&a[j2]); t1 = _mm_sub_pd(t0, t2); t0 = _mm_add_pd(t0, t2); t3 = _mm_load_pd(&a[j1]); t4 = _mm_load_pd(&a[j3]); t2 = _mm_add_pd(t3, t4); t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0)); t3 = _mm_shuffle_pd(t3, t3, 1); /* b[j4] = x0 + x2; b[j4 + 1] = y0 + y2; b[j6] = wr2 * (x0 - x2) - wi2 * (y0 - y2); b[j6 + 1] = wr2 * (y0 - y2) + wi2 * (x0 - x2); b[j5] = wr1 * (x1 + x3) - wi1 * (y1 + y3); b[j5 + 1] = wr1 * (y1 + y3) + wi1 * (x1 + x3); b[j7] = wr3 * (x1 - x3) - wi3 * (y1 - y3); b[j7 + 1] = wr3 * (y1 - y3) + wi3 * (x1 - x3); */ _mm_store_pd(&b[j4], _mm_add_pd(t0, t2)); _mm_store_pd(&b[j6], ZMUL(w2, _mm_sub_pd(t0, t2))); _mm_store_pd(&b[j5], ZMUL(w1, _mm_add_pd(t1, t3))); _mm_store_pd(&b[j7], ZMUL(w3, _mm_sub_pd(t1, t3))); } return 0; }
__SIMDd _SIMD_neg_pd(__SIMDd a) { #ifdef USE_SSE return _mm_xor_pd(a, _mm_set1_pd(-0.0f)); #elif defined USE_AVX return _mm256_xor_pd(a, _mm_set1_pd(-0.0f)); #elif defined USE_IBM return vec_neg(a); #endif }
__SIMDd _SIMD_xor_pd(__SIMDd a, __SIMDd b) { #ifdef USE_SSE return _mm_xor_pd(a,b); #elif defined USE_AVX return _m256_xor_ps(a,b); #elif defined USE_IBM return vec_xor(a,b); #endif }
int fft3a_(double *a, double *b, double *w, int *l) { /* static double c31 = .86602540378443865; static double c32 = .5; */ static __m128d c31, c32; int j, j0, j1, j2, j3, j4, j5; /* double x0, y0, x1, y1, x2, y2, wi1, wi2, wr1, wr2; */ __m128d t0, t1, t2, t3, w1, w2; c31 = _mm_set1_pd(0.86602540378443865); c32 = _mm_set1_pd(0.5); for (j = 0; j < *l; j++) { j0 = j << 1; j1 = j0 + (*l << 1); j2 = j1 + (*l << 1); j3 = j * 6; j4 = j3 + 2; j5 = j4 + 2; /* wr1 = w[j0]; wi1 = w[j0 + 1]; wr2 = wr1 * wr1 - wi1 * wi1; wi2 = wr1 * wi1 + wr1 * wi1; */ w1 = _mm_load_pd(&w[j0]); w2 = ZMUL(w1, w1); /* x0 = a[j1] + a[j2]; y0 = a[j1 + 1] + a[j2 + 1]; x1 = a[j0] - c32 * x0; y1 = a[j0 + 1] - c32 * y0; x2 = c31 * (a[j1 + 1] - a[j2 + 1]); y2 = c31 * (a[j2] - a[j1]); */ t1 = _mm_load_pd(&a[j1]); t2 = _mm_load_pd(&a[j2]); t0 = _mm_add_pd(t1, t2); t2 = _mm_xor_pd(_mm_sub_pd(t1, t2), _mm_set_sd(-0.0)); t2 = _mm_mul_pd(c31, _mm_shuffle_pd(t2, t2, 1)); t3 = _mm_load_pd(&a[j0]); t1 = _mm_sub_pd(t3, _mm_mul_pd(c32, t0)); /* b[j3] = a[j0] + x0; b[j3 + 1] = a[j0 + 1] + y0; b[j4] = wr1 * (x1 + x2) - wi1 * (y1 + y2); b[j4 + 1] = wr1 * (y1 + y2) + wi1 * (x1 + x2); b[j5] = wr2 * (x1 - x2) - wi2 * (y1 - y2); b[j5 + 1] = wr2 * (y1 - y2) + wi2 * (x1 - x2); */ _mm_store_pd(&b[j3], _mm_add_pd(t3, t0)); _mm_store_pd(&b[j4], ZMUL(w1, _mm_add_pd(t1, t2))); _mm_store_pd(&b[j5], ZMUL(w2, _mm_sub_pd(t1, t2))); } return 0; }
static SIMD_INLINE __m128d sin_vml_pd(__m128d x) { SIMD_CONST_SD(1_PI , 0.318309886183790671538); SIMD_CONST_SD(PI4_A, 0.78539816290140151978 * -4.0); SIMD_CONST_SD(PI4_B, 4.9604678871439933374e-10 * -4.0); SIMD_CONST_SD(PI4_C, 1.1258708853173288931e-18 * -4.0); SIMD_CONST_SD(PI4_D, 1.7607799325916000908e-27 * -4.0); SIMD_CONST_SD(sin_0,-7.97255955009037868891952e-18); SIMD_CONST_SD(sin_1, 2.81009972710863200091251e-15); SIMD_CONST_SD(sin_2,-7.64712219118158833288484e-13); SIMD_CONST_SD(sin_3, 1.60590430605664501629054e-10); SIMD_CONST_SD(sin_4,-2.50521083763502045810755e-08); SIMD_CONST_SD(sin_5, 2.75573192239198747630416e-06); SIMD_CONST_SD(sin_6,-1.98412698412696162806809e-04); SIMD_CONST_SD(sin_7, 8.33333333333332974823815e-03); SIMD_CONST_SD(sin_8,-1.66666666666666657414808e-01); SIMD_CONST_SD(magic, 6755399441055744.0); __m128d y = _mm_mul_pd(x, SIMD_GET_PD(1_PI)); __m128d q = _mm_add_pd(y, SIMD_GET_PD(magic)); __m128d i = _mm_castsi128_pd(_mm_slli_epi64(_mm_castpd_si128(q), 63)); q = _mm_sub_pd(q, SIMD_GET_PD(magic)); x = Simd128::mad(q, SIMD_GET_PD(PI4_A), x); x = Simd128::mad(q, SIMD_GET_PD(PI4_B), x); x = Simd128::mad(q, SIMD_GET_PD(PI4_C), x); x = Simd128::mad(q, SIMD_GET_PD(PI4_D), x); __m128d xx = _mm_mul_pd(x, x); x = _mm_xor_pd(x, i); __m128d u = SIMD_GET_PD(sin_0); u = Simd128::mad(u, xx, SIMD_GET_PD(sin_1)); u = Simd128::mad(u, xx, SIMD_GET_PD(sin_2)); u = Simd128::mad(u, xx, SIMD_GET_PD(sin_3)); u = Simd128::mad(u, xx, SIMD_GET_PD(sin_4)); u = Simd128::mad(u, xx, SIMD_GET_PD(sin_5)); u = Simd128::mad(u, xx, SIMD_GET_PD(sin_6)); u = Simd128::mad(u, xx, SIMD_GET_PD(sin_7)); u = Simd128::mad(u, xx, SIMD_GET_PD(sin_8)); u = Simd128::mad(xx, _mm_mul_pd(u, x), x); return u; }
BI_FORCE_INLINE inline const sse_double operator-(const sse_double& o) { sse_double res; res.packed = _mm_xor_pd(_mm_set1_pd(-0.0), o.packed); return res; }
int fft8b_(double *a, double *b, double *w, int *m, int *l) { /* static double c81 = .70710678118654752; */ static __m128d c81; int i, i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, j, j0; /* double u0, v0, u1, x0, y0, x1, y1, x2, y2, x3, y3, v1, x4, y4, x5, y5, x6, y6, x7, y7, u2, v2, u3, v3, wi1, wi2, wi3, wi4, wi5, wi6, wi7, wr1, wr2, wr3, wr4, wr5, wr6, wr7; */ __m128d t0, t1, t2, t3, t4, t5, t6, t7, t8, u0, u1, u2, u3, w1, w2, w3, w4, w5, w6, w7; c81 = _mm_set1_pd(0.70710678118654752); for (i = 0; i < *m; i++) { i0 = i << 1; i1 = i0 + (*m * *l << 1); i2 = i1 + (*m * *l << 1); i3 = i2 + (*m * *l << 1); i4 = i3 + (*m * *l << 1); i5 = i4 + (*m * *l << 1); i6 = i5 + (*m * *l << 1); i7 = i6 + (*m * *l << 1); i8 = i << 1; i9 = i8 + (*m << 1); i10 = i9 + (*m << 1); i11 = i10 + (*m << 1); i12 = i11 + (*m << 1); i13 = i12 + (*m << 1); i14 = i13 + (*m << 1); i15 = i14 + (*m << 1); /* x0 = a[i0] + a[i4]; y0 = a[i0 + 1] + a[i4 + 1]; x1 = a[i0] - a[i4]; y1 = a[i0 + 1] - a[i4 + 1]; x2 = a[i2] + a[i6]; y2 = a[i2 + 1] + a[i6 + 1]; x3 = a[i2 + 1] - a[i6 + 1]; y3 = a[i6] - a[i2]; */ t0 = _mm_load_pd(&a[i0]); t2 = _mm_load_pd(&a[i4]); t1 = _mm_sub_pd(t0, t2); t0 = _mm_add_pd(t0, t2); t3 = _mm_load_pd(&a[i2]); t4 = _mm_load_pd(&a[i6]); t2 = _mm_add_pd(t3, t4); t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0)); t3 = _mm_shuffle_pd(t3, t3, 1); /* u0 = x0 + x2; v0 = y0 + y2; u1 = x0 - x2; v1 = y0 - y2; */ u0 = _mm_add_pd(t0, t2); u1 = _mm_sub_pd(t0, t2); /* x4 = a[i1] + a[i5]; y4 = a[i1 + 1] + a[i5 + 1]; x5 = a[i1] - a[i5]; y5 = a[i1 + 1] - a[i5 + 1]; x6 = a[i3] + a[i7]; y6 = a[i3 + 1] + a[i7 + 1]; x7 = a[i3] - a[i7]; y7 = a[i3 + 1] - a[i7 + 1]; */ t4 = _mm_load_pd(&a[i1]); t6 = _mm_load_pd(&a[i5]); t5 = _mm_sub_pd(t4, t6); t4 = _mm_add_pd(t4, t6); t7 = _mm_load_pd(&a[i3]); t8 = _mm_load_pd(&a[i7]); t6 = _mm_add_pd(t7, t8); t7 = _mm_sub_pd(t7, t8); /* u2 = x4 + x6; v2 = y4 + y6; u3 = y4 - y6; v3 = x6 - x4; */ u2 = _mm_add_pd(t4, t6); u3 = _mm_xor_pd(_mm_sub_pd(t4, t6), _mm_set_sd(-0.0)); u3 = _mm_shuffle_pd(u3, u3, 1); /* b[i8] = u0 + u2; b[i8 + 1] = v0 + v2; b[i12] = u0 - u2; b[i12 + 1] = v0 - v2; b[i10] = u1 + u3; b[i10 + 1] = v1 + v3; b[i14] = u1 - u3; b[i14 + 1] = v1 - v3; */ _mm_store_pd(&b[i8], _mm_add_pd(u0, u2)); _mm_store_pd(&b[i12], _mm_sub_pd(u0, u2)); _mm_store_pd(&b[i10], _mm_add_pd(u1, u3)); _mm_store_pd(&b[i14], _mm_sub_pd(u1, u3)); /* u0 = x1 + c81 * (x5 - x7); v0 = y1 + c81 * (y5 - y7); u1 = x1 - c81 * (x5 - x7); v1 = y1 - c81 * (y5 - y7); u2 = x3 + c81 * (y5 + y7); v2 = y3 - c81 * (x5 + x7); u3 = x3 - c81 * (y5 + y7); v3 = y3 + c81 * (x5 + x7); */ u1 = _mm_mul_pd(c81, _mm_sub_pd(t5, t7)); u0 = _mm_add_pd(t1, u1); u1 = _mm_sub_pd(t1, u1); u3 = _mm_xor_pd(_mm_mul_pd(c81, _mm_add_pd(t5, t7)), _mm_set_sd(-0.0)); u3 = _mm_shuffle_pd(u3, u3, 1); u2 = _mm_add_pd(t3, u3); u3 = _mm_sub_pd(t3, u3); /* b[i9] = u0 + u2; b[i9 + 1] = v0 + v2; b[i13] = u1 + u3; b[i13 + 1] = v1 + v3; b[i11] = u1 - u3; b[i11 + 1] = v1 - v3; b[i15] = u0 - u2; b[i15 + 1] = v0 - v2; */ _mm_store_pd(&b[i9], _mm_add_pd(u0, u2)); _mm_store_pd(&b[i13], _mm_add_pd(u1, u3)); _mm_store_pd(&b[i11], _mm_sub_pd(u1, u3)); _mm_store_pd(&b[i15], _mm_sub_pd(u0, u2)); } for (j = 1; j < *l; j++) { j0 = j << 1; /* wr1 = w[j0]; wi1 = w[j0 + 1]; wr2 = wr1 * wr1 - wi1 * wi1; wi2 = wr1 * wi1 + wr1 * wi1; wr3 = wr1 * wr2 - wi1 * wi2; wi3 = wr1 * wi2 + wi1 * wr2; wr4 = wr2 * wr2 - wi2 * wi2; wi4 = wr2 * wi2 + wr2 * wi2; wr5 = wr2 * wr3 - wi2 * wi3; wi5 = wr2 * wi3 + wi2 * wr3; wr6 = wr3 * wr3 - wi3 * wi3; wi6 = wr3 * wi3 + wr3 * wi3; wr7 = wr3 * wr4 - wi3 * wi4; wi7 = wr3 * wi4 + wi3 * wr4; */ w1 = _mm_load_pd(&w[j0]); w2 = ZMUL(w1, w1); w3 = ZMUL(w1, w2); w4 = ZMUL(w2, w2); w5 = ZMUL(w2, w3); w6 = ZMUL(w3, w3); w7 = ZMUL(w3, w4); for (i = 0; i < *m; i++) { i0 = (i << 1) + (j * *m << 1); i1 = i0 + (*m * *l << 1); i2 = i1 + (*m * *l << 1); i3 = i2 + (*m * *l << 1); i4 = i3 + (*m * *l << 1); i5 = i4 + (*m * *l << 1); i6 = i5 + (*m * *l << 1); i7 = i6 + (*m * *l << 1); i8 = (i << 1) + (j * *m << 4); i9 = i8 + (*m << 1); i10 = i9 + (*m << 1); i11 = i10 + (*m << 1); i12 = i11 + (*m << 1); i13 = i12 + (*m << 1); i14 = i13 + (*m << 1); i15 = i14 + (*m << 1); /* x0 = a[i0] + a[i4]; y0 = a[i0 + 1] + a[i4 + 1]; x1 = a[i0] - a[i4]; y1 = a[i0 + 1] - a[i4 + 1]; x2 = a[i2] + a[i6]; y2 = a[i2 + 1] + a[i6 + 1]; x3 = a[i2 + 1] - a[i6 + 1]; y3 = a[i6] - a[i2]; */ t0 = _mm_load_pd(&a[i0]); t2 = _mm_load_pd(&a[i4]); t1 = _mm_sub_pd(t0, t2); t0 = _mm_add_pd(t0, t2); t3 = _mm_load_pd(&a[i2]); t4 = _mm_load_pd(&a[i6]); t2 = _mm_add_pd(t3, t4); t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0)); t3 = _mm_shuffle_pd(t3, t3, 1); /* u0 = x0 + x2; v0 = y0 + y2; u1 = x0 - x2; v1 = y0 - y2; */ u0 = _mm_add_pd(t0, t2); u1 = _mm_sub_pd(t0, t2); /* x4 = a[i1] + a[i5]; y4 = a[i1 + 1] + a[i5 + 1]; x5 = a[i1] - a[i5]; y5 = a[i1 + 1] - a[i5 + 1]; x6 = a[i3] + a[i7]; y6 = a[i3 + 1] + a[i7 + 1]; x7 = a[i3] - a[i7]; y7 = a[i3 + 1] - a[i7 + 1]; */ t4 = _mm_load_pd(&a[i1]); t6 = _mm_load_pd(&a[i5]); t5 = _mm_sub_pd(t4, t6); t4 = _mm_add_pd(t4, t6); t7 = _mm_load_pd(&a[i3]); t8 = _mm_load_pd(&a[i7]); t6 = _mm_add_pd(t7, t8); t7 = _mm_sub_pd(t7, t8); /* u2 = x4 + x6; v2 = y4 + y6; u3 = y4 - y6; v3 = x6 - x4; */ u2 = _mm_add_pd(t4, t6); u3 = _mm_xor_pd(_mm_sub_pd(t4, t6), _mm_set_sd(-0.0)); u3 = _mm_shuffle_pd(u3, u3, 1); /* b[i8] = u0 + u2; b[i8 + 1] = v0 + v2; b[i12] = wr4 * (u0 - u2) - wi4 * (v0 - v2); b[i12 + 1] = wr4 * (v0 - v2) + wi4 * (u0 - u2); b[i10] = wr2 * (u1 + u3) - wi2 * (v1 + v3); b[i10 + 1] = wr2 * (v1 + v3) + wi2 * (u1 + u3); b[i14] = wr6 * (u1 - u3) - wi6 * (v1 - v3); b[i14 + 1] = wr6 * (v1 - v3) + wi6 * (u1 - u3); */ _mm_store_pd(&b[i8], _mm_add_pd(u0, u2)); _mm_store_pd(&b[i12], ZMUL(w4, _mm_sub_pd(u0, u2))); _mm_store_pd(&b[i10], ZMUL(w2, _mm_add_pd(u1, u3))); _mm_store_pd(&b[i14], ZMUL(w6, _mm_sub_pd(u1, u3))); /* u0 = x1 + c81 * (x5 - x7); v0 = y1 + c81 * (y5 - y7); u1 = x1 - c81 * (x5 - x7); v1 = y1 - c81 * (y5 - y7); u2 = x3 + c81 * (y5 + y7); v2 = y3 - c81 * (x5 + x7); u3 = x3 - c81 * (y5 + y7); v3 = y3 + c81 * (x5 + x7); */ u1 = _mm_mul_pd(c81, _mm_sub_pd(t5, t7)); u0 = _mm_add_pd(t1, u1); u1 = _mm_sub_pd(t1, u1); u3 = _mm_xor_pd(_mm_mul_pd(c81, _mm_add_pd(t5, t7)), _mm_set_sd(-0.0)); u3 = _mm_shuffle_pd(u3, u3, 1); u2 = _mm_add_pd(t3, u3); u3 = _mm_sub_pd(t3, u3); /* b[i9] = wr1 * (u0 + u2) - wi1 * (v0 + v2); b[i9 + 1] = wr1 * (v0 + v2) + wi1 * (u0 + u2); b[i13] = wr5 * (u1 + u3) - wi5 * (v1 + v3); b[i13 + 1] = wr5 * (v1 + v3) + wi5 * (u1 + u3); b[i11] = wr3 * (u1 - u3) - wi3 * (v1 - v3); b[i11 + 1] = wr3 * (v1 - v3) + wi3 * (u1 - u3); b[i15] = wr7 * (u0 - u2) - wi7 * (v0 - v2); b[i15 + 1] = wr7 * (v0 - v2) + wi7 * (u0 - u2); */ _mm_store_pd(&b[i9], ZMUL(w1, _mm_add_pd(u0, u2))); _mm_store_pd(&b[i13], ZMUL(w5, _mm_add_pd(u1, u3))); _mm_store_pd(&b[i11], ZMUL(w3, _mm_sub_pd(u1, u3))); _mm_store_pd(&b[i15], ZMUL(w7, _mm_sub_pd(u0, u2))); } } return 0; }
int fft8a_(double *a, double *b, double *w, int *l) { /* static double c81 = .70710678118654752; */ static __m128d c81; int j, j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15; /* double u0, v0, u1, x0, y0, x1, y1, x2, y2, x3, y3, v1, x4, y4, x5, y5, x6, y6, x7, y7, u2, v2, u3, v3, wi1, wi2, wi3, wi4, wi5, wi6, wi7, wr1, wr2, wr3, wr4, wr5, wr6, wr7; */ __m128d t0, t1, t2, t3, t4, t5, t6, t7, t8, u0, u1, u2, u3, w1, w2, w3, w4, w5, w6, w7; c81 = _mm_set1_pd(0.70710678118654752); for (j = 0; j < *l; j++) { j0 = j << 1; j1 = j0 + (*l << 1); j2 = j1 + (*l << 1); j3 = j2 + (*l << 1); j4 = j3 + (*l << 1); j5 = j4 + (*l << 1); j6 = j5 + (*l << 1); j7 = j6 + (*l << 1); j8 = j << 4; j9 = j8 + 2; j10 = j9 + 2; j11 = j10 + 2; j12 = j11 + 2; j13 = j12 + 2; j14 = j13 + 2; j15 = j14 + 2; /* wr1 = w[j0]; wi1 = w[j0 + 1]; wr2 = wr1 * wr1 - wi1 * wi1; wi2 = wr1 * wi1 + wr1 * wi1; wr3 = wr1 * wr2 - wi1 * wi2; wi3 = wr1 * wi2 + wi1 * wr2; wr4 = wr2 * wr2 - wi2 * wi2; wi4 = wr2 * wi2 + wr2 * wi2; wr5 = wr2 * wr3 - wi2 * wi3; wi5 = wr2 * wi3 + wi2 * wr3; wr6 = wr3 * wr3 - wi3 * wi3; wi6 = wr3 * wi3 + wr3 * wi3; wr7 = wr3 * wr4 - wi3 * wi4; wi7 = wr3 * wi4 + wi3 * wr4; */ w1 = _mm_load_pd(&w[j0]); w2 = ZMUL(w1, w1); w3 = ZMUL(w1, w2); w4 = ZMUL(w2, w2); w5 = ZMUL(w2, w3); w6 = ZMUL(w3, w3); w7 = ZMUL(w3, w4); /* x0 = a[j0] + a[j4]; y0 = a[j0 + 1] + a[j4 + 1]; x1 = a[j0] - a[j4]; y1 = a[j0 + 1] - a[j4 + 1]; x2 = a[j2] + a[j6]; y2 = a[j2 + 1] + a[j6 + 1]; x3 = a[j2 + 1] - a[j6 + 1]; y3 = a[j6] - a[j2]; */ t0 = _mm_load_pd(&a[j0]); t2 = _mm_load_pd(&a[j4]); t1 = _mm_sub_pd(t0, t2); t0 = _mm_add_pd(t0, t2); t3 = _mm_load_pd(&a[j2]); t4 = _mm_load_pd(&a[j6]); t2 = _mm_add_pd(t3, t4); t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0)); t3 = _mm_shuffle_pd(t3, t3, 1); /* u0 = x0 + x2; v0 = y0 + y2; u1 = x0 - x2; v1 = y0 - y2; */ u0 = _mm_add_pd(t0, t2); u1 = _mm_sub_pd(t0, t2); /* x4 = a[j1] + a[j5]; y4 = a[j1 + 1] + a[j5 + 1]; x5 = a[j1] - a[j5]; y5 = a[j1 + 1] - a[j5 + 1]; x6 = a[j3] + a[j7]; y6 = a[j3 + 1] + a[j7 + 1]; x7 = a[j3] - a[j7]; y7 = a[j3 + 1] - a[j7 + 1]; */ t4 = _mm_load_pd(&a[j1]); t6 = _mm_load_pd(&a[j5]); t5 = _mm_sub_pd(t4, t6); t4 = _mm_add_pd(t4, t6); t7 = _mm_load_pd(&a[j3]); t8 = _mm_load_pd(&a[j7]); t6 = _mm_add_pd(t7, t8); t7 = _mm_sub_pd(t7, t8); /* u2 = x4 + x6; v2 = y4 + y6; u3 = y4 - y6; v3 = x6 - x4; */ u2 = _mm_add_pd(t4, t6); u3 = _mm_xor_pd(_mm_sub_pd(t4, t6), _mm_set_sd(-0.0)); u3 = _mm_shuffle_pd(u3, u3, 1); /* b[j8] = u0 + u2; b[j8 + 1] = v0 + v2; b[j12] = wr4 * (u0 - u2) - wi4 * (v0 - v2); b[j12 + 1] = wr4 * (v0 - v2) + wi4 * (u0 - u2); b[j10] = wr2 * (u1 + u3) - wi2 * (v1 + v3); b[j10 + 1] = wr2 * (v1 + v3) + wi2 * (u1 + u3); b[j14] = wr6 * (u1 - u3) - wi6 * (v1 - v3); b[j14 + 1] = wr6 * (v1 - v3) + wi6 * (u1 - u3); */ _mm_store_pd(&b[j8], _mm_add_pd(u0, u2)); _mm_store_pd(&b[j12], ZMUL(w4, _mm_sub_pd(u0, u2))); _mm_store_pd(&b[j10], ZMUL(w2, _mm_add_pd(u1, u3))); _mm_store_pd(&b[j14], ZMUL(w6, _mm_sub_pd(u1, u3))); /* u0 = x1 + c81 * (x5 - x7); v0 = y1 + c81 * (y5 - y7); u1 = x1 - c81 * (x5 - x7); v1 = y1 - c81 * (y5 - y7); u2 = x3 + c81 * (y5 + y7); v2 = y3 - c81 * (x5 + x7); u3 = x3 - c81 * (y5 + y7); v3 = y3 + c81 * (x5 + x7); */ u1 = _mm_mul_pd(c81, _mm_sub_pd(t5, t7)); u0 = _mm_add_pd(t1, u1); u1 = _mm_sub_pd(t1, u1); u3 = _mm_xor_pd(_mm_mul_pd(c81, _mm_add_pd(t5, t7)), _mm_set_sd(-0.0)); u3 = _mm_shuffle_pd(u3, u3, 1); u2 = _mm_add_pd(t3, u3); u3 = _mm_sub_pd(t3, u3); /* b[j9] = wr1 * (u0 + u2) - wi1 * (v0 + v2); b[j9 + 1] = wr1 * (v0 + v2) + wi1 * (u0 + u2); b[j13] = wr5 * (u1 + u3) - wi5 * (v1 + v3); b[j13 + 1] = wr5 * (v1 + v3) + wi5 * (u1 + u3); b[j11] = wr3 * (u1 - u3) - wi3 * (v1 - v3); b[j11 + 1] = wr3 * (v1 - v3) + wi3 * (u1 - u3); b[j15] = wr7 * (u0 - u2) - wi7 * (v0 - v2); b[j15 + 1] = wr7 * (v0 - v2) + wi7 * (u0 - u2); */ _mm_store_pd(&b[j9], ZMUL(w1, _mm_add_pd(u0, u2))); _mm_store_pd(&b[j13], ZMUL(w5, _mm_add_pd(u1, u3))); _mm_store_pd(&b[j11], ZMUL(w3, _mm_sub_pd(u1, u3))); _mm_store_pd(&b[j15], ZMUL(w7, _mm_sub_pd(u0, u2))); } return 0; }
int fft5b_(double *a, double *b, double *w, int *m, int *l) { /* static double c51 = .95105651629515357; static double c52 = .61803398874989485; static double c53 = .55901699437494742; static double c54 = .25; */ static __m128d c51, c52, c53, c54; int i, i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, j, j0; /* double x0, y0, x1, y1, x2, y2, x3, y3, x4, y4, x5, y5, x6, y6, x7, y7, x8, y8, x9, y9, x10, y10, wi1, wi2, wi3, wi4, wr1, wr2, wr3, wr4; */ __m128d t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, w1, w2, w3, w4; c51 = _mm_set1_pd(0.95105651629515357); c52 = _mm_set1_pd(0.61803398874989485); c53 = _mm_set1_pd(0.55901699437494742); c54 = _mm_set1_pd(0.25); for (i = 0; i < *m; i++) { i0 = i << 1; i1 = i0 + (*m * *l << 1); i2 = i1 + (*m * *l << 1); i3 = i2 + (*m * *l << 1); i4 = i3 + (*m * *l << 1); i5 = i << 1; i6 = i5 + (*m << 1); i7 = i6 + (*m << 1); i8 = i7 + (*m << 1); i9 = i8 + (*m << 1); /* x0 = a[i1] + a[i4]; y0 = a[i1 + 1] + a[i4 + 1]; x1 = a[i2] + a[i3]; y1 = a[i2 + 1] + a[i3 + 1]; x2 = c51 * (a[i1] - a[i4]); y2 = c51 * (a[i1 + 1] - a[i4 + 1]); x3 = c51 * (a[i2] - a[i3]); y3 = c51 * (a[i2 + 1] - a[i3 + 1]); x4 = x0 + x1; y4 = y0 + y1; x5 = c53 * (x0 - x1); y5 = c53 * (y0 - y1); x6 = a[i0] - c54 * x4; y6 = a[i0 + 1] - c54 * y4; x7 = x6 + x5; y7 = y6 + y5; x8 = x6 - x5; y8 = y6 - y5; x9 = y2 + c52 * y3; y9 = -x2 - c52 * x3; x10 = c52 * y2 - y3; y10 = x3 - c52 * x2; */ t1 = _mm_load_pd(&a[i1]); t4 = _mm_load_pd(&a[i4]); t0 = _mm_add_pd(t1, t4); t2 = _mm_mul_pd(c51, _mm_sub_pd(t1, t4)); t1 = _mm_load_pd(&a[i2]); t4 = _mm_load_pd(&a[i3]); t3 = _mm_mul_pd(c51, _mm_sub_pd(t1, t4)); t1 = _mm_add_pd(t1, t4); t4 = _mm_add_pd(t0, t1); t5 = _mm_mul_pd(c53, _mm_sub_pd(t0, t1)); t0 = _mm_load_pd(&a[i0]); t6 = _mm_sub_pd(t0, _mm_mul_pd(c54, t4)); t7 = _mm_add_pd(t6, t5); t8 = _mm_sub_pd(t6, t5); t9 = _mm_xor_pd(_mm_add_pd(t2, _mm_mul_pd(c52, t3)), _mm_set_sd(-0.0)); t9 = _mm_shuffle_pd(t9, t9, 1); t10 = _mm_sub_pd(t3, _mm_mul_pd(c52, t2)); t10 = _mm_xor_pd(_mm_shuffle_pd(t10, t10, 1), _mm_set_sd(-0.0)); /* b[i5] = a[i0] + x4; b[i5 + 1] = a[i0 + 1] + y4; b[i6] = x7 + x9; b[i6 + 1] = y7 + y9; b[i7] = x8 + x10; b[i7 + 1] = y8 + y10; b[i8] = x8 - x10; b[i8 + 1] = y8 - y10; b[i9] = x7 - x9; b[i9 + 1] = y7 - y9; */ _mm_store_pd(&b[i5], _mm_add_pd(t0, t4)); _mm_store_pd(&b[i6], _mm_add_pd(t7, t9)); _mm_store_pd(&b[i7], _mm_add_pd(t8, t10)); _mm_store_pd(&b[i8], _mm_sub_pd(t8, t10)); _mm_store_pd(&b[i9], _mm_sub_pd(t7, t9)); } for (j = 1; j < *l; j++) { j0 = j << 1; /* wr1 = w[j0]; wi1 = w[j0 + 1]; wr2 = wr1 * wr1 - wi1 * wi1; wi2 = wr1 * wi1 + wr1 * wi1; wr3 = wr1 * wr2 - wi1 * wi2; wi3 = wr1 * wi2 + wi1 * wr2; wr4 = wr2 * wr2 - wi2 * wi2; wi4 = wr2 * wi2 + wr2 * wi2; */ w1 = _mm_load_pd(&w[j0]); w2 = ZMUL(w1, w1); w3 = ZMUL(w1, w2); w4 = ZMUL(w2, w2); for (i = 0; i < *m; i++) { i0 = (i << 1) + (j * *m << 1); i1 = i0 + (*m * *l << 1); i2 = i1 + (*m * *l << 1); i3 = i2 + (*m * *l << 1); i4 = i3 + (*m * *l << 1); i5 = (i << 1) + (j * *m * 10); i6 = i5 + (*m << 1); i7 = i6 + (*m << 1); i8 = i7 + (*m << 1); i9 = i8 + (*m << 1); /* x0 = a[i1] + a[i4]; y0 = a[i1 + 1] + a[i4 + 1]; x1 = a[i2] + a[i3]; y1 = a[i2 + 1] + a[i3 + 1]; x2 = c51 * (a[i1] - a[i4]); y2 = c51 * (a[i1 + 1] - a[i4 + 1]); x3 = c51 * (a[i2] - a[i3]); y3 = c51 * (a[i2 + 1] - a[i3 + 1]); x4 = x0 + x1; y4 = y0 + y1; x5 = c53 * (x0 - x1); y5 = c53 * (y0 - y1); x6 = a[i0] - c54 * x4; y6 = a[i0 + 1] - c54 * y4; x7 = x6 + x5; y7 = y6 + y5; x8 = x6 - x5; y8 = y6 - y5; x9 = y2 + c52 * y3; y9 = -x2 - c52 * x3; x10 = c52 * y2 - y3; y10 = x3 - c52 * x2; */ t1 = _mm_load_pd(&a[i1]); t4 = _mm_load_pd(&a[i4]); t0 = _mm_add_pd(t1, t4); t2 = _mm_mul_pd(c51, _mm_sub_pd(t1, t4)); t1 = _mm_load_pd(&a[i2]); t4 = _mm_load_pd(&a[i3]); t3 = _mm_mul_pd(c51, _mm_sub_pd(t1, t4)); t1 = _mm_add_pd(t1, t4); t4 = _mm_add_pd(t0, t1); t5 = _mm_mul_pd(c53, _mm_sub_pd(t0, t1)); t0 = _mm_load_pd(&a[i0]); t6 = _mm_sub_pd(t0, _mm_mul_pd(c54, t4)); t7 = _mm_add_pd(t6, t5); t8 = _mm_sub_pd(t6, t5); t9 = _mm_xor_pd(_mm_add_pd(t2, _mm_mul_pd(c52, t3)), _mm_set_sd(-0.0)); t9 = _mm_shuffle_pd(t9, t9, 1); t10 = _mm_sub_pd(t3, _mm_mul_pd(c52, t2)); t10 = _mm_xor_pd(_mm_shuffle_pd(t10, t10, 1), _mm_set_sd(-0.0)); /* b[i5] = a[i0] + x4; b[i5 + 1] = a[i0 + 1] + y4; b[i6] = wr1 * (x7 + x9) - wi1 * (y7 + y9); b[i6 + 1] = wr1 * (y7 + y9) + wi1 * (x7 + x9); b[i7] = wr2 * (x8 + x10) - wi2 * (y8 + y10); b[i7 + 1] = wr2 * (y8 + y10) + wi2 * (x8 + x10); b[i8] = wr3 * (x8 - x10) - wi3 * (y8 - y10); b[i8 + 1] = wr3 * (y8 - y10) + wi3 * (x8 - x10); b[i9] = wr4 * (x7 - x9) - wi4 * (y7 - y9); b[i9 + 1] = wr4 * (y7 - y9) + wi4 * (x7 - x9); */ _mm_store_pd(&b[i5], _mm_add_pd(t0, t4)); _mm_store_pd(&b[i6], ZMUL(w1, _mm_add_pd(t7, t9))); _mm_store_pd(&b[i7], ZMUL(w2, _mm_add_pd(t8, t10))); _mm_store_pd(&b[i8], ZMUL(w3, _mm_sub_pd(t8, t10))); _mm_store_pd(&b[i9], ZMUL(w4, _mm_sub_pd(t7, t9))); } } return 0; }
int fft5a_(double *a, double *b, double *w, int *l) { /* static double c51 = .95105651629515357; static double c52 = .61803398874989485; static double c53 = .55901699437494742; static double c54 = .25; */ static __m128d c51, c52, c53, c54; int j, j0, j1, j2, j3, j4, j5, j6, j7, j8, j9; /* double x0, y0, x1, y1, x2, y2, x3, y3, x4, y4, x5, y5, x6, y6, x7, y7, x8, y8, x9, y9, x10, y10, wi1, wi2, wi3, wi4, wr1, wr2, wr3, wr4; */ __m128d t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, w1, w2, w3, w4; c51 = _mm_set1_pd(0.95105651629515357); c52 = _mm_set1_pd(0.61803398874989485); c53 = _mm_set1_pd(0.55901699437494742); c54 = _mm_set1_pd(0.25); for (j = 0; j < *l; j++) { j0 = j << 1; j1 = j0 + (*l << 1); j2 = j1 + (*l << 1); j3 = j2 + (*l << 1); j4 = j3 + (*l << 1); j5 = j * 10; j6 = j5 + 2; j7 = j6 + 2; j8 = j7 + 2; j9 = j8 + 2; /* wr1 = w[j0]; wi1 = w[j0 + 1]; wr2 = wr1 * wr1 - wi1 * wi1; wi2 = wr1 * wi1 + wr1 * wi1; wr3 = wr1 * wr2 - wi1 * wi2; wi3 = wr1 * wi2 + wi1 * wr2; wr4 = wr2 * wr2 - wi2 * wi2; wi4 = wr2 * wi2 + wr2 * wi2; */ w1 = _mm_load_pd(&w[j0]); w2 = ZMUL(w1, w1); w3 = ZMUL(w1, w2); w4 = ZMUL(w2, w2); /* x0 = a[j1] + a[j4]; y0 = a[j1 + 1] + a[j4 + 1]; x1 = a[j2] + a[j3]; y1 = a[j2 + 1] + a[j3 + 1]; x2 = c51 * (a[j1] - a[j4]); y2 = c51 * (a[j1 + 1] - a[j4 + 1]); x3 = c51 * (a[j2] - a[j3]); y3 = c51 * (a[j2 + 1] - a[j3 + 1]); x4 = x0 + x1; y4 = y0 + y1; x5 = c53 * (x0 - x1); y5 = c53 * (y0 - y1); x6 = a[j0] - c54 * x4; y6 = a[j0 + 1] - c54 * y4; x7 = x6 + x5; y7 = y6 + y5; x8 = x6 - x5; y8 = y6 - y5; x9 = y2 + c52 * y3; y9 = -x2 - c52 * x3; x10 = c52 * y2 - y3; y10 = x3 - c52 * x2; */ t1 = _mm_load_pd(&a[j1]); t4 = _mm_load_pd(&a[j4]); t0 = _mm_add_pd(t1, t4); t2 = _mm_mul_pd(c51, _mm_sub_pd(t1, t4)); t1 = _mm_load_pd(&a[j2]); t4 = _mm_load_pd(&a[j3]); t3 = _mm_mul_pd(c51, _mm_sub_pd(t1, t4)); t1 = _mm_add_pd(t1, t4); t4 = _mm_add_pd(t0, t1); t5 = _mm_mul_pd(c53, _mm_sub_pd(t0, t1)); t0 = _mm_load_pd(&a[j0]); t6 = _mm_sub_pd(t0, _mm_mul_pd(c54, t4)); t7 = _mm_add_pd(t6, t5); t8 = _mm_sub_pd(t6, t5); t9 = _mm_xor_pd(_mm_add_pd(t2, _mm_mul_pd(c52, t3)), _mm_set_sd(-0.0)); t9 = _mm_shuffle_pd(t9, t9, 1); t10 = _mm_sub_pd(t3, _mm_mul_pd(c52, t2)); t10 = _mm_xor_pd(_mm_shuffle_pd(t10, t10, 1), _mm_set_sd(-0.0)); /* b[j5] = a[j0] + x4; b[j5 + 1] = a[j0 + 1] + y4; b[j6] = wr1 * (x7 + x9) - wi1 * (y7 + y9); b[j6 + 1] = wr1 * (y7 + y9) + wi1 * (x7 + x9); b[j7] = wr2 * (x8 + x10) - wi2 * (y8 + y10); b[j7 + 1] = wr2 * (y8 + y10) + wi2 * (x8 + x10); b[j8] = wr3 * (x8 - x10) - wi3 * (y8 - y10); b[j8 + 1] = wr3 * (y8 - y10) + wi3 * (x8 - x10); b[j9] = wr4 * (x7 - x9) - wi4 * (y7 - y9); b[j9 + 1] = wr4 * (y7 - y9) + wi4 * (x7 - x9); */ _mm_store_pd(&b[j5], _mm_add_pd(t0, t4)); _mm_store_pd(&b[j6], ZMUL(w1, _mm_add_pd(t7, t9))); _mm_store_pd(&b[j7], ZMUL(w2, _mm_add_pd(t8, t10))); _mm_store_pd(&b[j8], ZMUL(w3, _mm_sub_pd(t8, t10))); _mm_store_pd(&b[j9], ZMUL(w4, _mm_sub_pd(t7, t9))); } return 0; }
int fft4b_(double *a, double *b, double *w, int *m, int *l) { int i, i0, i1, i2, i3, i4, i5, i6, i7, j, j0; /* double x0, y0, x1, y1, x2, y2, x3, y3, wi1, wi2, wi3, wr1, wr2, wr3; */ __m128d t0, t1, t2, t3, t4, w1, w2, w3; for (i = 0; i < *m; i++) { i0 = i << 1; i1 = i0 + (*m * *l << 1); i2 = i1 + (*m * *l << 1); i3 = i2 + (*m * *l << 1); i4 = i << 1; i5 = i4 + (*m << 1); i6 = i5 + (*m << 1); i7 = i6 + (*m << 1); /* x0 = a[i0] + a[i2]; y0 = a[i0 + 1] + a[i2 + 1]; x1 = a[i0] - a[i2]; y1 = a[i0 + 1] - a[i2 + 1]; x2 = a[i1] + a[i3]; y2 = a[i1 + 1] + a[i3 + 1]; x3 = a[i1 + 1] - a[i3 + 1]; y3 = a[i3] - a[i1]; */ t0 = _mm_load_pd(&a[i0]); t2 = _mm_load_pd(&a[i2]); t1 = _mm_sub_pd(t0, t2); t0 = _mm_add_pd(t0, t2); t3 = _mm_load_pd(&a[i1]); t4 = _mm_load_pd(&a[i3]); t2 = _mm_add_pd(t3, t4); t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0)); t3 = _mm_shuffle_pd(t3, t3, 1); /* b[i4] = x0 + x2; b[i4 + 1] = y0 + y2; b[i6] = x0 - x2; b[i6 + 1] = y0 - y2; b[i5] = x1 + x3; b[i5 + 1] = y1 + y3; b[i7] = x1 - x3; b[i7 + 1] = y1 - y3; */ _mm_store_pd(&b[i4], _mm_add_pd(t0, t2)); _mm_store_pd(&b[i6], _mm_sub_pd(t0, t2)); _mm_store_pd(&b[i5], _mm_add_pd(t1, t3)); _mm_store_pd(&b[i7], _mm_sub_pd(t1, t3)); } for (j = 1; j < *l; j++) { j0 = j << 1; /* wr1 = w[j0]; wi1 = w[j0 + 1]; wr2 = wr1 * wr1 - wi1 * wi1; wi2 = wr1 * wi1 + wr1 * wi1; wr3 = wr1 * wr2 - wi1 * wi2; wi3 = wr1 * wi2 + wi1 * wr2; */ w1 = _mm_load_pd(&w[j0]); w2 = ZMUL(w1, w1); w3 = ZMUL(w1, w2); for (i = 0; i < *m; i++) { i0 = (i << 1) + (j * *m << 1); i1 = i0 + (*m * *l << 1); i2 = i1 + (*m * *l << 1); i3 = i2 + (*m * *l << 1); i4 = (i << 1) + (j * *m << 3); i5 = i4 + (*m << 1); i6 = i5 + (*m << 1); i7 = i6 + (*m << 1); /* x0 = a[i0] + a[i2]; y0 = a[i0 + 1] + a[i2 + 1]; x1 = a[i0] - a[i2]; y1 = a[i0 + 1] - a[i2 + 1]; x2 = a[i1] + a[i3]; y2 = a[i1 + 1] + a[i3 + 1]; x3 = a[i1 + 1] - a[i3 + 1]; y3 = a[i3] - a[i1]; */ t0 = _mm_load_pd(&a[i0]); t2 = _mm_load_pd(&a[i2]); t1 = _mm_sub_pd(t0, t2); t0 = _mm_add_pd(t0, t2); t3 = _mm_load_pd(&a[i1]); t4 = _mm_load_pd(&a[i3]); t2 = _mm_add_pd(t3, t4); t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0)); t3 = _mm_shuffle_pd(t3, t3, 1); /* b[i4] = x0 + x2; b[i4 + 1] = y0 + y2; b[i6] = wr2 * (x0 - x2) - wi2 * (y0 - y2); b[i6 + 1] = wr2 * (y0 - y2) + wi2 * (x0 - x2); b[i5] = wr1 * (x1 + x3) - wi1 * (y1 + y3); b[i5 + 1] = wr1 * (y1 + y3) + wi1 * (x1 + x3); b[i7] = wr3 * (x1 - x3) - wi3 * (y1 - y3); b[i7 + 1] = wr3 * (y1 - y3) + wi3 * (x1 - x3); */ _mm_store_pd(&b[i4], _mm_add_pd(t0, t2)); _mm_store_pd(&b[i6], ZMUL(w2, _mm_sub_pd(t0, t2))); _mm_store_pd(&b[i5], ZMUL(w1, _mm_add_pd(t1, t3))); _mm_store_pd(&b[i7], ZMUL(w3, _mm_sub_pd(t1, t3))); } } return 0; }
int fft3b_(double *a, double *b, double *w, int *m, int *l) { /* static double c31 = .86602540378443865; static double c32 = .5; */ static __m128d c31, c32; int i, i0, i1, i2, i3, i4, i5, j, j0; /* double x0, y0, x1, y1, x2, y2, wi1, wi2, wr1, wr2; */ __m128d t0, t1, t2, t3, w1, w2; c31 = _mm_set1_pd(0.86602540378443865); c32 = _mm_set1_pd(0.5); for (i = 0; i < *m; i++) { i0 = i << 1; i1 = i0 + (*m * *l << 1); i2 = i1 + (*m * *l << 1); i3 = i << 1; i4 = i3 + (*m << 1); i5 = i4 + (*m << 1); /* x0 = a[i1] + a[i2]; y0 = a[i1 + 1] + a[i2 + 1]; x1 = a[i0] - c32 * x0; y1 = a[i0 + 1] - c32 * y0; x2 = c31 * (a[i1 + 1] - a[i2 + 1]); y2 = c31 * (a[i2] - a[i1]); */ t1 = _mm_load_pd(&a[i1]); t2 = _mm_load_pd(&a[i2]); t0 = _mm_add_pd(t1, t2); t2 = _mm_xor_pd(_mm_sub_pd(t1, t2), _mm_set_sd(-0.0)); t2 = _mm_mul_pd(c31, _mm_shuffle_pd(t2, t2, 1)); t3 = _mm_load_pd(&a[i0]); t1 = _mm_sub_pd(t3, _mm_mul_pd(c32, t0)); /* b[i3] = a[i0] + x0; b[i3 + 1] = a[i0 + 1] + y0; b[i4] = x1 + x2; b[i4 + 1] = y1 + y2; b[i5] = x1 - x2; b[i5 + 1] = y1 - y2; */ _mm_store_pd(&b[i3], _mm_add_pd(t3, t0)); _mm_store_pd(&b[i4], _mm_add_pd(t1, t2)); _mm_store_pd(&b[i5], _mm_sub_pd(t1, t2)); } for (j = 1; j < *l; j++) { j0 = j << 1; /* wr1 = w[j0]; wi1 = w[j0 + 1]; wr2 = wr1 * wr1 - wi1 * wi1; wi2 = wr1 * wi1 + wr1 * wi1; */ w1 = _mm_load_pd(&w[j0]); w2 = ZMUL(w1, w1); for (i = 0; i < *m; i++) { i0 = (i << 1) + (j * *m << 1); i1 = i0 + (*m * *l << 1); i2 = i1 + (*m * *l << 1); i3 = (i << 1) + (j * *m * 6); i4 = i3 + (*m << 1); i5 = i4 + (*m << 1); /* x0 = a[i1] + a[i2]; y0 = a[i1 + 1] + a[i2 + 1]; x1 = a[i0] - x0 * .5; y1 = a[i0 + 1] - y0 * .5; x2 = c31 * (a[i1 + 1] - a[i2 + 1]); y2 = c31 * (a[i2] - a[i1]); */ t1 = _mm_load_pd(&a[i1]); t2 = _mm_load_pd(&a[i2]); t0 = _mm_add_pd(t1, t2); t2 = _mm_xor_pd(_mm_sub_pd(t1, t2), _mm_set_sd(-0.0)); t2 = _mm_mul_pd(c31, _mm_shuffle_pd(t2, t2, 1)); t3 = _mm_load_pd(&a[i0]); t1 = _mm_sub_pd(t3, _mm_mul_pd(c32, t0)); /* b[i3] = a[i0] + x0; b[i3 + 1] = a[i0 + 1] + y0; b[i4] = wr1 * (x1 + x2) - wi1 * (y1 + y2); b[i4 + 1] = wr1 * (y1 + y2) + wi1 * (x1 + x2); b[i5] = wr2 * (x1 - x2) - wi2 * (y1 - y2); b[i5 + 1] = wr2 * (y1 - y2) + wi2 * (x1 - x2); */ _mm_store_pd(&b[i3], _mm_add_pd(t3, t0)); _mm_store_pd(&b[i4], ZMUL(w1, _mm_add_pd(t1, t2))); _mm_store_pd(&b[i5], ZMUL(w2, _mm_sub_pd(t1, t2))); } } return 0; }
// The input must be in domain [-1686629712, 1686629712]. // // I tried to optimize the double to int conversion by using `magic`, but // it was actually slower than using `_mm_cvttpd_epi32()` and it didn't // offer greater domain for `x`. static SIMD_INLINE __m128d sin_cephes_pd(__m128d x) { SIMD_CONST_SQ(sign , SIMD_UINT64_C(0x8000000000000000)); SIMD_CONST_SQ(inv_sign , SIMD_UINT64_C(0x7FFFFFFFFFFFFFFF)); SIMD_CONST_SI(int32_one, 1); SIMD_CONST_SD(4_DIV_PI , 1.27323954473516268615107010698); SIMD_CONST_SD(DP1 , 7.85398125648498535156e-1); SIMD_CONST_SD(DP2 , 3.77489470793079817668e-8); SIMD_CONST_SD(DP3 , 2.69515142907905952645e-15); #define DEFINE_DATA(name, x0, x1, x2, x3, x4, x5, xm, xa, y0, y1, y2, y3, y4, y5, ym, ya) \ SIMD_ALIGN_VAR(static const double, name[], 16) = { \ x0, x0, x1, x1, x2, x2, x3, x3, x4, x4, x5, x5, xm, xm, xa, xa, \ y0, x0, y1, x1, y2, x2, y3, x3, y4, x4, y5, x5, ym, xm, ya, xa, \ x0, y0, x1, y1, x2, y2, x3, y3, x4, y4, x5, y5, xm, ym, xa, ya, \ y0, y0, y1, y1, y2, y2, y3, y3, y4, y4, y5, y5, ym, ym, ya, ya \ } DEFINE_DATA(sincos_coeff, 1.58962301576546568060e-10,-2.50507477628578072866e-8, 2.75573136213857245213e-6 ,-1.98412698295895385996e-4, 8.33333333332211858878e-3 ,-1.66666666666666307295e-1, 1.0, 0.0, -1.13585365213876817300e-11, 2.08757008419747316778e-9, -2.75573141792967388112e-7 , 2.48015872888517045348e-5, -1.38888888888730564116e-3 , 4.16666666666665929218e-2,-0.5, 1.0); __m128d y; __m128d sign = x; // Sign bit. x = _mm_and_pd(x, SIMD_GET_PD(inv_sign)); // Take the absolute value. y = _mm_mul_pd(x, SIMD_GET_PD(4_DIV_PI)); // Integer part of `x * 4 / PI`. __m128i ival = _mm_cvttpd_epi32(y); // Extract the integer part of y. __m128i ione = SIMD_GET_PI(int32_one); ival = _mm_add_epi32(ival, ione); // j += 1. ival = _mm_andnot_si128(ione, ival); // j &=~1. y = _mm_cvtepi32_pd(ival); ival = _mm_unpacklo_epi32(ival, ival); sign = _mm_xor_pd(sign, // Swap the sign bit if `j & 4`. _mm_castsi128_pd(_mm_slli_epi64(ival, 61))); sign = _mm_and_pd(sign, SIMD_GET_PD(sign)); // Keep only the sign bit. // Get the polynom selection mask (j & 2): // 1. `0x0000000000000000` => `0 <= x <= PI/4` // 2. `0xFFFFFFFFFFFFFFFF` => `PI/4 < x <= PI/2` ival = _mm_slli_epi32(ival, 30); ival = _mm_srai_epi32(ival, 31); // Extended precision modular arithmetic: // x = ((x - y * DP1) - y * DP2) - y * DP3 x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP1))); x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP2))); x = _mm_sub_pd(x, _mm_mul_pd(y, SIMD_GET_PD(DP3))); // Get the polynom coefficients for each lane (sin/cos). __m128d poly_mask = _mm_castsi128_pd(ival); const __m128d* coeff = reinterpret_cast<const __m128d*>(sincos_coeff) + static_cast<uintptr_t>(_mm_movemask_pd(poly_mask)) * 8; __m128d xx = _mm_mul_pd(x, x); y = coeff[0]; y = Simd128::mad(y, xx, coeff[1]); y = Simd128::mad(y, xx, coeff[2]); y = Simd128::mad(y, xx, coeff[3]); y = Simd128::mad(y, xx, coeff[4]); y = Simd128::mad(y, xx, coeff[5]); y = _mm_mul_pd(y, xx); __m128d x_or_xx = _mm_or_pd( _mm_and_pd(xx, poly_mask), _mm_andnot_pd(poly_mask, x)); y = _mm_mul_pd(y, x_or_xx); y = _mm_add_pd(y, _mm_mul_pd(x_or_xx, coeff[6])); y = _mm_add_pd(y, coeff[7]); return _mm_xor_pd(y, sign); }
__m128d t4(void) { return _mm_xor_pd (magic_a,magic_b); }
BOOST_FORCEINLINE __m128d __vectorcall operator ^ ( __m128d const left, __m128d const right ) { return _mm_xor_pd ( left, right ); }