I32 roundInt(F64 val) { #ifdef USE_SSE4 __m128d t = _mm_set_sd(val); t = _mm_round_sd(t, t, _MM_FROUND_TO_NEAREST_INT); I32 i = _mm_cvtsd_si32(t); #elif defined(USE_SSE2) __m128d t = _mm_set_sd(val); I32 i = (I32)_mm_cvtsd_si32(t); #else I32 i = (I32)core_floor(val + 0.5); #endif return i; }
I32 floorInt(F64 val) { #ifdef USE_SSE4 __m128d t = _mm_set_sd(val); t = _mm_floor_sd(t, t); I32 i = _mm_cvtsd_si32(t); #elif defined(USE_SSE2) val += -0.5; __m128d t = _mm_set_sd(val); I32 i = (I32)_mm_cvtsd_si32(t); #else I32 i = (I32)core_floor(val); #endif return i; }
void int_to_fft(__m128d *T, int k, const unsigned __int64 *A, size_t AL) { size_t fft_length = 1 << k; __m128d *Tstop = T + fft_length; if (fft_length < 8 * AL) throw "FFT length is too small."; for (size_t c = 0; c < AL; c++){ unsigned __int64 word = A[c]; *T++ = _mm_set_sd(word & 0xff); word >>= 8; *T++ = _mm_set_sd(word & 0xff); word >>= 8; *T++ = _mm_set_sd(word & 0xff); word >>= 8; *T++ = _mm_set_sd(word & 0xff); word >>= 8; *T++ = _mm_set_sd(word & 0xff); word >>= 8; *T++ = _mm_set_sd(word & 0xff); word >>= 8; *T++ = _mm_set_sd(word & 0xff); word >>= 8; *T++ = _mm_set_sd(word); } while (T < Tstop) *T++ = _mm_setzero_pd(); }
I32 ceilInt(F64 val) { #ifdef USE_SSE4 __m128d t = _mm_set_sd(val); t = _mm_ceil_sd(t, t); I32 i = _mm_cvtsd_si32(t); #elif defined(USE_SSE2) val += 0.5; __m128d t = _mm_set_sd(val); I32 i = (I32)_mm_cvtsd_si32(t); #else I32 i = (I32)core_ceil(val); #endif return i; }
F64 round(F64 val) { #ifdef USE_SSE4 __m128d t = _mm_set_sd(val); t = _mm_round_sd(t, t, _MM_FROUND_TO_NEAREST_INT); _mm_store_sd(&val, t); #elif defined(USE_SSE2) __m128d t = _mm_set_sd(val); U32 i = (U32)_mm_cvtsd_si32(t); t = _mm_cvtsi32_sd(t, (int32)i); _mm_store_sd(&val, t); #else val = core_floor(val + 0.5); #endif return val; }
F64 ceil(F64 val) { #ifdef USE_SSE4 __m128d t = _mm_set_sd(val); t = _mm_ceil_sd(t, t); _mm_store_sd(&val, t); #elif defined(USE_SSE2) val += 0.5; __m128d t = _mm_set_sd(val); U32 i = (U32)_mm_cvtsd_si32(t); t = _mm_cvtsi32_sd(t, (int32)i); _mm_store_sd(&val, t); #else val = core_ceil(val); #endif return val; }
/** @brief Rounds floating-point number to the nearest integer @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the result is not defined. */ CV_INLINE int cvRound( double value ) { #if ((defined _MSC_VER && defined _M_X64) || (defined __GNUC__ && defined __x86_64__ \ && defined __SSE2__ && !defined __APPLE__) || CV_SSE2) && !defined(__CUDACC__) __m128d t = _mm_set_sd( value ); return _mm_cvtsd_si32(t); #elif defined _MSC_VER && defined _M_IX86 int t; __asm { fld value; fistp t; } return t; #elif ((defined _MSC_VER && defined _M_ARM) || defined CV_ICC || \ defined __GNUC__) && defined HAVE_TEGRA_OPTIMIZATION TEGRA_ROUND_DBL(value); #elif defined CV_ICC || defined __GNUC__ # if defined ARM_ROUND_DBL ARM_ROUND_DBL(value); # else return (int)lrint(value); # endif #else /* it's ok if round does not comply with IEEE754 standard; the tests should allow +/-1 difference when the tested functions use round */ return (int)(value + (value >= 0 ? 0.5 : -0.5)); #endif }
int fft4a_(double *a, double *b, double *w, int *l) { int j, j0, j1, j2, j3, j4, j5, j6, j7; /* double x0, y0, x1, y1, x2, y2, x3, y3, wi1, wi2, wi3, wr1, wr2, wr3; */ __m128d t0, t1, t2, t3, t4, w1, w2, w3; for (j = 0; j < *l; j++) { j0 = j << 1; j1 = j0 + (*l << 1); j2 = j1 + (*l << 1); j3 = j2 + (*l << 1); j4 = j << 3; j5 = j4 + 2; j6 = j5 + 2; j7 = j6 + 2; /* wr1 = w[j0]; wi1 = w[j0 + 1]; wr2 = wr1 * wr1 - wi1 * wi1; wi2 = wr1 * wi1 + wr1 * wi1; wr3 = wr1 * wr2 - wi1 * wi2; wi3 = wr1 * wi2 + wi1 * wr2; */ w1 = _mm_load_pd(&w[j0]); w2 = ZMUL(w1, w1); w3 = ZMUL(w1, w2); /* x0 = a[j0] + a[j2]; y0 = a[j0 + 1] + a[j2 + 1]; x1 = a[j0] - a[j2]; y1 = a[j0 + 1] - a[j2 + 1]; x2 = a[j1] + a[j3]; y2 = a[j1 + 1] + a[j3 + 1]; x3 = a[j1 + 1] - a[j3 + 1]; y3 = a[j3] - a[j1]; */ t0 = _mm_load_pd(&a[j0]); t2 = _mm_load_pd(&a[j2]); t1 = _mm_sub_pd(t0, t2); t0 = _mm_add_pd(t0, t2); t3 = _mm_load_pd(&a[j1]); t4 = _mm_load_pd(&a[j3]); t2 = _mm_add_pd(t3, t4); t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0)); t3 = _mm_shuffle_pd(t3, t3, 1); /* b[j4] = x0 + x2; b[j4 + 1] = y0 + y2; b[j6] = wr2 * (x0 - x2) - wi2 * (y0 - y2); b[j6 + 1] = wr2 * (y0 - y2) + wi2 * (x0 - x2); b[j5] = wr1 * (x1 + x3) - wi1 * (y1 + y3); b[j5 + 1] = wr1 * (y1 + y3) + wi1 * (x1 + x3); b[j7] = wr3 * (x1 - x3) - wi3 * (y1 - y3); b[j7 + 1] = wr3 * (y1 - y3) + wi3 * (x1 - x3); */ _mm_store_pd(&b[j4], _mm_add_pd(t0, t2)); _mm_store_pd(&b[j6], ZMUL(w2, _mm_sub_pd(t0, t2))); _mm_store_pd(&b[j5], ZMUL(w1, _mm_add_pd(t1, t3))); _mm_store_pd(&b[j7], ZMUL(w3, _mm_sub_pd(t1, t3))); } return 0; }
F64 root(F64 val) { #ifdef USE_SSE2 __m128d i = _mm_set_sd(val); __m128d unused; i = _mm_sqrt_sd(unused, i); _mm_store_sd(&val, i); return val; #else return core_sqrt(val); #endif }
int fft3a_(double *a, double *b, double *w, int *l) { /* static double c31 = .86602540378443865; static double c32 = .5; */ static __m128d c31, c32; int j, j0, j1, j2, j3, j4, j5; /* double x0, y0, x1, y1, x2, y2, wi1, wi2, wr1, wr2; */ __m128d t0, t1, t2, t3, w1, w2; c31 = _mm_set1_pd(0.86602540378443865); c32 = _mm_set1_pd(0.5); for (j = 0; j < *l; j++) { j0 = j << 1; j1 = j0 + (*l << 1); j2 = j1 + (*l << 1); j3 = j * 6; j4 = j3 + 2; j5 = j4 + 2; /* wr1 = w[j0]; wi1 = w[j0 + 1]; wr2 = wr1 * wr1 - wi1 * wi1; wi2 = wr1 * wi1 + wr1 * wi1; */ w1 = _mm_load_pd(&w[j0]); w2 = ZMUL(w1, w1); /* x0 = a[j1] + a[j2]; y0 = a[j1 + 1] + a[j2 + 1]; x1 = a[j0] - c32 * x0; y1 = a[j0 + 1] - c32 * y0; x2 = c31 * (a[j1 + 1] - a[j2 + 1]); y2 = c31 * (a[j2] - a[j1]); */ t1 = _mm_load_pd(&a[j1]); t2 = _mm_load_pd(&a[j2]); t0 = _mm_add_pd(t1, t2); t2 = _mm_xor_pd(_mm_sub_pd(t1, t2), _mm_set_sd(-0.0)); t2 = _mm_mul_pd(c31, _mm_shuffle_pd(t2, t2, 1)); t3 = _mm_load_pd(&a[j0]); t1 = _mm_sub_pd(t3, _mm_mul_pd(c32, t0)); /* b[j3] = a[j0] + x0; b[j3 + 1] = a[j0 + 1] + y0; b[j4] = wr1 * (x1 + x2) - wi1 * (y1 + y2); b[j4 + 1] = wr1 * (y1 + y2) + wi1 * (x1 + x2); b[j5] = wr2 * (x1 - x2) - wi2 * (y1 - y2); b[j5 + 1] = wr2 * (y1 - y2) + wi2 * (x1 - x2); */ _mm_store_pd(&b[j3], _mm_add_pd(t3, t0)); _mm_store_pd(&b[j4], ZMUL(w1, _mm_add_pd(t1, t2))); _mm_store_pd(&b[j5], ZMUL(w2, _mm_sub_pd(t1, t2))); } return 0; }
/** @brief Rounds floating-point number to the nearest integer not smaller than the original. The function computes an integer i such that: \f[i \le \texttt{value} < i+1\f] @param value floating-point number. If the value is outside of INT_MIN ... INT_MAX range, the result is not defined. */ CV_INLINE int cvCeil( double value ) { #if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__) __m128d t = _mm_set_sd( value ); int i = _mm_cvtsd_si32(t); return i + _mm_movemask_pd(_mm_cmplt_sd(_mm_cvtsi32_sd(t,i), t)); #elif defined __GNUC__ int i = (int)value; return i + (i < value); #else int i = cvRound(value); float diff = (float)(i - value); return i + (diff < 0); #endif }
extern "C" YEP_PRIVATE_SYMBOL double sqrt(double x) { const __m128d xmm = _mm_set_sd(x); return _mm_cvtsd_f64(_mm_sqrt_sd(xmm, xmm)); }
int fft3b_(double *a, double *b, double *w, int *m, int *l) { /* static double c31 = .86602540378443865; static double c32 = .5; */ static __m128d c31, c32; int i, i0, i1, i2, i3, i4, i5, j, j0; /* double x0, y0, x1, y1, x2, y2, wi1, wi2, wr1, wr2; */ __m128d t0, t1, t2, t3, w1, w2; c31 = _mm_set1_pd(0.86602540378443865); c32 = _mm_set1_pd(0.5); for (i = 0; i < *m; i++) { i0 = i << 1; i1 = i0 + (*m * *l << 1); i2 = i1 + (*m * *l << 1); i3 = i << 1; i4 = i3 + (*m << 1); i5 = i4 + (*m << 1); /* x0 = a[i1] + a[i2]; y0 = a[i1 + 1] + a[i2 + 1]; x1 = a[i0] - c32 * x0; y1 = a[i0 + 1] - c32 * y0; x2 = c31 * (a[i1 + 1] - a[i2 + 1]); y2 = c31 * (a[i2] - a[i1]); */ t1 = _mm_load_pd(&a[i1]); t2 = _mm_load_pd(&a[i2]); t0 = _mm_add_pd(t1, t2); t2 = _mm_xor_pd(_mm_sub_pd(t1, t2), _mm_set_sd(-0.0)); t2 = _mm_mul_pd(c31, _mm_shuffle_pd(t2, t2, 1)); t3 = _mm_load_pd(&a[i0]); t1 = _mm_sub_pd(t3, _mm_mul_pd(c32, t0)); /* b[i3] = a[i0] + x0; b[i3 + 1] = a[i0 + 1] + y0; b[i4] = x1 + x2; b[i4 + 1] = y1 + y2; b[i5] = x1 - x2; b[i5 + 1] = y1 - y2; */ _mm_store_pd(&b[i3], _mm_add_pd(t3, t0)); _mm_store_pd(&b[i4], _mm_add_pd(t1, t2)); _mm_store_pd(&b[i5], _mm_sub_pd(t1, t2)); } for (j = 1; j < *l; j++) { j0 = j << 1; /* wr1 = w[j0]; wi1 = w[j0 + 1]; wr2 = wr1 * wr1 - wi1 * wi1; wi2 = wr1 * wi1 + wr1 * wi1; */ w1 = _mm_load_pd(&w[j0]); w2 = ZMUL(w1, w1); for (i = 0; i < *m; i++) { i0 = (i << 1) + (j * *m << 1); i1 = i0 + (*m * *l << 1); i2 = i1 + (*m * *l << 1); i3 = (i << 1) + (j * *m * 6); i4 = i3 + (*m << 1); i5 = i4 + (*m << 1); /* x0 = a[i1] + a[i2]; y0 = a[i1 + 1] + a[i2 + 1]; x1 = a[i0] - x0 * .5; y1 = a[i0 + 1] - y0 * .5; x2 = c31 * (a[i1 + 1] - a[i2 + 1]); y2 = c31 * (a[i2] - a[i1]); */ t1 = _mm_load_pd(&a[i1]); t2 = _mm_load_pd(&a[i2]); t0 = _mm_add_pd(t1, t2); t2 = _mm_xor_pd(_mm_sub_pd(t1, t2), _mm_set_sd(-0.0)); t2 = _mm_mul_pd(c31, _mm_shuffle_pd(t2, t2, 1)); t3 = _mm_load_pd(&a[i0]); t1 = _mm_sub_pd(t3, _mm_mul_pd(c32, t0)); /* b[i3] = a[i0] + x0; b[i3 + 1] = a[i0 + 1] + y0; b[i4] = wr1 * (x1 + x2) - wi1 * (y1 + y2); b[i4 + 1] = wr1 * (y1 + y2) + wi1 * (x1 + x2); b[i5] = wr2 * (x1 - x2) - wi2 * (y1 - y2); b[i5 + 1] = wr2 * (y1 - y2) + wi2 * (x1 - x2); */ _mm_store_pd(&b[i3], _mm_add_pd(t3, t0)); _mm_store_pd(&b[i4], ZMUL(w1, _mm_add_pd(t1, t2))); _mm_store_pd(&b[i5], ZMUL(w2, _mm_sub_pd(t1, t2))); } } return 0; }
DBL AVXFMA4Noise(const Vector3d& EPoint, int noise_generator) { DBL x, y, z; DBL *mp; int ix, iy, iz; int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash; DBL sum; // TODO FIXME - global statistics reference // Stats[Calls_To_Noise]++; if (noise_generator==kNoiseGen_Perlin) { // The 1.59 and 0.985 are to correct for some biasing problems with // the random # generator used to create the noise tables. Final // range of values is about 5.0e-4 below 0.0 and above 1.0. Mean // value is 0.49 (ideally it would be 0.5). sum = 0.5 * (1.59 * SolidNoise(EPoint) + 0.985); // Clamp final value to 0-1 range if (sum < 0.0) sum = 0.0; if (sum > 1.0) sum = 1.0; return sum; } x = EPoint[X]; y = EPoint[Y]; z = EPoint[Z]; /* its equivalent integer lattice point. */ /* ix = (int)x; iy = (int)y; iz = (long)z; */ /* JB fix for the range problem */ __m128d xy = _mm_setr_pd(x, y); __m128d zn = _mm_set_sd(z); __m128d epsy = _mm_set1_pd(1.0 - EPSILON); __m128d xy_e = _mm_sub_pd(xy, epsy); __m128d zn_e = _mm_sub_sd(zn, epsy); __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy)); __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn)); __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0); __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ); __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy)); __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn)); const __m128i fff = _mm_set1_epi32(0xfff); __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff); __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff); ix = _mm_extract_epi32(i_xy, 0); iy = _mm_extract_epi32(i_xy, 1); iz = _mm_extract_epi32(i_zn, 0); ixiy_hash = Hash2d(ix, iy); jxiy_hash = Hash2d(ix + 1, iy); ixjy_hash = Hash2d(ix, iy + 1); jxjy_hash = Hash2d(ix + 1, iy + 1); mp = &RTable[Hash1dRTableIndex(ixiy_hash, iz)]; DBL *mp2 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)]; DBL *mp3 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)]; DBL *mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)]; DBL *mp5 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)]; DBL *mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)]; DBL *mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)]; DBL *mp8 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)]; const __m128d three = _mm_set1_pd(3.0); const __m128d two = _mm_set1_pd(2.0); const __m128d one = _mm_set1_pd(1.0); __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy); __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy); __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn); __m128d jx_mm = _mm_sub_pd(ix_mm, one); __m128d jy_mm = _mm_sub_pd(iy_mm, one); __m128d jz_mm = _mm_sub_pd(iz_mm, one); __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three)); __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three)); __m128d mm_tz = _mm_sub_pd(one, mm_sz); __m128d mm_txy = _mm_sub_pd(one, mm_sxy); __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy); __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy); __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy); __m128d y_mm = _mm_unpacklo_pd(iy_mm, jy_mm); __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p, s_mm; __m128d int_sum1 = _mm_setzero_pd(); s_mm = _mm_mul_pd(mm_txty_txsy, mm_tz); INCRSUMP2(mp, mp2, s_mm, ix_mm, y_mm, iz_mm, int_sum1); s_mm = _mm_mul_pd(mm_txty_txsy, mm_sz); INCRSUMP2(mp3, mp4, s_mm, ix_mm, y_mm, jz_mm, int_sum1); s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_tz); INCRSUMP2(mp5, mp6, s_mm, jx_mm, y_mm, iz_mm, int_sum1); s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_sz); INCRSUMP2(mp7, mp8, s_mm, jx_mm, y_mm, jz_mm, int_sum1); int_sum1 = _mm_hadd_pd(int_sum1, int_sum1); if(noise_generator==kNoiseGen_RangeCorrected) { /* details of range here: Min, max: -1.05242, 0.988997 Mean: -0.0191481, Median: -0.535493, Std Dev: 0.256828 We want to change it to as close to [0,1] as possible. */ const __m128d r2 = _mm_set_sd(0.48985582); const __m128d r1r2 = _mm_set_sd(1.05242*0.48985582); int_sum1 = _mm_macc_sd(int_sum1, r2, r1r2); } else { int_sum1 = _mm_add_sd(int_sum1, _mm_set_sd(0.5)); } int_sum1 = _mm_min_sd(one, int_sum1); int_sum1 = _mm_max_sd(_mm_setzero_pd(), int_sum1); _mm_store_sd(&sum, int_sum1); return (sum); }
int fft8b_(double *a, double *b, double *w, int *m, int *l) { /* static double c81 = .70710678118654752; */ static __m128d c81; int i, i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, i10, i11, i12, i13, i14, i15, j, j0; /* double u0, v0, u1, x0, y0, x1, y1, x2, y2, x3, y3, v1, x4, y4, x5, y5, x6, y6, x7, y7, u2, v2, u3, v3, wi1, wi2, wi3, wi4, wi5, wi6, wi7, wr1, wr2, wr3, wr4, wr5, wr6, wr7; */ __m128d t0, t1, t2, t3, t4, t5, t6, t7, t8, u0, u1, u2, u3, w1, w2, w3, w4, w5, w6, w7; c81 = _mm_set1_pd(0.70710678118654752); for (i = 0; i < *m; i++) { i0 = i << 1; i1 = i0 + (*m * *l << 1); i2 = i1 + (*m * *l << 1); i3 = i2 + (*m * *l << 1); i4 = i3 + (*m * *l << 1); i5 = i4 + (*m * *l << 1); i6 = i5 + (*m * *l << 1); i7 = i6 + (*m * *l << 1); i8 = i << 1; i9 = i8 + (*m << 1); i10 = i9 + (*m << 1); i11 = i10 + (*m << 1); i12 = i11 + (*m << 1); i13 = i12 + (*m << 1); i14 = i13 + (*m << 1); i15 = i14 + (*m << 1); /* x0 = a[i0] + a[i4]; y0 = a[i0 + 1] + a[i4 + 1]; x1 = a[i0] - a[i4]; y1 = a[i0 + 1] - a[i4 + 1]; x2 = a[i2] + a[i6]; y2 = a[i2 + 1] + a[i6 + 1]; x3 = a[i2 + 1] - a[i6 + 1]; y3 = a[i6] - a[i2]; */ t0 = _mm_load_pd(&a[i0]); t2 = _mm_load_pd(&a[i4]); t1 = _mm_sub_pd(t0, t2); t0 = _mm_add_pd(t0, t2); t3 = _mm_load_pd(&a[i2]); t4 = _mm_load_pd(&a[i6]); t2 = _mm_add_pd(t3, t4); t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0)); t3 = _mm_shuffle_pd(t3, t3, 1); /* u0 = x0 + x2; v0 = y0 + y2; u1 = x0 - x2; v1 = y0 - y2; */ u0 = _mm_add_pd(t0, t2); u1 = _mm_sub_pd(t0, t2); /* x4 = a[i1] + a[i5]; y4 = a[i1 + 1] + a[i5 + 1]; x5 = a[i1] - a[i5]; y5 = a[i1 + 1] - a[i5 + 1]; x6 = a[i3] + a[i7]; y6 = a[i3 + 1] + a[i7 + 1]; x7 = a[i3] - a[i7]; y7 = a[i3 + 1] - a[i7 + 1]; */ t4 = _mm_load_pd(&a[i1]); t6 = _mm_load_pd(&a[i5]); t5 = _mm_sub_pd(t4, t6); t4 = _mm_add_pd(t4, t6); t7 = _mm_load_pd(&a[i3]); t8 = _mm_load_pd(&a[i7]); t6 = _mm_add_pd(t7, t8); t7 = _mm_sub_pd(t7, t8); /* u2 = x4 + x6; v2 = y4 + y6; u3 = y4 - y6; v3 = x6 - x4; */ u2 = _mm_add_pd(t4, t6); u3 = _mm_xor_pd(_mm_sub_pd(t4, t6), _mm_set_sd(-0.0)); u3 = _mm_shuffle_pd(u3, u3, 1); /* b[i8] = u0 + u2; b[i8 + 1] = v0 + v2; b[i12] = u0 - u2; b[i12 + 1] = v0 - v2; b[i10] = u1 + u3; b[i10 + 1] = v1 + v3; b[i14] = u1 - u3; b[i14 + 1] = v1 - v3; */ _mm_store_pd(&b[i8], _mm_add_pd(u0, u2)); _mm_store_pd(&b[i12], _mm_sub_pd(u0, u2)); _mm_store_pd(&b[i10], _mm_add_pd(u1, u3)); _mm_store_pd(&b[i14], _mm_sub_pd(u1, u3)); /* u0 = x1 + c81 * (x5 - x7); v0 = y1 + c81 * (y5 - y7); u1 = x1 - c81 * (x5 - x7); v1 = y1 - c81 * (y5 - y7); u2 = x3 + c81 * (y5 + y7); v2 = y3 - c81 * (x5 + x7); u3 = x3 - c81 * (y5 + y7); v3 = y3 + c81 * (x5 + x7); */ u1 = _mm_mul_pd(c81, _mm_sub_pd(t5, t7)); u0 = _mm_add_pd(t1, u1); u1 = _mm_sub_pd(t1, u1); u3 = _mm_xor_pd(_mm_mul_pd(c81, _mm_add_pd(t5, t7)), _mm_set_sd(-0.0)); u3 = _mm_shuffle_pd(u3, u3, 1); u2 = _mm_add_pd(t3, u3); u3 = _mm_sub_pd(t3, u3); /* b[i9] = u0 + u2; b[i9 + 1] = v0 + v2; b[i13] = u1 + u3; b[i13 + 1] = v1 + v3; b[i11] = u1 - u3; b[i11 + 1] = v1 - v3; b[i15] = u0 - u2; b[i15 + 1] = v0 - v2; */ _mm_store_pd(&b[i9], _mm_add_pd(u0, u2)); _mm_store_pd(&b[i13], _mm_add_pd(u1, u3)); _mm_store_pd(&b[i11], _mm_sub_pd(u1, u3)); _mm_store_pd(&b[i15], _mm_sub_pd(u0, u2)); } for (j = 1; j < *l; j++) { j0 = j << 1; /* wr1 = w[j0]; wi1 = w[j0 + 1]; wr2 = wr1 * wr1 - wi1 * wi1; wi2 = wr1 * wi1 + wr1 * wi1; wr3 = wr1 * wr2 - wi1 * wi2; wi3 = wr1 * wi2 + wi1 * wr2; wr4 = wr2 * wr2 - wi2 * wi2; wi4 = wr2 * wi2 + wr2 * wi2; wr5 = wr2 * wr3 - wi2 * wi3; wi5 = wr2 * wi3 + wi2 * wr3; wr6 = wr3 * wr3 - wi3 * wi3; wi6 = wr3 * wi3 + wr3 * wi3; wr7 = wr3 * wr4 - wi3 * wi4; wi7 = wr3 * wi4 + wi3 * wr4; */ w1 = _mm_load_pd(&w[j0]); w2 = ZMUL(w1, w1); w3 = ZMUL(w1, w2); w4 = ZMUL(w2, w2); w5 = ZMUL(w2, w3); w6 = ZMUL(w3, w3); w7 = ZMUL(w3, w4); for (i = 0; i < *m; i++) { i0 = (i << 1) + (j * *m << 1); i1 = i0 + (*m * *l << 1); i2 = i1 + (*m * *l << 1); i3 = i2 + (*m * *l << 1); i4 = i3 + (*m * *l << 1); i5 = i4 + (*m * *l << 1); i6 = i5 + (*m * *l << 1); i7 = i6 + (*m * *l << 1); i8 = (i << 1) + (j * *m << 4); i9 = i8 + (*m << 1); i10 = i9 + (*m << 1); i11 = i10 + (*m << 1); i12 = i11 + (*m << 1); i13 = i12 + (*m << 1); i14 = i13 + (*m << 1); i15 = i14 + (*m << 1); /* x0 = a[i0] + a[i4]; y0 = a[i0 + 1] + a[i4 + 1]; x1 = a[i0] - a[i4]; y1 = a[i0 + 1] - a[i4 + 1]; x2 = a[i2] + a[i6]; y2 = a[i2 + 1] + a[i6 + 1]; x3 = a[i2 + 1] - a[i6 + 1]; y3 = a[i6] - a[i2]; */ t0 = _mm_load_pd(&a[i0]); t2 = _mm_load_pd(&a[i4]); t1 = _mm_sub_pd(t0, t2); t0 = _mm_add_pd(t0, t2); t3 = _mm_load_pd(&a[i2]); t4 = _mm_load_pd(&a[i6]); t2 = _mm_add_pd(t3, t4); t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0)); t3 = _mm_shuffle_pd(t3, t3, 1); /* u0 = x0 + x2; v0 = y0 + y2; u1 = x0 - x2; v1 = y0 - y2; */ u0 = _mm_add_pd(t0, t2); u1 = _mm_sub_pd(t0, t2); /* x4 = a[i1] + a[i5]; y4 = a[i1 + 1] + a[i5 + 1]; x5 = a[i1] - a[i5]; y5 = a[i1 + 1] - a[i5 + 1]; x6 = a[i3] + a[i7]; y6 = a[i3 + 1] + a[i7 + 1]; x7 = a[i3] - a[i7]; y7 = a[i3 + 1] - a[i7 + 1]; */ t4 = _mm_load_pd(&a[i1]); t6 = _mm_load_pd(&a[i5]); t5 = _mm_sub_pd(t4, t6); t4 = _mm_add_pd(t4, t6); t7 = _mm_load_pd(&a[i3]); t8 = _mm_load_pd(&a[i7]); t6 = _mm_add_pd(t7, t8); t7 = _mm_sub_pd(t7, t8); /* u2 = x4 + x6; v2 = y4 + y6; u3 = y4 - y6; v3 = x6 - x4; */ u2 = _mm_add_pd(t4, t6); u3 = _mm_xor_pd(_mm_sub_pd(t4, t6), _mm_set_sd(-0.0)); u3 = _mm_shuffle_pd(u3, u3, 1); /* b[i8] = u0 + u2; b[i8 + 1] = v0 + v2; b[i12] = wr4 * (u0 - u2) - wi4 * (v0 - v2); b[i12 + 1] = wr4 * (v0 - v2) + wi4 * (u0 - u2); b[i10] = wr2 * (u1 + u3) - wi2 * (v1 + v3); b[i10 + 1] = wr2 * (v1 + v3) + wi2 * (u1 + u3); b[i14] = wr6 * (u1 - u3) - wi6 * (v1 - v3); b[i14 + 1] = wr6 * (v1 - v3) + wi6 * (u1 - u3); */ _mm_store_pd(&b[i8], _mm_add_pd(u0, u2)); _mm_store_pd(&b[i12], ZMUL(w4, _mm_sub_pd(u0, u2))); _mm_store_pd(&b[i10], ZMUL(w2, _mm_add_pd(u1, u3))); _mm_store_pd(&b[i14], ZMUL(w6, _mm_sub_pd(u1, u3))); /* u0 = x1 + c81 * (x5 - x7); v0 = y1 + c81 * (y5 - y7); u1 = x1 - c81 * (x5 - x7); v1 = y1 - c81 * (y5 - y7); u2 = x3 + c81 * (y5 + y7); v2 = y3 - c81 * (x5 + x7); u3 = x3 - c81 * (y5 + y7); v3 = y3 + c81 * (x5 + x7); */ u1 = _mm_mul_pd(c81, _mm_sub_pd(t5, t7)); u0 = _mm_add_pd(t1, u1); u1 = _mm_sub_pd(t1, u1); u3 = _mm_xor_pd(_mm_mul_pd(c81, _mm_add_pd(t5, t7)), _mm_set_sd(-0.0)); u3 = _mm_shuffle_pd(u3, u3, 1); u2 = _mm_add_pd(t3, u3); u3 = _mm_sub_pd(t3, u3); /* b[i9] = wr1 * (u0 + u2) - wi1 * (v0 + v2); b[i9 + 1] = wr1 * (v0 + v2) + wi1 * (u0 + u2); b[i13] = wr5 * (u1 + u3) - wi5 * (v1 + v3); b[i13 + 1] = wr5 * (v1 + v3) + wi5 * (u1 + u3); b[i11] = wr3 * (u1 - u3) - wi3 * (v1 - v3); b[i11 + 1] = wr3 * (v1 - v3) + wi3 * (u1 - u3); b[i15] = wr7 * (u0 - u2) - wi7 * (v0 - v2); b[i15 + 1] = wr7 * (v0 - v2) + wi7 * (u0 - u2); */ _mm_store_pd(&b[i9], ZMUL(w1, _mm_add_pd(u0, u2))); _mm_store_pd(&b[i13], ZMUL(w5, _mm_add_pd(u1, u3))); _mm_store_pd(&b[i11], ZMUL(w3, _mm_sub_pd(u1, u3))); _mm_store_pd(&b[i15], ZMUL(w7, _mm_sub_pd(u0, u2))); } } return 0; }
int fft8a_(double *a, double *b, double *w, int *l) { /* static double c81 = .70710678118654752; */ static __m128d c81; int j, j0, j1, j2, j3, j4, j5, j6, j7, j8, j9, j10, j11, j12, j13, j14, j15; /* double u0, v0, u1, x0, y0, x1, y1, x2, y2, x3, y3, v1, x4, y4, x5, y5, x6, y6, x7, y7, u2, v2, u3, v3, wi1, wi2, wi3, wi4, wi5, wi6, wi7, wr1, wr2, wr3, wr4, wr5, wr6, wr7; */ __m128d t0, t1, t2, t3, t4, t5, t6, t7, t8, u0, u1, u2, u3, w1, w2, w3, w4, w5, w6, w7; c81 = _mm_set1_pd(0.70710678118654752); for (j = 0; j < *l; j++) { j0 = j << 1; j1 = j0 + (*l << 1); j2 = j1 + (*l << 1); j3 = j2 + (*l << 1); j4 = j3 + (*l << 1); j5 = j4 + (*l << 1); j6 = j5 + (*l << 1); j7 = j6 + (*l << 1); j8 = j << 4; j9 = j8 + 2; j10 = j9 + 2; j11 = j10 + 2; j12 = j11 + 2; j13 = j12 + 2; j14 = j13 + 2; j15 = j14 + 2; /* wr1 = w[j0]; wi1 = w[j0 + 1]; wr2 = wr1 * wr1 - wi1 * wi1; wi2 = wr1 * wi1 + wr1 * wi1; wr3 = wr1 * wr2 - wi1 * wi2; wi3 = wr1 * wi2 + wi1 * wr2; wr4 = wr2 * wr2 - wi2 * wi2; wi4 = wr2 * wi2 + wr2 * wi2; wr5 = wr2 * wr3 - wi2 * wi3; wi5 = wr2 * wi3 + wi2 * wr3; wr6 = wr3 * wr3 - wi3 * wi3; wi6 = wr3 * wi3 + wr3 * wi3; wr7 = wr3 * wr4 - wi3 * wi4; wi7 = wr3 * wi4 + wi3 * wr4; */ w1 = _mm_load_pd(&w[j0]); w2 = ZMUL(w1, w1); w3 = ZMUL(w1, w2); w4 = ZMUL(w2, w2); w5 = ZMUL(w2, w3); w6 = ZMUL(w3, w3); w7 = ZMUL(w3, w4); /* x0 = a[j0] + a[j4]; y0 = a[j0 + 1] + a[j4 + 1]; x1 = a[j0] - a[j4]; y1 = a[j0 + 1] - a[j4 + 1]; x2 = a[j2] + a[j6]; y2 = a[j2 + 1] + a[j6 + 1]; x3 = a[j2 + 1] - a[j6 + 1]; y3 = a[j6] - a[j2]; */ t0 = _mm_load_pd(&a[j0]); t2 = _mm_load_pd(&a[j4]); t1 = _mm_sub_pd(t0, t2); t0 = _mm_add_pd(t0, t2); t3 = _mm_load_pd(&a[j2]); t4 = _mm_load_pd(&a[j6]); t2 = _mm_add_pd(t3, t4); t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0)); t3 = _mm_shuffle_pd(t3, t3, 1); /* u0 = x0 + x2; v0 = y0 + y2; u1 = x0 - x2; v1 = y0 - y2; */ u0 = _mm_add_pd(t0, t2); u1 = _mm_sub_pd(t0, t2); /* x4 = a[j1] + a[j5]; y4 = a[j1 + 1] + a[j5 + 1]; x5 = a[j1] - a[j5]; y5 = a[j1 + 1] - a[j5 + 1]; x6 = a[j3] + a[j7]; y6 = a[j3 + 1] + a[j7 + 1]; x7 = a[j3] - a[j7]; y7 = a[j3 + 1] - a[j7 + 1]; */ t4 = _mm_load_pd(&a[j1]); t6 = _mm_load_pd(&a[j5]); t5 = _mm_sub_pd(t4, t6); t4 = _mm_add_pd(t4, t6); t7 = _mm_load_pd(&a[j3]); t8 = _mm_load_pd(&a[j7]); t6 = _mm_add_pd(t7, t8); t7 = _mm_sub_pd(t7, t8); /* u2 = x4 + x6; v2 = y4 + y6; u3 = y4 - y6; v3 = x6 - x4; */ u2 = _mm_add_pd(t4, t6); u3 = _mm_xor_pd(_mm_sub_pd(t4, t6), _mm_set_sd(-0.0)); u3 = _mm_shuffle_pd(u3, u3, 1); /* b[j8] = u0 + u2; b[j8 + 1] = v0 + v2; b[j12] = wr4 * (u0 - u2) - wi4 * (v0 - v2); b[j12 + 1] = wr4 * (v0 - v2) + wi4 * (u0 - u2); b[j10] = wr2 * (u1 + u3) - wi2 * (v1 + v3); b[j10 + 1] = wr2 * (v1 + v3) + wi2 * (u1 + u3); b[j14] = wr6 * (u1 - u3) - wi6 * (v1 - v3); b[j14 + 1] = wr6 * (v1 - v3) + wi6 * (u1 - u3); */ _mm_store_pd(&b[j8], _mm_add_pd(u0, u2)); _mm_store_pd(&b[j12], ZMUL(w4, _mm_sub_pd(u0, u2))); _mm_store_pd(&b[j10], ZMUL(w2, _mm_add_pd(u1, u3))); _mm_store_pd(&b[j14], ZMUL(w6, _mm_sub_pd(u1, u3))); /* u0 = x1 + c81 * (x5 - x7); v0 = y1 + c81 * (y5 - y7); u1 = x1 - c81 * (x5 - x7); v1 = y1 - c81 * (y5 - y7); u2 = x3 + c81 * (y5 + y7); v2 = y3 - c81 * (x5 + x7); u3 = x3 - c81 * (y5 + y7); v3 = y3 + c81 * (x5 + x7); */ u1 = _mm_mul_pd(c81, _mm_sub_pd(t5, t7)); u0 = _mm_add_pd(t1, u1); u1 = _mm_sub_pd(t1, u1); u3 = _mm_xor_pd(_mm_mul_pd(c81, _mm_add_pd(t5, t7)), _mm_set_sd(-0.0)); u3 = _mm_shuffle_pd(u3, u3, 1); u2 = _mm_add_pd(t3, u3); u3 = _mm_sub_pd(t3, u3); /* b[j9] = wr1 * (u0 + u2) - wi1 * (v0 + v2); b[j9 + 1] = wr1 * (v0 + v2) + wi1 * (u0 + u2); b[j13] = wr5 * (u1 + u3) - wi5 * (v1 + v3); b[j13 + 1] = wr5 * (v1 + v3) + wi5 * (u1 + u3); b[j11] = wr3 * (u1 - u3) - wi3 * (v1 - v3); b[j11 + 1] = wr3 * (v1 - v3) + wi3 * (u1 - u3); b[j15] = wr7 * (u0 - u2) - wi7 * (v0 - v2); b[j15 + 1] = wr7 * (v0 - v2) + wi7 * (u0 - u2); */ _mm_store_pd(&b[j9], ZMUL(w1, _mm_add_pd(u0, u2))); _mm_store_pd(&b[j13], ZMUL(w5, _mm_add_pd(u1, u3))); _mm_store_pd(&b[j11], ZMUL(w3, _mm_sub_pd(u1, u3))); _mm_store_pd(&b[j15], ZMUL(w7, _mm_sub_pd(u0, u2))); } return 0; }
int fft5b_(double *a, double *b, double *w, int *m, int *l) { /* static double c51 = .95105651629515357; static double c52 = .61803398874989485; static double c53 = .55901699437494742; static double c54 = .25; */ static __m128d c51, c52, c53, c54; int i, i0, i1, i2, i3, i4, i5, i6, i7, i8, i9, j, j0; /* double x0, y0, x1, y1, x2, y2, x3, y3, x4, y4, x5, y5, x6, y6, x7, y7, x8, y8, x9, y9, x10, y10, wi1, wi2, wi3, wi4, wr1, wr2, wr3, wr4; */ __m128d t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, w1, w2, w3, w4; c51 = _mm_set1_pd(0.95105651629515357); c52 = _mm_set1_pd(0.61803398874989485); c53 = _mm_set1_pd(0.55901699437494742); c54 = _mm_set1_pd(0.25); for (i = 0; i < *m; i++) { i0 = i << 1; i1 = i0 + (*m * *l << 1); i2 = i1 + (*m * *l << 1); i3 = i2 + (*m * *l << 1); i4 = i3 + (*m * *l << 1); i5 = i << 1; i6 = i5 + (*m << 1); i7 = i6 + (*m << 1); i8 = i7 + (*m << 1); i9 = i8 + (*m << 1); /* x0 = a[i1] + a[i4]; y0 = a[i1 + 1] + a[i4 + 1]; x1 = a[i2] + a[i3]; y1 = a[i2 + 1] + a[i3 + 1]; x2 = c51 * (a[i1] - a[i4]); y2 = c51 * (a[i1 + 1] - a[i4 + 1]); x3 = c51 * (a[i2] - a[i3]); y3 = c51 * (a[i2 + 1] - a[i3 + 1]); x4 = x0 + x1; y4 = y0 + y1; x5 = c53 * (x0 - x1); y5 = c53 * (y0 - y1); x6 = a[i0] - c54 * x4; y6 = a[i0 + 1] - c54 * y4; x7 = x6 + x5; y7 = y6 + y5; x8 = x6 - x5; y8 = y6 - y5; x9 = y2 + c52 * y3; y9 = -x2 - c52 * x3; x10 = c52 * y2 - y3; y10 = x3 - c52 * x2; */ t1 = _mm_load_pd(&a[i1]); t4 = _mm_load_pd(&a[i4]); t0 = _mm_add_pd(t1, t4); t2 = _mm_mul_pd(c51, _mm_sub_pd(t1, t4)); t1 = _mm_load_pd(&a[i2]); t4 = _mm_load_pd(&a[i3]); t3 = _mm_mul_pd(c51, _mm_sub_pd(t1, t4)); t1 = _mm_add_pd(t1, t4); t4 = _mm_add_pd(t0, t1); t5 = _mm_mul_pd(c53, _mm_sub_pd(t0, t1)); t0 = _mm_load_pd(&a[i0]); t6 = _mm_sub_pd(t0, _mm_mul_pd(c54, t4)); t7 = _mm_add_pd(t6, t5); t8 = _mm_sub_pd(t6, t5); t9 = _mm_xor_pd(_mm_add_pd(t2, _mm_mul_pd(c52, t3)), _mm_set_sd(-0.0)); t9 = _mm_shuffle_pd(t9, t9, 1); t10 = _mm_sub_pd(t3, _mm_mul_pd(c52, t2)); t10 = _mm_xor_pd(_mm_shuffle_pd(t10, t10, 1), _mm_set_sd(-0.0)); /* b[i5] = a[i0] + x4; b[i5 + 1] = a[i0 + 1] + y4; b[i6] = x7 + x9; b[i6 + 1] = y7 + y9; b[i7] = x8 + x10; b[i7 + 1] = y8 + y10; b[i8] = x8 - x10; b[i8 + 1] = y8 - y10; b[i9] = x7 - x9; b[i9 + 1] = y7 - y9; */ _mm_store_pd(&b[i5], _mm_add_pd(t0, t4)); _mm_store_pd(&b[i6], _mm_add_pd(t7, t9)); _mm_store_pd(&b[i7], _mm_add_pd(t8, t10)); _mm_store_pd(&b[i8], _mm_sub_pd(t8, t10)); _mm_store_pd(&b[i9], _mm_sub_pd(t7, t9)); } for (j = 1; j < *l; j++) { j0 = j << 1; /* wr1 = w[j0]; wi1 = w[j0 + 1]; wr2 = wr1 * wr1 - wi1 * wi1; wi2 = wr1 * wi1 + wr1 * wi1; wr3 = wr1 * wr2 - wi1 * wi2; wi3 = wr1 * wi2 + wi1 * wr2; wr4 = wr2 * wr2 - wi2 * wi2; wi4 = wr2 * wi2 + wr2 * wi2; */ w1 = _mm_load_pd(&w[j0]); w2 = ZMUL(w1, w1); w3 = ZMUL(w1, w2); w4 = ZMUL(w2, w2); for (i = 0; i < *m; i++) { i0 = (i << 1) + (j * *m << 1); i1 = i0 + (*m * *l << 1); i2 = i1 + (*m * *l << 1); i3 = i2 + (*m * *l << 1); i4 = i3 + (*m * *l << 1); i5 = (i << 1) + (j * *m * 10); i6 = i5 + (*m << 1); i7 = i6 + (*m << 1); i8 = i7 + (*m << 1); i9 = i8 + (*m << 1); /* x0 = a[i1] + a[i4]; y0 = a[i1 + 1] + a[i4 + 1]; x1 = a[i2] + a[i3]; y1 = a[i2 + 1] + a[i3 + 1]; x2 = c51 * (a[i1] - a[i4]); y2 = c51 * (a[i1 + 1] - a[i4 + 1]); x3 = c51 * (a[i2] - a[i3]); y3 = c51 * (a[i2 + 1] - a[i3 + 1]); x4 = x0 + x1; y4 = y0 + y1; x5 = c53 * (x0 - x1); y5 = c53 * (y0 - y1); x6 = a[i0] - c54 * x4; y6 = a[i0 + 1] - c54 * y4; x7 = x6 + x5; y7 = y6 + y5; x8 = x6 - x5; y8 = y6 - y5; x9 = y2 + c52 * y3; y9 = -x2 - c52 * x3; x10 = c52 * y2 - y3; y10 = x3 - c52 * x2; */ t1 = _mm_load_pd(&a[i1]); t4 = _mm_load_pd(&a[i4]); t0 = _mm_add_pd(t1, t4); t2 = _mm_mul_pd(c51, _mm_sub_pd(t1, t4)); t1 = _mm_load_pd(&a[i2]); t4 = _mm_load_pd(&a[i3]); t3 = _mm_mul_pd(c51, _mm_sub_pd(t1, t4)); t1 = _mm_add_pd(t1, t4); t4 = _mm_add_pd(t0, t1); t5 = _mm_mul_pd(c53, _mm_sub_pd(t0, t1)); t0 = _mm_load_pd(&a[i0]); t6 = _mm_sub_pd(t0, _mm_mul_pd(c54, t4)); t7 = _mm_add_pd(t6, t5); t8 = _mm_sub_pd(t6, t5); t9 = _mm_xor_pd(_mm_add_pd(t2, _mm_mul_pd(c52, t3)), _mm_set_sd(-0.0)); t9 = _mm_shuffle_pd(t9, t9, 1); t10 = _mm_sub_pd(t3, _mm_mul_pd(c52, t2)); t10 = _mm_xor_pd(_mm_shuffle_pd(t10, t10, 1), _mm_set_sd(-0.0)); /* b[i5] = a[i0] + x4; b[i5 + 1] = a[i0 + 1] + y4; b[i6] = wr1 * (x7 + x9) - wi1 * (y7 + y9); b[i6 + 1] = wr1 * (y7 + y9) + wi1 * (x7 + x9); b[i7] = wr2 * (x8 + x10) - wi2 * (y8 + y10); b[i7 + 1] = wr2 * (y8 + y10) + wi2 * (x8 + x10); b[i8] = wr3 * (x8 - x10) - wi3 * (y8 - y10); b[i8 + 1] = wr3 * (y8 - y10) + wi3 * (x8 - x10); b[i9] = wr4 * (x7 - x9) - wi4 * (y7 - y9); b[i9 + 1] = wr4 * (y7 - y9) + wi4 * (x7 - x9); */ _mm_store_pd(&b[i5], _mm_add_pd(t0, t4)); _mm_store_pd(&b[i6], ZMUL(w1, _mm_add_pd(t7, t9))); _mm_store_pd(&b[i7], ZMUL(w2, _mm_add_pd(t8, t10))); _mm_store_pd(&b[i8], ZMUL(w3, _mm_sub_pd(t8, t10))); _mm_store_pd(&b[i9], ZMUL(w4, _mm_sub_pd(t7, t9))); } } return 0; }
int fft5a_(double *a, double *b, double *w, int *l) { /* static double c51 = .95105651629515357; static double c52 = .61803398874989485; static double c53 = .55901699437494742; static double c54 = .25; */ static __m128d c51, c52, c53, c54; int j, j0, j1, j2, j3, j4, j5, j6, j7, j8, j9; /* double x0, y0, x1, y1, x2, y2, x3, y3, x4, y4, x5, y5, x6, y6, x7, y7, x8, y8, x9, y9, x10, y10, wi1, wi2, wi3, wi4, wr1, wr2, wr3, wr4; */ __m128d t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10, w1, w2, w3, w4; c51 = _mm_set1_pd(0.95105651629515357); c52 = _mm_set1_pd(0.61803398874989485); c53 = _mm_set1_pd(0.55901699437494742); c54 = _mm_set1_pd(0.25); for (j = 0; j < *l; j++) { j0 = j << 1; j1 = j0 + (*l << 1); j2 = j1 + (*l << 1); j3 = j2 + (*l << 1); j4 = j3 + (*l << 1); j5 = j * 10; j6 = j5 + 2; j7 = j6 + 2; j8 = j7 + 2; j9 = j8 + 2; /* wr1 = w[j0]; wi1 = w[j0 + 1]; wr2 = wr1 * wr1 - wi1 * wi1; wi2 = wr1 * wi1 + wr1 * wi1; wr3 = wr1 * wr2 - wi1 * wi2; wi3 = wr1 * wi2 + wi1 * wr2; wr4 = wr2 * wr2 - wi2 * wi2; wi4 = wr2 * wi2 + wr2 * wi2; */ w1 = _mm_load_pd(&w[j0]); w2 = ZMUL(w1, w1); w3 = ZMUL(w1, w2); w4 = ZMUL(w2, w2); /* x0 = a[j1] + a[j4]; y0 = a[j1 + 1] + a[j4 + 1]; x1 = a[j2] + a[j3]; y1 = a[j2 + 1] + a[j3 + 1]; x2 = c51 * (a[j1] - a[j4]); y2 = c51 * (a[j1 + 1] - a[j4 + 1]); x3 = c51 * (a[j2] - a[j3]); y3 = c51 * (a[j2 + 1] - a[j3 + 1]); x4 = x0 + x1; y4 = y0 + y1; x5 = c53 * (x0 - x1); y5 = c53 * (y0 - y1); x6 = a[j0] - c54 * x4; y6 = a[j0 + 1] - c54 * y4; x7 = x6 + x5; y7 = y6 + y5; x8 = x6 - x5; y8 = y6 - y5; x9 = y2 + c52 * y3; y9 = -x2 - c52 * x3; x10 = c52 * y2 - y3; y10 = x3 - c52 * x2; */ t1 = _mm_load_pd(&a[j1]); t4 = _mm_load_pd(&a[j4]); t0 = _mm_add_pd(t1, t4); t2 = _mm_mul_pd(c51, _mm_sub_pd(t1, t4)); t1 = _mm_load_pd(&a[j2]); t4 = _mm_load_pd(&a[j3]); t3 = _mm_mul_pd(c51, _mm_sub_pd(t1, t4)); t1 = _mm_add_pd(t1, t4); t4 = _mm_add_pd(t0, t1); t5 = _mm_mul_pd(c53, _mm_sub_pd(t0, t1)); t0 = _mm_load_pd(&a[j0]); t6 = _mm_sub_pd(t0, _mm_mul_pd(c54, t4)); t7 = _mm_add_pd(t6, t5); t8 = _mm_sub_pd(t6, t5); t9 = _mm_xor_pd(_mm_add_pd(t2, _mm_mul_pd(c52, t3)), _mm_set_sd(-0.0)); t9 = _mm_shuffle_pd(t9, t9, 1); t10 = _mm_sub_pd(t3, _mm_mul_pd(c52, t2)); t10 = _mm_xor_pd(_mm_shuffle_pd(t10, t10, 1), _mm_set_sd(-0.0)); /* b[j5] = a[j0] + x4; b[j5 + 1] = a[j0 + 1] + y4; b[j6] = wr1 * (x7 + x9) - wi1 * (y7 + y9); b[j6 + 1] = wr1 * (y7 + y9) + wi1 * (x7 + x9); b[j7] = wr2 * (x8 + x10) - wi2 * (y8 + y10); b[j7 + 1] = wr2 * (y8 + y10) + wi2 * (x8 + x10); b[j8] = wr3 * (x8 - x10) - wi3 * (y8 - y10); b[j8 + 1] = wr3 * (y8 - y10) + wi3 * (x8 - x10); b[j9] = wr4 * (x7 - x9) - wi4 * (y7 - y9); b[j9 + 1] = wr4 * (y7 - y9) + wi4 * (x7 - x9); */ _mm_store_pd(&b[j5], _mm_add_pd(t0, t4)); _mm_store_pd(&b[j6], ZMUL(w1, _mm_add_pd(t7, t9))); _mm_store_pd(&b[j7], ZMUL(w2, _mm_add_pd(t8, t10))); _mm_store_pd(&b[j8], ZMUL(w3, _mm_sub_pd(t8, t10))); _mm_store_pd(&b[j9], ZMUL(w4, _mm_sub_pd(t7, t9))); } return 0; }
int fft4b_(double *a, double *b, double *w, int *m, int *l) { int i, i0, i1, i2, i3, i4, i5, i6, i7, j, j0; /* double x0, y0, x1, y1, x2, y2, x3, y3, wi1, wi2, wi3, wr1, wr2, wr3; */ __m128d t0, t1, t2, t3, t4, w1, w2, w3; for (i = 0; i < *m; i++) { i0 = i << 1; i1 = i0 + (*m * *l << 1); i2 = i1 + (*m * *l << 1); i3 = i2 + (*m * *l << 1); i4 = i << 1; i5 = i4 + (*m << 1); i6 = i5 + (*m << 1); i7 = i6 + (*m << 1); /* x0 = a[i0] + a[i2]; y0 = a[i0 + 1] + a[i2 + 1]; x1 = a[i0] - a[i2]; y1 = a[i0 + 1] - a[i2 + 1]; x2 = a[i1] + a[i3]; y2 = a[i1 + 1] + a[i3 + 1]; x3 = a[i1 + 1] - a[i3 + 1]; y3 = a[i3] - a[i1]; */ t0 = _mm_load_pd(&a[i0]); t2 = _mm_load_pd(&a[i2]); t1 = _mm_sub_pd(t0, t2); t0 = _mm_add_pd(t0, t2); t3 = _mm_load_pd(&a[i1]); t4 = _mm_load_pd(&a[i3]); t2 = _mm_add_pd(t3, t4); t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0)); t3 = _mm_shuffle_pd(t3, t3, 1); /* b[i4] = x0 + x2; b[i4 + 1] = y0 + y2; b[i6] = x0 - x2; b[i6 + 1] = y0 - y2; b[i5] = x1 + x3; b[i5 + 1] = y1 + y3; b[i7] = x1 - x3; b[i7 + 1] = y1 - y3; */ _mm_store_pd(&b[i4], _mm_add_pd(t0, t2)); _mm_store_pd(&b[i6], _mm_sub_pd(t0, t2)); _mm_store_pd(&b[i5], _mm_add_pd(t1, t3)); _mm_store_pd(&b[i7], _mm_sub_pd(t1, t3)); } for (j = 1; j < *l; j++) { j0 = j << 1; /* wr1 = w[j0]; wi1 = w[j0 + 1]; wr2 = wr1 * wr1 - wi1 * wi1; wi2 = wr1 * wi1 + wr1 * wi1; wr3 = wr1 * wr2 - wi1 * wi2; wi3 = wr1 * wi2 + wi1 * wr2; */ w1 = _mm_load_pd(&w[j0]); w2 = ZMUL(w1, w1); w3 = ZMUL(w1, w2); for (i = 0; i < *m; i++) { i0 = (i << 1) + (j * *m << 1); i1 = i0 + (*m * *l << 1); i2 = i1 + (*m * *l << 1); i3 = i2 + (*m * *l << 1); i4 = (i << 1) + (j * *m << 3); i5 = i4 + (*m << 1); i6 = i5 + (*m << 1); i7 = i6 + (*m << 1); /* x0 = a[i0] + a[i2]; y0 = a[i0 + 1] + a[i2 + 1]; x1 = a[i0] - a[i2]; y1 = a[i0 + 1] - a[i2 + 1]; x2 = a[i1] + a[i3]; y2 = a[i1 + 1] + a[i3 + 1]; x3 = a[i1 + 1] - a[i3 + 1]; y3 = a[i3] - a[i1]; */ t0 = _mm_load_pd(&a[i0]); t2 = _mm_load_pd(&a[i2]); t1 = _mm_sub_pd(t0, t2); t0 = _mm_add_pd(t0, t2); t3 = _mm_load_pd(&a[i1]); t4 = _mm_load_pd(&a[i3]); t2 = _mm_add_pd(t3, t4); t3 = _mm_xor_pd(_mm_sub_pd(t3, t4), _mm_set_sd(-0.0)); t3 = _mm_shuffle_pd(t3, t3, 1); /* b[i4] = x0 + x2; b[i4 + 1] = y0 + y2; b[i6] = wr2 * (x0 - x2) - wi2 * (y0 - y2); b[i6 + 1] = wr2 * (y0 - y2) + wi2 * (x0 - x2); b[i5] = wr1 * (x1 + x3) - wi1 * (y1 + y3); b[i5 + 1] = wr1 * (y1 + y3) + wi1 * (x1 + x3); b[i7] = wr3 * (x1 - x3) - wi3 * (y1 - y3); b[i7 + 1] = wr3 * (y1 - y3) + wi3 * (x1 - x3); */ _mm_store_pd(&b[i4], _mm_add_pd(t0, t2)); _mm_store_pd(&b[i6], ZMUL(w2, _mm_sub_pd(t0, t2))); _mm_store_pd(&b[i5], ZMUL(w1, _mm_add_pd(t1, t3))); _mm_store_pd(&b[i7], ZMUL(w3, _mm_sub_pd(t1, t3))); } } return 0; }
void AVXFMA4DNoise(Vector3d& result, const Vector3d& EPoint) { DBL x, y, z; int ix, iy, iz; int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash; // TODO FIXME - global statistics reference // Stats[Calls_To_DNoise]++; x = EPoint[X]; y = EPoint[Y]; z = EPoint[Z]; /* its equivalent integer lattice point. */ /*ix = (int)x; iy = (int)y; iz = (int)z; x_ix = x - ix; y_iy = y - iy; z_iz = z - iz;*/ /* JB fix for the range problem */ __m128d xy = _mm_setr_pd(x, y); __m128d zn = _mm_set_sd(z); __m128d epsy = _mm_set1_pd(1.0 - EPSILON); __m128d xy_e = _mm_sub_pd(xy, epsy); __m128d zn_e = _mm_sub_sd(zn, epsy); __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy)); __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn)); __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0); __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ); __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy)); __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn)); const __m128i fff = _mm_set1_epi32(0xfff); __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff); __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff); ix = _mm_extract_epi32(i_xy, 0); iy = _mm_extract_epi32(i_xy, 1); iz = _mm_extract_epi32(i_zn, 0); ixiy_hash = Hash2d(ix, iy); jxiy_hash = Hash2d(ix + 1, iy); ixjy_hash = Hash2d(ix, iy + 1); jxjy_hash = Hash2d(ix + 1, iy + 1); DBL* mp1 = &RTable[Hash1dRTableIndex(ixiy_hash, iz)]; DBL* mp2 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)]; DBL* mp3 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)]; DBL* mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)]; DBL* mp5 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)]; DBL* mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)]; DBL* mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)]; DBL* mp8 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)]; const __m128d three = _mm_set1_pd(3.0); const __m128d two = _mm_set1_pd(2.0); const __m128d one = _mm_set1_pd(1.0); __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy); __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy); __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn); __m128d jx_mm = _mm_sub_pd(ix_mm, one); __m128d jy_mm = _mm_sub_pd(iy_mm, one); __m128d jz_mm = _mm_sub_pd(iz_mm, one); __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three)); __m128d mm_tz = _mm_sub_pd(one, mm_sz); __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three)); __m128d mm_txy = _mm_sub_pd(one, mm_sxy); __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy); __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy); __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy); __m128d mm_txty_txsy_tz = _mm_mul_pd(mm_txty_txsy, mm_tz); __m128d mm_txty_txsy_sz = _mm_mul_pd(mm_txty_txsy, mm_sz); __m128d mm_sxty_sxsy_tz = _mm_mul_pd(mm_sxty_sxsy, mm_tz); __m128d mm_sxty_sxsy_sz = _mm_mul_pd(mm_sxty_sxsy, mm_sz); __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p; __m128d sum_X_Y = _mm_setzero_pd(); __m128d sum__Z = _mm_setzero_pd(); __m128d mm_s1 = _mm_unpacklo_pd(mm_txty_txsy_tz, mm_txty_txsy_tz); INCRSUMP2(mp1, mp1 + 8, mm_s1, ix_mm, iy_mm, iz_mm, sum_X_Y); __m128d mm_s2 = _mm_unpacklo_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz); INCRSUMP2(mp2, mp2 + 8, mm_s2, jx_mm, iy_mm, iz_mm, sum_X_Y); __m128d mm_s3 = _mm_unpackhi_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz); INCRSUMP2(mp3, mp3 + 8, mm_s3, jx_mm, jy_mm, iz_mm, sum_X_Y); __m128d mm_s4 = _mm_unpackhi_pd(mm_txty_txsy_tz, mm_txty_txsy_tz); INCRSUMP2(mp4, mp4 + 8, mm_s4, ix_mm, jy_mm, iz_mm, sum_X_Y); __m128d mm_s5 = _mm_unpackhi_pd(mm_txty_txsy_sz, mm_txty_txsy_sz); INCRSUMP2(mp5, mp5 + 8, mm_s5, ix_mm, jy_mm, jz_mm, sum_X_Y); __m128d mm_s6 = _mm_unpackhi_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz); INCRSUMP2(mp6, mp6 + 8, mm_s6, jx_mm, jy_mm, jz_mm, sum_X_Y); __m128d mm_s7 = _mm_unpacklo_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz); INCRSUMP2(mp7, mp7 + 8, mm_s7, jx_mm, iy_mm, jz_mm, sum_X_Y); __m128d mm_s8 = _mm_unpacklo_pd(mm_txty_txsy_sz, mm_txty_txsy_sz); INCRSUMP2(mp8, mp8 + 8, mm_s8, ix_mm, iy_mm, jz_mm, sum_X_Y); __m128d iy_jy = _mm_unpacklo_pd(iy_mm, jy_mm); INCRSUMP2(mp1 + 16, mp4 + 16, mm_txty_txsy_tz, ix_mm, iy_jy, iz_mm, sum__Z); INCRSUMP2(mp8 + 16, mp5 + 16, mm_txty_txsy_sz, ix_mm, iy_jy, jz_mm, sum__Z); INCRSUMP2(mp2 + 16, mp3 + 16, mm_sxty_sxsy_tz, jx_mm, iy_jy, iz_mm, sum__Z); INCRSUMP2(mp7 + 16, mp6 + 16, mm_sxty_sxsy_sz, jx_mm, iy_jy, jz_mm, sum__Z); sum__Z = _mm_hadd_pd(sum__Z, sum__Z); _mm_storeu_pd(*result, sum_X_Y); _mm_store_sd(&result[Z], sum__Z); }
int main() { #ifndef __EMSCRIPTEN__ _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); #endif printf ("{ \"workload\": %u, \"results\": [\n", N); assert(N%2 == 0); // Don't care about the tail for now. double *src = get_src_d();//(float*)aligned_alloc(16, N*sizeof(float)); for(int i = 0; i < N; ++i) src[i] = (double)rand() / RAND_MAX; double *src2 = get_src2_d();//(float*)aligned_alloc(16, N*sizeof(float)); for(int i = 0; i < N; ++i) src2[i] = (double)rand() / RAND_MAX; double *dst = get_dst_d();//(float*)aligned_alloc(16, N*sizeof(float)); float scalarTime; SETCHART("load"); START(); for(int i = 0; i < N; ++i) dst[i] = src[i]; ENDSCALAR(checksum_dst(dst), "scalar"); LS_TEST("_mm_load_pd", _mm_load_pd, 0, _mm_store_pd, double*, 0, 2); LS_TEST("_mm_load_pd1", _mm_load_pd1, 1, _mm_store_pd, double*, 0, 2); LS_TEST("_mm_load_sd", _mm_load_sd, 1, _mm_store_pd, double*, 0, 2); // _mm_load_si128 LS_TEST("_mm_load1_pd", _mm_load1_pd, 1, _mm_store_pd, double*, 0, 2); __m128d tempReg = _mm_set_pd(1.0, 2.0); LSH_TEST("_mm_loadh_pd", tempReg, _mm_loadh_pd, double*, 1, _mm_store_pd, double*, 0, 2); // _mm_loadl_epi64 LSH_TEST("_mm_loadl_pd", tempReg, _mm_loadh_pd, double*, 1, _mm_store_pd, double*, 0, 2); LS_TEST("_mm_loadr_pd", _mm_loadr_pd, 0, _mm_store_pd, double*, 0, 2); LS_TEST("_mm_loadu_pd", _mm_loadu_pd, 1, _mm_store_pd, double*, 0, 2); // _mm_loadu_si128 SETCHART("set"); /* _mm_set_epi16 _mm_set_epi32 _mm_set_epi64 _mm_set_epi64x _mm_set_epi8 */ SS_TEST_D("_mm_set_pd", _mm_set_pd(src[i+2], src[i+0])); //SS_TEST_D("_mm_set_pd1", _mm_set_pd1(src[i])); SS_TEST_D("_mm_set_sd", _mm_set_sd(src[i])); /* _mm_set1_epi16 _mm_set1_epi32 _mm_set1_epi64 _mm_set1_epi64x _mm_set1_epi8 */ SS_TEST_D("_mm_set1_pd", _mm_set1_pd(src[i])); /* _mm_setr_epi16 _mm_setr_epi32 _mm_setr_epi64 _mm_setr_epi8 */ SS_TEST_D("_mm_setr_pd", _mm_set_pd(src[i+2], src[i+0])); SS_TEST_D("_mm_setzero_pd", _mm_setzero_pd()); // _mm_setzero_si128 SETCHART("move"); // _mm_move_epi64 SS_TEST_D("_mm_move_sd", _mm_move_sd(_mm_load_pd(src+i), _mm_load_pd(src2+i))); SETCHART("store"); // _mm_maskmoveu_si128 LS_TEST("_mm_store_pd", _mm_load_pd, 0, _mm_store_pd, double*, 0, 2); // LS_TEST("_mm_store_pd1", _mm_load_pd, 0, _mm_store_pd1, double*, 0); LS_TEST("_mm_store_sd", _mm_load_pd, 0, _mm_store_sd, double*, 1, 2); // _mm_store_si128 // _mm_store1_pd LS64_TEST("_mm_storeh_pi", _mm_load_pd, 0, _mm_storeh_pi, 1, 2); // _mm_storel_epi64 LS64_TEST("_mm_storel_pi", _mm_load_pd, 0, _mm_storel_pi, 1, 2); LS_TEST("_mm_storer_pd", _mm_load_pd, 0, _mm_storer_pd, double*, 0, 2); LS_TEST("_mm_storeu_pd", _mm_load_pd, 0, _mm_storeu_pd, double*, 1, 2); // _mm_storeu_si128 LS_TEST("_mm_stream_pd", _mm_load_pd, 0, _mm_stream_pd, double*, 0, 2); // _mm_stream_si128 // _mm_stream_si32 // _mm_stream_si64 SETCHART("arithmetic"); // _mm_add_epi16 // _mm_add_epi32 // _mm_add_epi64 // _mm_add_epi8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] += src2[0]; dst[1] += src2[1]; dst[2] += src2[2]; dst[3] += src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar add"); BINARYOP_TEST_D("_mm_add_pd", _mm_add_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_add_sd", _mm_add_sd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_adds_epi16 // _mm_adds_epi8 // _mm_adds_epu16 // _mm_adds_epu8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] /= src2[0]; dst[1] /= src2[1]; dst[2] /= src2[2]; dst[3] /= src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar div"); BINARYOP_TEST_D("_mm_div_pd", _mm_div_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_div_sd", _mm_div_sd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_madd_epi16 // _mm_mul_epu32 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] *= src2[0]; dst[1] *= src2[1]; dst[2] *= src2[2]; dst[3] *= src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar mul"); BINARYOP_TEST_D("_mm_mul_pd", _mm_mul_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_mul_sd", _mm_mul_sd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_mulhi_epi16 // _mm_mulhi_epu16 // _mm_mullo_epi16 // _mm_sad_epu8 // _mm_sub_epi16 // _mm_sub_epi32 // _mm_sub_epi64 // _mm_sub_epi8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] -= src2[0]; dst[1] -= src2[1]; dst[2] -= src2[2]; dst[3] -= src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar sub"); BINARYOP_TEST_D("_mm_sub_pd", _mm_sub_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_sub_sd", _mm_sub_sd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_subs_epi16 // _mm_subs_epi8 // _mm_subs_epu16 // _mm_subs_epu8 SETCHART("roots"); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = sqrt(dst[0]); dst[1] = sqrt(dst[1]); dst[2] = sqrt(dst[2]); dst[3] = sqrt(dst[3]); } ENDSCALAR(checksum_dst(dst), "scalar sqrt"); UNARYOP_TEST_D("_mm_sqrt_pd", _mm_sqrt_pd, _mm_load_pd(src)); // UNARYOP_TEST_D("_mm_sqrt_sd", _mm_sqrt_sd, _mm_load_pd(src)); SETCHART("logical"); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastd(dcastu(dst[0]) & dcastu(src2[0])); dst[1] = ucastd(dcastu(dst[1]) & dcastu(src2[1])); dst[2] = ucastd(dcastu(dst[2]) & dcastu(src2[2])); dst[3] = ucastd(dcastu(dst[3]) & dcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar and"); BINARYOP_TEST_D("_mm_and_pd", _mm_and_pd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_and_si128 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastd((~dcastu(dst[0])) & dcastu(src2[0])); dst[1] = ucastd((~dcastu(dst[1])) & dcastu(src2[1])); dst[2] = ucastd((~dcastu(dst[2])) & dcastu(src2[2])); dst[3] = ucastd((~dcastu(dst[3])) & dcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar andnot"); BINARYOP_TEST_D("_mm_andnot_pd", _mm_andnot_pd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_andnot_si128 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastd(dcastu(dst[0]) | dcastu(src2[0])); dst[1] = ucastd(dcastu(dst[1]) | dcastu(src2[1])); dst[2] = ucastd(dcastu(dst[2]) | dcastu(src2[2])); dst[3] = ucastd(dcastu(dst[3]) | dcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar or"); BINARYOP_TEST_D("_mm_or_pd", _mm_or_pd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_or_si128 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = ucastd(dcastu(dst[0]) ^ dcastu(src2[0])); dst[1] = ucastd(dcastu(dst[1]) ^ dcastu(src2[1])); dst[2] = ucastd(dcastu(dst[2]) ^ dcastu(src2[2])); dst[3] = ucastd(dcastu(dst[3]) ^ dcastu(src2[3])); } ENDSCALAR(checksum_dst(dst), "scalar xor"); BINARYOP_TEST_D("_mm_xor_pd", _mm_xor_pd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_xor_si128 SETCHART("cmp"); // _mm_cmpeq_epi16 // _mm_cmpeq_epi32 // _mm_cmpeq_epi8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] == src2[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] == src2[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] == src2[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] == src2[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp=="); BINARYOP_TEST_D("_mm_cmpeq_pd", _mm_cmpeq_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_cmpeq_sd", _mm_cmpeq_sd, _mm_load_pd(src), _mm_load_pd(src2)); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] >= src2[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] >= src2[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] >= src2[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] >= src2[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp>="); BINARYOP_TEST_D("_mm_cmpge_pd", _mm_cmpge_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_cmpge_sd", _mm_cmpge_sd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_cmpgt_epi16 // _mm_cmpgt_epi32 // _mm_cmpgt_epi8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] > src2[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] > src2[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] > src2[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] > src2[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp>"); BINARYOP_TEST_D("_mm_cmpgt_pd", _mm_cmpgt_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_cmpgt_sd", _mm_cmpgt_sd, _mm_load_pd(src), _mm_load_pd(src2)); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] <= src2[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] <= src2[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] <= src2[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] <= src2[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp<="); BINARYOP_TEST_D("_mm_cmple_pd", _mm_cmple_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_cmple_sd", _mm_cmple_sd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_cmplt_epi16 // _mm_cmplt_epi32 // _mm_cmplt_epi8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (dst[0] < src2[0]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (dst[1] < src2[1]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (dst[2] < src2[2]) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (dst[3] < src2[3]) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmp<"); BINARYOP_TEST_D("_mm_cmplt_pd", _mm_cmplt_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_cmplt_sd", _mm_cmplt_sd, _mm_load_pd(src), _mm_load_pd(src2)); /*_mm_cmpneq_pd _mm_cmpneq_sd _mm_cmpnge_pd _mm_cmpnge_sd _mm_cmpngt_pd _mm_cmpngt_sd _mm_cmpnle_pd _mm_cmpnle_sd _mm_cmpnlt_pd _mm_cmpnlt_sd*/ START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (!Isnan(dst[0]) && !Isnan(src2[0])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (!Isnan(dst[1]) && !Isnan(src2[1])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (!Isnan(dst[2]) && !Isnan(src2[2])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (!Isnan(dst[3]) && !Isnan(src2[3])) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmpord"); BINARYOP_TEST_D("_mm_cmpord_pd", _mm_cmpord_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_cmpord_sd", _mm_cmpord_sd, _mm_load_pd(src), _mm_load_pd(src2)); START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = (Isnan(dst[0]) || Isnan(src2[0])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[1] = (Isnan(dst[1]) || Isnan(src2[1])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[2] = (Isnan(dst[2]) || Isnan(src2[2])) ? ucastd(0xFFFFFFFFU) : 0.f; dst[3] = (Isnan(dst[3]) || Isnan(src2[3])) ? ucastd(0xFFFFFFFFU) : 0.f; } ENDSCALAR(checksum_dst(dst), "scalar cmpunord"); BINARYOP_TEST_D("_mm_cmpunord_pd", _mm_cmpunord_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_cmpunord_sd", _mm_cmpunord_sd, _mm_load_pd(src), _mm_load_pd(src2)); SETCHART("max"); // _mm_max_epi16 // _mm_max_epu8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = Max(dst[0], src2[0]); dst[1] = Max(dst[1], src2[1]); dst[2] = Max(dst[2], src2[2]); dst[3] = Max(dst[3], src2[3]); } ENDSCALAR(checksum_dst(dst), "scalar max"); BINARYOP_TEST_D("_mm_max_pd", _mm_max_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_max_sd", _mm_max_sd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_min_epi16 // _mm_min_epu8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = Min(dst[0], src2[0]); dst[1] = Min(dst[1], src2[1]); dst[2] = Min(dst[2], src2[2]); dst[3] = Min(dst[3], src2[3]); } ENDSCALAR(checksum_dst(dst), "scalar min"); BINARYOP_TEST_D("_mm_min_pd", _mm_min_pd, _mm_load_pd(src), _mm_load_pd(src2)); BINARYOP_TEST_D("_mm_min_sd", _mm_min_sd, _mm_load_pd(src), _mm_load_pd(src2)); SETCHART("shuffle"); // _mm_extract_epi16 // _mm_insert_epi16 // _mm_shuffle_epi32 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[3] = dst[1]; dst[2] = dst[0]; dst[1] = src2[3]; dst[0] = src2[2]; } ENDSCALAR(checksum_dst(dst), "scalar shuffle"); // BINARYOP_TEST_D("_mm_shuffle_pd", _mm_shuffle_pd, _mm_load_pd(src), _mm_load_pd(src2)); START(); __m128 o0 = _mm_load_pd(src); __m128 o1 = _mm_load_pd(src2); for(int i = 0; i < N; i += 4) o0 = _mm_shuffle_pd(o0, o1, _MM_SHUFFLE(1, 0, 3, 2)); _mm_store_pd(dst, o0); END(checksum_dst(dst), "_mm_shuffle_pd"); // _mm_shufflehi_epi16 // _mm_shufflelo_epi16 // _mm_unpackhi_epi16 // _mm_unpackhi_epi32 // _mm_unpackhi_epi64 // _mm_unpackhi_epi8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[0] = dst[2]; dst[1] = src2[2]; dst[2] = dst[3]; dst[3] = src2[3]; } ENDSCALAR(checksum_dst(dst), "scalar unpackhi_pd"); BINARYOP_TEST_D("_mm_unpackhi_pd", _mm_unpackhi_pd, _mm_load_pd(src), _mm_load_pd(src2)); // _mm_unpacklo_epi16 // _mm_unpacklo_epi32 // _mm_unpacklo_epi64 // _mm_unpacklo_epi8 START(); dst[0] = src[0]; dst[1] = src[1]; dst[2] = src[2]; dst[3] = src[3]; for(int i = 0; i < N; ++i) { dst[2] = dst[1]; dst[1] = dst[0]; dst[0] = src2[0]; dst[3] = src2[1]; } ENDSCALAR(checksum_dst(dst), "scalar unpacklo_pd"); BINARYOP_TEST_D("_mm_unpacklo_pd", _mm_unpacklo_pd, _mm_load_pd(src), _mm_load_pd(src2)); printf("]}\n"); /* printf("Finished!\n"); printf("Total time spent in scalar intrinsics: %f msecs.\n", (double)scalarTotalTicks * 1000.0 / ticks_per_sec()); printf("Total time spent in SSE1 intrinsics: %f msecs.\n", (double)simdTotalTicks * 1000.0 / ticks_per_sec()); if (scalarTotalTicks > simdTotalTicks) printf("SSE1 was %.3fx faster than scalar!\n", (double)scalarTotalTicks / simdTotalTicks); else printf("SSE1 was %.3fx slower than scalar!\n", (double)simdTotalTicks / scalarTotalTicks); */ #ifdef __EMSCRIPTEN__ fprintf(stderr,"User Agent: %s\n", emscripten_run_script_string("navigator.userAgent")); printf("/*Test finished! Now please close Firefox to continue with benchmark_sse2.py.*/\n"); #endif exit(0); }