void static avx_test (void) { int i; union256d s1; union128i_d u; int e [4]; s1.x = _mm256_set_pd (2.78, 7777768.82, 23.67, 536.46); u.x = _mm256_cvtpd_epi32 (s1.x); for (i = 0; i < 4; i++) e[i] = (int)(s1.a[i] + 0.5); if (check_union128i_d (u, e)) abort (); }
// Process audio effects for 8 channels simultaneously: void processEffects(const vec8_i32 &inpSamples, vec8_i32 &outSamples, const long n) { // Extract int samples and convert to doubles: const vec4_d64 ds0 = _mm256_div_pd( _mm256_cvtepi32_pd(_mm256_extractf128_si256(inpSamples, 0)), _mm256_set1_pd((double)INT_MAX) ); const vec4_d64 ds1 = _mm256_div_pd( _mm256_cvtepi32_pd(_mm256_extractf128_si256(inpSamples, 1)), _mm256_set1_pd((double)INT_MAX) ); // Monitor input levels: fx.fi_monitor.levels[n + 0] = scalar_to_dBFS(ds0); fx.fi_monitor.levels[n + 1] = scalar_to_dBFS(ds1); vec4_d64 s0, s1; // f0_gain: { s0 = _mm256_mul_pd(ds0, fx.f0_gain.calc.gain[n + 0]); s1 = _mm256_mul_pd(ds1, fx.f0_gain.calc.gain[n + 1]); } // Monitor levels: fx.f0_output.levels[n + 0] = scalar_to_dBFS(s0); fx.f0_output.levels[n + 1] = scalar_to_dBFS(s1); // f1_compressor: { const vec4_dBFS l0 = scalar_to_dBFS_offs(s0); const vec4_dBFS l1 = scalar_to_dBFS_offs(s1); // over = s - thresh vec4_dB over0 = _mm256_sub_pd(l0, fx.f1_compressor.input.threshold[n + 0]); vec4_dB over1 = _mm256_sub_pd(l1, fx.f1_compressor.input.threshold[n + 1]); // over = if over < 0.0 then 0.0 else over; over0 = mm256_if_then_else(_mm256_cmp_pd(over0, _mm256_set1_pd(0.0), _CMP_LT_OQ), _mm256_set1_pd(0.0), over0); over1 = mm256_if_then_else(_mm256_cmp_pd(over1, _mm256_set1_pd(0.0), _CMP_LT_OQ), _mm256_set1_pd(0.0), over1); // over += DC_OFFSET over0 = _mm256_add_pd(over0, DC_OFFSET); over1 = _mm256_add_pd(over1, DC_OFFSET); // env = over + coef * ( env - over ) const vec4_dB attack_env0 = _mm256_add_pd(over0, _mm256_mul_pd(fx.f1_compressor.calc.attack_coef[n + 0], _mm256_sub_pd(fx.f1_compressor.state.env[n + 0], over0))); const vec4_dB attack_env1 = _mm256_add_pd(over1, _mm256_mul_pd(fx.f1_compressor.calc.attack_coef[n + 1], _mm256_sub_pd(fx.f1_compressor.state.env[n + 1], over1))); const vec4_dB release_env0 = _mm256_add_pd(over0, _mm256_mul_pd(fx.f1_compressor.calc.release_coef[n + 0], _mm256_sub_pd(fx.f1_compressor.state.env[n + 0], over0))); const vec4_dB release_env1 = _mm256_add_pd(over1, _mm256_mul_pd(fx.f1_compressor.calc.release_coef[n + 1], _mm256_sub_pd(fx.f1_compressor.state.env[n + 1], over1))); // env = if over > env then attack_env else release_env fx.f1_compressor.state.env[n + 0] = mm256_if_then_else(_mm256_cmp_pd(over0, fx.f1_compressor.state.env[n + 0], _CMP_GT_OQ), attack_env0, release_env0); fx.f1_compressor.state.env[n + 1] = mm256_if_then_else(_mm256_cmp_pd(over1, fx.f1_compressor.state.env[n + 1], _CMP_GT_OQ), attack_env1, release_env1); // over = env - DC_OFFSET over0 = _mm256_sub_pd(fx.f1_compressor.state.env[n + 0], DC_OFFSET); over1 = _mm256_sub_pd(fx.f1_compressor.state.env[n + 1], DC_OFFSET); // grdB = ( over * ( ratio - 1.0 ) ) vec4_dB gr0dB = _mm256_mul_pd(over0, fx.f1_compressor.calc.ratio_min_1[n + 0]); vec4_dB gr1dB = _mm256_mul_pd(over1, fx.f1_compressor.calc.ratio_min_1[n + 1]); // gr = dB_to_scalar(grdB) fx.f1_compressor.monitor.gain_reduction[n + 0] = dB_to_scalar(gr0dB); fx.f1_compressor.monitor.gain_reduction[n + 1] = dB_to_scalar(gr1dB); // Apply gain reduction to inputs: s0 = _mm256_mul_pd(s0, fx.f1_compressor.monitor.gain_reduction[n + 0]); s1 = _mm256_mul_pd(s1, fx.f1_compressor.monitor.gain_reduction[n + 1]); // Apply make-up gain: s0 = _mm256_mul_pd(s0, fx.f1_compressor.calc.gain[n + 0]); s1 = _mm256_mul_pd(s1, fx.f1_compressor.calc.gain[n + 1]); } // Monitor output levels: fx.fo_monitor.levels[n + 0] = scalar_to_dBFS(s0); fx.fo_monitor.levels[n + 1] = scalar_to_dBFS(s1); // TODO(jsd): Better limiter implementation! // Limit final samples: s0 = _mm256_max_pd(_mm256_min_pd(s0, _mm256_set1_pd((double)1.0)), _mm256_set1_pd((double)-1.0)); s1 = _mm256_max_pd(_mm256_min_pd(s1, _mm256_set1_pd((double)1.0)), _mm256_set1_pd((double)-1.0)); // Convert doubles back to 32-bit ints: s0 = _mm256_mul_pd(s0, _mm256_set1_pd((double)INT_MAX)); s1 = _mm256_mul_pd(s1, _mm256_set1_pd((double)INT_MAX)); const vec8_i32 os = _mm256_setr_m128i(_mm256_cvtpd_epi32(s0), _mm256_cvtpd_epi32(s1)); // Write outputs: _mm256_stream_si256(&outSamples, os); }
static inline __m256d gmx_mm256_exp2_pd(__m256d x) { /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */ const __m256d arglimit = _mm256_set1_pd(1022.0); const __m128i expbase = _mm_set1_epi32(1023); const __m256d P2 = _mm256_set1_pd(2.30933477057345225087e-2); const __m256d P1 = _mm256_set1_pd(2.02020656693165307700e1); const __m256d P0 = _mm256_set1_pd(1.51390680115615096133e3); /* Q2 == 1.0 */ const __m256d Q1 = _mm256_set1_pd(2.33184211722314911771e2); const __m256d Q0 = _mm256_set1_pd(4.36821166879210612817e3); const __m256d one = _mm256_set1_pd(1.0); const __m256d two = _mm256_set1_pd(2.0); __m256d valuemask; __m256i iexppart; __m128i iexppart128a, iexppart128b; __m256d fexppart; __m256d intpart; __m256d z, z2; __m256d PolyP, PolyQ; iexppart128a = _mm256_cvtpd_epi32(x); intpart = _mm256_round_pd(x, _MM_FROUND_TO_NEAREST_INT); /* Add exponent bias */ iexppart128a = _mm_add_epi32(iexppart128a, expbase); /* We now want to shift the exponent 52 positions left, but to achieve this we need * to separate the 128-bit register data into two registers (4x64-bit > 128bit) * shift them, and then merge into a single __m256d. * Elements 0/1 should end up in iexppart128a, and 2/3 in iexppart128b. * It doesnt matter what we put in the 2nd/4th position, since that data will be * shifted out and replaced with zeros. */ iexppart128b = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(3, 3, 2, 2)); iexppart128a = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(1, 1, 0, 0)); iexppart128b = _mm_slli_epi64(iexppart128b, 52); iexppart128a = _mm_slli_epi64(iexppart128a, 52); iexppart = _mm256_castsi128_si256(iexppart128a); iexppart = _mm256_insertf128_si256(iexppart, iexppart128b, 0x1); valuemask = _mm256_cmp_pd(arglimit, gmx_mm256_abs_pd(x), _CMP_GE_OQ); fexppart = _mm256_and_pd(valuemask, _mm256_castsi256_pd(iexppart)); z = _mm256_sub_pd(x, intpart); z2 = _mm256_mul_pd(z, z); PolyP = _mm256_mul_pd(P2, z2); PolyP = _mm256_add_pd(PolyP, P1); PolyQ = _mm256_add_pd(z2, Q1); PolyP = _mm256_mul_pd(PolyP, z2); PolyQ = _mm256_mul_pd(PolyQ, z2); PolyP = _mm256_add_pd(PolyP, P0); PolyQ = _mm256_add_pd(PolyQ, Q0); PolyP = _mm256_mul_pd(PolyP, z); z = _mm256_mul_pd(PolyP, gmx_mm256_inv_pd(_mm256_sub_pd(PolyQ, PolyP))); z = _mm256_add_pd(one, _mm256_mul_pd(two, z)); z = _mm256_mul_pd(z, fexppart); return z; }