Exemple #1
0
void static
avx_test (void)
{
    int i;
    union256d s1;
    union128i_d u;
    int e [4];

    s1.x = _mm256_set_pd (2.78, 7777768.82, 23.67, 536.46);
    u.x = _mm256_cvtpd_epi32 (s1.x);

    for (i = 0; i < 4; i++)
        e[i] = (int)(s1.a[i] + 0.5);

    if (check_union128i_d (u, e))
        abort ();
}
Exemple #2
0
// Process audio effects for 8 channels simultaneously:
void processEffects(const vec8_i32 &inpSamples, vec8_i32 &outSamples, const long n)
{
    // Extract int samples and convert to doubles:
    const vec4_d64 ds0 = _mm256_div_pd(
        _mm256_cvtepi32_pd(_mm256_extractf128_si256(inpSamples, 0)),
        _mm256_set1_pd((double)INT_MAX)
        );
    const vec4_d64 ds1 = _mm256_div_pd(
        _mm256_cvtepi32_pd(_mm256_extractf128_si256(inpSamples, 1)),
        _mm256_set1_pd((double)INT_MAX)
        );

    // Monitor input levels:
    fx.fi_monitor.levels[n + 0] = scalar_to_dBFS(ds0);
    fx.fi_monitor.levels[n + 1] = scalar_to_dBFS(ds1);

    vec4_d64 s0, s1;

    // f0_gain:
    {
        s0 = _mm256_mul_pd(ds0, fx.f0_gain.calc.gain[n + 0]);
        s1 = _mm256_mul_pd(ds1, fx.f0_gain.calc.gain[n + 1]);
    }

    // Monitor levels:
    fx.f0_output.levels[n + 0] = scalar_to_dBFS(s0);
    fx.f0_output.levels[n + 1] = scalar_to_dBFS(s1);

    // f1_compressor:
    {
        const vec4_dBFS l0 = scalar_to_dBFS_offs(s0);
        const vec4_dBFS l1 = scalar_to_dBFS_offs(s1);

        // over = s - thresh
        vec4_dB over0 = _mm256_sub_pd(l0, fx.f1_compressor.input.threshold[n + 0]);
        vec4_dB over1 = _mm256_sub_pd(l1, fx.f1_compressor.input.threshold[n + 1]);

        // over = if over < 0.0 then 0.0 else over;
        over0 = mm256_if_then_else(_mm256_cmp_pd(over0, _mm256_set1_pd(0.0), _CMP_LT_OQ), _mm256_set1_pd(0.0), over0);
        over1 = mm256_if_then_else(_mm256_cmp_pd(over1, _mm256_set1_pd(0.0), _CMP_LT_OQ), _mm256_set1_pd(0.0), over1);

        // over += DC_OFFSET
        over0 = _mm256_add_pd(over0, DC_OFFSET);
        over1 = _mm256_add_pd(over1, DC_OFFSET);

        // env = over + coef * ( env - over )
        const vec4_dB attack_env0  = _mm256_add_pd(over0, _mm256_mul_pd(fx.f1_compressor.calc.attack_coef[n + 0], _mm256_sub_pd(fx.f1_compressor.state.env[n + 0], over0)));
        const vec4_dB attack_env1  = _mm256_add_pd(over1, _mm256_mul_pd(fx.f1_compressor.calc.attack_coef[n + 1], _mm256_sub_pd(fx.f1_compressor.state.env[n + 1], over1)));
        const vec4_dB release_env0  = _mm256_add_pd(over0, _mm256_mul_pd(fx.f1_compressor.calc.release_coef[n + 0], _mm256_sub_pd(fx.f1_compressor.state.env[n + 0], over0)));
        const vec4_dB release_env1  = _mm256_add_pd(over1, _mm256_mul_pd(fx.f1_compressor.calc.release_coef[n + 1], _mm256_sub_pd(fx.f1_compressor.state.env[n + 1], over1)));

        // env = if over > env then attack_env else release_env
        fx.f1_compressor.state.env[n + 0] = mm256_if_then_else(_mm256_cmp_pd(over0, fx.f1_compressor.state.env[n + 0], _CMP_GT_OQ), attack_env0, release_env0);
        fx.f1_compressor.state.env[n + 1] = mm256_if_then_else(_mm256_cmp_pd(over1, fx.f1_compressor.state.env[n + 1], _CMP_GT_OQ), attack_env1, release_env1);

        // over = env - DC_OFFSET
        over0 = _mm256_sub_pd(fx.f1_compressor.state.env[n + 0], DC_OFFSET);
        over1 = _mm256_sub_pd(fx.f1_compressor.state.env[n + 1], DC_OFFSET);

        // grdB = ( over * ( ratio - 1.0 ) )
        vec4_dB gr0dB = _mm256_mul_pd(over0, fx.f1_compressor.calc.ratio_min_1[n + 0]);
        vec4_dB gr1dB = _mm256_mul_pd(over1, fx.f1_compressor.calc.ratio_min_1[n + 1]);

        // gr = dB_to_scalar(grdB)
        fx.f1_compressor.monitor.gain_reduction[n + 0] = dB_to_scalar(gr0dB);
        fx.f1_compressor.monitor.gain_reduction[n + 1] = dB_to_scalar(gr1dB);

        // Apply gain reduction to inputs:
        s0 = _mm256_mul_pd(s0, fx.f1_compressor.monitor.gain_reduction[n + 0]);
        s1 = _mm256_mul_pd(s1, fx.f1_compressor.monitor.gain_reduction[n + 1]);

        // Apply make-up gain:
        s0 = _mm256_mul_pd(s0, fx.f1_compressor.calc.gain[n + 0]);
        s1 = _mm256_mul_pd(s1, fx.f1_compressor.calc.gain[n + 1]);
    }

    // Monitor output levels:
    fx.fo_monitor.levels[n + 0] = scalar_to_dBFS(s0);
    fx.fo_monitor.levels[n + 1] = scalar_to_dBFS(s1);

    // TODO(jsd): Better limiter implementation!
    // Limit final samples:
    s0 = _mm256_max_pd(_mm256_min_pd(s0, _mm256_set1_pd((double)1.0)), _mm256_set1_pd((double)-1.0));
    s1 = _mm256_max_pd(_mm256_min_pd(s1, _mm256_set1_pd((double)1.0)), _mm256_set1_pd((double)-1.0));

    // Convert doubles back to 32-bit ints:
    s0 = _mm256_mul_pd(s0, _mm256_set1_pd((double)INT_MAX));
    s1 = _mm256_mul_pd(s1, _mm256_set1_pd((double)INT_MAX));
    const vec8_i32 os = _mm256_setr_m128i(_mm256_cvtpd_epi32(s0), _mm256_cvtpd_epi32(s1));

    // Write outputs:
    _mm256_stream_si256(&outSamples, os);
}
static inline __m256d gmx_mm256_exp2_pd(__m256d x)
{
    /* Lower bound: We do not allow numbers that would lead to an IEEE fp representation exponent smaller than -126. */
    const __m256d arglimit = _mm256_set1_pd(1022.0);
    const __m128i expbase  = _mm_set1_epi32(1023);

    const __m256d P2       = _mm256_set1_pd(2.30933477057345225087e-2);
    const __m256d P1       = _mm256_set1_pd(2.02020656693165307700e1);
    const __m256d P0       = _mm256_set1_pd(1.51390680115615096133e3);
    /* Q2 == 1.0 */
    const __m256d Q1       = _mm256_set1_pd(2.33184211722314911771e2);
    const __m256d Q0       = _mm256_set1_pd(4.36821166879210612817e3);
    const __m256d one      = _mm256_set1_pd(1.0);
    const __m256d two      = _mm256_set1_pd(2.0);

    __m256d       valuemask;
    __m256i       iexppart;
    __m128i       iexppart128a, iexppart128b;
    __m256d       fexppart;
    __m256d       intpart;
    __m256d       z, z2;
    __m256d       PolyP, PolyQ;

    iexppart128a  = _mm256_cvtpd_epi32(x);
    intpart       = _mm256_round_pd(x, _MM_FROUND_TO_NEAREST_INT);

    /* Add exponent bias */
    iexppart128a   = _mm_add_epi32(iexppart128a, expbase);

    /* We now want to shift the exponent 52 positions left, but to achieve this we need
     * to separate the 128-bit register data into two registers (4x64-bit > 128bit)
     * shift them, and then merge into a single __m256d.
     * Elements 0/1 should end up in iexppart128a, and 2/3 in iexppart128b.
     * It doesnt matter what we put in the 2nd/4th position, since that data will be
     * shifted out and replaced with zeros.
     */
    iexppart128b   = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(3, 3, 2, 2));
    iexppart128a   = _mm_shuffle_epi32(iexppart128a, _MM_SHUFFLE(1, 1, 0, 0));

    iexppart128b   = _mm_slli_epi64(iexppart128b, 52);
    iexppart128a   = _mm_slli_epi64(iexppart128a, 52);

    iexppart  = _mm256_castsi128_si256(iexppart128a);
    iexppart  = _mm256_insertf128_si256(iexppart, iexppart128b, 0x1);

    valuemask = _mm256_cmp_pd(arglimit, gmx_mm256_abs_pd(x), _CMP_GE_OQ);
    fexppart  = _mm256_and_pd(valuemask, _mm256_castsi256_pd(iexppart));

    z         = _mm256_sub_pd(x, intpart);

    z2        = _mm256_mul_pd(z, z);

    PolyP     = _mm256_mul_pd(P2, z2);
    PolyP     = _mm256_add_pd(PolyP, P1);
    PolyQ     = _mm256_add_pd(z2, Q1);
    PolyP     = _mm256_mul_pd(PolyP, z2);
    PolyQ     = _mm256_mul_pd(PolyQ, z2);
    PolyP     = _mm256_add_pd(PolyP, P0);
    PolyQ     = _mm256_add_pd(PolyQ, Q0);
    PolyP     = _mm256_mul_pd(PolyP, z);

    z         = _mm256_mul_pd(PolyP, gmx_mm256_inv_pd(_mm256_sub_pd(PolyQ, PolyP)));
    z         = _mm256_add_pd(one, _mm256_mul_pd(two, z));

    z         = _mm256_mul_pd(z, fexppart);

    return z;
}