/****************************************************************** * * SPLIT RADIX PRECOMPUTED AND VECTORIZED FFT MULTIPLICATION * ******************************************************************/ void sr_vector_mul(ring_t *r, const ring_t *x, const ring_t *y){ // printf("\n\n**************split-radix FAST**************\n"); fft_vector_forward(&vctr_x,x); fft_vector_forward(&vctr_y,y); __m256d real_x,imag_x,real_y,imag_y,imag_temp,real_temp; // double a,b,c,d; for (int i = 0; i < CPLXDIM; i+=4) { real_x = _mm256_load_pd(vctr_x.real+i); imag_x = _mm256_load_pd(vctr_x.imag+i); real_y = _mm256_load_pd(vctr_y.real+i); imag_y = _mm256_load_pd(vctr_y.imag+i); //(a + ib) * (c + id) = (ac - bd) + i(ad+bc) //real_temp = bd real_temp = _mm256_mul_pd(imag_x,imag_y); //imag_temp = ad imag_temp = _mm256_mul_pd(real_x,imag_y); real_x = _mm256_fmsub_pd(real_x,real_y,real_temp); imag_x = _mm256_fmadd_pd(imag_x,real_y,imag_temp); real_y = _mm256_set1_pd(CPLXDIM); real_x = _mm256_div_pd(real_x,real_y); imag_x = _mm256_div_pd(imag_x,real_y); _mm256_store_pd(vctr_res.real+i,real_x); _mm256_store_pd(vctr_res.imag+i,imag_x); } fft_vector_backward(&vctr_res,r); }
/****************************************************************** * * NEGACYCLIC FFT LOOK UP TABLE * ******************************************************************/ void negacyc_mul(ring_t *r, const ring_t *x, const ring_t *y) { phi_forward(&vector_x,x); phi_forward(&vector_y,y); __m256d real_x,imag_x,real_y,imag_y,imag_temp,real_temp,dim; dim = _mm256_set1_pd(CPLXDIM); // double a,b,c,d; for (int i = 0; i < CPLXDIM; i+=4) { real_x = _mm256_load_pd(vector_x.real+i); imag_x = _mm256_load_pd(vector_x.imag+i); real_y = _mm256_load_pd(vector_y.real+i); imag_y = _mm256_load_pd(vector_y.imag+i); //(a + ib) * (c + id) = (ac - bd) + i(ad+bc) //real_temp = bd real_temp = _mm256_mul_pd(imag_x,imag_y); //imag_temp = ad imag_temp = _mm256_mul_pd(real_x,imag_y); real_x = _mm256_fmsub_pd(real_x,real_y,real_temp); imag_x = _mm256_fmadd_pd(imag_x,real_y,imag_temp); real_x = _mm256_div_pd(real_x,dim); imag_x = _mm256_div_pd(imag_x,dim); _mm256_store_pd(vector_res.real+i,real_x); _mm256_store_pd(vector_res.imag+i,imag_x); } phi_backward(&vector_res,r); // print_cplx(&vec_res,CPLXDIM); }
double compute_pi(size_t dt) { int i; double pi = 0.0; double delta = 1.0 / dt; register __m256d ymm0, ymm1, ymm2, ymm3, ymm4; ymm0 = _mm256_set1_pd(1.0); ymm1 = _mm256_set1_pd(delta); ymm2 = _mm256_set_pd(delta * 3, delta * 2, delta * 1, 0.0); ymm4 = _mm256_setzero_pd(); for (i = 0; i <= dt - 4; i += 4) { ymm3 = _mm256_set1_pd(i * delta); ymm3 = _mm256_add_pd(ymm3, ymm2); ymm3 = _mm256_mul_pd(ymm3, ymm3); ymm3 = _mm256_add_pd(ymm0, ymm3); ymm3 = _mm256_div_pd(ymm1, ymm3); ymm4 = _mm256_add_pd(ymm4, ymm3); } double tmp[4] __attribute__((aligned(32))); _mm256_store_pd(tmp, ymm4); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; return pi * 4.0; }
void THDoubleVector_divs_AVX(double *y, const double *x, const double c, const ptrdiff_t n) { ptrdiff_t i; __m256d YMM15 = _mm256_set_pd(c, c, c, c); __m256d YMM0, YMM1; for (i=0; i<=((n)-8); i+=8) { YMM0 = _mm256_loadu_pd(x+i); YMM1 = _mm256_loadu_pd(x+i+4); YMM0 = _mm256_div_pd(YMM0, YMM15); YMM1 = _mm256_div_pd(YMM1, YMM15); _mm256_storeu_pd(y+i, YMM0); _mm256_storeu_pd(y+i+4, YMM1); } for (; i<(n); i++) { y[i] = x[i] / c; } }
inline float64x4_t inverse(const float64x4_t ymm) { float64x4_t conj0 = conjugate(ymm); float64x4_t dot0 = simd_geometric::dot(ymm, ymm); float64x4_t div0 = _mm256_div_pd(conj0, dot0); return div0; }
double calcPi_simd(void) { double x; int i; double width = 1./(double) num_rects; // __m256d __width = _mm256_set1_pd(width); // __m256d __half = _mm256_set1_pd(0.5); __m256d __four = _mm256_set1_pd(4.0); __m256d __one = _mm256_set1_pd(1.0); // __m256d __sum = _mm256_set1_pd(0.0); __m256d __i = _mm256_set_pd(0.5, 1.5, 2.5, 3.5); for (i = 0; i < num_rects; i += 4) { __m256d __x = __i *__width; __m256d __y = _mm256_div_pd(__one, __one + __x*__x); __sum += __four * __y; __i = __i + __four; } double sum; // sum = ((double*) &__sum)[0]; sum += ((double*) &__sum)[1]; sum += ((double*) &__sum)[2]; sum += ((double*) &__sum)[3]; // return width*sum; }
void gvrotg_fma(double *c, double *s, double *r, double a, double b) { #if defined(__FMA__) register __m256d x0, x1, t0, t2, u0, u1, one, b0, b1; if (b == 0.0) { *c = 1.0; *s = 0.0; *r = a; return; } if (a == 0.0) { *c = 0.0; *s = 1.0; *r = b; return; } // set_pd() order: [3, 2, 1, 0] // x[0], x[1]: |a| > |b|, x[2],x[3]: |b| > |a| one = _mm256_set1_pd(1.0); x0 = _mm256_set_pd(1.0, a, b, 1.0); // x0 = {1, a, b, 1} x1 = _mm256_set_pd(1.0, b, a, 1.0); // x0 = {1, b, a, 1} t0 = _mm256_div_pd(x0, x1); // t0 = {1, a/b, b/a, 1} t2 = _mm256_fmadd_pd(t0, t0, one); // x3 = {1, 1+(a/b)^2, (b/a)^2+1, 1} u0 = _mm256_sqrt_pd(t2); // u0 = {1, sqrt(1+(a/b)^2), sqrt((b/2)^2+1), 1} u1 = _mm256_div_pd(one, u0); b0 = _mm256_blend_pd(u0, u1, 0x9); // b0 = {1/u(a), u(a), u(b), 1/u(b)} b0 = _mm256_mul_pd(b0, x1); // b0 = {1/u(a), b*u(a), a*u(b), 1/u(b)} b1 = _mm256_mul_pd(t0, u1); // b1 = {1/u(a), t*u(a), t*u(b), 1/u(b)} if (fabs(b) > fabs(a)) { *s = b0[3]; *r = b0[2]; *c = b1[2]; if (signbit(b)) { *s = -(*s); *c = -(*c); *r = -(*r); } } else { *c = b0[0]; *r = b0[1]; *s = b1[1]; } #endif }
void gvrotg_avx(double *c, double *s, double *r, double a, double b) { register __m256d x0, x1, t0, t2, u0, u1, one, b0, b1; if (b == 0.0) { *c = 1.0; *s = 0.0; *r = a; return; } if (a == 0.0) { *c = 0.0; *s = 1.0; *r = b; return; } // set_pd() order: [3, 2, 1, 0] // x[0], x[1]: |a| > |b|, x[2],x[3]: |b| > |a| x0 = _mm256_set_pd(1.0, a, b, 1.0); // x0 = {1, a, b, 1} x1 = _mm256_set_pd(1.0, b, a, 1.0); // x0 = {1, b, a, 1} t0 = _mm256_div_pd(x0, x1); // t0 = {1, a/b, b/a, 1} x0 = _mm256_mul_pd(t0, t0); // x3 = {1, (a/b)^2, (b/a)^2, 1} t2 = _mm256_hadd_pd(x0, x0); // x3 = {1+(a/b)^2, ., (b/a)^2+1, ..} u0 = _mm256_sqrt_pd(t2); // u0 = {sqrt(1+(a/b)^2), .., sqrt((b/a)^2+1)} one = _mm256_set1_pd(1.0); u1 = _mm256_div_pd(one, u0); b0 = _mm256_blend_pd(u0, u1, 0x9); // b0 = {1/u(b), u(b), u(a), 1/u(a)} b0 = _mm256_mul_pd(b0, x1); // b0 = {1/u(b), b*u(b), a*u(a), 1/u(a)} b1 = _mm256_mul_pd(t0, u1); // b1 = {1/u(b), t*u(b), t*u(a), 1/u(a)} if (fabs(b) > fabs(a)) { *s = b0[3]; // = 1/u(b) *r = b0[2]; // = b*u(b) *c = b1[2]; // = t*u(b) if (signbit(b)) { *s = -(*s); *c = -(*c); *r = -(*r); } } else { *c = b0[0]; *r = b0[1]; *s = b1[1]; } }
void THDoubleVector_cdiv_AVX(double *z, const double *x, const double *y, const ptrdiff_t n) { ptrdiff_t i; __m256d YMM0, YMM1, YMM2, YMM3; for (i=0; i<=((n)-8); i+=8) { YMM0 = _mm256_loadu_pd(x+i); YMM1 = _mm256_loadu_pd(x+i+4); YMM2 = _mm256_loadu_pd(y+i); YMM3 = _mm256_loadu_pd(y+i+4); YMM2 = _mm256_div_pd(YMM0, YMM2); YMM3 = _mm256_div_pd(YMM1, YMM3); _mm256_storeu_pd(z+i, YMM2); _mm256_storeu_pd(z+i+4, YMM3); } for (; i<(n); i++) { z[i] = x[i] / y[i]; } }
double compute_pi_leibniz_avx_opt(size_t n) { double pi = 0.0; register __m256d ymm0, ymm1, ymm2, ymm3, ymm4, ymm5, ymm6, ymm7, ymm8; register __m256d ymm9, ymm10, ymm11, ymm12, ymm13; ymm0 = _mm256_set_pd(1.0, -1.0, 1.0, -1.0); ymm1 = _mm256_set_pd(1.0, 3.0, 5.0, 7.0); ymm2 = _mm256_set_pd(9.0, 11.0, 13.0, 15.0); ymm3 = _mm256_set_pd(17.0, 19.0, 21.0, 23.0); ymm4 = _mm256_set_pd(25.0, 27.0, 29.0, 31.0); ymm13 = _mm256_set1_pd(32.0); ymm5 = _mm256_setzero_pd(); ymm6 = _mm256_setzero_pd(); ymm7 = _mm256_setzero_pd(); ymm8 = _mm256_setzero_pd(); for (int i = 0; i <= n - 16; i += 16) { ymm9 = _mm256_div_pd(ymm0, ymm1); ymm1 = _mm256_add_pd(ymm1, ymm13); ymm10 = _mm256_div_pd(ymm0, ymm2); ymm2 = _mm256_add_pd(ymm2, ymm13); ymm11 = _mm256_div_pd(ymm0, ymm3); ymm3 = _mm256_add_pd(ymm3, ymm13); ymm12 = _mm256_div_pd(ymm0, ymm4); ymm4 = _mm256_add_pd(ymm4, ymm13); ymm5 = _mm256_add_pd(ymm5, ymm9); ymm6 = _mm256_add_pd(ymm6, ymm10); ymm7 = _mm256_add_pd(ymm7, ymm11); ymm8 = _mm256_add_pd(ymm8, ymm12); } double tmp[4] __attribute__((aligned(32))); _mm256_store_pd(tmp, ymm5); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; _mm256_store_pd(tmp, ymm6); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; _mm256_store_pd(tmp, ymm7); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; _mm256_store_pd(tmp, ymm8); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; return pi * 4.0; }
/****************************************************************** * * SPLIT RADIX PRECOMPUTED AND VECTORIZED NON RECURSIVE FFT MULTIPLICATION * ******************************************************************/ void sr_vector_nonrec_mul(ring_t *r, const ring_t *x, const ring_t *y){ fft_vector_nonrec_forward(&vec_x,x); fft_vector_nonrec_forward(&vec_y,y); __m256d real_x,imag_x,real_y,imag_y,imag_temp,real_temp; // double a,b,c,d; for (int i = 0; i < CPLXDIM; i+=4) { real_x = _mm256_load_pd(vec_x.real+i); imag_x = _mm256_load_pd(vec_x.imag+i); real_y = _mm256_load_pd(vec_y.real+i); imag_y = _mm256_load_pd(vec_y.imag+i); //(a + ib) * (c + id) = (ac - bd) + i(ad+bc) //real_temp = bd real_temp = _mm256_mul_pd(imag_x,imag_y); //imag_temp = ad imag_temp = _mm256_mul_pd(real_x,imag_y); //REPLACED FOR COMMENTED SECTION //real_x = ac // real_x = _mm256_mul_pd(real_x,real_y); // //imag_x = bc // imag_x = _mm256_mul_pd(imag_x,real_y); // //real_x = ac - bd => real_x - real_temp // real_x = _mm256_sub_pd(real_x,real_temp); // //imag_x = ad + bc => imag_temp + imag_x // imag_x = _mm256_add_pd(imag_x,imag_temp); //THESE ARE NOT WORKING real_x = _mm256_fmsub_pd(real_x,real_y,real_temp); imag_x = _mm256_fmadd_pd(imag_x,real_y,imag_temp); real_y = _mm256_set1_pd(CPLXDIM); real_x = _mm256_div_pd(real_x,real_y); imag_x = _mm256_div_pd(imag_x,real_y); _mm256_store_pd(vec_res.real+i,real_x); _mm256_store_pd(vec_res.imag+i,imag_x); } fft_vector_nonrec_backward(&vec_res,r); }
div(avx_simd_complex_double<T> lhs, avx_simd_complex_double<T> rhs) { //lhs = [x1.real, x1.img, x2.real, x2.img] //rhs = [y1.real, y1.img, y2.real, y2.img] //ymm0 = [y1.real, y1.real, y2.real, y2.real] __m256d ymm0 = _mm256_movedup_pd(rhs.value); //ymm1 = [y1.imag, y1.imag, y2.imag, y2.imag] __m256d ymm1 = _mm256_permute_pd(rhs.value, 0b1111); //ymm2 = [x1.img, x1.real, x2.img, x2.real] __m256d ymm2 = _mm256_permute_pd(lhs.value, 0b0101); //ymm4 = [x.img * y.img, x.real * y.img] __m256d ymm4 = _mm256_mul_pd(ymm2, ymm1); //ymm5 = subadd((lhs * ymm0), ymm4) #ifdef __FMA__ __m256d ymm5 = _mm256_fmsubadd_pd(lhs.value, ymm0, ymm4); #else __m256d t1 = _mm256_mul_pd(lhs.value, ymm0); __m256d t2 = _mm256_sub_pd(_mm256_set1_pd(0.0), ymm4); __m256d ymm5 = _mm256_addsub_pd(t1, t2); #endif //ymm3 = [y.imag^2, y.imag^2] __m256d ymm3 = _mm256_mul_pd(ymm1, ymm1); //ymm0 = (ymm0 * ymm0 + ymm3) #ifdef __FMA__ ymm0 = _mm256_fmadd_pd(ymm0, ymm0, ymm3); #else __m256d t3 = _mm256_mul_pd(ymm0, ymm0); ymm0 = _mm256_add_pd(t3, ymm3); #endif //result = ymm5 / ymm0 return _mm256_div_pd(ymm5, ymm0); }
double compute_pi_euler_avx(size_t n) { double pi = 0.0; register __m256d ymm0, ymm1, ymm2, ymm3; ymm0 = _mm256_setzero_pd(); ymm1 = _mm256_set1_pd(1.0); ymm2 = _mm256_set1_pd(6.0); for (int i = 0; i <= n - 4; i += 4) { ymm3 = _mm256_set_pd(i, i + 1.0, i + 2.0, i + 3.0); ymm3 = _mm256_mul_pd(ymm3, ymm3); ymm3 = _mm256_div_pd(ymm1, ymm3); ymm0 = _mm256_add_pd(ymm0, ymm3); } ymm3 = _mm256_mul_pd(ymm2, ymm0); double tmp[4] __attribute__((aligned(32))); _mm256_store_pd(tmp, ymm0); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; return sqrt( pi ); }
double compute_pi_leibniz_fma(size_t n) { double pi = 0.0; register __m256d ymm0, ymm1, ymm2, ymm3, ymm4; ymm0 = _mm256_setzero_pd(); ymm1 = _mm256_set1_pd(2.0); ymm2 = _mm256_set1_pd(1.0); ymm3 = _mm256_set_pd(1.0, -1.0, 1.0, -1.0); for (int i = 0; i <= n - 4; i += 4) { ymm4 = _mm256_set_pd(i, i + 1.0, i + 2.0, i + 3.0); ymm4 = _mm256_fmadd_pd(ymm1, ymm4, ymm2); ymm4 = _mm256_div_pd(ymm3, ymm4); ymm0 = _mm256_add_pd(ymm0, ymm4); } double tmp[4] __attribute__((aligned(32))); _mm256_store_pd(tmp, ymm0); pi += tmp[0] + tmp[1] + tmp[2] + tmp[3]; return pi * 4.0; }
inline vector4d operator/(const vector4d& lhs, const vector4d& rhs) { return _mm256_div_pd(lhs, rhs); }
void core::Vector3::normalize(void) { #if defined(VTX_USE_AVX) ALIGNED_32 platform::F64_t vector[] = {this->x, this->y, this->z, 0}; ALIGNED_32 platform::F64_t reciprocalVector[] = {1.0, 1.0, 1.0, 1.0}; __m256d simdvector; __m256d result; __m256d recp; simdvector = _mm256_load_pd(vector); recp = _mm256_load_pd(reciprocalVector); result = _mm256_mul_pd(simdvector, simdvector); result = _mm256_hadd_pd(result, result); result = _mm256_hadd_pd(result, result); result = _mm256_sqrt_pd(result); result = _mm256_div_pd(recp, result); simdvector = _mm256_mul_pd(simdvector, result); _mm256_store_pd(vector, simdvector); this->x = vector[0]; this->y = vector[1]; this->z = vector[2]; #elif defined(VTX_USE_SSE) // Must pad with a trailing 0, to store in 128-bit register ALIGNED_16 core::F32_t vector[] = {this->x, this->y, this->z, 0}; __m128 simdvector; __m128 result; simdvector = _mm_load_ps(vector); // (X^2, Y^2, Z^2, 0^2) result = _mm_mul_ps(simdvector, simdvector); // Add all elements together, giving us (X^2 + Y^2 + Z^2 + 0^2) result = _mm_hadd_ps(result, result); result = _mm_hadd_ps(result, result); // Calculate square root, giving us sqrt(X^2 + Y^2 + Z^2 + 0^2) result = _mm_sqrt_ps(result); // Calculate reciprocal, giving us 1 / sqrt(X^2 + Y^2 + Z^2 + 0^2) result = _mm_rcp_ps(result); // Finally, multiply the result with our original vector. simdvector = _mm_mul_ps(simdvector, result); _mm_store_ps(vector, simdvector); this->x = vector[0]; this->y = vector[1]; this->z = vector[2]; #else core::F64_t num = 1.0 / std::sqrt(std::pow(this->x, 2) + std::pow(this->y, 2) + std::pow(this->z, 2)); this->x *= num; this->y *= num; this->z *= num; #endif }
BI_FORCE_INLINE inline avx_double operator/(const avx_double& o1, const double& o2) { avx_double res; res.packed = _mm256_div_pd(o1.packed, _mm256_set1_pd(o2)); return res; }
BI_FORCE_INLINE inline avx_double& operator/=(avx_double& o1, const avx_double& o2) { o1.packed = _mm256_div_pd(o1.packed, o2.packed); return o1; }
inline float64x4_t normalize(const float64x4_t ymm) { float64x4_t len0 = simd_geometric::length(ymm); float64x4_t div0 = _mm256_div_pd(ymm, len0); return div0; }
/*! * \brief Divide the two given vectors */ ETL_STATIC_INLINE(avx_simd_double) div(avx_simd_double lhs, avx_simd_double rhs) { return _mm256_div_pd(lhs.value, rhs.value); }
// Process audio effects for 8 channels simultaneously: void processEffects(const vec8_i32 &inpSamples, vec8_i32 &outSamples, const long n) { // Extract int samples and convert to doubles: const vec4_d64 ds0 = _mm256_div_pd( _mm256_cvtepi32_pd(_mm256_extractf128_si256(inpSamples, 0)), _mm256_set1_pd((double)INT_MAX) ); const vec4_d64 ds1 = _mm256_div_pd( _mm256_cvtepi32_pd(_mm256_extractf128_si256(inpSamples, 1)), _mm256_set1_pd((double)INT_MAX) ); // Monitor input levels: fx.fi_monitor.levels[n + 0] = scalar_to_dBFS(ds0); fx.fi_monitor.levels[n + 1] = scalar_to_dBFS(ds1); vec4_d64 s0, s1; // f0_gain: { s0 = _mm256_mul_pd(ds0, fx.f0_gain.calc.gain[n + 0]); s1 = _mm256_mul_pd(ds1, fx.f0_gain.calc.gain[n + 1]); } // Monitor levels: fx.f0_output.levels[n + 0] = scalar_to_dBFS(s0); fx.f0_output.levels[n + 1] = scalar_to_dBFS(s1); // f1_compressor: { const vec4_dBFS l0 = scalar_to_dBFS_offs(s0); const vec4_dBFS l1 = scalar_to_dBFS_offs(s1); // over = s - thresh vec4_dB over0 = _mm256_sub_pd(l0, fx.f1_compressor.input.threshold[n + 0]); vec4_dB over1 = _mm256_sub_pd(l1, fx.f1_compressor.input.threshold[n + 1]); // over = if over < 0.0 then 0.0 else over; over0 = mm256_if_then_else(_mm256_cmp_pd(over0, _mm256_set1_pd(0.0), _CMP_LT_OQ), _mm256_set1_pd(0.0), over0); over1 = mm256_if_then_else(_mm256_cmp_pd(over1, _mm256_set1_pd(0.0), _CMP_LT_OQ), _mm256_set1_pd(0.0), over1); // over += DC_OFFSET over0 = _mm256_add_pd(over0, DC_OFFSET); over1 = _mm256_add_pd(over1, DC_OFFSET); // env = over + coef * ( env - over ) const vec4_dB attack_env0 = _mm256_add_pd(over0, _mm256_mul_pd(fx.f1_compressor.calc.attack_coef[n + 0], _mm256_sub_pd(fx.f1_compressor.state.env[n + 0], over0))); const vec4_dB attack_env1 = _mm256_add_pd(over1, _mm256_mul_pd(fx.f1_compressor.calc.attack_coef[n + 1], _mm256_sub_pd(fx.f1_compressor.state.env[n + 1], over1))); const vec4_dB release_env0 = _mm256_add_pd(over0, _mm256_mul_pd(fx.f1_compressor.calc.release_coef[n + 0], _mm256_sub_pd(fx.f1_compressor.state.env[n + 0], over0))); const vec4_dB release_env1 = _mm256_add_pd(over1, _mm256_mul_pd(fx.f1_compressor.calc.release_coef[n + 1], _mm256_sub_pd(fx.f1_compressor.state.env[n + 1], over1))); // env = if over > env then attack_env else release_env fx.f1_compressor.state.env[n + 0] = mm256_if_then_else(_mm256_cmp_pd(over0, fx.f1_compressor.state.env[n + 0], _CMP_GT_OQ), attack_env0, release_env0); fx.f1_compressor.state.env[n + 1] = mm256_if_then_else(_mm256_cmp_pd(over1, fx.f1_compressor.state.env[n + 1], _CMP_GT_OQ), attack_env1, release_env1); // over = env - DC_OFFSET over0 = _mm256_sub_pd(fx.f1_compressor.state.env[n + 0], DC_OFFSET); over1 = _mm256_sub_pd(fx.f1_compressor.state.env[n + 1], DC_OFFSET); // grdB = ( over * ( ratio - 1.0 ) ) vec4_dB gr0dB = _mm256_mul_pd(over0, fx.f1_compressor.calc.ratio_min_1[n + 0]); vec4_dB gr1dB = _mm256_mul_pd(over1, fx.f1_compressor.calc.ratio_min_1[n + 1]); // gr = dB_to_scalar(grdB) fx.f1_compressor.monitor.gain_reduction[n + 0] = dB_to_scalar(gr0dB); fx.f1_compressor.monitor.gain_reduction[n + 1] = dB_to_scalar(gr1dB); // Apply gain reduction to inputs: s0 = _mm256_mul_pd(s0, fx.f1_compressor.monitor.gain_reduction[n + 0]); s1 = _mm256_mul_pd(s1, fx.f1_compressor.monitor.gain_reduction[n + 1]); // Apply make-up gain: s0 = _mm256_mul_pd(s0, fx.f1_compressor.calc.gain[n + 0]); s1 = _mm256_mul_pd(s1, fx.f1_compressor.calc.gain[n + 1]); } // Monitor output levels: fx.fo_monitor.levels[n + 0] = scalar_to_dBFS(s0); fx.fo_monitor.levels[n + 1] = scalar_to_dBFS(s1); // TODO(jsd): Better limiter implementation! // Limit final samples: s0 = _mm256_max_pd(_mm256_min_pd(s0, _mm256_set1_pd((double)1.0)), _mm256_set1_pd((double)-1.0)); s1 = _mm256_max_pd(_mm256_min_pd(s1, _mm256_set1_pd((double)1.0)), _mm256_set1_pd((double)-1.0)); // Convert doubles back to 32-bit ints: s0 = _mm256_mul_pd(s0, _mm256_set1_pd((double)INT_MAX)); s1 = _mm256_mul_pd(s1, _mm256_set1_pd((double)INT_MAX)); const vec8_i32 os = _mm256_setr_m128i(_mm256_cvtpd_epi32(s0), _mm256_cvtpd_epi32(s1)); // Write outputs: _mm256_stream_si256(&outSamples, os); }