static void apply_window_mp3(float *in, float *win, int *unused, float *out, int incr) { LOCAL_ALIGNED_16(float, suma, [17]); LOCAL_ALIGNED_16(float, sumb, [17]); LOCAL_ALIGNED_16(float, sumc, [17]); LOCAL_ALIGNED_16(float, sumd, [17]); float sum; int j; float *out2 = out + 32 * incr; /* copy to avoid wrap */ memcpy(in + 512, in, 32 * sizeof(*in)); apply_window(in + 16, win , win + 512, suma, sumc, 16); apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16); SUM8(MLSS, suma[0], win + 32, in + 48); sumc[ 0] = 0; sumb[16] = 0; sumd[16] = 0; out[0 ] = suma[ 0]; out += incr; out2 -= incr; for(j=1; j<16; j++) { *out = suma[ j] - sumd[16-j]; *out2 = -sumb[16-j] - sumc[ j]; out += incr; out2 -= incr; } sum = 0; SUM8(MLSS, sum, win + 16 + 32, in + 32); *out = sum; }
void ComputePower(double * s, int iSize) { const int static N = iSize; { // store coefficients in registers const double a0 = 0.01f; const double a1 = 0.035f; const double a2 = 0.001f; const double a3 = 0.15f; const double b0 = 0.012f; const double b1 = 0.067f; const double b2 = 0.02f; const double b3 = 0.21f; // 8 independent streams of computations // this helps filling the processing pipeline // 8 data reads double x0 = s[0]; double y0 = s[0]; double z0 = s[0]; double w0 = s[0]; double r0 = s[0]; double t0 = s[0]; double u0 = s[0]; double v0 = s[0]; // compute independently 8 polynomial evaluations with horner's scheme of degree 4N // 4*8*2*N FLOP for (int i=0; i<N; i++) { double gx0 = MULADD(b0, x0, a0); double gy0 = MULADD(b0, y0, a0); double gz0 = MULADD(b0, z0, a0); double gw0 = MULADD(b0, w0, a0); double gr0 = MULADD(b0, r0, a0); double gt0 = MULADD(b0, t0, a0); double gu0 = MULADD(b0, u0, a0); double gv0 = MULADD(b0, v0, a0); double tx0 = MULADD(b1, gx0, a1); double ty0 = MULADD(b1, gy0, a1); double tz0 = MULADD(b1, gz0, a1); double tw0 = MULADD(b1, gw0, a1); double tr0 = MULADD(b1, gr0, a1); double tt0 = MULADD(b1, gt0, a1); double tu0 = MULADD(b1, gu0, a1); double tv0 = MULADD(b1, gv0, a1); double ux0 = MULADD(b2, tx0, a2); double uy0 = MULADD(b2, ty0, a2); double uz0 = MULADD(b2, tz0, a2); double uw0 = MULADD(b2, tw0, a2); double ur0 = MULADD(b2, tr0, a2); double ut0 = MULADD(b2, tt0, a2); double uu0 = MULADD(b2, tu0, a2); double uv0 = MULADD(b2, tv0, a2); x0 = MULADD(b3, ux0, a3); y0 = MULADD(b3, uy0, a3); z0 = MULADD(b3, uz0, a3); w0 = MULADD(b3, uw0, a3); r0 = MULADD(b3, ur0, a3); t0 = MULADD(b3, ut0, a3); u0 = MULADD(b3, uu0, a3); v0 = MULADD(b3, uv0, a3); } // sum the results of the 8 polynomial evaluations // and store in the output array // without this step, the compiler might simplify the code by discarding unused computation and data // 1 data write, 7 FLOP (irrelevant if N is large enough) s[0] = SUM8(x0, y0, z0, w0, r0, t0, u0, v0); } }