static void apply_window_mp3(float *in, float *win, int *unused, float *out,
                             int incr)
{
	LOCAL_ALIGNED_16(float, suma, [17]);
	LOCAL_ALIGNED_16(float, sumb, [17]);
	LOCAL_ALIGNED_16(float, sumc, [17]);
	LOCAL_ALIGNED_16(float, sumd, [17]);

	float sum;
	int j;
	float *out2 = out + 32 * incr;

	/* copy to avoid wrap */
	memcpy(in + 512, in, 32 * sizeof(*in));

	apply_window(in + 16, win     , win + 512, suma, sumc, 16);
	apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);

	SUM8(MLSS, suma[0], win + 32, in + 48);

	sumc[ 0] = 0;
	sumb[16] = 0;
	sumd[16] = 0;

	out[0  ]  = suma[   0];
	out += incr;
	out2 -= incr;
	for(j=1; j<16; j++)
	{
		*out  =  suma[   j] - sumd[16-j];
		*out2 = -sumb[16-j] - sumc[   j];
		out  += incr;
		out2 -= incr;
	}

	sum = 0;
	SUM8(MLSS, sum, win + 16 + 32, in + 32);
	*out = sum;
}
示例#2
0
void ComputePower(double * s, int iSize)
{
	const int static N = iSize;
	
	{		
		// store coefficients in registers
		const double a0 = 0.01f;
		const double a1 = 0.035f;
		const double a2 = 0.001f;
		const double a3 = 0.15f;
		
		const double b0 = 0.012f;
		const double b1 = 0.067f;
		const double b2 = 0.02f;
		const double b3 = 0.21f;
		
		
		// 8 independent streams of computations
		// this helps filling the processing pipeline
		// 8 data reads
		double x0 = s[0];
		double y0 = s[0];
		double z0 = s[0];
		double w0 = s[0];
		double r0 = s[0];
		double t0 = s[0];
		double u0 = s[0];
		double v0 = s[0];
		
		// compute independently 8 polynomial evaluations with horner's scheme of degree 4N
		// 4*8*2*N FLOP
		for (int i=0; i<N; i++)
		{
			double gx0 = MULADD(b0, x0, a0);
			double gy0 = MULADD(b0, y0, a0);
			double gz0 = MULADD(b0, z0, a0);
			double gw0 = MULADD(b0, w0, a0);
			double gr0 = MULADD(b0, r0, a0);
			double gt0 = MULADD(b0, t0, a0);
			double gu0 = MULADD(b0, u0, a0);
			double gv0 = MULADD(b0, v0, a0);
			
			double tx0 = MULADD(b1, gx0, a1);
			double ty0 = MULADD(b1, gy0, a1);
			double tz0 = MULADD(b1, gz0, a1);
			double tw0 = MULADD(b1, gw0, a1);
			double tr0 = MULADD(b1, gr0, a1);
			double tt0 = MULADD(b1, gt0, a1);
			double tu0 = MULADD(b1, gu0, a1);
			double tv0 = MULADD(b1, gv0, a1);
			
			double ux0 = MULADD(b2, tx0, a2);
			double uy0 = MULADD(b2, ty0, a2);
			double uz0 = MULADD(b2, tz0, a2);
			double uw0 = MULADD(b2, tw0, a2);
			double ur0 = MULADD(b2, tr0, a2);
			double ut0 = MULADD(b2, tt0, a2);
			double uu0 = MULADD(b2, tu0, a2);
			double uv0 = MULADD(b2, tv0, a2);
			
			x0 = MULADD(b3, ux0, a3);
			y0 = MULADD(b3, uy0, a3);
			z0 = MULADD(b3, uz0, a3);
			w0 = MULADD(b3, uw0, a3);
			r0 = MULADD(b3, ur0, a3);
			t0 = MULADD(b3, ut0, a3);
			u0 = MULADD(b3, uu0, a3);
			v0 = MULADD(b3, uv0, a3);
		}
		
		// sum the results of the 8 polynomial evaluations
		// and store in the output array
		// without this step, the compiler might simplify the code by discarding unused computation and data
		// 1 data write, 7 FLOP (irrelevant if N is large enough)
		s[0] = SUM8(x0, y0, z0, w0, r0, t0, u0, v0);
	}
}