コード例 #1
0
ファイル: mdct_sse.c プロジェクト: 9060/wavtoac3encoder
/** 32 point butterfly (in place, 4 register) */
static void
mdct_butterfly_32_sse(FLOAT *x) {
    static _MM_ALIGN16 const float PFV0[4] = { -AFT_PI3_8, -AFT_PI1_8,
                                               -AFT_PI2_8, -AFT_PI2_8 };
    static _MM_ALIGN16 const float PFV1[4] = { -AFT_PI1_8,  AFT_PI3_8,
                                               -AFT_PI2_8,  AFT_PI2_8 };
    static _MM_ALIGN16 const float PFV2[4] = { -AFT_PI1_8, -AFT_PI3_8,
                                                     -1.f,        1.f };
    static _MM_ALIGN16 const float PFV3[4] = { -AFT_PI3_8,  AFT_PI1_8,
                                                      0.f,        0.f };
    static _MM_ALIGN16 const float PFV4[4] = {  AFT_PI3_8,  AFT_PI3_8,
                                                AFT_PI2_8,  AFT_PI2_8 };
    static _MM_ALIGN16 const float PFV5[4] = { -AFT_PI1_8,  AFT_PI1_8,
                                               -AFT_PI2_8,  AFT_PI2_8 };
    static _MM_ALIGN16 const float PFV6[4] = {  AFT_PI1_8,  AFT_PI3_8,
                                                      1.f,        1.f };
    static _MM_ALIGN16 const float PFV7[4] = { -AFT_PI3_8,  AFT_PI1_8,
                                                      0.f,        0.f };
    __m128  XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7;

    XMM0     = _mm_load_ps(x+16);
    XMM1     = _mm_load_ps(x+20);
    XMM2     = _mm_load_ps(x+24);
    XMM3     = _mm_load_ps(x+28);
    XMM4     = XMM0;
    XMM5     = XMM1;
    XMM6     = XMM2;
    XMM7     = XMM3;

    XMM0     = _mm_sub_ps(XMM0, PM128(x   ));
    XMM1     = _mm_sub_ps(XMM1, PM128(x+ 4));
    XMM2     = _mm_sub_ps(XMM2, PM128(x+ 8));
    XMM3     = _mm_sub_ps(XMM3, PM128(x+12));
    XMM4     = _mm_add_ps(XMM4, PM128(x   ));
    XMM5     = _mm_add_ps(XMM5, PM128(x+ 4));
    XMM6     = _mm_add_ps(XMM6, PM128(x+ 8));
    XMM7     = _mm_add_ps(XMM7, PM128(x+12));
    _mm_store_ps(x+16, XMM4);
    _mm_store_ps(x+20, XMM5);
    _mm_store_ps(x+24, XMM6);
    _mm_store_ps(x+28, XMM7);

    XMM4     = XMM0;
    XMM5     = XMM1;
    XMM6     = XMM2;
    XMM7     = XMM3;

    XMM0     = _mm_shuffle_ps(XMM0, XMM0, _MM_SHUFFLE(3,3,1,1));
    XMM4     = _mm_shuffle_ps(XMM4, XMM4, _MM_SHUFFLE(2,2,0,0));

    XMM1     = _mm_shuffle_ps(XMM1, XMM1, _MM_SHUFFLE(2,3,1,1));
    XMM5     = _mm_shuffle_ps(XMM5, XMM5, _MM_SHUFFLE(2,3,0,0));
    XMM2     = _mm_shuffle_ps(XMM2, XMM2, _MM_SHUFFLE(2,2,1,0));
    XMM6     = _mm_shuffle_ps(XMM6, XMM6, _MM_SHUFFLE(3,3,0,1));
    XMM3     = _mm_shuffle_ps(XMM3, XMM3, _MM_SHUFFLE(3,2,0,0));
    XMM7     = _mm_shuffle_ps(XMM7, XMM7, _MM_SHUFFLE(3,2,1,1));
    XMM0     = _mm_mul_ps(XMM0, PM128(PFV0));
    XMM4     = _mm_mul_ps(XMM4, PM128(PFV1));
    XMM1     = _mm_mul_ps(XMM1, PM128(PFV2));
    XMM5     = _mm_mul_ps(XMM5, PM128(PFV3));
    XMM2     = _mm_mul_ps(XMM2, PM128(PFV4));
    XMM6     = _mm_mul_ps(XMM6, PM128(PFV5));
    XMM3     = _mm_mul_ps(XMM3, PM128(PFV6));
    XMM7     = _mm_mul_ps(XMM7, PM128(PFV7));
    XMM0     = _mm_add_ps(XMM0, XMM4);
    XMM1     = _mm_add_ps(XMM1, XMM5);
    XMM2     = _mm_add_ps(XMM2, XMM6);
    XMM3     = _mm_add_ps(XMM3, XMM7);
    _mm_store_ps(x   , XMM0);
    _mm_store_ps(x+ 4, XMM1);
    _mm_store_ps(x+ 8, XMM2);
    _mm_store_ps(x+12, XMM3);

    mdct_butterfly_16_sse(x);
    mdct_butterfly_16_sse(x+16);
}
コード例 #2
0
ファイル: PhaseVocoderDSP.cpp プロジェクト: krkrz/krkrz
//---------------------------------------------------------------------------
void tRisaPhaseVocoderDSP::ProcessCore_sse(int ch)
{
	unsigned int framesize_d2 = FrameSize / 2;
	float * analwork = AnalWork[ch];
	float * synthwork = SynthWork[ch];

	// 丸めモードを設定
	SetRoundingModeToNearest_SSE();

	// FFT を実行する
	rdft(FrameSize, 1, analwork, FFTWorkIp, FFTWorkW); // Real DFT
	analwork[1] = 0.0; // analwork[1] = nyquist freq. power (どっちみち使えないので0に)

	__m128 exact_time_scale = _mm_load1_ps(&ExactTimeScale);
	__m128 over_sampling_radian_v = _mm_load1_ps(&OverSamplingRadian);

	if(FrequencyScale != 1.0)
	{
		// ここでは 4 複素数 (8実数) ごとに処理を行う。
		__m128 over_sampling_radian_recp = _mm_load1_ps(&OverSamplingRadianRecp);
		__m128 frequency_per_filter_band = _mm_load1_ps(&FrequencyPerFilterBand);
		__m128 frequency_per_filter_band_recp = _mm_load1_ps(&FrequencyPerFilterBandRecp);

		for(unsigned int i = 0; i < framesize_d2; i += 4)
		{
			// インターリーブ解除 +  直交座標系→極座標系
			__m128 aw3120 = *(__m128*)(analwork + i*2    );
			__m128 aw7654 = *(__m128*)(analwork + i*2 + 4);

			__m128 re3210 = _mm_shuffle_ps(aw3120, aw7654, _MM_SHUFFLE(2,0,2,0));
			__m128 im3210 = _mm_shuffle_ps(aw3120, aw7654, _MM_SHUFFLE(3,1,3,1));

			__m128 mag = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(re3210,re3210), _mm_mul_ps(im3210,im3210)));
			__m128 ang = VFast_arctan2_F4_SSE(im3210, re3210);

			// 前回の位相との差をとる
			__m128 lastp = *(__m128*)(LastAnalPhase[ch] + i);
			*(__m128*)(LastAnalPhase[ch] + i) = ang;
			ang = _mm_sub_ps(lastp, ang);

			// over sampling の影響を考慮する
			__m128 i_3210;
			i_3210 = _mm_cvtsi32_ss(i_3210, i);
			i_3210 = _mm_shuffle_ps(i_3210, i_3210, _MM_SHUFFLE(0,0,0,0));
			i_3210 = _mm_add_ps( i_3210, PM128(PFV_INIT) );

			__m128 phase_shift = _mm_mul_ps(i_3210, over_sampling_radian_v);
			ang = _mm_sub_ps( ang, phase_shift );

			// unwrapping をする
			ang = Wrap_Pi_F4_SSE(ang);

			// -M_PI~+M_PIを-1.0~+1.0の変位に変換
			ang = _mm_mul_ps( ang, over_sampling_radian_recp );

			// tmp をフィルタバンド中央からの周波数の変位に変換し、
			// それにフィルタバンドの中央周波数を加算する
			__m128 freq = _mm_mul_ps( _mm_add_ps(ang, i_3210), frequency_per_filter_band );

			// analwork に値を格納する
			re3210 = mag;
			im3210 = freq;
			__m128 im10re10 = _mm_movelh_ps(re3210, im3210);
			__m128 im32re32 = _mm_movehl_ps(im3210, re3210);
			__m128 im1re1im0re0 = _mm_shuffle_ps(im10re10, im10re10, _MM_SHUFFLE(3,1,2,0));
			__m128 im3re3im2re2 = _mm_shuffle_ps(im32re32, im32re32, _MM_SHUFFLE(3,1,2,0));
			*(__m128*)(analwork + i*2    ) = im1re1im0re0;
			*(__m128*)(analwork + i*2 + 4) = im3re3im2re2;
		}


		//------------------------------------------------
		// 変換
		//------------------------------------------------
		// 周波数軸方向のリサンプリングを行う
		float FrequencyScale_rcp = 1.0f / FrequencyScale;
		for(unsigned int i = 0; i < framesize_d2; i ++)
		{
			// i に対応するインデックスを得る
			float fi = i * FrequencyScale_rcp;

			// floor(x) と floor(x) + 1 の間でバイリニア補間を行う
			unsigned int index = static_cast<unsigned int>(fi); // floor
			float frac = fi - index;

			if(index + 1 < framesize_d2)
			{
				synthwork[i*2  ] =
					analwork[index*2  ] +
					frac * (analwork[index*2+2]-analwork[index*2  ]);
				synthwork[i*2+1] =
					FrequencyScale * (
					analwork[index*2+1] +
					frac * (analwork[index*2+3]-analwork[index*2+1]) );
			}
			else if(index < framesize_d2)
			{
				synthwork[i*2  ] = analwork[index*2  ];
				synthwork[i*2+1] = analwork[index*2+1] * FrequencyScale;
			}
			else
			{
				synthwork[i*2  ] = 0.0;
				synthwork[i*2+1] = 0.0;
			}
		}

		//------------------------------------------------
		// 合成
		//------------------------------------------------

		// 各フィルタバンドごとに変換
		// 基本的には解析の逆変換である
		for(unsigned int i = 0; i < framesize_d2; i += 4)
		{
			// インターリーブ解除
			__m128 sw3120 = *(__m128*)(synthwork + i*2    );
			__m128 sw7654 = *(__m128*)(synthwork + i*2 + 4);

			__m128 mag  = _mm_shuffle_ps(sw3120, sw7654, _MM_SHUFFLE(2,0,2,0));
			__m128 freq = _mm_shuffle_ps(sw3120, sw7654, _MM_SHUFFLE(3,1,3,1));

			// i+3 i+2 i+1 i+0 を準備
			__m128 i_3210;
			i_3210 = _mm_cvtsi32_ss(i_3210, i);
			i_3210 = _mm_shuffle_ps(i_3210, i_3210, _MM_SHUFFLE(0,0,0,0));
			i_3210 = _mm_add_ps(i_3210, PM128(PFV_INIT));

			// 周波数から各フィルタバンドの中央周波数を減算し、
			// フィルタバンドの中央周波数からの-1.0~+1.0の変位
			// に変換する
			__m128 ang = _mm_sub_ps(_mm_mul_ps(freq, frequency_per_filter_band_recp), i_3210);

			// -1.0~+1.0の変位を-M_PI~+M_PIの位相に変換
			ang = _mm_mul_ps( ang, over_sampling_radian_v );

			// OverSampling による位相の補正
			ang = _mm_add_ps( ang, _mm_mul_ps( i_3210, over_sampling_radian_v ) );

			// TimeScale による位相の補正
			ang = _mm_mul_ps( ang, exact_time_scale );

			// 前回の位相と加算する
			// ここでも虚数部の符号が逆になるので注意
			ang = _mm_sub_ps( *(__m128*)(LastSynthPhase[ch] + i), ang );
			*(__m128*)(LastSynthPhase[ch] + i) = ang;

			// 極座標系→直交座標系
			__m128 sin, cos;
			VFast_sincos_F4_SSE(ang, sin, cos);
			__m128 re3210 = _mm_mul_ps( mag, cos );
			__m128 im3210 = _mm_mul_ps( mag, sin );

			// インターリーブ
			__m128 im10re10 = _mm_movelh_ps(re3210, im3210);
			__m128 im32re32 = _mm_movehl_ps(im3210, re3210);
			__m128 im1re1im0re0 = _mm_shuffle_ps(im10re10, im10re10, _MM_SHUFFLE(3,1,2,0));
			__m128 im3re3im2re2 = _mm_shuffle_ps(im32re32, im32re32, _MM_SHUFFLE(3,1,2,0));
			*(__m128*)(synthwork + i*2    ) = im1re1im0re0;
			*(__m128*)(synthwork + i*2 + 4) = im3re3im2re2;
		}
	}
	else
	{
		// 周波数軸方向にシフトがない場合
		// ここでも 4 複素数 (8実数) ごとに処理を行う。
		for(unsigned int i = 0; i < framesize_d2; i += 4)
		{
			// インターリーブ解除 +  直交座標系→極座標系
			__m128 aw3120 = *(__m128*)(analwork + i*2    );
			__m128 aw7654 = *(__m128*)(analwork + i*2 + 4);

			__m128 re3210 = _mm_shuffle_ps(aw3120, aw7654, _MM_SHUFFLE(2,0,2,0));
			__m128 im3210 = _mm_shuffle_ps(aw3120, aw7654, _MM_SHUFFLE(3,1,3,1));

			__m128 mag = _mm_sqrt_ps( _mm_add_ps(_mm_mul_ps(re3210,re3210), _mm_mul_ps(im3210,im3210)) );
			__m128 ang = VFast_arctan2_F4_SSE(im3210, re3210);

			// 前回の位相との差をとる
			__m128 lastp = *(__m128*)(LastAnalPhase[ch] + i);
			*(__m128*)(LastAnalPhase[ch] + i) = ang;
			ang = _mm_sub_ps( lastp, ang );

			// over sampling の影響を考慮する
			__m128 i_3210;
			i_3210 = _mm_cvtsi32_ss(i_3210, i);
			i_3210 = _mm_shuffle_ps(i_3210, i_3210, _MM_SHUFFLE(0,0,0,0));
			i_3210 = _mm_add_ps( i_3210, PM128(PFV_INIT) );

			__m128 phase_shift = _mm_mul_ps( i_3210, over_sampling_radian_v );
			ang = _mm_sub_ps( ang, phase_shift );

			// unwrapping をする
			ang = Wrap_Pi_F4_SSE(ang);

			// OverSampling による位相の補正
			ang = _mm_add_ps( ang, phase_shift );

			// TimeScale による位相の補正
			ang = _mm_mul_ps( ang, exact_time_scale );

			// 前回の位相と加算する
			// ここでも虚数部の符号が逆になるので注意
			ang = _mm_sub_ps( *(__m128*)(LastSynthPhase[ch] + i), ang );
			*(__m128*)(LastSynthPhase[ch] + i) = ang;

			// 極座標系→直交座標系
			__m128 sin, cos;
			VFast_sincos_F4_SSE(ang, sin, cos);
			re3210 = _mm_mul_ps( mag, cos );
			im3210 = _mm_mul_ps( mag, sin );

			// インターリーブ
			__m128 im10re10 = _mm_movelh_ps(re3210, im3210);
			__m128 im32re32 = _mm_movehl_ps(im3210, re3210);
			__m128 im1re1im0re0 = _mm_shuffle_ps(im10re10, im10re10, _MM_SHUFFLE(3,1,2,0));
			__m128 im3re3im2re2 = _mm_shuffle_ps(im32re32, im32re32, _MM_SHUFFLE(3,1,2,0));
			*(__m128*)(synthwork + i*2    ) = im1re1im0re0;
			*(__m128*)(synthwork + i*2 + 4) = im3re3im2re2;
		}
	}

	// FFT を実行する
	synthwork[1] = 0.0; // synthwork[1] = nyquist freq. power (どっちみち使えないので0に)
	rdft_sse(FrameSize, -1, synthwork, FFTWorkIp, FFTWorkW); // Inverse Real DFT
}