// Sse version of RevbinPermuteInv function. static void RevbinPermuteInvSse(const OMX_F32 *in, OMX_F32 *out, const OMX_F32 *twiddle, OMX_INT n) { OMX_INT i; OMX_INT j; OMX_INT n_by_2 = n >> 1; OMX_INT n_by_4 = n >> 2; const OMX_F32 *tw; const OMX_F32 *pi; const OMX_F32 *pj; VC v_i; VC v_j; VC v_big_a; VC v_big_b; VC v_temp; VC v_tw; for (i = 0, j = n_by_2 - 3; i < n_by_4; i += 4, j -= 4) { pi = in + (i << 1); pj = in + (j << 1); VC_LOAD_INTERLEAVE(&v_i, pi); v_j.real = _mm_set_ps(pj[0], pj[2], pj[4], pj[6]); v_j.imag = _mm_set_ps(pj[1], pj[3], pj[5], pj[7]); // A[k] = (X[k] + X'[N/2 - k]) VC_ADD_SUB(&v_big_a, &v_i, &v_j); // temp = (X[k] - X'[N/2 - k]) VC_SUB_ADD(&v_temp, &v_i, &v_j); // W[k] tw = twiddle + i; VC_LOAD_SPLIT(&v_tw, tw, n); // B[k] = (X[k] - X'[N/2 - k]) * W[k] VC_CONJ_MUL(&v_big_b, &v_temp, &v_tw); // Convert split format to interleaved format. // Z[k] = (A[k] + j * B[k]) (k = 0, ..., N/2 - 1) // The scaling of 1/2 will be merged into to the scaling in // the last step before the output in omxSP_FFTInv_CCSToR_F32_Sfs. VC_ADD_X_STORE_SPLIT((out + i), &v_big_a, &v_big_b, n_by_2); VC_SUB_X_INVERSE_STOREU_SPLIT((out + j), &v_big_a, &v_big_b, n_by_2); } // The n_by_2 complex point out[n_by_4] = 2.0f * in[n_by_2]; out[n_by_4 + n_by_2] = -2.0f * in[n_by_2 + 1]; // The first complex point out[0] = in[0] + in[n]; out[n_by_2] = in[0] - in[n]; }
void x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse( const OMX_F32 *in, OMX_F32 *out, OMX_INT n) { OMX_INT i; OMX_INT n_by_4 = n >> 2; OMX_F32 *out0 = out; for (i = 0; i < n_by_4; i += 4) { VC v_t0; VC v_t1; VC v_t2; VC v_t3; VC v_t4; VC v_t5; VC v_t6; VC v_t7; const OMX_F32 *in0 = in + i; const OMX_F32 *in1 = in0 + n_by_4; const OMX_F32 *in2 = in1 + n_by_4; const OMX_F32 *in3 = in2 + n_by_4; OMX_F32 *out1 = out0 + n_by_4; OMX_F32 *out2 = out1 + n_by_4; OMX_F32 *out3 = out2 + n_by_4; VC_LOAD_SPLIT(&v_t0, in0, n); VC_LOAD_SPLIT(&v_t1, in1, n); VC_LOAD_SPLIT(&v_t2, in2, n); VC_LOAD_SPLIT(&v_t3, in3, n); RADIX4_BUTTERFLY_FS(&v_t4, &v_t5, &v_t6, &v_t7, &v_t0, &v_t1, &v_t2, &v_t3); RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3, &v_t4, &v_t5, &v_t6, &v_t7, n); out0 += 4; } }
void x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse( const OMX_F32 *in, OMX_F32 *out, const OMX_F32 *twiddle, OMX_INT n, OMX_INT sub_size, OMX_INT sub_num) { OMX_INT set; OMX_INT grp; OMX_INT step = sub_num >> 1; OMX_INT set_count = sub_num >> 2; OMX_INT n_by_4 = n >> 2; OMX_INT n_mul_2 = n << 1; OMX_F32 *out0 = out; if (set_count == 2) { InternalUnroll2Inv(in, out, twiddle, n); return; } // grp == 0 for (set = 0; set < set_count; set += 4) { const OMX_F32 * in0 = in + set; const OMX_F32 *in1 = in0 + set_count; const OMX_F32 *in2 = in1 + set_count; const OMX_F32 *in3 = in2 + set_count; VC v_t0; VC v_t1; VC v_t2; VC v_t3; VC v_t4; VC v_t5; VC v_t6; VC v_t7; VC_LOAD_SPLIT(&v_t0, in0, n); VC_LOAD_SPLIT(&v_t1, in1, n); VC_LOAD_SPLIT(&v_t2, in2, n); VC_LOAD_SPLIT(&v_t3, in3, n); OMX_F32 *out1 = out0 + n_by_4; OMX_F32 *out2 = out1 + n_by_4; OMX_F32 *out3 = out2 + n_by_4; RADIX4_BUTTERFLY_FS(&v_t4, &v_t5, &v_t6, &v_t7, &v_t0, &v_t1, &v_t2, &v_t3); RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3, &v_t4, &v_t5, &v_t6, &v_t7, n); out0 += 4; } for (grp = 1; grp < sub_size; ++grp) { const OMX_F32 *tw1 = twiddle + grp * step; const OMX_F32 *tw2 = tw1 + grp * step; const OMX_F32 *tw3 = tw2 + grp * step; VC v_tw1; VC v_tw2; VC v_tw3; v_tw1.real = _mm_load1_ps(tw1); v_tw1.imag = _mm_load1_ps(tw1 + n_mul_2); v_tw2.real = _mm_load1_ps(tw2); v_tw2.imag = _mm_load1_ps(tw2 + n_mul_2); v_tw3.real = _mm_load1_ps(tw3); v_tw3.imag = _mm_load1_ps(tw3 + n_mul_2); for (set = 0; set < set_count; set += 4) { const OMX_F32 *in0 = in + set + grp * sub_num; const OMX_F32 *in1 = in0 + set_count; const OMX_F32 *in2 = in1 + set_count; const OMX_F32 *in3 = in2 + set_count; VC v_t0; VC v_t1; VC v_t2; VC v_t3; VC v_t4; VC v_t5; VC v_t6; VC v_t7; VC_LOAD_SPLIT(&v_t0, in0, n); VC_LOAD_SPLIT(&v_t1, in1, n); VC_LOAD_SPLIT(&v_t2, in2, n); VC_LOAD_SPLIT(&v_t3, in3, n); OMX_F32 *out1 = out0 + n_by_4; OMX_F32 *out2 = out1 + n_by_4; OMX_F32 *out3 = out2 + n_by_4; RADIX4_INV_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7, &v_tw1, &v_tw2, &v_tw3, &v_t0, &v_t1, &v_t2, &v_t3); RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3, &v_t4, &v_t5, &v_t6, &v_t7, n); out0 += 4; } } }
// Sse version of RevbinPermuteFwd function. static void RevbinPermuteFwdSse( const OMX_F32 *in, OMX_F32 *out, const OMX_F32 *twiddle, OMX_INT n) { OMX_INT i; OMX_INT j; OMX_INT n_by_2 = n >> 1; OMX_INT n_by_4 = n >> 2; VC v_i; VC v_j; VC v_big_a; VC v_big_b; VC v_temp; VC v_x0; VC v_x1; VC v_tw; __m128 factor = _mm_set1_ps(0.5f); for (i = 0, j = n_by_2 - 3; i < n_by_4; i += 4, j -= 4) { VC_LOAD_SPLIT(&v_i, (in + i), n_by_2); VC_LOADU_SPLIT(&v_j, (in + j), n_by_2); VC_REVERSE(&v_j); // A[k] = (Z[k] + Z'[N/2 - k]) VC_ADD_SUB(&v_big_a, &v_j, &v_i); // B[k] = -j * (Z[k] - Z'[N/2 - k]) VC_SUB_ADD(&v_big_b, &v_j, &v_i); // W[k] VC_LOAD_SPLIT(&v_tw, (twiddle + i), n); // temp = B[k] * W[k] VC_CONJ_MUL(&v_temp, &v_big_b, &v_tw); VC_SUB_X(&v_x0, &v_big_a, &v_temp); VC_ADD_X(&v_x1, &v_big_a, &v_temp); VC_MUL_F(&v_x0, &v_x0, factor); VC_MUL_F(&v_x1, &v_x1, factor); // X[k] = A[k] + B[k] * W[k] (k = 0, ..., N/2 - 1) VC_STORE_INTERLEAVE((out + (i << 1)), &v_x0); // X[k] = X'[N - k] (k = N/2 + 1, ..., N - 1) VC_REVERSE(&v_x1); VC_STOREU_INTERLEAVE((out + (j << 1)), &v_x1); } out[n_by_2] = in[n_by_4]; out[n_by_2 + 1] = -in[n_by_4 + n_by_2]; out[0] = in[0] + in[n_by_2]; out[1] = 0; out[n] = in[0] - in[n_by_2]; out[n + 1] = 0; }
OMXResult omxSP_FFTInv_CCSToR_F32_Sfs(const OMX_F32 *pSrc, OMX_F32 *pDst, const OMXFFTSpec_R_F32 *pFFTSpec) { OMX_INT n; OMX_INT n_by_2; OMX_INT n_by_4; OMX_INT i; const OMX_F32 *twiddle; OMX_F32 *buf; OMX_F32 *in = (OMX_F32*) pSrc; const X86FFTSpec_R_FC32 *pFFTStruct = (const X86FFTSpec_R_FC32*) pFFTSpec; // Input must be 32 byte aligned if (!pSrc || !pDst || (const uintptr_t)pSrc & 31 || (uintptr_t)pDst & 31) return OMX_Sts_BadArgErr; n = pFFTStruct->N; // This is to handle the case of order == 1. if (n == 2) { pDst[0] = (pSrc[0] + pSrc[2]) / 2; pDst[1] = (pSrc[0] - pSrc[2]) / 2; return OMX_Sts_NoErr; } n_by_2 = n >> 1; n_by_4 = n >> 2; buf = pFFTStruct->pBuf1; twiddle = pFFTStruct->pTwiddle; if (n < 8) RevbinPermuteInv(in, buf, twiddle, n); else RevbinPermuteInvSse(in, buf, twiddle, n); if (n_by_2 < 16) { buf = x86SP_F32_radix2_kernel_OutOfPlace( buf, pFFTStruct->pBuf2, buf, twiddle, n_by_2, 0); } else { buf = x86SP_F32_radix4_kernel_OutOfPlace_sse( buf, pFFTStruct->pBuf2, buf, twiddle, n_by_2, 0); } // Scale the result by 1/n. // It contains a scaling factor of 1/2 in // RevbinPermuteInv/RevbinPermuteInvSse. OMX_F32 factor = 1.0f / n; if (n < 8) { for (i = 0; i < n_by_2; i++) { pDst[i << 1] = buf[i] * factor; pDst[(i << 1) + 1] = buf[i + n_by_2] * factor; } } else { OMX_F32 *base; OMX_F32 *dst; VC temp0; VC temp1; __m128 mFactor = _mm_load1_ps(&factor); // Two things are done in this loop: // 1 Get the result scaled; 2 Change the format from split to interleaved. for (i = 0; i < n_by_2; i += 4) { base = buf + i; dst = pDst + (i << 1); VC_LOAD_SPLIT(&temp0, base, n_by_2); VC_MUL_F(&temp1, &temp0, mFactor); VC_STORE_INTERLEAVE(dst, &temp1); } } return OMX_Sts_NoErr; }