// Sse version of RevbinPermuteFwd function. static void RevbinPermuteFwdSse( const OMX_F32 *in, OMX_F32 *out, const OMX_F32 *twiddle, OMX_INT n) { OMX_INT i; OMX_INT j; OMX_INT n_by_2 = n >> 1; OMX_INT n_by_4 = n >> 2; VC v_i; VC v_j; VC v_big_a; VC v_big_b; VC v_temp; VC v_x0; VC v_x1; VC v_tw; __m128 factor = _mm_set1_ps(0.5f); for (i = 0, j = n_by_2 - 3; i < n_by_4; i += 4, j -= 4) { VC_LOAD_SPLIT(&v_i, (in + i), n_by_2); VC_LOADU_SPLIT(&v_j, (in + j), n_by_2); VC_REVERSE(&v_j); // A[k] = (Z[k] + Z'[N/2 - k]) VC_ADD_SUB(&v_big_a, &v_j, &v_i); // B[k] = -j * (Z[k] - Z'[N/2 - k]) VC_SUB_ADD(&v_big_b, &v_j, &v_i); // W[k] VC_LOAD_SPLIT(&v_tw, (twiddle + i), n); // temp = B[k] * W[k] VC_CONJ_MUL(&v_temp, &v_big_b, &v_tw); VC_SUB_X(&v_x0, &v_big_a, &v_temp); VC_ADD_X(&v_x1, &v_big_a, &v_temp); VC_MUL_F(&v_x0, &v_x0, factor); VC_MUL_F(&v_x1, &v_x1, factor); // X[k] = A[k] + B[k] * W[k] (k = 0, ..., N/2 - 1) VC_STORE_INTERLEAVE((out + (i << 1)), &v_x0); // X[k] = X'[N - k] (k = N/2 + 1, ..., N - 1) VC_REVERSE(&v_x1); VC_STOREU_INTERLEAVE((out + (j << 1)), &v_x1); } out[n_by_2] = in[n_by_4]; out[n_by_2 + 1] = -in[n_by_4 + n_by_2]; out[0] = in[0] + in[n_by_2]; out[1] = 0; out[n] = in[0] - in[n_by_2]; out[n + 1] = 0; }
OMXResult omxSP_FFTInv_CCSToR_F32_Sfs(const OMX_F32 *pSrc, OMX_F32 *pDst, const OMXFFTSpec_R_F32 *pFFTSpec) { OMX_INT n; OMX_INT n_by_2; OMX_INT n_by_4; OMX_INT i; const OMX_F32 *twiddle; OMX_F32 *buf; OMX_F32 *in = (OMX_F32*) pSrc; const X86FFTSpec_R_FC32 *pFFTStruct = (const X86FFTSpec_R_FC32*) pFFTSpec; // Input must be 32 byte aligned if (!pSrc || !pDst || (const uintptr_t)pSrc & 31 || (uintptr_t)pDst & 31) return OMX_Sts_BadArgErr; n = pFFTStruct->N; // This is to handle the case of order == 1. if (n == 2) { pDst[0] = (pSrc[0] + pSrc[2]) / 2; pDst[1] = (pSrc[0] - pSrc[2]) / 2; return OMX_Sts_NoErr; } n_by_2 = n >> 1; n_by_4 = n >> 2; buf = pFFTStruct->pBuf1; twiddle = pFFTStruct->pTwiddle; if (n < 8) RevbinPermuteInv(in, buf, twiddle, n); else RevbinPermuteInvSse(in, buf, twiddle, n); if (n_by_2 < 16) { buf = x86SP_F32_radix2_kernel_OutOfPlace( buf, pFFTStruct->pBuf2, buf, twiddle, n_by_2, 0); } else { buf = x86SP_F32_radix4_kernel_OutOfPlace_sse( buf, pFFTStruct->pBuf2, buf, twiddle, n_by_2, 0); } // Scale the result by 1/n. // It contains a scaling factor of 1/2 in // RevbinPermuteInv/RevbinPermuteInvSse. OMX_F32 factor = 1.0f / n; if (n < 8) { for (i = 0; i < n_by_2; i++) { pDst[i << 1] = buf[i] * factor; pDst[(i << 1) + 1] = buf[i + n_by_2] * factor; } } else { OMX_F32 *base; OMX_F32 *dst; VC temp0; VC temp1; __m128 mFactor = _mm_load1_ps(&factor); // Two things are done in this loop: // 1 Get the result scaled; 2 Change the format from split to interleaved. for (i = 0; i < n_by_2; i += 4) { base = buf + i; dst = pDst + (i << 1); VC_LOAD_SPLIT(&temp0, base, n_by_2); VC_MUL_F(&temp1, &temp0, mFactor); VC_STORE_INTERLEAVE(dst, &temp1); } } return OMX_Sts_NoErr; }