// Sse version of RevbinPermuteInv function. static void RevbinPermuteInvSse(const OMX_F32 *in, OMX_F32 *out, const OMX_F32 *twiddle, OMX_INT n) { OMX_INT i; OMX_INT j; OMX_INT n_by_2 = n >> 1; OMX_INT n_by_4 = n >> 2; const OMX_F32 *tw; const OMX_F32 *pi; const OMX_F32 *pj; VC v_i; VC v_j; VC v_big_a; VC v_big_b; VC v_temp; VC v_tw; for (i = 0, j = n_by_2 - 3; i < n_by_4; i += 4, j -= 4) { pi = in + (i << 1); pj = in + (j << 1); VC_LOAD_INTERLEAVE(&v_i, pi); v_j.real = _mm_set_ps(pj[0], pj[2], pj[4], pj[6]); v_j.imag = _mm_set_ps(pj[1], pj[3], pj[5], pj[7]); // A[k] = (X[k] + X'[N/2 - k]) VC_ADD_SUB(&v_big_a, &v_i, &v_j); // temp = (X[k] - X'[N/2 - k]) VC_SUB_ADD(&v_temp, &v_i, &v_j); // W[k] tw = twiddle + i; VC_LOAD_SPLIT(&v_tw, tw, n); // B[k] = (X[k] - X'[N/2 - k]) * W[k] VC_CONJ_MUL(&v_big_b, &v_temp, &v_tw); // Convert split format to interleaved format. // Z[k] = (A[k] + j * B[k]) (k = 0, ..., N/2 - 1) // The scaling of 1/2 will be merged into to the scaling in // the last step before the output in omxSP_FFTInv_CCSToR_F32_Sfs. VC_ADD_X_STORE_SPLIT((out + i), &v_big_a, &v_big_b, n_by_2); VC_SUB_X_INVERSE_STOREU_SPLIT((out + j), &v_big_a, &v_big_b, n_by_2); } // The n_by_2 complex point out[n_by_4] = 2.0f * in[n_by_2]; out[n_by_4 + n_by_2] = -2.0f * in[n_by_2 + 1]; // The first complex point out[0] = in[0] + in[n]; out[n_by_2] = in[0] - in[n]; }
// Sse version of RevbinPermuteFwd function. static void RevbinPermuteFwdSse( const OMX_F32 *in, OMX_F32 *out, const OMX_F32 *twiddle, OMX_INT n) { OMX_INT i; OMX_INT j; OMX_INT n_by_2 = n >> 1; OMX_INT n_by_4 = n >> 2; VC v_i; VC v_j; VC v_big_a; VC v_big_b; VC v_temp; VC v_x0; VC v_x1; VC v_tw; __m128 factor = _mm_set1_ps(0.5f); for (i = 0, j = n_by_2 - 3; i < n_by_4; i += 4, j -= 4) { VC_LOAD_SPLIT(&v_i, (in + i), n_by_2); VC_LOADU_SPLIT(&v_j, (in + j), n_by_2); VC_REVERSE(&v_j); // A[k] = (Z[k] + Z'[N/2 - k]) VC_ADD_SUB(&v_big_a, &v_j, &v_i); // B[k] = -j * (Z[k] - Z'[N/2 - k]) VC_SUB_ADD(&v_big_b, &v_j, &v_i); // W[k] VC_LOAD_SPLIT(&v_tw, (twiddle + i), n); // temp = B[k] * W[k] VC_CONJ_MUL(&v_temp, &v_big_b, &v_tw); VC_SUB_X(&v_x0, &v_big_a, &v_temp); VC_ADD_X(&v_x1, &v_big_a, &v_temp); VC_MUL_F(&v_x0, &v_x0, factor); VC_MUL_F(&v_x1, &v_x1, factor); // X[k] = A[k] + B[k] * W[k] (k = 0, ..., N/2 - 1) VC_STORE_INTERLEAVE((out + (i << 1)), &v_x0); // X[k] = X'[N - k] (k = N/2 + 1, ..., N - 1) VC_REVERSE(&v_x1); VC_STOREU_INTERLEAVE((out + (j << 1)), &v_x1); } out[n_by_2] = in[n_by_4]; out[n_by_2 + 1] = -in[n_by_4 + n_by_2]; out[0] = in[0] + in[n_by_2]; out[1] = 0; out[n] = in[0] - in[n_by_2]; out[n + 1] = 0; }