// Sse version of RevbinPermuteInv function.
static void RevbinPermuteInvSse(const OMX_F32 *in,
                                OMX_F32 *out,
                                const OMX_F32 *twiddle,
                                OMX_INT n) {
  OMX_INT i;
  OMX_INT j;
  OMX_INT n_by_2 = n >> 1;
  OMX_INT n_by_4 = n >> 2;
  const OMX_F32 *tw;
  const OMX_F32 *pi;
  const OMX_F32 *pj;

  VC v_i;
  VC v_j;
  VC v_big_a;
  VC v_big_b;
  VC v_temp;
  VC v_tw;

  for (i = 0, j = n_by_2 - 3; i < n_by_4; i += 4, j -= 4) {
    pi = in + (i << 1);
    pj = in + (j << 1);
    VC_LOAD_INTERLEAVE(&v_i, pi);

    v_j.real = _mm_set_ps(pj[0], pj[2], pj[4], pj[6]);
    v_j.imag = _mm_set_ps(pj[1], pj[3], pj[5], pj[7]);

    // A[k] = (X[k] + X'[N/2 - k])
    VC_ADD_SUB(&v_big_a, &v_i, &v_j);

    // temp = (X[k] - X'[N/2 - k])
    VC_SUB_ADD(&v_temp, &v_i, &v_j);

    // W[k]
    tw = twiddle + i;
    VC_LOAD_SPLIT(&v_tw, tw, n);

    // B[k] = (X[k] - X'[N/2 - k]) * W[k]
    VC_CONJ_MUL(&v_big_b, &v_temp, &v_tw);

    // Convert split format to interleaved format.
    // Z[k] = (A[k] + j * B[k]) (k = 0, ..., N/2 - 1)
    // The scaling of 1/2 will be merged into to the scaling in
    // the last step before the output in omxSP_FFTInv_CCSToR_F32_Sfs.
    VC_ADD_X_STORE_SPLIT((out + i), &v_big_a, &v_big_b, n_by_2);

    VC_SUB_X_INVERSE_STOREU_SPLIT((out + j), &v_big_a, &v_big_b, n_by_2);
  }

  // The n_by_2 complex point
  out[n_by_4] = 2.0f * in[n_by_2];
  out[n_by_4 + n_by_2] = -2.0f * in[n_by_2 + 1];

  // The first complex point
  out[0] = in[0] + in[n];
  out[n_by_2] = in[0] - in[n];
}
void x86SP_FFT_CToC_FC32_Inv_Radix4_fs_sse(
    const OMX_F32 *in,
    OMX_F32 *out,
    OMX_INT n) {
  OMX_INT i;
  OMX_INT n_by_4 = n >> 2;
  OMX_F32 *out0 = out;

  for (i = 0; i < n_by_4; i += 4) {
    VC v_t0;
    VC v_t1;
    VC v_t2;
    VC v_t3;
    VC v_t4;
    VC v_t5;
    VC v_t6;
    VC v_t7;

    const OMX_F32 *in0 = in + i;
    const OMX_F32 *in1 = in0 + n_by_4;
    const OMX_F32 *in2 = in1 + n_by_4;
    const OMX_F32 *in3 = in2 + n_by_4;

    OMX_F32 *out1 = out0 + n_by_4;
    OMX_F32 *out2 = out1 + n_by_4;
    OMX_F32 *out3 = out2 + n_by_4;

    VC_LOAD_SPLIT(&v_t0, in0, n);
    VC_LOAD_SPLIT(&v_t1, in1, n);
    VC_LOAD_SPLIT(&v_t2, in2, n);
    VC_LOAD_SPLIT(&v_t3, in3, n);

    RADIX4_BUTTERFLY_FS(&v_t4, &v_t5, &v_t6, &v_t7,
                        &v_t0, &v_t1, &v_t2, &v_t3);

    RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3,
                               &v_t4, &v_t5, &v_t6, &v_t7, n);

    out0 += 4;
  }
}
void x86SP_FFT_CToC_FC32_Inv_Radix4_ms_sse(
    const OMX_F32 *in,
    OMX_F32 *out,
    const OMX_F32 *twiddle,
    OMX_INT n,
    OMX_INT sub_size,
    OMX_INT sub_num) {
  OMX_INT set;
  OMX_INT grp;
  OMX_INT step = sub_num >> 1;
  OMX_INT set_count = sub_num >> 2;
  OMX_INT n_by_4 = n >> 2;
  OMX_INT n_mul_2 = n << 1;

  OMX_F32 *out0 = out;

  if (set_count == 2) {
    InternalUnroll2Inv(in, out, twiddle, n);
    return;
  }

  // grp == 0
  for (set = 0; set < set_count; set += 4) {
    const OMX_F32 * in0 = in + set;
    const OMX_F32 *in1 = in0 + set_count;
    const OMX_F32 *in2 = in1 + set_count;
    const OMX_F32 *in3 = in2 + set_count;

    VC v_t0;
    VC v_t1;
    VC v_t2;
    VC v_t3;
    VC v_t4;
    VC v_t5;
    VC v_t6;
    VC v_t7;

    VC_LOAD_SPLIT(&v_t0, in0, n);
    VC_LOAD_SPLIT(&v_t1, in1, n);
    VC_LOAD_SPLIT(&v_t2, in2, n);
    VC_LOAD_SPLIT(&v_t3, in3, n);

    OMX_F32 *out1 = out0 + n_by_4;
    OMX_F32 *out2 = out1 + n_by_4;
    OMX_F32 *out3 = out2 + n_by_4;

    RADIX4_BUTTERFLY_FS(&v_t4, &v_t5, &v_t6, &v_t7,
                        &v_t0, &v_t1, &v_t2, &v_t3);

    RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3,
                               &v_t4, &v_t5, &v_t6, &v_t7, n);

    out0 += 4;
  }

  for (grp = 1; grp < sub_size; ++grp) {
    const OMX_F32 *tw1 = twiddle + grp * step;
    const OMX_F32 *tw2 = tw1 + grp * step;
    const OMX_F32 *tw3 = tw2 + grp * step;

    VC v_tw1;
    VC v_tw2;
    VC v_tw3;

    v_tw1.real = _mm_load1_ps(tw1);
    v_tw1.imag = _mm_load1_ps(tw1 + n_mul_2);
    v_tw2.real = _mm_load1_ps(tw2);
    v_tw2.imag = _mm_load1_ps(tw2 + n_mul_2);
    v_tw3.real = _mm_load1_ps(tw3);
    v_tw3.imag = _mm_load1_ps(tw3 + n_mul_2);

    for (set = 0; set < set_count; set += 4) {
      const OMX_F32 *in0 = in + set + grp * sub_num;
      const OMX_F32 *in1 = in0 + set_count;
      const OMX_F32 *in2 = in1 + set_count;
      const OMX_F32 *in3 = in2 + set_count;

      VC v_t0;
      VC v_t1;
      VC v_t2;
      VC v_t3;
      VC v_t4;
      VC v_t5;
      VC v_t6;
      VC v_t7;

      VC_LOAD_SPLIT(&v_t0, in0, n);
      VC_LOAD_SPLIT(&v_t1, in1, n);
      VC_LOAD_SPLIT(&v_t2, in2, n);
      VC_LOAD_SPLIT(&v_t3, in3, n);

      OMX_F32 *out1 = out0 + n_by_4;
      OMX_F32 *out2 = out1 + n_by_4;
      OMX_F32 *out3 = out2 + n_by_4;

      RADIX4_INV_BUTTERFLY(&v_t4, &v_t5, &v_t6, &v_t7,
                           &v_tw1, &v_tw2, &v_tw3,
                           &v_t0, &v_t1, &v_t2, &v_t3);

      RADIX4_INV_BUTTERFLY_STORE(out0, out1, out2, out3,
                                 &v_t4, &v_t5, &v_t6, &v_t7, n);

      out0 += 4;
    }
  }
}
// Sse version of RevbinPermuteFwd function.
static void RevbinPermuteFwdSse(
    const OMX_F32 *in,
    OMX_F32 *out,
    const OMX_F32 *twiddle,
    OMX_INT n) {
  OMX_INT i;
  OMX_INT j;
  OMX_INT n_by_2 = n >> 1;
  OMX_INT n_by_4 = n >> 2;

  VC v_i;
  VC v_j;
  VC v_big_a;
  VC v_big_b;
  VC v_temp;
  VC v_x0;
  VC v_x1;
  VC v_tw;

  __m128 factor = _mm_set1_ps(0.5f);

  for (i = 0, j = n_by_2 - 3; i < n_by_4; i += 4, j -= 4) {
    VC_LOAD_SPLIT(&v_i, (in + i), n_by_2);

    VC_LOADU_SPLIT(&v_j, (in + j), n_by_2);
    VC_REVERSE(&v_j);

    // A[k] = (Z[k] + Z'[N/2 - k])
    VC_ADD_SUB(&v_big_a, &v_j, &v_i);

    // B[k] = -j * (Z[k] - Z'[N/2 - k])
    VC_SUB_ADD(&v_big_b, &v_j, &v_i);

    // W[k]
    VC_LOAD_SPLIT(&v_tw, (twiddle + i), n);

    // temp = B[k] * W[k]
    VC_CONJ_MUL(&v_temp, &v_big_b, &v_tw);

    VC_SUB_X(&v_x0, &v_big_a, &v_temp);
    VC_ADD_X(&v_x1, &v_big_a, &v_temp);

    VC_MUL_F(&v_x0, &v_x0, factor);
    VC_MUL_F(&v_x1, &v_x1, factor);

    // X[k] = A[k] + B[k] * W[k] (k = 0, ..., N/2 - 1)
    VC_STORE_INTERLEAVE((out + (i << 1)), &v_x0);

    // X[k] = X'[N - k] (k = N/2 + 1, ..., N - 1)
    VC_REVERSE(&v_x1);
    VC_STOREU_INTERLEAVE((out + (j << 1)), &v_x1);
  }

  out[n_by_2] = in[n_by_4];
  out[n_by_2 + 1] = -in[n_by_4 + n_by_2];

  out[0] = in[0] + in[n_by_2];
  out[1] = 0;
  out[n] = in[0] - in[n_by_2];
  out[n + 1] = 0;
}
OMXResult omxSP_FFTInv_CCSToR_F32_Sfs(const OMX_F32 *pSrc, OMX_F32 *pDst,
                                      const OMXFFTSpec_R_F32 *pFFTSpec) {
  OMX_INT n;
  OMX_INT n_by_2;
  OMX_INT n_by_4;
  OMX_INT i;
  const OMX_F32 *twiddle;
  OMX_F32 *buf;
  OMX_F32 *in = (OMX_F32*) pSrc;

  const X86FFTSpec_R_FC32 *pFFTStruct = (const X86FFTSpec_R_FC32*) pFFTSpec;

  // Input must be 32 byte aligned
  if (!pSrc || !pDst || (const uintptr_t)pSrc & 31 || (uintptr_t)pDst & 31)
    return OMX_Sts_BadArgErr;

  n = pFFTStruct->N;

  // This is to handle the case of order == 1.
  if (n == 2) {
    pDst[0] = (pSrc[0] + pSrc[2]) / 2;
    pDst[1] = (pSrc[0] - pSrc[2]) / 2;
    return OMX_Sts_NoErr;
  }

  n_by_2 = n >> 1;
  n_by_4 = n >> 2;
  buf = pFFTStruct->pBuf1;

  twiddle = pFFTStruct->pTwiddle;

  if (n < 8)
    RevbinPermuteInv(in, buf, twiddle, n);
  else
    RevbinPermuteInvSse(in, buf, twiddle, n);

  if (n_by_2 < 16) {
    buf = x86SP_F32_radix2_kernel_OutOfPlace(
        buf,
        pFFTStruct->pBuf2,
        buf,
        twiddle,
        n_by_2,
        0);
  } else {
    buf = x86SP_F32_radix4_kernel_OutOfPlace_sse(
        buf,
        pFFTStruct->pBuf2,
        buf,
        twiddle,
        n_by_2,
        0);
  }

  // Scale the result by 1/n.
  // It contains a scaling factor of 1/2 in
  // RevbinPermuteInv/RevbinPermuteInvSse.
  OMX_F32 factor = 1.0f / n;

  if (n < 8) {
    for (i = 0; i < n_by_2; i++) {
      pDst[i << 1] = buf[i] * factor;
      pDst[(i << 1) + 1] = buf[i + n_by_2] * factor;
    }
  } else {
    OMX_F32 *base;
    OMX_F32 *dst;
    VC temp0;
    VC temp1;
    __m128 mFactor = _mm_load1_ps(&factor);

    // Two things are done in this loop:
    // 1 Get the result scaled; 2 Change the format from split to interleaved.
    for (i = 0; i < n_by_2; i += 4) {
      base = buf + i;
      dst = pDst + (i << 1);
      VC_LOAD_SPLIT(&temp0, base, n_by_2);
      VC_MUL_F(&temp1, &temp0, mFactor);
      VC_STORE_INTERLEAVE(dst, &temp1);
    }
  }

  return OMX_Sts_NoErr;
}