Example #1
0
void test_vnegQf32 (void)
{
  float32x4_t out_float32x4_t;
  float32x4_t arg0_float32x4_t;

  out_float32x4_t = vnegq_f32 (arg0_float32x4_t);
}
Example #2
0
        template <bool align> SIMD_INLINE void HogDirectionHistograms(const float32x4_t & dx, const float32x4_t & dy, Buffer & buffer, size_t col)
        {
            float32x4_t bestDot = vdupq_n_f32(0);
            int32x4_t bestIndex = vdupq_n_s32(0);
            for(int i = 0; i < buffer.size; ++i)
            {
                float32x4_t dot = vaddq_f32(vmulq_f32(dx, buffer.cos[i]), vmulq_f32(dy, buffer.sin[i]));
                uint32x4_t mask = vcgtq_f32(dot, bestDot);
                bestDot = vmaxq_f32(dot, bestDot);
                bestIndex = vbslq_s32(mask, buffer.pos[i], bestIndex);

                dot = vnegq_f32(dot);
                mask = vcgtq_f32(dot, bestDot);
                bestDot = vmaxq_f32(dot, bestDot);
                bestIndex = vbslq_s32(mask, buffer.neg[i], bestIndex);
            }
            Store<align>(buffer.index + col, bestIndex);
            Store<align>(buffer.value + col, Sqrt<SIMD_NEON_RCP_ITER>(vaddq_f32(vmulq_f32(dx, dx), vmulq_f32(dy, dy))));
        }
void fft_real_neon(
        CkFftContext* context, 
        const float* input, 
        CkFftComplex* output, 
        int count)
{
    int countDiv2 = count/2;

    fft_neon(context, (const CkFftComplex*) input, output, countDiv2, false, 1, context->fwdExpTable, context->maxCount / countDiv2);

    output[countDiv2] = output[0];

    int expTableStride = context->maxCount/count;
    const CkFftComplex* exp0 = context->fwdExpTable;
    const CkFftComplex* exp1 = context->fwdExpTable + countDiv2 * expTableStride;

    CkFftComplex* p0 = output;
    CkFftComplex* p1 = output + countDiv2 - 3;
    const CkFftComplex* pEnd = p0 + count/4;
    while (p0 < pEnd)
    {
        float32x4x2_t z0_v = vld2q_f32((const float32_t*) p0);
        float32x4x2_t z1_v = vld2q_f32((const float32_t*) p1);

        float32x2_t hi, lo;

        // reverse z1 real
        z1_v.val[0] = vrev64q_f32(z1_v.val[0]);
        hi = vget_high_f32(z1_v.val[0]);
        lo = vget_low_f32(z1_v.val[0]);
        z1_v.val[0] = vcombine_f32(hi, lo);

        // reverse z1 imaginary
        z1_v.val[1] = vrev64q_f32(z1_v.val[1]);
        hi = vget_high_f32(z1_v.val[1]);
        lo = vget_low_f32(z1_v.val[1]);
        z1_v.val[1] = vcombine_f32(hi, lo);

        float32x4x2_t sum_v;
        sum_v.val[0] = vaddq_f32(z0_v.val[0], z1_v.val[0]);
        sum_v.val[1] = vsubq_f32(z0_v.val[1], z1_v.val[1]);

        float32x4x2_t diff_v;
        diff_v.val[0] = vsubq_f32(z0_v.val[0], z1_v.val[0]);
        diff_v.val[1] = vaddq_f32(z0_v.val[1], z1_v.val[1]);

        float32x4x2_t exp_v;
        exp_v = vld2q_lane_f32((const float32_t*) exp0, exp_v, 0);
        exp0 += expTableStride;
        exp_v = vld2q_lane_f32((const float32_t*) exp0, exp_v, 1);
        exp0 += expTableStride;
        exp_v = vld2q_lane_f32((const float32_t*) exp0, exp_v, 2);
        exp0 += expTableStride;
        exp_v = vld2q_lane_f32((const float32_t*) exp0, exp_v, 3);
        exp0 += expTableStride;

        float32x4x2_t f_v;
        f_v.val[0] = vnegq_f32(exp_v.val[1]);
        f_v.val[1] = exp_v.val[0];

        float32x4x2_t c_v;
        multiply(f_v, diff_v, c_v);
        subtract(sum_v, c_v, z0_v);
        vst2q_f32((float32_t*) p0, z0_v);

        diff_v.val[0] = vnegq_f32(diff_v.val[0]);
        sum_v.val[1] = vnegq_f32(sum_v.val[1]);

        exp_v = vld2q_lane_f32((const float32_t*) exp1, exp_v, 0);
        exp1 -= expTableStride;
        exp_v = vld2q_lane_f32((const float32_t*) exp1, exp_v, 1);
        exp1 -= expTableStride;
        exp_v = vld2q_lane_f32((const float32_t*) exp1, exp_v, 2);
        exp1 -= expTableStride;
        exp_v = vld2q_lane_f32((const float32_t*) exp1, exp_v, 3);
        exp1 -= expTableStride;

        f_v.val[0] = vnegq_f32(exp_v.val[1]);
        f_v.val[1] = exp_v.val[0];

        multiply(f_v, diff_v, c_v);
        subtract(sum_v, c_v, z1_v);

        // reverse z1 real
        z1_v.val[0] = vrev64q_f32(z1_v.val[0]);
        hi = vget_high_f32(z1_v.val[0]);
        lo = vget_low_f32(z1_v.val[0]);
        z1_v.val[0] = vcombine_f32(hi, lo);

        // reverse z1 imaginary
        z1_v.val[1] = vrev64q_f32(z1_v.val[1]);
        hi = vget_high_f32(z1_v.val[1]);
        lo = vget_low_f32(z1_v.val[1]);
        z1_v.val[1] = vcombine_f32(hi, lo);

        vst2q_f32((float32_t*) p1, z1_v);

        p0 += 4;
        p1 -= 4;
    }

    if (count > 8)
    {
        // middle:
        p0->real = p0->real * 2.0f;
        p0->imag = -p0->imag * 2.0f;
    }
}
void fft_real_inverse_neon(
        CkFftContext* context, 
        const CkFftComplex* input, 
        float* output, 
        int count,
        CkFftComplex* tmpBuf)
{
    int countDiv2 = count/2;

    int expTableStride = context->maxCount/count;
    const CkFftComplex* exp0 = context->invExpTable;
    const CkFftComplex* exp1 = context->invExpTable + countDiv2 * expTableStride;

    const CkFftComplex* p0 = input;
    const CkFftComplex* p1 = input + countDiv2 - 3;
    CkFftComplex* tmp0 = tmpBuf;
    CkFftComplex* tmp1 = tmpBuf + countDiv2 - 3;
    const CkFftComplex* pEnd = p0 + count/4;
    while (p0 < pEnd)
    {
        float32x4x2_t z0_v = vld2q_f32((const float32_t*) p0);
        float32x4x2_t z1_v = vld2q_f32((const float32_t*) p1);

        float32x2_t hi, lo;

        // reverse z1 real
        z1_v.val[0] = vrev64q_f32(z1_v.val[0]);
        hi = vget_high_f32(z1_v.val[0]);
        lo = vget_low_f32(z1_v.val[0]);
        z1_v.val[0] = vcombine_f32(hi, lo);

        // reverse z1 imaginary
        z1_v.val[1] = vrev64q_f32(z1_v.val[1]);
        hi = vget_high_f32(z1_v.val[1]);
        lo = vget_low_f32(z1_v.val[1]);
        z1_v.val[1] = vcombine_f32(hi, lo);

        float32x4x2_t sum_v;
        sum_v.val[0] = vaddq_f32(z0_v.val[0], z1_v.val[0]);
        sum_v.val[1] = vsubq_f32(z0_v.val[1], z1_v.val[1]);

        float32x4x2_t diff_v;
        diff_v.val[0] = vsubq_f32(z0_v.val[0], z1_v.val[0]);
        diff_v.val[1] = vaddq_f32(z0_v.val[1], z1_v.val[1]);

        float32x4x2_t exp_v;
        exp_v = vld2q_lane_f32((const float32_t*) exp0, exp_v, 0);
        exp0 += expTableStride;
        exp_v = vld2q_lane_f32((const float32_t*) exp0, exp_v, 1);
        exp0 += expTableStride;
        exp_v = vld2q_lane_f32((const float32_t*) exp0, exp_v, 2);
        exp0 += expTableStride;
        exp_v = vld2q_lane_f32((const float32_t*) exp0, exp_v, 3);
        exp0 += expTableStride;

        float32x4x2_t f_v;
        f_v.val[0] = vnegq_f32(exp_v.val[1]);
        f_v.val[1] = exp_v.val[0];

        float32x4x2_t c_v;
        multiply(f_v, diff_v, c_v);
        add(sum_v, c_v, z0_v);
        vst2q_f32((float32_t*) tmp0, z0_v);

        diff_v.val[0] = vnegq_f32(diff_v.val[0]);
        sum_v.val[1] = vnegq_f32(sum_v.val[1]);

        exp_v = vld2q_lane_f32((const float32_t*) exp1, exp_v, 0);
        exp1 -= expTableStride;
        exp_v = vld2q_lane_f32((const float32_t*) exp1, exp_v, 1);
        exp1 -= expTableStride;
        exp_v = vld2q_lane_f32((const float32_t*) exp1, exp_v, 2);
        exp1 -= expTableStride;
        exp_v = vld2q_lane_f32((const float32_t*) exp1, exp_v, 3);
        exp1 -= expTableStride;

        f_v.val[0] = vnegq_f32(exp_v.val[1]);
        f_v.val[1] = exp_v.val[0];

        multiply(f_v, diff_v, c_v);
        add(sum_v, c_v, z1_v);

        // reverse z1 real
        z1_v.val[0] = vrev64q_f32(z1_v.val[0]);
        hi = vget_high_f32(z1_v.val[0]);
        lo = vget_low_f32(z1_v.val[0]);
        z1_v.val[0] = vcombine_f32(hi, lo);

        // reverse z1 imaginary
        z1_v.val[1] = vrev64q_f32(z1_v.val[1]);
        hi = vget_high_f32(z1_v.val[1]);
        lo = vget_low_f32(z1_v.val[1]);
        z1_v.val[1] = vcombine_f32(hi, lo);

        vst2q_f32((float32_t*) tmp1, z1_v);

        p0 += 4;
        tmp0 += 4;
        p1 -= 4;
        tmp1 -= 4;
    }

    // middle:
    tmp0->real = p0->real * 2.0f;
    tmp0->imag = -p0->imag * 2.0f;

    fft_neon(context, tmpBuf, (CkFftComplex*) output, countDiv2, true, 1, context->invExpTable, context->maxCount / countDiv2);
}
static void ne10_fft_split_c2r_1d_float32_neon (ne10_fft_cpx_float32_t *dst,
        const ne10_fft_cpx_float32_t *src,
        ne10_fft_cpx_float32_t *twiddles,
        ne10_int32_t ncfft)
{

    ne10_int32_t k;
    ne10_int32_t count = ncfft / 2;
    ne10_fft_cpx_float32_t fk, fnkc, fek, fok, tmp;
    float32x4x2_t q2_fk, q2_fnkc, q2_tw, q2_dst, q2_dst2;
    float32x4_t q_fnkc_r, q_fnkc_i;
    float32x4_t q_fek_r, q_fek_i, q_fok_r, q_fok_i;
    float32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_val;
    float32x4_t q_dst2_r, q_dst2_i;
    float32_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;

    dst[0].r = (src[0].r + src[ncfft].r) * 0.5f;
    dst[0].i = (src[0].r - src[ncfft].r) * 0.5f;

    if (count >= 4)
    {
        for (k = 1; k <= count ; k += 4)
        {
            p_src  = (float32_t*) (& (src[k]));
            p_src2  = (float32_t*) (& (src[ncfft - k - 3]));
            p_twiddles  = (float32_t*) (& (twiddles[k - 1]));
            p_dst  = (float32_t*) (& (dst[k]));
            p_dst2  = (float32_t*) (& (dst[ncfft - k - 3]));

            q2_fk  = vld2q_f32 (p_src);
            q2_fnkc = vld2q_f32 (p_src2);
            q2_tw = vld2q_f32 (p_twiddles);
            q2_fnkc.val[0] = vrev64q_f32 (q2_fnkc.val[0]);
            q2_fnkc.val[1] = vrev64q_f32 (q2_fnkc.val[1]);
            q_fnkc_r = vcombine_f32 (vget_high_f32 (q2_fnkc.val[0]), vget_low_f32 (q2_fnkc.val[0]));
            q_fnkc_i = vcombine_f32 (vget_high_f32 (q2_fnkc.val[1]), vget_low_f32 (q2_fnkc.val[1]));
            q_fnkc_i = vnegq_f32 (q_fnkc_i);

            q_fek_r = vaddq_f32 (q2_fk.val[0], q_fnkc_r);
            q_fek_i = vaddq_f32 (q2_fk.val[1], q_fnkc_i);

            q_tmp0 = vsubq_f32 (q2_fk.val[0], q_fnkc_r);
            q_tmp1 = vsubq_f32 (q2_fk.val[1], q_fnkc_i);

            q_fok_r = vmulq_f32 (q_tmp0, q2_tw.val[0]);
            q_fok_i = vmulq_f32 (q_tmp1, q2_tw.val[0]);
            q_tmp2 = vmulq_f32 (q_tmp1, q2_tw.val[1]);
            q_tmp3 = vmulq_f32 (q_tmp0, q2_tw.val[1]);
            q_fok_r = vaddq_f32 (q_fok_r, q_tmp2);
            q_fok_i = vsubq_f32 (q_fok_i, q_tmp3);

            q_val = vdupq_n_f32 (0.5f);
            q_dst2_r = vsubq_f32 (q_fek_r, q_fok_r);
            q_dst2_i = vsubq_f32 (q_fok_i, q_fek_i);
            q2_dst.val[0] = vaddq_f32 (q_fek_r, q_fok_r);
            q2_dst.val[1] = vaddq_f32 (q_fek_i, q_fok_i);
            q_dst2_r = vmulq_f32 (q_dst2_r, q_val);
            q_dst2_i = vmulq_f32 (q_dst2_i, q_val);
            q2_dst.val[0] = vmulq_f32 (q2_dst.val[0], q_val);
            q2_dst.val[1] = vmulq_f32 (q2_dst.val[1], q_val);
            q_dst2_r = vrev64q_f32 (q_dst2_r);
            q_dst2_i = vrev64q_f32 (q_dst2_i);
            q2_dst2.val[0] = vcombine_f32 (vget_high_f32 (q_dst2_r), vget_low_f32 (q_dst2_r));
            q2_dst2.val[1] = vcombine_f32 (vget_high_f32 (q_dst2_i), vget_low_f32 (q_dst2_i));
            vst2q_f32 (p_dst, q2_dst);
            vst2q_f32 (p_dst2, q2_dst2);

        }
    }
    else
    {
        for (k = 1; k <= count ; k++)
        {
            fk = src[k];
            fnkc.r = src[ncfft - k].r;
            fnkc.i = -src[ncfft - k].i;

            fek.r = fk.r + fnkc.r;
            fek.i = fk.i + fnkc.i;

            tmp.r = fk.r - fnkc.r;
            tmp.i = fk.i - fnkc.i;

            fok.r = tmp.r * twiddles[k - 1].r + tmp.i * twiddles[k - 1].i;
            fok.i = tmp.i * twiddles[k - 1].r - tmp.r * twiddles[k - 1].i;

            dst[k].r = (fek.r + fok.r) * 0.5f;
            dst[k].i = (fek.i + fok.i) * 0.5f;

            dst[ncfft - k].r = (fek.r - fok.r) * 0.5f;
            dst[ncfft - k].i = (fok.i - fek.i) * 0.5f;
        }
    }
}
static void ne10_fft_split_r2c_1d_float32_neon (ne10_fft_cpx_float32_t *dst,
        const ne10_fft_cpx_float32_t *src,
        ne10_fft_cpx_float32_t *twiddles,
        ne10_int32_t ncfft)
{
    ne10_int32_t k;
    ne10_int32_t count = ncfft / 2;
    ne10_fft_cpx_float32_t fpnk, fpk, f1k, f2k, tw, tdc;
    float32x4x2_t q2_fpk, q2_fpnk, q2_tw, q2_dst, q2_dst2;
    float32x4_t q_fpnk_r, q_fpnk_i;
    float32x4_t q_f1k_r, q_f1k_i, q_f2k_r, q_f2k_i;
    float32x4_t q_tw_r, q_tw_i;
    float32x4_t q_tmp0, q_tmp1, q_tmp2, q_tmp3, q_val;
    float32x4_t q_dst_r, q_dst_i, q_dst2_r, q_dst2_i;
    float32_t *p_src, *p_src2, *p_dst, *p_dst2, *p_twiddles;

    tdc.r = src[0].r;
    tdc.i = src[0].i;

    dst[0].r = tdc.r + tdc.i;
    dst[ncfft].r = tdc.r - tdc.i;
    dst[ncfft].i = dst[0].i = 0;

    if (count >= 4)
    {
        for (k = 1; k <= count ; k += 4)
        {
            p_src  = (float32_t*) (& (src[k]));
            p_src2  = (float32_t*) (& (src[ncfft - k - 3]));
            p_twiddles  = (float32_t*) (& (twiddles[k - 1]));
            p_dst  = (float32_t*) (& (dst[k]));
            p_dst2  = (float32_t*) (& (dst[ncfft - k - 3]));

            q2_fpk  = vld2q_f32 (p_src);
            q2_fpnk = vld2q_f32 (p_src2);
            q2_tw = vld2q_f32 (p_twiddles);
            q2_fpnk.val[0] = vrev64q_f32 (q2_fpnk.val[0]);
            q2_fpnk.val[1] = vrev64q_f32 (q2_fpnk.val[1]);
            q_fpnk_r = vcombine_f32 (vget_high_f32 (q2_fpnk.val[0]), vget_low_f32 (q2_fpnk.val[0]));
            q_fpnk_i = vcombine_f32 (vget_high_f32 (q2_fpnk.val[1]), vget_low_f32 (q2_fpnk.val[1]));
            q_fpnk_i = vnegq_f32 (q_fpnk_i);

            q_f1k_r = vaddq_f32 (q2_fpk.val[0], q_fpnk_r);
            q_f1k_i = vaddq_f32 (q2_fpk.val[1], q_fpnk_i);

            q_f2k_r = vsubq_f32 (q2_fpk.val[0], q_fpnk_r);
            q_f2k_i = vsubq_f32 (q2_fpk.val[1], q_fpnk_i);

            q_tmp0 = vmulq_f32 (q_f2k_r, q2_tw.val[0]);
            q_tmp1 = vmulq_f32 (q_f2k_i, q2_tw.val[1]);
            q_tmp2 = vmulq_f32 (q_f2k_r, q2_tw.val[1]);
            q_tmp3 = vmulq_f32 (q_f2k_i, q2_tw.val[0]);
            q_tw_r = vsubq_f32 (q_tmp0, q_tmp1);
            q_tw_i = vaddq_f32 (q_tmp2, q_tmp3);

            q_val = vdupq_n_f32 (0.5f);
            q_dst2_r = vsubq_f32 (q_f1k_r, q_tw_r);
            q_dst2_i = vsubq_f32 (q_tw_i, q_f1k_i);
            q_dst_r = vaddq_f32 (q_f1k_r, q_tw_r);
            q_dst_i = vaddq_f32 (q_f1k_i, q_tw_i);
            q_dst2_r = vmulq_f32 (q_dst2_r, q_val);
            q_dst2_i = vmulq_f32 (q_dst2_i, q_val);
            q2_dst.val[0] = vmulq_f32 (q_dst_r, q_val);
            q2_dst.val[1] = vmulq_f32 (q_dst_i, q_val);
            q_dst2_r = vrev64q_f32 (q_dst2_r);
            q_dst2_i = vrev64q_f32 (q_dst2_i);
            q2_dst2.val[0] = vcombine_f32 (vget_high_f32 (q_dst2_r), vget_low_f32 (q_dst2_r));
            q2_dst2.val[1] = vcombine_f32 (vget_high_f32 (q_dst2_i), vget_low_f32 (q_dst2_i));
            vst2q_f32 (p_dst, q2_dst);
            vst2q_f32 (p_dst2, q2_dst2);

        }
    }
    else
    {
        for (k = 1; k <= count ; k++)
        {
            fpk    = src[k];
            fpnk.r =   src[ncfft - k].r;
            fpnk.i = - src[ncfft - k].i;

            f1k.r = fpk.r + fpnk.r;
            f1k.i = fpk.i + fpnk.i;

            f2k.r = fpk.r - fpnk.r;
            f2k.i = fpk.i - fpnk.i;

            tw.r = f2k.r * (twiddles[k - 1]).r - f2k.i * (twiddles[k - 1]).i;
            tw.i = f2k.r * (twiddles[k - 1]).i + f2k.i * (twiddles[k - 1]).r;

            dst[k].r = (f1k.r + tw.r) * 0.5f;
            dst[k].i = (f1k.i + tw.i) * 0.5f;
            dst[ncfft - k].r = (f1k.r - tw.r) * 0.5f;
            dst[ncfft - k].i = (tw.i - f1k.i) * 0.5f;
        }
    }
}