static inline void PreShiftW32toW16Neon(int32_t* inre, int32_t* inim, int16_t* outre, int16_t* outim, int32_t sh) { int k; int32x4_t sh32x4 = vdupq_n_s32(sh); for (k = 0; k < FRAMESAMPLES/2; k += 16) { int32x4x4_t inre32x4x4 = vld4q_s32(inre); int32x4x4_t inim32x4x4 = vld4q_s32(inim); inre += 16; inim += 16; inre32x4x4.val[0] = vrshlq_s32(inre32x4x4.val[0], sh32x4); inre32x4x4.val[1] = vrshlq_s32(inre32x4x4.val[1], sh32x4); inre32x4x4.val[2] = vrshlq_s32(inre32x4x4.val[2], sh32x4); inre32x4x4.val[3] = vrshlq_s32(inre32x4x4.val[3], sh32x4); inim32x4x4.val[0] = vrshlq_s32(inim32x4x4.val[0], sh32x4); inim32x4x4.val[1] = vrshlq_s32(inim32x4x4.val[1], sh32x4); inim32x4x4.val[2] = vrshlq_s32(inim32x4x4.val[2], sh32x4); inim32x4x4.val[3] = vrshlq_s32(inim32x4x4.val[3], sh32x4); int16x4x4_t outre16x4x4; int16x4x4_t outim16x4x4; outre16x4x4.val[0] = vmovn_s32(inre32x4x4.val[0]); outre16x4x4.val[1] = vmovn_s32(inre32x4x4.val[1]); outre16x4x4.val[2] = vmovn_s32(inre32x4x4.val[2]); outre16x4x4.val[3] = vmovn_s32(inre32x4x4.val[3]); outim16x4x4.val[0] = vmovn_s32(inim32x4x4.val[0]); outim16x4x4.val[1] = vmovn_s32(inim32x4x4.val[1]); outim16x4x4.val[2] = vmovn_s32(inim32x4x4.val[2]); outim16x4x4.val[3] = vmovn_s32(inim32x4x4.val[3]); vst4_s16(outre, outre16x4x4); vst4_s16(outim, outim16x4x4); outre += 16; outim += 16; } }
inline void vst4(s16 * ptr, const int16x4x4_t & v) { return vst4_s16(ptr, v); }