void test_vmovns32 (void) { int16x4_t out_int16x4_t; int32x4_t arg0_int32x4_t; out_int16x4_t = vmovn_s32 (arg0_int32x4_t); }
void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { const int16x8_t max = vdupq_n_s16((1 << bd) - 1); int32x4_t c0 = vld1q_s32(input); int32x4_t c1 = vld1q_s32(input + 4); int32x4_t c2 = vld1q_s32(input + 8); int32x4_t c3 = vld1q_s32(input + 12); int16x8_t a0, a1; if (bd == 8) { const int16x4_t cospis = vld1_s16(kCospi); // Rows a0 = vcombine_s16(vmovn_s32(c0), vmovn_s32(c1)); a1 = vcombine_s16(vmovn_s32(c2), vmovn_s32(c3)); idct4x4_16_kernel_bd8(cospis, &a0, &a1); // Columns a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1)); idct4x4_16_kernel_bd8(cospis, &a0, &a1); a0 = vrshrq_n_s16(a0, 4); a1 = vrshrq_n_s16(a1, 4); } else { const int32x4_t cospis = vld1q_s32(kCospi32); if (bd == 10) { idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3); idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3); } else { idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3); idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3); } a0 = vcombine_s16(vqrshrn_n_s32(c0, 4), vqrshrn_n_s32(c1, 4)); a1 = vcombine_s16(vqrshrn_n_s32(c3, 4), vqrshrn_n_s32(c2, 4)); } highbd_idct4x4_1_add_kernel1(&dest, stride, a0, max); highbd_idct4x4_1_add_kernel2(&dest, stride, a1, max); }
static inline void PreShiftW32toW16Neon(int32_t* inre, int32_t* inim, int16_t* outre, int16_t* outim, int32_t sh) { int k; int32x4_t sh32x4 = vdupq_n_s32(sh); for (k = 0; k < FRAMESAMPLES/2; k += 16) { int32x4x4_t inre32x4x4 = vld4q_s32(inre); int32x4x4_t inim32x4x4 = vld4q_s32(inim); inre += 16; inim += 16; inre32x4x4.val[0] = vrshlq_s32(inre32x4x4.val[0], sh32x4); inre32x4x4.val[1] = vrshlq_s32(inre32x4x4.val[1], sh32x4); inre32x4x4.val[2] = vrshlq_s32(inre32x4x4.val[2], sh32x4); inre32x4x4.val[3] = vrshlq_s32(inre32x4x4.val[3], sh32x4); inim32x4x4.val[0] = vrshlq_s32(inim32x4x4.val[0], sh32x4); inim32x4x4.val[1] = vrshlq_s32(inim32x4x4.val[1], sh32x4); inim32x4x4.val[2] = vrshlq_s32(inim32x4x4.val[2], sh32x4); inim32x4x4.val[3] = vrshlq_s32(inim32x4x4.val[3], sh32x4); int16x4x4_t outre16x4x4; int16x4x4_t outim16x4x4; outre16x4x4.val[0] = vmovn_s32(inre32x4x4.val[0]); outre16x4x4.val[1] = vmovn_s32(inre32x4x4.val[1]); outre16x4x4.val[2] = vmovn_s32(inre32x4x4.val[2]); outre16x4x4.val[3] = vmovn_s32(inre32x4x4.val[3]); outim16x4x4.val[0] = vmovn_s32(inim32x4x4.val[0]); outim16x4x4.val[1] = vmovn_s32(inim32x4x4.val[1]); outim16x4x4.val[2] = vmovn_s32(inim32x4x4.val[2]); outim16x4x4.val[3] = vmovn_s32(inim32x4x4.val[3]); vst4_s16(outre, outre16x4x4); vst4_s16(outim, outim16x4x4); outre += 16; outim += 16; } }
void ne10_img_vresize_linear_neon (const int** src, unsigned char* dst, const short* beta, int width) { const int *S0 = src[0], *S1 = src[1]; int32x4_t qS0_0123, qS0_4567, qS1_0123, qS1_4567; int32x4_t qT_0123, qT_4567; int16x4_t dT_0123, dT_4567; uint16x8_t qT_01234567; uint8x8_t dT_01234567, dDst_01234567; int32x2_t dBeta; dBeta = vset_lane_s32 ( (int) (beta[0]), dBeta, 0); dBeta = vset_lane_s32 ( (int) (beta[1]), dBeta, 1); int32x4_t qDelta, qMin, qMax; qDelta = vdupq_n_s32 (DELTA); qMin = vdupq_n_s32 (0); qMax = vdupq_n_s32 (255); int x = 0; for (; x <= width - 8; x += 8) { qS0_0123 = vld1q_s32 (&S0[x]); qS0_4567 = vld1q_s32 (&S0[x + 4]); qS1_0123 = vld1q_s32 (&S1[x]); qS1_4567 = vld1q_s32 (&S1[x + 4]); qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0); qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0); qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1); qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1); qT_0123 = vaddq_s32 (qT_0123, qDelta); qT_4567 = vaddq_s32 (qT_4567, qDelta); qT_0123 = vshrq_n_s32 (qT_0123, BITS); qT_4567 = vshrq_n_s32 (qT_4567, BITS); qT_0123 = vmaxq_s32 (qT_0123, qMin); qT_4567 = vmaxq_s32 (qT_4567, qMin); qT_0123 = vminq_s32 (qT_0123, qMax); qT_4567 = vminq_s32 (qT_4567, qMax); dT_0123 = vmovn_s32 (qT_0123); dT_4567 = vmovn_s32 (qT_4567); qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567)); dT_01234567 = vmovn_u16 (qT_01234567); vst1_u8 (&dst[x], dT_01234567); } if (x < width) { uint8x8_t dMask; dMask = vld1_u8 ( (uint8_t *) (&ne10_img_vresize_linear_mask_residual_table[ (width - x - 1)])); dDst_01234567 = vld1_u8 (&dst[x]); qS0_0123 = vld1q_s32 (&S0[x]); qS0_4567 = vld1q_s32 (&S0[x + 4]); qS1_0123 = vld1q_s32 (&S1[x]); qS1_4567 = vld1q_s32 (&S1[x + 4]); qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0); qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0); qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1); qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1); qT_0123 = vaddq_s32 (qT_0123, qDelta); qT_4567 = vaddq_s32 (qT_4567, qDelta); qT_0123 = vshrq_n_s32 (qT_0123, BITS); qT_4567 = vshrq_n_s32 (qT_4567, BITS); qT_0123 = vmaxq_s32 (qT_0123, qMin); qT_4567 = vmaxq_s32 (qT_4567, qMin); qT_0123 = vminq_s32 (qT_0123, qMax); qT_4567 = vminq_s32 (qT_4567, qMax); dT_0123 = vmovn_s32 (qT_0123); dT_4567 = vmovn_s32 (qT_4567); qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567)); dT_01234567 = vmovn_u16 (qT_01234567); dMask = vbsl_u8 (dMask, dT_01234567, dDst_01234567); vst1_u8 (&dst[x], dMask); } }
void vp9_highbd_iht8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd) { int32x4_t a[16]; int16x8_t c[8]; a[0] = vld1q_s32(input); a[1] = vld1q_s32(input + 4); a[2] = vld1q_s32(input + 8); a[3] = vld1q_s32(input + 12); a[4] = vld1q_s32(input + 16); a[5] = vld1q_s32(input + 20); a[6] = vld1q_s32(input + 24); a[7] = vld1q_s32(input + 28); a[8] = vld1q_s32(input + 32); a[9] = vld1q_s32(input + 36); a[10] = vld1q_s32(input + 40); a[11] = vld1q_s32(input + 44); a[12] = vld1q_s32(input + 48); a[13] = vld1q_s32(input + 52); a[14] = vld1q_s32(input + 56); a[15] = vld1q_s32(input + 60); if (bd == 8) { c[0] = vcombine_s16(vmovn_s32(a[0]), vmovn_s32(a[1])); c[1] = vcombine_s16(vmovn_s32(a[2]), vmovn_s32(a[3])); c[2] = vcombine_s16(vmovn_s32(a[4]), vmovn_s32(a[5])); c[3] = vcombine_s16(vmovn_s32(a[6]), vmovn_s32(a[7])); c[4] = vcombine_s16(vmovn_s32(a[8]), vmovn_s32(a[9])); c[5] = vcombine_s16(vmovn_s32(a[10]), vmovn_s32(a[11])); c[6] = vcombine_s16(vmovn_s32(a[12]), vmovn_s32(a[13])); c[7] = vcombine_s16(vmovn_s32(a[14]), vmovn_s32(a[15])); switch (tx_type) { case DCT_DCT: { const int16x8_t cospis = vld1q_s16(kCospi); const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 idct8x8_64_1d_bd8(cospis0, cospis1, c); idct8x8_64_1d_bd8(cospis0, cospis1, c); break; } case ADST_DCT: { const int16x8_t cospis = vld1q_s16(kCospi); const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 idct8x8_64_1d_bd8(cospis0, cospis1, c); transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6], &c[7]); iadst8(c); break; } case DCT_ADST: { const int16x8_t cospis = vld1q_s16(kCospi); const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6], &c[7]); iadst8(c); idct8x8_64_1d_bd8(cospis0, cospis1, c); break; } default: { transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6], &c[7]); iadst8(c); transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6], &c[7]); iadst8(c); break; } } c[0] = vrshrq_n_s16(c[0], 5); c[1] = vrshrq_n_s16(c[1], 5); c[2] = vrshrq_n_s16(c[2], 5); c[3] = vrshrq_n_s16(c[3], 5); c[4] = vrshrq_n_s16(c[4], 5); c[5] = vrshrq_n_s16(c[5], 5); c[6] = vrshrq_n_s16(c[6], 5); c[7] = vrshrq_n_s16(c[7], 5); } else { switch (tx_type) { case DCT_DCT: { const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 if (bd == 10) { idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], &a[15]); idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], &a[15]); } else { idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], &a[15]); idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], &a[15]); } break; } case ADST_DCT: { const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 if (bd == 10) { idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], &a[15]); transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); iadst8_bd10(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], &a[15]); iadst8_bd10(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], &a[15]); } else { idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], &a[15]); transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); iadst8_bd12(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], &a[15]); iadst8_bd12(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], &a[15]); } break; } case DCT_ADST: { const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 if (bd == 10) { transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); iadst8_bd10(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], &a[15]); iadst8_bd10(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], &a[15]); idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], &a[15]); } else { transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); iadst8_bd12(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], &a[15]); iadst8_bd12(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], &a[15]); idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], &a[15]); } break; } default: { assert(tx_type == ADST_ADST); if (bd == 10) { transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); iadst8_bd10(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], &a[15]); iadst8_bd10(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], &a[15]); transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); iadst8_bd10(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], &a[15]); iadst8_bd10(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], &a[15]); } else { transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); iadst8_bd12(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], &a[15]); iadst8_bd12(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], &a[15]); transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); iadst8_bd12(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], &a[15]); iadst8_bd12(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], &a[15]); } break; } } c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5)); c[1] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5)); c[2] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5)); c[3] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5)); c[4] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5)); c[5] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5)); c[6] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5)); c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5)); } highbd_add8x8(c, dest, stride, bd); }
inline int16x4_t vmovn(const int32x4_t & v) { return vmovn_s32(v); }
static inline void PostShiftAndSeparateNeon(int16_t* inre, int16_t* inim, int16_t* outre, int16_t* outim, int32_t sh) { int k; int16_t* inre1 = inre; int16_t* inre2 = &inre[FRAMESAMPLES/2 - 4]; int16_t* inim1 = inim; int16_t* inim2 = &inim[FRAMESAMPLES/2 - 4]; int16_t* outre1 = outre; int16_t* outre2 = &outre[FRAMESAMPLES/2 - 4]; int16_t* outim1 = outim; int16_t* outim2 = &outim[FRAMESAMPLES/2 - 4]; const int16_t* kSinTab1 = &WebRtcIsacfix_kSinTab2[0]; const int16_t* kSinTab2 = &WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4 -4]; // By vshl, we effectively did "<< (-sh - 23)", instead of "<< (-sh)", // ">> 14" and then ">> 9" as in the C code. int32x4_t shift = vdupq_n_s32(-sh - 23); for (k = 0; k < FRAMESAMPLES/4; k += 4) { int16x4_t tmpi = vld1_s16(kSinTab1); kSinTab1 += 4; int16x4_t tmpr = vld1_s16(kSinTab2); kSinTab2 -= 4; int16x4_t inre_0 = vld1_s16(inre1); inre1 += 4; int16x4_t inre_1 = vld1_s16(inre2); inre2 -= 4; int16x4_t inim_0 = vld1_s16(inim1); inim1 += 4; int16x4_t inim_1 = vld1_s16(inim2); inim2 -= 4; tmpr = vneg_s16(tmpr); inre_1 = vrev64_s16(inre_1); inim_1 = vrev64_s16(inim_1); tmpr = vrev64_s16(tmpr); int16x4_t xr = vqadd_s16(inre_0, inre_1); int16x4_t xi = vqsub_s16(inim_0, inim_1); int16x4_t yr = vqadd_s16(inim_0, inim_1); int16x4_t yi = vqsub_s16(inre_1, inre_0); int32x4_t outr0 = vmull_s16(tmpr, xr); int32x4_t outi0 = vmull_s16(tmpi, xr); int32x4_t outr1 = vmull_s16(tmpi, yr); int32x4_t outi1 = vmull_s16(tmpi, yi); outr0 = vmlsl_s16(outr0, tmpi, xi); outi0 = vmlal_s16(outi0, tmpr, xi); outr1 = vmlal_s16(outr1, tmpr, yi); outi1 = vmlsl_s16(outi1, tmpr, yr); outr0 = vshlq_s32(outr0, shift); outi0 = vshlq_s32(outi0, shift); outr1 = vshlq_s32(outr1, shift); outi1 = vshlq_s32(outi1, shift); outr1 = vnegq_s32(outr1); int16x4_t outre_0 = vmovn_s32(outr0); int16x4_t outim_0 = vmovn_s32(outi0); int16x4_t outre_1 = vmovn_s32(outr1); int16x4_t outim_1 = vmovn_s32(outi1); outre_1 = vrev64_s16(outre_1); outim_1 = vrev64_s16(outim_1); vst1_s16(outre1, outre_0); outre1 += 4; vst1_s16(outim1, outim_0); outim1 += 4; vst1_s16(outre2, outre_1); outre2 -= 4; vst1_s16(outim2, outim_1); outim2 -= 4; } }
int rotate_cpx_vector(int16_t *x, int16_t *alpha, int16_t *y, uint32_t N, uint16_t output_shift) { // Multiply elementwise two complex vectors of N elements // x - input 1 in the format |Re0 Im0 |,......,|Re(N-1) Im(N-1)| // We assume x1 with a dynamic of 15 bit maximum // // alpha - input 2 in the format |Re0 Im0| // We assume x2 with a dynamic of 15 bit maximum // // y - output in the format |Re0 Im0|,......,|Re(N-1) Im(N-1)| // // N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4; // // log2_amp - increase the output amplitude by a factor 2^log2_amp (default is 0) // WARNING: log2_amp>0 can cause overflow!! uint32_t i; // loop counter simd_q15_t *y_128,alpha_128; int32_t *xd=(int32_t *)x; #if defined(__x86_64__) || defined(__i386__) __m128i shift = _mm_cvtsi32_si128(output_shift); register simd_q15_t m0,m1,m2,m3; ((int16_t *)&alpha_128)[0] = alpha[0]; ((int16_t *)&alpha_128)[1] = -alpha[1]; ((int16_t *)&alpha_128)[2] = alpha[1]; ((int16_t *)&alpha_128)[3] = alpha[0]; ((int16_t *)&alpha_128)[4] = alpha[0]; ((int16_t *)&alpha_128)[5] = -alpha[1]; ((int16_t *)&alpha_128)[6] = alpha[1]; ((int16_t *)&alpha_128)[7] = alpha[0]; #elif defined(__arm__) int32x4_t shift; int32x4_t ab_re0,ab_re1,ab_im0,ab_im1,re32,im32; int16_t reflip[8] __attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1}; int32x4x2_t xtmp; ((int16_t *)&alpha_128)[0] = alpha[0]; ((int16_t *)&alpha_128)[1] = alpha[1]; ((int16_t *)&alpha_128)[2] = alpha[0]; ((int16_t *)&alpha_128)[3] = alpha[1]; ((int16_t *)&alpha_128)[4] = alpha[0]; ((int16_t *)&alpha_128)[5] = alpha[1]; ((int16_t *)&alpha_128)[6] = alpha[0]; ((int16_t *)&alpha_128)[7] = alpha[1]; int16x8_t bflip = vrev32q_s16(alpha_128); int16x8_t bconj = vmulq_s16(alpha_128,*(int16x8_t *)reflip); shift = vdupq_n_s32(-output_shift); #endif y_128 = (simd_q15_t *) y; for(i=0; i<N>>2; i++) { #if defined(__x86_64__) || defined(__i386__) m0 = _mm_setr_epi32(xd[0],xd[0],xd[1],xd[1]); m1 = _mm_setr_epi32(xd[2],xd[2],xd[3],xd[3]); m2 = _mm_madd_epi16(m0,alpha_128); //complex multiply. result is 32bit [Re Im Re Im] m3 = _mm_madd_epi16(m1,alpha_128); //complex multiply. result is 32bit [Re Im Re Im] m2 = _mm_sra_epi32(m2,shift); // shift right by shift in order to compensate for the input amplitude m3 = _mm_sra_epi32(m3,shift); // shift right by shift in order to compensate for the input amplitude y_128[0] = _mm_packs_epi32(m2,m3); // pack in 16bit integers with saturation [re im re im re im re im] #elif defined(__arm__) ab_re0 = vmull_s16(((int16x4_t*)xd)[0],((int16x4_t*)&bconj)[0]); ab_re1 = vmull_s16(((int16x4_t*)xd)[1],((int16x4_t*)&bconj)[1]); ab_im0 = vmull_s16(((int16x4_t*)xd)[0],((int16x4_t*)&bflip)[0]); ab_im1 = vmull_s16(((int16x4_t*)xd)[1],((int16x4_t*)&bflip)[1]); re32 = vshlq_s32(vcombine_s32(vpadd_s32(((int32x2_t*)&ab_re0)[0],((int32x2_t*)&ab_re0)[1]), vpadd_s32(((int32x2_t*)&ab_re1)[0],((int32x2_t*)&ab_re1)[1])), shift); im32 = vshlq_s32(vcombine_s32(vpadd_s32(((int32x2_t*)&ab_im0)[0],((int32x2_t*)&ab_im0)[1]), vpadd_s32(((int32x2_t*)&ab_im1)[0],((int32x2_t*)&ab_im1)[1])), shift); xtmp = vzipq_s32(re32,im32); y_128[0] = vcombine_s16(vmovn_s32(xtmp.val[0]),vmovn_s32(xtmp.val[1])); #endif xd+=4; y_128+=1; } _mm_empty(); _m_empty(); return(0); }
int mult_cpx_conj_vector(int16_t *x1, int16_t *x2, int16_t *y, uint32_t N, int output_shift, int madd) { // Multiply elementwise the complex conjugate of x1 with x2. // x1 - input 1 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // We assume x1 with a dinamic of 15 bit maximum // // x2 - input 2 in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // We assume x2 with a dinamic of 14 bit maximum /// // y - output in the format |Re0 Im0 Re1 Im1|,......,|Re(N-2) Im(N-2) Re(N-1) Im(N-1)| // // N - the size f the vectors (this function does N cpx mpy. WARNING: N>=4; // // output_shift - shift to be applied to generate output // // madd - add the output to y uint32_t i; // loop counter simd_q15_t *x1_128; simd_q15_t *x2_128; simd_q15_t *y_128; #if defined(__x86_64__) || defined(__i386__) simd_q15_t tmp_re,tmp_im; simd_q15_t tmpy0,tmpy1; #elif defined(__arm__) int32x4_t tmp_re,tmp_im; int32x4_t tmp_re1,tmp_im1; int16x4x2_t tmpy; int32x4_t shift = vdupq_n_s32(-output_shift); #endif x1_128 = (simd_q15_t *)&x1[0]; x2_128 = (simd_q15_t *)&x2[0]; y_128 = (simd_q15_t *)&y[0]; // we compute 4 cpx multiply for each loop for(i=0; i<(N>>2); i++) { #if defined(__x86_64__) || defined(__i386__) tmp_re = _mm_madd_epi16(*x1_128,*x2_128); tmp_im = _mm_shufflelo_epi16(*x1_128,_MM_SHUFFLE(2,3,0,1)); tmp_im = _mm_shufflehi_epi16(tmp_im,_MM_SHUFFLE(2,3,0,1)); tmp_im = _mm_sign_epi16(tmp_im,*(__m128i*)&conjug[0]); tmp_im = _mm_madd_epi16(tmp_im,*x2_128); tmp_re = _mm_srai_epi32(tmp_re,output_shift); tmp_im = _mm_srai_epi32(tmp_im,output_shift); tmpy0 = _mm_unpacklo_epi32(tmp_re,tmp_im); tmpy1 = _mm_unpackhi_epi32(tmp_re,tmp_im); if (madd==0) *y_128 = _mm_packs_epi32(tmpy0,tmpy1); else *y_128 += _mm_packs_epi32(tmpy0,tmpy1); #elif defined(__arm__) tmp_re = vmull_s16(((simdshort_q15_t *)x1_128)[0], ((simdshort_q15_t*)x2_128)[0]); //tmp_re = [Re(x1[0])Re(x2[0]) Im(x1[0])Im(x2[0]) Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1])] tmp_re1 = vmull_s16(((simdshort_q15_t *)x1_128)[1], ((simdshort_q15_t*)x2_128)[1]); //tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])] tmp_re = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)), vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1))); //tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] tmp_im = vmull_s16(vrev32_s16(vmul_s16(((simdshort_q15_t*)x2_128)[0],*(simdshort_q15_t*)conjug)), ((simdshort_q15_t*)x1_128)[0]); //tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])] tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(((simdshort_q15_t*)x2_128)[1],*(simdshort_q15_t*)conjug)), ((simdshort_q15_t*)x1_128)[1]); //tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])] tmp_im = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)), vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1))); //tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])] tmp_re = vqshlq_s32(tmp_re,shift); tmp_im = vqshlq_s32(tmp_im,shift); tmpy = vzip_s16(vmovn_s32(tmp_re),vmovn_s32(tmp_im)); if (madd==0) *y_128 = vcombine_s16(tmpy.val[0],tmpy.val[1]); else *y_128 += vcombine_s16(tmpy.val[0],tmpy.val[1]); #endif x1_128++; x2_128++; y_128++; } _mm_empty(); _m_empty(); return(0); }