static inline void char_to_float_vectors(const unsigned char * sourcep, float32x4_t *mp0, float32x4_t * mp1) { uint8x8_t rawpixels; /* source pixels as {[YUYV]0 [YUYV]1} */ int16x8_t widerpixels; /* rawpixels promoted to shorts per component */ int16x4_t high16, low16; int32x4_t high32, low32; const int16x8_t uvbias = {0, 128, 0, 128, 0, 128, 0, 128}; rawpixels = vld1_u8(sourcep); widerpixels = vreinterpretq_s16_u16(vmovl_u8(rawpixels)); /* subtract uvbias from widerpixels */ widerpixels = vsubq_s16(widerpixels, uvbias); /* now take widerpixels apart into (low16, high16) and */ /* then expand those into (low32, high32) */ low16 = vget_low_s16(widerpixels); high16 = vget_high_s16(widerpixels); high32 = vmovl_s16(high16); low32 = vmovl_s16(low16); /* now convert low32 and high32 into floats and store them in */ /* *mp0, *mp1 */ *mp0 = vcvtq_f32_s32(low32); *mp1 = vcvtq_f32_s32(high32); }
void test_vmovls16 (void) { int32x4_t out_int32x4_t; int16x4_t arg0_int16x4_t; out_int32x4_t = vmovl_s16 (arg0_int16x4_t); }
static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16, int16x4_t *d5s16, int16x8_t *q3s16, int16x8_t *q8s16, int16x8_t *q9s16) { int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16; int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32; d6s16 = vget_low_s16(*q3s16); d16s16 = vget_low_s16(*q8s16); d17s16 = vget_high_s16(*q8s16); d18s16 = vget_low_s16(*q9s16); d19s16 = vget_high_s16(*q9s16); q10s32 = vmull_s16(*d3s16, d16s16); q11s32 = vmull_s16(*d4s16, d16s16); q12s32 = vmull_s16(d6s16, d17s16); q13s32 = vmull_s16(*d5s16, d18s16); q14s32 = vmull_s16(*d3s16, d18s16); q15s32 = vmovl_s16(d16s16); q15s32 = vaddw_s16(q15s32, d19s16); q8s32 = vmull_s16(*d4s16, d19s16); q15s32 = vsubw_s16(q15s32, d18s16); q9s32 = vmull_s16(*d5s16, d19s16); q10s32 = vaddq_s32(q10s32, q13s32); q10s32 = vaddq_s32(q10s32, q8s32); q11s32 = vsubq_s32(q11s32, q14s32); q8s32 = vdupq_n_s32(sinpi_3_9); q11s32 = vsubq_s32(q11s32, q9s32); q15s32 = vmulq_s32(q15s32, q8s32); q13s32 = vaddq_s32(q10s32, q12s32); q10s32 = vaddq_s32(q10s32, q11s32); q14s32 = vaddq_s32(q11s32, q12s32); q10s32 = vsubq_s32(q10s32, q12s32); d16s16 = vqrshrn_n_s32(q13s32, 14); d17s16 = vqrshrn_n_s32(q14s32, 14); d18s16 = vqrshrn_n_s32(q15s32, 14); d19s16 = vqrshrn_n_s32(q10s32, 14); *q8s16 = vcombine_s16(d16s16, d17s16); *q9s16 = vcombine_s16(d18s16, d19s16); return; }
void UpsampleRgbaLinePairNEON(const uint8_t *top_y, const uint8_t *bottom_y, const uint8_t *top_u, const uint8_t *top_v, const uint8_t *cur_u, const uint8_t *cur_v, uint8_t *top_dst, uint8_t *bottom_dst, int len) { int block; uint8_t uv_buf[2 * 32 + 15]; uint8_t *const r_uv = (uint8_t*)((uintptr_t)(uv_buf + 15) & ~15); const int uv_len = (len + 1) >> 1; const int num_blocks = (uv_len - 1) >> 3; const int leftover = uv_len - num_blocks * 8; const int last_pos = 1 + 16 * num_blocks; const int u_diag = ((top_u[0] + cur_u[0]) >> 1) + 1; const int v_diag = ((top_v[0] + cur_v[0]) >> 1) + 1; const int16x4_t cf16 = vld1_s16(coef); const int32x2_t cf32 = vmov_n_s32(76283); const uint8x8_t u16 = vmov_n_u8(16); const uint8x8_t u128 = vmov_n_u8(128); for (block = 0; block < num_blocks; ++block) { { uint8x8_t a = vld1_u8(top_u); uint8x8_t b = vld1_u8(top_u + 1); uint8x8_t c = vld1_u8(cur_u); uint8x8_t d = vld1_u8(cur_u + 1); uint16x8_t al = vshll_n_u8(a, 1); uint16x8_t bl = vshll_n_u8(b, 1); uint16x8_t cl = vshll_n_u8(c, 1); uint16x8_t dl = vshll_n_u8(d, 1); uint8x8_t diag1, diag2; uint16x8_t sl; sl = vaddl_u8(a, b); sl = vaddw_u8(sl, c); sl = vaddw_u8(sl, d); al = vaddq_u16(sl, al); bl = vaddq_u16(sl, bl); al = vaddq_u16(al, dl); bl = vaddq_u16(bl, cl); diag2 = vshrn_n_u16(al, 3); diag1 = vshrn_n_u16(bl, 3); a = vrhadd_u8(a, diag1); b = vrhadd_u8(b, diag2); c = vrhadd_u8(c, diag2); d = vrhadd_u8(d, diag1); { const uint8x8x2_t a_b = {{ a, b }}; const uint8x8x2_t c_d = {{ c, d }}; vst2_u8(r_uv, a_b); vst2_u8(r_uv + 32, c_d); } } { uint8x8_t a = vld1_u8(top_v); uint8x8_t b = vld1_u8(top_v + 1); uint8x8_t c = vld1_u8(cur_v); uint8x8_t d = vld1_u8(cur_v + 1); uint16x8_t al = vshll_n_u8(a, 1); uint16x8_t bl = vshll_n_u8(b, 1); uint16x8_t cl = vshll_n_u8(c, 1); uint16x8_t dl = vshll_n_u8(d, 1); uint8x8_t diag1, diag2; uint16x8_t sl; sl = vaddl_u8(a, b); sl = vaddw_u8(sl, c); sl = vaddw_u8(sl, d); al = vaddq_u16(sl, al); bl = vaddq_u16(sl, bl); al = vaddq_u16(al, dl); bl = vaddq_u16(bl, cl); diag2 = vshrn_n_u16(al, 3); diag1 = vshrn_n_u16(bl, 3); a = vrhadd_u8(a, diag1); b = vrhadd_u8(b, diag2); c = vrhadd_u8(c, diag2); d = vrhadd_u8(d, diag1); { const uint8x8x2_t a_b = {{ a, b }}; const uint8x8x2_t c_d = {{ c, d }}; vst2_u8(r_uv + 16, a_b); vst2_u8(r_uv + 16 + 32, c_d); } } { if (top_y) { { int i; for (i = 0; i < 16; i += 8) { int off = ((16 * block + 1) + i) * 4; uint8x8_t y = vld1_u8(top_y + (16 * block + 1) + i); uint8x8_t u = vld1_u8((r_uv) + i); uint8x8_t v = vld1_u8((r_uv) + i + 16); int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16)); int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128)); int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128)); int16x8_t ud = vshlq_n_s16(uu, 1); int16x8_t vd = vshlq_n_s16(vv, 1); int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1), vget_low_s16(vd), cf16, 0); int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), vget_high_s16(vd), cf16, 0); int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16), vrshrn_n_s32(vrh, 16)); int32x4_t vl = vmovl_s16(vget_low_s16(vv)); int32x4_t vh = vmovl_s16(vget_high_s16(vv)); int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu), cf16, 1); int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1); int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv), cf16, 2); int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2); int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16), vrshrn_n_s32(gch, 16)); int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1), vget_low_s16(ud), cf16, 3); int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), vget_high_s16(ud), cf16, 3); int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16), vrshrn_n_s32(ubh, 16)); int32x4_t rl = vaddl_s16(vget_low_s16(yy), vget_low_s16(vr)); int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr)); int32x4_t gl = vsubl_s16(vget_low_s16(yy), vget_low_s16(gc)); int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc)); int32x4_t bl = vaddl_s16(vget_low_s16(yy), vget_low_s16(ub)); int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub)); rl = vmulq_lane_s32(rl, cf32, 0); rh = vmulq_lane_s32(rh, cf32, 0); gl = vmulq_lane_s32(gl, cf32, 0); gh = vmulq_lane_s32(gh, cf32, 0); bl = vmulq_lane_s32(bl, cf32, 0); bh = vmulq_lane_s32(bh, cf32, 0); y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16), vrshrn_n_s32(rh, 16))); u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16), vrshrn_n_s32(gh, 16))); v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16), vrshrn_n_s32(bh, 16))); do { const uint8x8x4_t r_g_b_v255 = {{ y, u, v, vmov_n_u8(255) }}; vst4_u8(top_dst + off, r_g_b_v255); } while (0); } } } if (bottom_y) { { int i; for (i = 0; i < 16; i += 8) { int off = ((16 * block + 1) + i) * 4; uint8x8_t y = vld1_u8(bottom_y + (16 * block + 1) + i); uint8x8_t u = vld1_u8(((r_uv) + 32) + i); uint8x8_t v = vld1_u8(((r_uv) + 32) + i + 16); int16x8_t yy = vreinterpretq_s16_u16(vsubl_u8(y, u16)); int16x8_t uu = vreinterpretq_s16_u16(vsubl_u8(u, u128)); int16x8_t vv = vreinterpretq_s16_u16(vsubl_u8(v, u128)); int16x8_t ud = vshlq_n_s16(uu, 1); int16x8_t vd = vshlq_n_s16(vv, 1); int32x4_t vrl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(vv), 1), vget_low_s16(vd), cf16, 0); int32x4_t vrh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(vv), 1), vget_high_s16(vd), cf16, 0); int16x8_t vr = vcombine_s16(vrshrn_n_s32(vrl, 16), vrshrn_n_s32(vrh, 16)); int32x4_t vl = vmovl_s16(vget_low_s16(vv)); int32x4_t vh = vmovl_s16(vget_high_s16(vv)); int32x4_t ugl = vmlal_lane_s16(vl, vget_low_s16(uu), cf16, 1); int32x4_t ugh = vmlal_lane_s16(vh, vget_high_s16(uu), cf16, 1); int32x4_t gcl = vqdmlal_lane_s16(ugl, vget_low_s16(vv), cf16, 2); int32x4_t gch = vqdmlal_lane_s16(ugh, vget_high_s16(vv), cf16, 2); int16x8_t gc = vcombine_s16(vrshrn_n_s32(gcl, 16), vrshrn_n_s32(gch, 16)); int32x4_t ubl = vqdmlal_lane_s16(vshll_n_s16(vget_low_s16(uu), 1), vget_low_s16(ud), cf16, 3); int32x4_t ubh = vqdmlal_lane_s16(vshll_n_s16(vget_high_s16(uu), 1), vget_high_s16(ud), cf16, 3); int16x8_t ub = vcombine_s16(vrshrn_n_s32(ubl, 16), vrshrn_n_s32(ubh, 16)); int32x4_t rl = vaddl_s16(vget_low_s16(yy), vget_low_s16(vr)); int32x4_t rh = vaddl_s16(vget_high_s16(yy), vget_high_s16(vr)); int32x4_t gl = vsubl_s16(vget_low_s16(yy), vget_low_s16(gc)); int32x4_t gh = vsubl_s16(vget_high_s16(yy), vget_high_s16(gc)); int32x4_t bl = vaddl_s16(vget_low_s16(yy), vget_low_s16(ub)); int32x4_t bh = vaddl_s16(vget_high_s16(yy), vget_high_s16(ub)); rl = vmulq_lane_s32(rl, cf32, 0); rh = vmulq_lane_s32(rh, cf32, 0); gl = vmulq_lane_s32(gl, cf32, 0); gh = vmulq_lane_s32(gh, cf32, 0); bl = vmulq_lane_s32(bl, cf32, 0); bh = vmulq_lane_s32(bh, cf32, 0); y = vqmovun_s16(vcombine_s16(vrshrn_n_s32(rl, 16), vrshrn_n_s32(rh, 16))); u = vqmovun_s16(vcombine_s16(vrshrn_n_s32(gl, 16), vrshrn_n_s32(gh, 16))); v = vqmovun_s16(vcombine_s16(vrshrn_n_s32(bl, 16), vrshrn_n_s32(bh, 16))); do { const uint8x8x4_t r_g_b_v255 = {{ y, u, v, vmov_n_u8(255) }}; vst4_u8(bottom_dst + off, r_g_b_v255); } while (0); } } } } } }
inline int32x4_t vmovl(const int16x4_t & v) { return vmovl_s16(v); }
void phase(const Size2D &size, const s16 * src0Base, ptrdiff_t src0Stride, const s16 * src1Base, ptrdiff_t src1Stride, u8 * dstBase, ptrdiff_t dstStride) { internal::assertSupportedConfiguration(); #ifdef CAROTENE_NEON FASTATAN2CONST(256.0f / 360.0f) size_t roiw16 = size.width >= 15 ? size.width - 15 : 0; size_t roiw8 = size.width >= 7 ? size.width - 7 : 0; float32x4_t v_05 = vdupq_n_f32(0.5f); for (size_t i = 0; i < size.height; ++i) { const s16 * src0 = internal::getRowPtr(src0Base, src0Stride, i); const s16 * src1 = internal::getRowPtr(src1Base, src1Stride, i); u8 * dst = internal::getRowPtr(dstBase, dstStride, i); size_t j = 0; for (; j < roiw16; j += 16) { internal::prefetch(src0 + j); internal::prefetch(src1 + j); int16x8_t v_src00 = vld1q_s16(src0 + j), v_src01 = vld1q_s16(src0 + j + 8); int16x8_t v_src10 = vld1q_s16(src1 + j), v_src11 = vld1q_s16(src1 + j + 8); // 0 float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src00))); float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src10))); float32x4_t v_dst32f0; FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0) v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src00))); v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src10))); float32x4_t v_dst32f1; FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1) uint16x8_t v_dst16s0 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))), vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05)))); // 1 v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src01))); v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src11))); FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0) v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src01))); v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src11))); FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1) uint16x8_t v_dst16s1 = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))), vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05)))); vst1q_u8(dst + j, vcombine_u8(vmovn_u16(v_dst16s0), vmovn_u16(v_dst16s1))); } for (; j < roiw8; j += 8) { int16x8_t v_src0 = vld1q_s16(src0 + j); int16x8_t v_src1 = vld1q_s16(src1 + j); float32x4_t v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src0))); float32x4_t v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_low_s16(v_src1))); float32x4_t v_dst32f0; FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f0) v_src0_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src0))); v_src1_p = vcvtq_f32_s32(vmovl_s16(vget_high_s16(v_src1))); float32x4_t v_dst32f1; FASTATAN2VECTOR(v_src1_p, v_src0_p, v_dst32f1) uint16x8_t v_dst = vcombine_u16(vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f0, v_05))), vmovn_u32(vcvtq_u32_f32(vaddq_f32(v_dst32f1, v_05)))); vst1_u8(dst + j, vmovn_u16(v_dst)); } for (; j < size.width; j++) { f32 x = src0[j], y = src1[j]; f32 a; FASTATAN2SCALAR(y, x, a) dst[j] = (u8)(s32)floor(a + 0.5f); } } #else (void)size; (void)src0Base; (void)src0Stride; (void)src1Base; (void)src1Stride; (void)dstBase; (void)dstStride; #endif }
inline v_int32x4 v_load_expand_q(const schar* ptr) { int8x8_t v0 = vcreate_s8(*(unsigned*)ptr); int16x4_t v1 = vget_low_s16(vmovl_s8(v0)); return v_int32x4(vmovl_s16(v1)); }
static inline void PostShiftAndDivideAndDemodulateNeon(int16_t* inre, int16_t* inim, int32_t* outre1, int32_t* outre2, int32_t sh) { int k; int16_t* p_inre = inre; int16_t* p_inim = inim; int32_t* p_outre1 = outre1; int32_t* p_outre2 = outre2; const int16_t* kCosTab = &WebRtcIsacfix_kCosTab1[0]; const int16_t* kSinTab = &WebRtcIsacfix_kSinTab1[0]; int32x4_t shift = vdupq_n_s32(-sh - 16); // Divide through by the normalizing constant: // scale all values with 1/240, i.e. with 273 in Q16. // 273/65536 ~= 0.0041656 // 1/240 ~= 0.0041666 int16x8_t scale = vdupq_n_s16(273); // Sqrt(240) in Q11 is round(15.49193338482967 * 2048) = 31727. int factQ19 = 31727 << 16; int32x4_t fact = vdupq_n_s32(factQ19); for (k = 0; k < FRAMESAMPLES/2; k += 8) { int16x8_t inre16x8 = vld1q_s16(p_inre); int16x8_t inim16x8 = vld1q_s16(p_inim); p_inre += 8; p_inim += 8; int16x8_t tmpr = vld1q_s16(kCosTab); int16x8_t tmpi = vld1q_s16(kSinTab); kCosTab += 8; kSinTab += 8; // By vshl and vmull, we effectively did "<< (-sh - 16)", // instead of "<< (-sh)" and ">> 16" as in the C code. int32x4_t outre1_0 = vmull_s16(vget_low_s16(inre16x8), vget_low_s16(scale)); int32x4_t outre2_0 = vmull_s16(vget_low_s16(inim16x8), vget_low_s16(scale)); #if defined(WEBRTC_ARCH_ARM64) int32x4_t outre1_1 = vmull_high_s16(inre16x8, scale); int32x4_t outre2_1 = vmull_high_s16(inim16x8, scale); #else int32x4_t outre1_1 = vmull_s16(vget_high_s16(inre16x8), vget_high_s16(scale)); int32x4_t outre2_1 = vmull_s16(vget_high_s16(inim16x8), vget_high_s16(scale)); #endif outre1_0 = vshlq_s32(outre1_0, shift); outre1_1 = vshlq_s32(outre1_1, shift); outre2_0 = vshlq_s32(outre2_0, shift); outre2_1 = vshlq_s32(outre2_1, shift); // Demodulate and separate. int32x4_t tmpr_0 = vmovl_s16(vget_low_s16(tmpr)); int32x4_t tmpi_0 = vmovl_s16(vget_low_s16(tmpi)); #if defined(WEBRTC_ARCH_ARM64) int32x4_t tmpr_1 = vmovl_high_s16(tmpr); int32x4_t tmpi_1 = vmovl_high_s16(tmpi); #else int32x4_t tmpr_1 = vmovl_s16(vget_high_s16(tmpr)); int32x4_t tmpi_1 = vmovl_s16(vget_high_s16(tmpi)); #endif int64x2_t xr0 = vmull_s32(vget_low_s32(tmpr_0), vget_low_s32(outre1_0)); int64x2_t xi0 = vmull_s32(vget_low_s32(tmpr_0), vget_low_s32(outre2_0)); int64x2_t xr2 = vmull_s32(vget_low_s32(tmpr_1), vget_low_s32(outre1_1)); int64x2_t xi2 = vmull_s32(vget_low_s32(tmpr_1), vget_low_s32(outre2_1)); xr0 = vmlsl_s32(xr0, vget_low_s32(tmpi_0), vget_low_s32(outre2_0)); xi0 = vmlal_s32(xi0, vget_low_s32(tmpi_0), vget_low_s32(outre1_0)); xr2 = vmlsl_s32(xr2, vget_low_s32(tmpi_1), vget_low_s32(outre2_1)); xi2 = vmlal_s32(xi2, vget_low_s32(tmpi_1), vget_low_s32(outre1_1)); #if defined(WEBRTC_ARCH_ARM64) int64x2_t xr1 = vmull_high_s32(tmpr_0, outre1_0); int64x2_t xi1 = vmull_high_s32(tmpr_0, outre2_0); int64x2_t xr3 = vmull_high_s32(tmpr_1, outre1_1); int64x2_t xi3 = vmull_high_s32(tmpr_1, outre2_1); xr1 = vmlsl_high_s32(xr1, tmpi_0, outre2_0); xi1 = vmlal_high_s32(xi1, tmpi_0, outre1_0); xr3 = vmlsl_high_s32(xr3, tmpi_1, outre2_1); xi3 = vmlal_high_s32(xi3, tmpi_1, outre1_1); #else int64x2_t xr1 = vmull_s32(vget_high_s32(tmpr_0), vget_high_s32(outre1_0)); int64x2_t xi1 = vmull_s32(vget_high_s32(tmpr_0), vget_high_s32(outre2_0)); int64x2_t xr3 = vmull_s32(vget_high_s32(tmpr_1), vget_high_s32(outre1_1)); int64x2_t xi3 = vmull_s32(vget_high_s32(tmpr_1), vget_high_s32(outre2_1)); xr1 = vmlsl_s32(xr1, vget_high_s32(tmpi_0), vget_high_s32(outre2_0)); xi1 = vmlal_s32(xi1, vget_high_s32(tmpi_0), vget_high_s32(outre1_0)); xr3 = vmlsl_s32(xr3, vget_high_s32(tmpi_1), vget_high_s32(outre2_1)); xi3 = vmlal_s32(xi3, vget_high_s32(tmpi_1), vget_high_s32(outre1_1)); #endif outre1_0 = vcombine_s32(vshrn_n_s64(xr0, 10), vshrn_n_s64(xr1, 10)); outre2_0 = vcombine_s32(vshrn_n_s64(xi0, 10), vshrn_n_s64(xi1, 10)); outre1_1 = vcombine_s32(vshrn_n_s64(xr2, 10), vshrn_n_s64(xr3, 10)); outre2_1 = vcombine_s32(vshrn_n_s64(xi2, 10), vshrn_n_s64(xi3, 10)); outre1_0 = vqdmulhq_s32(outre1_0, fact); outre2_0 = vqdmulhq_s32(outre2_0, fact); outre1_1 = vqdmulhq_s32(outre1_1, fact); outre2_1 = vqdmulhq_s32(outre2_1, fact); vst1q_s32(p_outre1, outre1_0); p_outre1 += 4; vst1q_s32(p_outre1, outre1_1); p_outre1 += 4; vst1q_s32(p_outre2, outre2_0); p_outre2 += 4; vst1q_s32(p_outre2, outre2_1); p_outre2 += 4; } }