OD_SIMD_INLINE void od_store4(od_coeff *x, int xstride, int32x4_t t0, int32x4_t t1, int32x4_t t2, int32x4_t t3) { vst1q_s32((int *)(x + 0*xstride), t0); vst1q_s32((int *)(x + 1*xstride), t1); vst1q_s32((int *)(x + 2*xstride), t2); vst1q_s32((int *)(x + 3*xstride), t3); }
void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore* aecm, const uint16_t* far_spectrum, int32_t* echo_est) { assert((uintptr_t)echo_est % 32 == 0); assert((uintptr_t)(aecm->channelStored) % 16 == 0); assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0); // This is C code of following optimized code. // During startup we store the channel every block. // memcpy(aecm->channelStored, // aecm->channelAdapt16, // sizeof(int16_t) * PART_LEN1); // Recalculate echo estimate // for (i = 0; i < PART_LEN; i += 4) { // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], // far_spectrum[i]); // echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1], // far_spectrum[i + 1]); // echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2], // far_spectrum[i + 2]); // echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3], // far_spectrum[i + 3]); // } // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], // far_spectrum[i]); const uint16_t* far_spectrum_p = far_spectrum; int16_t* start_adapt_p = aecm->channelAdapt16; int16_t* start_stored_p = aecm->channelStored; const int16_t* end_stored_p = aecm->channelStored + PART_LEN; int32_t* echo_est_p = echo_est; uint16x8_t far_spectrum_v; int16x8_t adapt_v; uint32x4_t echo_est_v_low, echo_est_v_high; while (start_stored_p < end_stored_p) { far_spectrum_v = vld1q_u16(far_spectrum_p); adapt_v = vld1q_s16(start_adapt_p); vst1q_s16(start_stored_p, adapt_v); echo_est_v_low = vmull_u16(vget_low_u16(far_spectrum_v), vget_low_u16(vreinterpretq_u16_s16(adapt_v))); echo_est_v_high = vmull_u16(vget_high_u16(far_spectrum_v), vget_high_u16(vreinterpretq_u16_s16(adapt_v))); vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low)); vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high)); far_spectrum_p += 8; start_adapt_p += 8; start_stored_p += 8; echo_est_p += 8; } aecm->channelStored[PART_LEN] = aecm->channelAdapt16[PART_LEN]; echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN], far_spectrum[PART_LEN]); }
void * scaled_sumi_thread_NEON(void * argument) { jsize i = 0; struct scaled_sumfneon_thread_data * data = (struct scaled_sumfneon_thread_data *) argument; int32_t * r = (int32_t *)data->r; const int32_t * x = (const int32_t *)data->x; const int32_t * y = (const int32_t *)data->y; const int32_t a = (const int32_t)data->a; const jsize size = data->size; int32x4_t rx4, xx4, yx4, ax4; ax4 = vdupq_n_s32(a); for(i; i < size ; i += 4) { xx4 = vld1q_s32(&(x[i])); yx4 = vld1q_s32(&(y[i])); rx4 = vmlaq_s32(xx4, ax4, yx4); vst1q_s32(&(r[i]), rx4); } }
void test_vst1Qs32 (void) { int32_t *arg0_int32_t; int32x4_t arg1_int32x4_t; vst1q_s32 (arg0_int32_t, arg1_int32x4_t); }
void test(JNIEnv * env, jobject jRoot, jobject jObj) { int *testSet1 = (int*)malloc(sizeof(int)*DATA_SIZE); for(uint32_t i = 0; i<DATA_SIZE; i++) { testSet1[i] = i; } clock_t begin = clock(); for (uint32_t i=0; i<DATA_SIZE/4/2; i++) { int32_t *src = testSet1+i*4; int32_t *dest = testSet1+DATA_SIZE - 4*(i+1); int32x4_t tmp = vld1q_dup_s32(src); int32x4_t destData = vld1q_dup_s32(dest); int32x4_t rDestData = vrev64q_s32(destData); vst1q_s32(src, rDestData); vst1q_s32(dest, tmp); } clock_t end = clock(); for (uint32_t i = 0; i<DATA_SIZE/2; i++) { int t = testSet1[i]; int d = testSet1[DATA_SIZE-1-i]; testSet1[i] = d; testSet1[DATA_SIZE-1-i] = t; } clock_t end2 = clock(); clock_t cost1 = end-begin; clock_t cost2 = end2-end; __android_log_print(ANDROID_LOG_DEBUG, "NEON", "last number is %d, acc=%.1fx", testSet1[DATA_SIZE-1], 1.f*cost2/cost1); free(testSet1); jclass clasz = env->FindClass("com/tencent/helloneon/BenchListener"); jmethodID method = env->GetMethodID(clasz, "onResult", "(Ljava/lang/String;)V"); std::stringstream out; out << "benchResult:" << 1.f*cost2/cost1; env->CallVoidMethod(jObj, method, env->NewStringUTF(out.str().c_str())); }
void EAP_MemsetBuff_filterbank_Int32(int32 *ptr_left, int32 *ptr_right) { #ifdef __ARM_NEON__ int i = 240; int32x4_t zero = { 0, }; for (i = 0; i < 240; i++, ptr_left += 8, ptr_right += 8) { vst1q_s32(ptr_left, zero); vst1q_s32(ptr_right, zero); vst1q_s32(ptr_left + 4, zero); vst1q_s32(ptr_right + 4, zero); } #else memset(ptr_left, 0, 240 * 8 * sizeof(int32)); memset(ptr_right, 0, 240 * 8 * sizeof(int32)); #endif }
test_vdupq_lane_s32 () { int32x2_t a; int32x4_t b; int i; int32_t c[2] = { 0, 1 }; int32_t d[4]; a = vld1_s32 (c); b = wrap_vdupq_lane_s32_0 (a); vst1q_s32 (d, b); for (i = 0; i < 4; i++) if (c[0] != d[i]) return 1; b = wrap_vdupq_lane_s32_1 (a); vst1q_s32 (d, b); for (i = 0; i < 4; i++) if (c[1] != d[i]) return 1; return 0; }
static void add_int_neon(int* dst, int* src1, int* src2, int count) { int i; for (i = 0; i < count; i += 4) { int32x4_t in1, in2, out; in1 = vld1q_s32(src1); src1 += 4; in2 = vld1q_s32(src2); src2 += 4; out = vaddq_s32(in1, in2); vst1q_s32(dst, out); dst += 4; } }
void matMult(int16_t mat1[], int16_t mat2[], int32_t prod[matrix_size][matrix_size]) { int output_size = 2 * matrix_size; int l,k; int16x4_t data1; int32x4_t mac_output[output_size/4]; int32x4_t MAC_addvalue[output_size/4]; int16x4_t constant_value; unsigned int index_input = 0; unsigned int transfer_index = 0 ; int32_t *pres_ver; /* Allocate output */ pres_ver = malloc(output_size * output_size * sizeof(int32_t)); for(l = 0 ; l < matrix_size/4; l++) { MAC_addvalue[l] = vmovq_n_s32(0); } /* Perform the multiplication */ for(l = 0; l < matrix_size*matrix_size; l++) { constant_value = vmov_n_s16 (mat1[l]); for(k = 0 ; k < matrix_size/4 ; k++) { data1 = vld1_s16 (&mat2[index_input]); MAC4 (&MAC_addvalue[k], &constant_value, &data1,&mac_output[k]); MAC_addvalue[k] = mac_output[k]; index_input +=4; } index_input+=output_size-matrix_size; if ((l + 1) % matrix_size == 0 ) { index_input = 0; for(k = 0 ; k < matrix_size/4 ; k++) { vst1q_s32(&pres_ver[transfer_index],MAC_addvalue[k]); transfer_index +=4; } transfer_index += output_size-matrix_size; for(k = 0 ; k < matrix_size/4; k++) { MAC_addvalue[k] = vmovq_n_s32(0); } } } }
void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore* aecm) { assert((uintptr_t)(aecm->channelStored) % 16 == 0); assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0); assert((uintptr_t)(aecm->channelAdapt32) % 32 == 0); // The C code of following optimized code. // for (i = 0; i < PART_LEN1; i++) { // aecm->channelAdapt16[i] = aecm->channelStored[i]; // aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32( // (int32_t)aecm->channelStored[i], 16); // } int16_t* start_stored_p = aecm->channelStored; int16_t* start_adapt16_p = aecm->channelAdapt16; int32_t* start_adapt32_p = aecm->channelAdapt32; const int16_t* end_stored_p = start_stored_p + PART_LEN; int16x8_t stored_v; int32x4_t adapt32_v_low, adapt32_v_high; while (start_stored_p < end_stored_p) { stored_v = vld1q_s16(start_stored_p); vst1q_s16(start_adapt16_p, stored_v); adapt32_v_low = vshll_n_s16(vget_low_s16(stored_v), 16); adapt32_v_high = vshll_n_s16(vget_high_s16(stored_v), 16); vst1q_s32(start_adapt32_p, adapt32_v_low); vst1q_s32(start_adapt32_p + 4, adapt32_v_high); start_stored_p += 8; start_adapt16_p += 8; start_adapt32_p += 8; } aecm->channelAdapt16[PART_LEN] = aecm->channelStored[PART_LEN]; aecm->channelAdapt32[PART_LEN] = (int32_t)aecm->channelStored[PART_LEN] << 16; }
OD_SIMD_INLINE void od_overflow_check_epi32(int32x4_t val, int32_t scale, int32_t offset, int idx) { #if defined(OD_DCT_TEST) && defined(OD_DCT_CHECK_OVERFLOW) int32_t mem[4]; int n; vst1q_s32((int *)mem, val); for (n = 0; n < 4; n++) { OD_DCT_OVERFLOW_CHECK(mem[n], scale, offset, idx); } #endif (void)val; (void)scale; (void)offset; (void)idx; }
test_vreinterpretq_s32_f64 () { float64x2_t a; int32x4_t b; float64_t c[2] = { PI_F64, E_F64 }; int32_t d[4] = { 0x54442D18, 0x400921FB, 0x8B145769, 0x4005BF0A }; int32_t e[4]; int i; a = vld1q_f64 (c); b = wrap_vreinterpretq_s32_f64 (a); vst1q_s32 (e, b); for (i = 0; i < 4; i++) if (d[i] != e[i]) return 1; return 0; };
/* Performs one rotation/translation */ static void neon_coord_4( float32x4_t a_4, float32x4_t b_4, float32x4_t x_4, float32x4_t y_4, float32x4_t pos_4f, float32x4_t point5_4, int * result) { float32x4_t tmp1 = vmulq_f32(a_4, x_4); float32x4_t tmp2 = vmulq_f32(b_4, y_4); tmp2 = vaddq_f32(tmp1, tmp2); tmp2 = vaddq_f32(tmp2, pos_4f); tmp2 = vaddq_f32(tmp2, point5_4); int32x4_t c_4 = vcvtq_s32_f32(tmp2); vst1q_s32(result, c_4); }
/* s32x4 mv mul */ void mw_neon_mv_mul_s32x4(int * A, int Row, int T, int * B, int * C) { int i = 0; int k = 0; int32x4_t neon_b, neon_c; int32x4_t neon_a0, neon_a1, neon_a2, neon_a3; int32x4_t neon_b0, neon_b1, neon_b2, neon_b3; for (i = 0; i < Row; i+=4) { neon_c = vmovq_n_s32(0); for (k = 0; k < T; k+=4) { int j = k * T + i; neon_a0 = vld1q_s32(A + j); j+=Row; neon_a1 = vld1q_s32(A + j); j+=Row; neon_a2 = vld1q_s32(A + j); j+=Row; neon_a3 = vld1q_s32(A + j); neon_b = vld1q_s32(B + k); neon_b0 = vdupq_n_s32(vgetq_lane_s32(neon_b, 0)); neon_b1 = vdupq_n_s32(vgetq_lane_s32(neon_b, 1)); neon_b2 = vdupq_n_s32(vgetq_lane_s32(neon_b, 2)); neon_b3 = vdupq_n_s32(vgetq_lane_s32(neon_b, 3)); neon_c = vaddq_s32(vmulq_s32(neon_a0, neon_b0), neon_c); neon_c = vaddq_s32(vmulq_s32(neon_a1, neon_b1), neon_c); neon_c = vaddq_s32(vmulq_s32(neon_a2, neon_b2), neon_c); neon_c = vaddq_s32(vmulq_s32(neon_a3, neon_b3), neon_c); } vst1q_s32(C + i, neon_c); } }
/* s32x4 saturated sub */ void mw_neon_mm_qsub_s32x4(int * A, int Row, int Col, int * B, int * C) { int32x4_t neon_a, neon_b, neon_c; int size = Row * Col; int i = 0; int k = 0; for (i = 4; i <= size ; i+=4) { k = i - 4; neon_a = vld1q_s32(A + k); neon_b = vld1q_s32(B + k); neon_c = vqsubq_s32(neon_a, neon_b); vst1q_s32(C + k, neon_c); } k = i - 4; for (i = 0; i < size % 4; i++) { C[k + i] = A[k + i] - B[k + i]; } }
static inline void PostShiftAndDivideAndDemodulateNeon(int16_t* inre, int16_t* inim, int32_t* outre1, int32_t* outre2, int32_t sh) { int k; int16_t* p_inre = inre; int16_t* p_inim = inim; int32_t* p_outre1 = outre1; int32_t* p_outre2 = outre2; const int16_t* kCosTab = &WebRtcIsacfix_kCosTab1[0]; const int16_t* kSinTab = &WebRtcIsacfix_kSinTab1[0]; int32x4_t shift = vdupq_n_s32(-sh - 16); // Divide through by the normalizing constant: // scale all values with 1/240, i.e. with 273 in Q16. // 273/65536 ~= 0.0041656 // 1/240 ~= 0.0041666 int16x8_t scale = vdupq_n_s16(273); // Sqrt(240) in Q11 is round(15.49193338482967 * 2048) = 31727. int factQ19 = 31727 << 16; int32x4_t fact = vdupq_n_s32(factQ19); for (k = 0; k < FRAMESAMPLES/2; k += 8) { int16x8_t inre16x8 = vld1q_s16(p_inre); int16x8_t inim16x8 = vld1q_s16(p_inim); p_inre += 8; p_inim += 8; int16x8_t tmpr = vld1q_s16(kCosTab); int16x8_t tmpi = vld1q_s16(kSinTab); kCosTab += 8; kSinTab += 8; // By vshl and vmull, we effectively did "<< (-sh - 16)", // instead of "<< (-sh)" and ">> 16" as in the C code. int32x4_t outre1_0 = vmull_s16(vget_low_s16(inre16x8), vget_low_s16(scale)); int32x4_t outre2_0 = vmull_s16(vget_low_s16(inim16x8), vget_low_s16(scale)); #if defined(WEBRTC_ARCH_ARM64) int32x4_t outre1_1 = vmull_high_s16(inre16x8, scale); int32x4_t outre2_1 = vmull_high_s16(inim16x8, scale); #else int32x4_t outre1_1 = vmull_s16(vget_high_s16(inre16x8), vget_high_s16(scale)); int32x4_t outre2_1 = vmull_s16(vget_high_s16(inim16x8), vget_high_s16(scale)); #endif outre1_0 = vshlq_s32(outre1_0, shift); outre1_1 = vshlq_s32(outre1_1, shift); outre2_0 = vshlq_s32(outre2_0, shift); outre2_1 = vshlq_s32(outre2_1, shift); // Demodulate and separate. int32x4_t tmpr_0 = vmovl_s16(vget_low_s16(tmpr)); int32x4_t tmpi_0 = vmovl_s16(vget_low_s16(tmpi)); #if defined(WEBRTC_ARCH_ARM64) int32x4_t tmpr_1 = vmovl_high_s16(tmpr); int32x4_t tmpi_1 = vmovl_high_s16(tmpi); #else int32x4_t tmpr_1 = vmovl_s16(vget_high_s16(tmpr)); int32x4_t tmpi_1 = vmovl_s16(vget_high_s16(tmpi)); #endif int64x2_t xr0 = vmull_s32(vget_low_s32(tmpr_0), vget_low_s32(outre1_0)); int64x2_t xi0 = vmull_s32(vget_low_s32(tmpr_0), vget_low_s32(outre2_0)); int64x2_t xr2 = vmull_s32(vget_low_s32(tmpr_1), vget_low_s32(outre1_1)); int64x2_t xi2 = vmull_s32(vget_low_s32(tmpr_1), vget_low_s32(outre2_1)); xr0 = vmlsl_s32(xr0, vget_low_s32(tmpi_0), vget_low_s32(outre2_0)); xi0 = vmlal_s32(xi0, vget_low_s32(tmpi_0), vget_low_s32(outre1_0)); xr2 = vmlsl_s32(xr2, vget_low_s32(tmpi_1), vget_low_s32(outre2_1)); xi2 = vmlal_s32(xi2, vget_low_s32(tmpi_1), vget_low_s32(outre1_1)); #if defined(WEBRTC_ARCH_ARM64) int64x2_t xr1 = vmull_high_s32(tmpr_0, outre1_0); int64x2_t xi1 = vmull_high_s32(tmpr_0, outre2_0); int64x2_t xr3 = vmull_high_s32(tmpr_1, outre1_1); int64x2_t xi3 = vmull_high_s32(tmpr_1, outre2_1); xr1 = vmlsl_high_s32(xr1, tmpi_0, outre2_0); xi1 = vmlal_high_s32(xi1, tmpi_0, outre1_0); xr3 = vmlsl_high_s32(xr3, tmpi_1, outre2_1); xi3 = vmlal_high_s32(xi3, tmpi_1, outre1_1); #else int64x2_t xr1 = vmull_s32(vget_high_s32(tmpr_0), vget_high_s32(outre1_0)); int64x2_t xi1 = vmull_s32(vget_high_s32(tmpr_0), vget_high_s32(outre2_0)); int64x2_t xr3 = vmull_s32(vget_high_s32(tmpr_1), vget_high_s32(outre1_1)); int64x2_t xi3 = vmull_s32(vget_high_s32(tmpr_1), vget_high_s32(outre2_1)); xr1 = vmlsl_s32(xr1, vget_high_s32(tmpi_0), vget_high_s32(outre2_0)); xi1 = vmlal_s32(xi1, vget_high_s32(tmpi_0), vget_high_s32(outre1_0)); xr3 = vmlsl_s32(xr3, vget_high_s32(tmpi_1), vget_high_s32(outre2_1)); xi3 = vmlal_s32(xi3, vget_high_s32(tmpi_1), vget_high_s32(outre1_1)); #endif outre1_0 = vcombine_s32(vshrn_n_s64(xr0, 10), vshrn_n_s64(xr1, 10)); outre2_0 = vcombine_s32(vshrn_n_s64(xi0, 10), vshrn_n_s64(xi1, 10)); outre1_1 = vcombine_s32(vshrn_n_s64(xr2, 10), vshrn_n_s64(xr3, 10)); outre2_1 = vcombine_s32(vshrn_n_s64(xi2, 10), vshrn_n_s64(xi3, 10)); outre1_0 = vqdmulhq_s32(outre1_0, fact); outre2_0 = vqdmulhq_s32(outre2_0, fact); outre1_1 = vqdmulhq_s32(outre1_1, fact); outre2_1 = vqdmulhq_s32(outre2_1, fact); vst1q_s32(p_outre1, outre1_0); p_outre1 += 4; vst1q_s32(p_outre1, outre1_1); p_outre1 += 4; vst1q_s32(p_outre2, outre2_0); p_outre2 += 4; vst1q_s32(p_outre2, outre2_1); p_outre2 += 4; } }
void ne10_img_hresize_4channels_linear_neon (const unsigned char** src, int** dst, int count, const int* xofs, const short* alpha, int swidth, int dwidth, int cn, int xmin, int xmax) { int dx, k; int dx0 = 0; int16x4x2_t alpha_vec; uint8x8_t dS0_vec, dS1_vec; int16x8_t qS0_vec, qS1_vec; int16x4_t dS0_0123, dS0_4567, dS1_0123, dS1_4567; int32x4_t qT0_vec, qT1_vec; int16x4_t dCoeff; dCoeff = vdup_n_s16 (INTER_RESIZE_COEF_SCALE); for (k = 0; k <= count - 2; k++) { const unsigned char *S0 = src[k], *S1 = src[k + 1]; int *D0 = dst[k], *D1 = dst[k + 1]; for (dx = dx0; dx < xmax; dx += 4) { int sx = xofs[dx]; alpha_vec = vld2_s16 (&alpha[dx * 2]); dS0_vec = vld1_u8 (&S0[sx]); dS1_vec = vld1_u8 (&S1[sx]); qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec)); qS1_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS1_vec)); dS0_0123 = vget_low_s16 (qS0_vec); dS0_4567 = vget_high_s16 (qS0_vec); dS1_0123 = vget_low_s16 (qS1_vec); dS1_4567 = vget_high_s16 (qS1_vec); qT0_vec = vmull_s16 (dS0_0123, alpha_vec.val[0]); qT1_vec = vmull_s16 (dS1_0123, alpha_vec.val[0]); qT0_vec = vmlal_s16 (qT0_vec, dS0_4567, alpha_vec.val[1]); qT1_vec = vmlal_s16 (qT1_vec, dS1_4567, alpha_vec.val[1]); vst1q_s32 (&D0[dx], qT0_vec); vst1q_s32 (&D1[dx], qT1_vec); } for (; dx < dwidth; dx += 4) { int sx = xofs[dx]; dS0_vec = vld1_u8 (&S0[sx]); dS1_vec = vld1_u8 (&S1[sx]); qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec)); qS1_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS1_vec)); dS0_0123 = vget_low_s16 (qS0_vec); dS1_0123 = vget_low_s16 (qS1_vec); qT0_vec = vmull_s16 (dS0_0123, dCoeff); qT1_vec = vmull_s16 (dS1_0123, dCoeff); vst1q_s32 (&D0[dx], qT0_vec); vst1q_s32 (&D1[dx], qT1_vec); } } for (; k < count; k++) { const unsigned char *S = src[k]; int *D = dst[k]; for (dx = 0; dx < xmax; dx += 4) { int sx = xofs[dx]; alpha_vec = vld2_s16 (&alpha[dx * 2]); dS0_vec = vld1_u8 (&S[sx]); qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec)); dS0_0123 = vget_low_s16 (qS0_vec); dS0_4567 = vget_high_s16 (qS0_vec); qT0_vec = vmull_s16 (dS0_0123, alpha_vec.val[0]); qT0_vec = vmlal_s16 (qT0_vec, dS0_4567, alpha_vec.val[1]); vst1q_s32 (&D[dx], qT0_vec); } for (; dx < dwidth; dx += 4) { int sx = xofs[dx]; dS0_vec = vld1_u8 (&S[sx]); qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec)); dS0_0123 = vget_low_s16 (qS0_vec); qT0_vec = vmull_s16 (dS0_0123, dCoeff); vst1q_s32 (&D[dx], qT0_vec); } } }
// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 135 non-zero // coefficients as follows: // 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 // 0 0 2 5 10 17 25 38 47 62 83 101 121 // 1 1 4 8 15 22 30 45 58 74 92 112 133 // 2 3 7 12 18 28 36 52 64 82 102 118 // 3 6 11 16 23 31 43 60 73 90 109 126 // 4 9 14 19 29 37 50 65 78 98 116 134 // 5 13 20 26 35 44 54 72 85 105 123 // 6 21 27 33 42 53 63 80 94 113 132 // 7 24 32 39 48 57 71 88 104 120 // 8 34 40 46 56 68 81 96 111 130 // 9 41 49 55 67 77 91 107 124 // 10 51 59 66 76 89 99 119 131 // 11 61 69 75 87 100 114 129 // 12 70 79 86 97 108 122 // 13 84 93 103 110 125 // 14 98 106 115 127 // 15 117 128 static void vpx_highbd_idct32_12_neon(const tran_low_t *const input, int32_t *output) { int32x4x2_t in[12], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32], s8[32]; load_8x8_s32_dual(input, &in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], &in[7]); transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6], &in[7]); load_4x8_s32_dual(input + 8, &in[8].val[0], &in[8].val[1], &in[9].val[0], &in[9].val[1], &in[10].val[0], &in[10].val[1], &in[11].val[0], &in[11].val[1]); transpose_s32_4x8(&in[8].val[0], &in[8].val[1], &in[9].val[0], &in[9].val[1], &in[10].val[0], &in[10].val[1], &in[11].val[0], &in[11].val[1]); // stage 1 s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64); s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64); s1[18] = multiply_shift_and_narrow_s32_dual(in[9], cospi_23_64); s1[29] = multiply_shift_and_narrow_s32_dual(in[9], cospi_9_64); s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64); s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64); s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64); s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64); s1[21] = multiply_shift_and_narrow_s32_dual(in[11], -cospi_21_64); s1[26] = multiply_shift_and_narrow_s32_dual(in[11], cospi_11_64); s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64); s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64); // stage 2 s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64); s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64); s2[10] = multiply_shift_and_narrow_s32_dual(in[10], cospi_22_64); s2[13] = multiply_shift_and_narrow_s32_dual(in[10], cospi_10_64); s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64); s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64); s2[18] = highbd_idct_sub_dual(s1[19], s1[18]); s2[19] = highbd_idct_add_dual(s1[18], s1[19]); s2[20] = highbd_idct_add_dual(s1[20], s1[21]); s2[21] = highbd_idct_sub_dual(s1[20], s1[21]); s2[26] = highbd_idct_sub_dual(s1[27], s1[26]); s2[27] = highbd_idct_add_dual(s1[26], s1[27]); s2[28] = highbd_idct_add_dual(s1[28], s1[29]); s2[29] = highbd_idct_sub_dual(s1[28], s1[29]); // stage 3 s3[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64); s3[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64); s3[10] = highbd_idct_sub_dual(s2[11], s2[10]); s3[11] = highbd_idct_add_dual(s2[10], s2[11]); s3[12] = highbd_idct_add_dual(s2[12], s2[13]); s3[13] = highbd_idct_sub_dual(s2[12], s2[13]); s3[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64, s1[31], cospi_28_64); s3[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64, s1[31], cospi_4_64); s3[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_28_64, s2[29], -cospi_4_64); s3[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_4_64, s2[29], cospi_28_64); s3[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_20_64, s2[26], cospi_12_64); s3[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], cospi_12_64, s2[26], cospi_20_64); s3[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64, s1[24], -cospi_20_64); s3[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64, s1[24], cospi_12_64); // stage 4 s4[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64); s4[2] = multiply_shift_and_narrow_s32_dual(in[8], cospi_24_64); s4[3] = multiply_shift_and_narrow_s32_dual(in[8], cospi_8_64); s4[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64, s2[15], cospi_24_64); s4[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64, s2[15], cospi_8_64); s4[10] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_24_64, s3[13], -cospi_8_64); s4[13] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_8_64, s3[13], cospi_24_64); s4[16] = highbd_idct_add_dual(s1[16], s2[19]); s4[17] = highbd_idct_add_dual(s3[17], s3[18]); s4[18] = highbd_idct_sub_dual(s3[17], s3[18]); s4[19] = highbd_idct_sub_dual(s1[16], s2[19]); s4[20] = highbd_idct_sub_dual(s1[23], s2[20]); s4[21] = highbd_idct_sub_dual(s3[22], s3[21]); s4[22] = highbd_idct_add_dual(s3[21], s3[22]); s4[23] = highbd_idct_add_dual(s2[20], s1[23]); s4[24] = highbd_idct_add_dual(s1[24], s2[27]); s4[25] = highbd_idct_add_dual(s3[25], s3[26]); s4[26] = highbd_idct_sub_dual(s3[25], s3[26]); s4[27] = highbd_idct_sub_dual(s1[24], s2[27]); s4[28] = highbd_idct_sub_dual(s1[31], s2[28]); s4[29] = highbd_idct_sub_dual(s3[30], s3[29]); s4[30] = highbd_idct_add_dual(s3[29], s3[30]); s4[31] = highbd_idct_add_dual(s2[28], s1[31]); // stage 5 s5[0] = highbd_idct_add_dual(s4[0], s4[3]); s5[1] = highbd_idct_add_dual(s4[0], s4[2]); s5[2] = highbd_idct_sub_dual(s4[0], s4[2]); s5[3] = highbd_idct_sub_dual(s4[0], s4[3]); s5[5] = sub_multiply_shift_and_narrow_s32_dual(s3[7], s3[4], cospi_16_64); s5[6] = add_multiply_shift_and_narrow_s32_dual(s3[4], s3[7], cospi_16_64); s5[8] = highbd_idct_add_dual(s2[8], s3[11]); s5[9] = highbd_idct_add_dual(s4[9], s4[10]); s5[10] = highbd_idct_sub_dual(s4[9], s4[10]); s5[11] = highbd_idct_sub_dual(s2[8], s3[11]); s5[12] = highbd_idct_sub_dual(s2[15], s3[12]); s5[13] = highbd_idct_sub_dual(s4[14], s4[13]); s5[14] = highbd_idct_add_dual(s4[13], s4[14]); s5[15] = highbd_idct_add_dual(s2[15], s3[12]); s5[18] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], -cospi_8_64, s4[29], cospi_24_64); s5[29] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], cospi_24_64, s4[29], cospi_8_64); s5[19] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], -cospi_8_64, s4[28], cospi_24_64); s5[28] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], cospi_24_64, s4[28], cospi_8_64); s5[20] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_24_64, s4[27], -cospi_8_64); s5[27] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_8_64, s4[27], cospi_24_64); s5[21] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_24_64, s4[26], -cospi_8_64); s5[26] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_8_64, s4[26], cospi_24_64); // stage 6 s6[0] = highbd_idct_add_dual(s5[0], s3[7]); s6[1] = highbd_idct_add_dual(s5[1], s5[6]); s6[2] = highbd_idct_add_dual(s5[2], s5[5]); s6[3] = highbd_idct_add_dual(s5[3], s3[4]); s6[4] = highbd_idct_sub_dual(s5[3], s3[4]); s6[5] = highbd_idct_sub_dual(s5[2], s5[5]); s6[6] = highbd_idct_sub_dual(s5[1], s5[6]); s6[7] = highbd_idct_sub_dual(s5[0], s3[7]); s6[10] = sub_multiply_shift_and_narrow_s32_dual(s5[13], s5[10], cospi_16_64); s6[13] = add_multiply_shift_and_narrow_s32_dual(s5[10], s5[13], cospi_16_64); s6[11] = sub_multiply_shift_and_narrow_s32_dual(s5[12], s5[11], cospi_16_64); s6[12] = add_multiply_shift_and_narrow_s32_dual(s5[11], s5[12], cospi_16_64); s6[16] = highbd_idct_add_dual(s4[16], s4[23]); s6[17] = highbd_idct_add_dual(s4[17], s4[22]); s6[18] = highbd_idct_add_dual(s5[18], s5[21]); s6[19] = highbd_idct_add_dual(s5[19], s5[20]); s6[20] = highbd_idct_sub_dual(s5[19], s5[20]); s6[21] = highbd_idct_sub_dual(s5[18], s5[21]); s6[22] = highbd_idct_sub_dual(s4[17], s4[22]); s6[23] = highbd_idct_sub_dual(s4[16], s4[23]); s6[24] = highbd_idct_sub_dual(s4[31], s4[24]); s6[25] = highbd_idct_sub_dual(s4[30], s4[25]); s6[26] = highbd_idct_sub_dual(s5[29], s5[26]); s6[27] = highbd_idct_sub_dual(s5[28], s5[27]); s6[28] = highbd_idct_add_dual(s5[27], s5[28]); s6[29] = highbd_idct_add_dual(s5[26], s5[29]); s6[30] = highbd_idct_add_dual(s4[25], s4[30]); s6[31] = highbd_idct_add_dual(s4[24], s4[31]); // stage 7 s7[0] = highbd_idct_add_dual(s6[0], s5[15]); s7[1] = highbd_idct_add_dual(s6[1], s5[14]); s7[2] = highbd_idct_add_dual(s6[2], s6[13]); s7[3] = highbd_idct_add_dual(s6[3], s6[12]); s7[4] = highbd_idct_add_dual(s6[4], s6[11]); s7[5] = highbd_idct_add_dual(s6[5], s6[10]); s7[6] = highbd_idct_add_dual(s6[6], s5[9]); s7[7] = highbd_idct_add_dual(s6[7], s5[8]); s7[8] = highbd_idct_sub_dual(s6[7], s5[8]); s7[9] = highbd_idct_sub_dual(s6[6], s5[9]); s7[10] = highbd_idct_sub_dual(s6[5], s6[10]); s7[11] = highbd_idct_sub_dual(s6[4], s6[11]); s7[12] = highbd_idct_sub_dual(s6[3], s6[12]); s7[13] = highbd_idct_sub_dual(s6[2], s6[13]); s7[14] = highbd_idct_sub_dual(s6[1], s5[14]); s7[15] = highbd_idct_sub_dual(s6[0], s5[15]); s7[20] = sub_multiply_shift_and_narrow_s32_dual(s6[27], s6[20], cospi_16_64); s7[27] = add_multiply_shift_and_narrow_s32_dual(s6[20], s6[27], cospi_16_64); s7[21] = sub_multiply_shift_and_narrow_s32_dual(s6[26], s6[21], cospi_16_64); s7[26] = add_multiply_shift_and_narrow_s32_dual(s6[21], s6[26], cospi_16_64); s7[22] = sub_multiply_shift_and_narrow_s32_dual(s6[25], s6[22], cospi_16_64); s7[25] = add_multiply_shift_and_narrow_s32_dual(s6[22], s6[25], cospi_16_64); s7[23] = sub_multiply_shift_and_narrow_s32_dual(s6[24], s6[23], cospi_16_64); s7[24] = add_multiply_shift_and_narrow_s32_dual(s6[23], s6[24], cospi_16_64); // final stage s8[0] = highbd_idct_add_dual(s7[0], s6[31]); s8[1] = highbd_idct_add_dual(s7[1], s6[30]); s8[2] = highbd_idct_add_dual(s7[2], s6[29]); s8[3] = highbd_idct_add_dual(s7[3], s6[28]); s8[4] = highbd_idct_add_dual(s7[4], s7[27]); s8[5] = highbd_idct_add_dual(s7[5], s7[26]); s8[6] = highbd_idct_add_dual(s7[6], s7[25]); s8[7] = highbd_idct_add_dual(s7[7], s7[24]); s8[8] = highbd_idct_add_dual(s7[8], s7[23]); s8[9] = highbd_idct_add_dual(s7[9], s7[22]); s8[10] = highbd_idct_add_dual(s7[10], s7[21]); s8[11] = highbd_idct_add_dual(s7[11], s7[20]); s8[12] = highbd_idct_add_dual(s7[12], s6[19]); s8[13] = highbd_idct_add_dual(s7[13], s6[18]); s8[14] = highbd_idct_add_dual(s7[14], s6[17]); s8[15] = highbd_idct_add_dual(s7[15], s6[16]); s8[16] = highbd_idct_sub_dual(s7[15], s6[16]); s8[17] = highbd_idct_sub_dual(s7[14], s6[17]); s8[18] = highbd_idct_sub_dual(s7[13], s6[18]); s8[19] = highbd_idct_sub_dual(s7[12], s6[19]); s8[20] = highbd_idct_sub_dual(s7[11], s7[20]); s8[21] = highbd_idct_sub_dual(s7[10], s7[21]); s8[22] = highbd_idct_sub_dual(s7[9], s7[22]); s8[23] = highbd_idct_sub_dual(s7[8], s7[23]); s8[24] = highbd_idct_sub_dual(s7[7], s7[24]); s8[25] = highbd_idct_sub_dual(s7[6], s7[25]); s8[26] = highbd_idct_sub_dual(s7[5], s7[26]); s8[27] = highbd_idct_sub_dual(s7[4], s7[27]); s8[28] = highbd_idct_sub_dual(s7[3], s6[28]); s8[29] = highbd_idct_sub_dual(s7[2], s6[29]); s8[30] = highbd_idct_sub_dual(s7[1], s6[30]); s8[31] = highbd_idct_sub_dual(s7[0], s6[31]); vst1q_s32(output + 0, s8[0].val[0]); vst1q_s32(output + 4, s8[0].val[1]); output += 16; vst1q_s32(output + 0, s8[1].val[0]); vst1q_s32(output + 4, s8[1].val[1]); output += 16; vst1q_s32(output + 0, s8[2].val[0]); vst1q_s32(output + 4, s8[2].val[1]); output += 16; vst1q_s32(output + 0, s8[3].val[0]); vst1q_s32(output + 4, s8[3].val[1]); output += 16; vst1q_s32(output + 0, s8[4].val[0]); vst1q_s32(output + 4, s8[4].val[1]); output += 16; vst1q_s32(output + 0, s8[5].val[0]); vst1q_s32(output + 4, s8[5].val[1]); output += 16; vst1q_s32(output + 0, s8[6].val[0]); vst1q_s32(output + 4, s8[6].val[1]); output += 16; vst1q_s32(output + 0, s8[7].val[0]); vst1q_s32(output + 4, s8[7].val[1]); output += 16; vst1q_s32(output + 0, s8[8].val[0]); vst1q_s32(output + 4, s8[8].val[1]); output += 16; vst1q_s32(output + 0, s8[9].val[0]); vst1q_s32(output + 4, s8[9].val[1]); output += 16; vst1q_s32(output + 0, s8[10].val[0]); vst1q_s32(output + 4, s8[10].val[1]); output += 16; vst1q_s32(output + 0, s8[11].val[0]); vst1q_s32(output + 4, s8[11].val[1]); output += 16; vst1q_s32(output + 0, s8[12].val[0]); vst1q_s32(output + 4, s8[12].val[1]); output += 16; vst1q_s32(output + 0, s8[13].val[0]); vst1q_s32(output + 4, s8[13].val[1]); output += 16; vst1q_s32(output + 0, s8[14].val[0]); vst1q_s32(output + 4, s8[14].val[1]); output += 16; vst1q_s32(output + 0, s8[15].val[0]); vst1q_s32(output + 4, s8[15].val[1]); output += 16; vst1q_s32(output + 0, s8[16].val[0]); vst1q_s32(output + 4, s8[16].val[1]); output += 16; vst1q_s32(output + 0, s8[17].val[0]); vst1q_s32(output + 4, s8[17].val[1]); output += 16; vst1q_s32(output + 0, s8[18].val[0]); vst1q_s32(output + 4, s8[18].val[1]); output += 16; vst1q_s32(output + 0, s8[19].val[0]); vst1q_s32(output + 4, s8[19].val[1]); output += 16; vst1q_s32(output + 0, s8[20].val[0]); vst1q_s32(output + 4, s8[20].val[1]); output += 16; vst1q_s32(output + 0, s8[21].val[0]); vst1q_s32(output + 4, s8[21].val[1]); output += 16; vst1q_s32(output + 0, s8[22].val[0]); vst1q_s32(output + 4, s8[22].val[1]); output += 16; vst1q_s32(output + 0, s8[23].val[0]); vst1q_s32(output + 4, s8[23].val[1]); output += 16; vst1q_s32(output + 0, s8[24].val[0]); vst1q_s32(output + 4, s8[24].val[1]); output += 16; vst1q_s32(output + 0, s8[25].val[0]); vst1q_s32(output + 4, s8[25].val[1]); output += 16; vst1q_s32(output + 0, s8[26].val[0]); vst1q_s32(output + 4, s8[26].val[1]); output += 16; vst1q_s32(output + 0, s8[27].val[0]); vst1q_s32(output + 4, s8[27].val[1]); output += 16; vst1q_s32(output + 0, s8[28].val[0]); vst1q_s32(output + 4, s8[28].val[1]); output += 16; vst1q_s32(output + 0, s8[29].val[0]); vst1q_s32(output + 4, s8[29].val[1]); output += 16; vst1q_s32(output + 0, s8[30].val[0]); vst1q_s32(output + 4, s8[30].val[1]); output += 16; vst1q_s32(output + 0, s8[31].val[0]); vst1q_s32(output + 4, s8[31].val[1]); }
void qt_blend_argb32_on_argb32_neon(uchar *destPixels, int dbpl, const uchar *srcPixels, int sbpl, int w, int h, int const_alpha) { const uint *src = (const uint *) srcPixels; uint *dst = (uint *) destPixels; int16x8_t half = vdupq_n_s16(0x80); int16x8_t full = vdupq_n_s16(0xff); if (const_alpha == 256) { for (int y = 0; y < h; ++y) { int x = 0; for (; x < w-3; x += 4) { int32x4_t src32 = vld1q_s32((int32_t *)&src[x]); if ((src[x] & src[x+1] & src[x+2] & src[x+3]) >= 0xff000000) { // all opaque vst1q_s32((int32_t *)&dst[x], src32); } else if (src[x] | src[x+1] | src[x+2] | src[x+3]) { int32x4_t dst32 = vld1q_s32((int32_t *)&dst[x]); const uint8x16_t src8 = vreinterpretq_u8_s32(src32); const uint8x16_t dst8 = vreinterpretq_u8_s32(dst32); const uint8x8_t src8_low = vget_low_u8(src8); const uint8x8_t dst8_low = vget_low_u8(dst8); const uint8x8_t src8_high = vget_high_u8(src8); const uint8x8_t dst8_high = vget_high_u8(dst8); const int16x8_t src16_low = vreinterpretq_s16_u16(vmovl_u8(src8_low)); const int16x8_t dst16_low = vreinterpretq_s16_u16(vmovl_u8(dst8_low)); const int16x8_t src16_high = vreinterpretq_s16_u16(vmovl_u8(src8_high)); const int16x8_t dst16_high = vreinterpretq_s16_u16(vmovl_u8(dst8_high)); const int16x8_t result16_low = qvsource_over_s16(src16_low, dst16_low, half, full); const int16x8_t result16_high = qvsource_over_s16(src16_high, dst16_high, half, full); const int32x2_t result32_low = vreinterpret_s32_s8(vmovn_s16(result16_low)); const int32x2_t result32_high = vreinterpret_s32_s8(vmovn_s16(result16_high)); vst1q_s32((int32_t *)&dst[x], vcombine_s32(result32_low, result32_high)); } } for (; x<w; ++x) { uint s = src[x]; if (s >= 0xff000000) dst[x] = s; else if (s != 0) dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); } dst = (quint32 *)(((uchar *) dst) + dbpl); src = (const quint32 *)(((const uchar *) src) + sbpl); } } else if (const_alpha != 0) { const_alpha = (const_alpha * 255) >> 8; int16x8_t const_alpha16 = vdupq_n_s16(const_alpha); for (int y = 0; y < h; ++y) { int x = 0; for (; x < w-3; x += 4) { if (src[x] | src[x+1] | src[x+2] | src[x+3]) { int32x4_t src32 = vld1q_s32((int32_t *)&src[x]); int32x4_t dst32 = vld1q_s32((int32_t *)&dst[x]); const uint8x16_t src8 = vreinterpretq_u8_s32(src32); const uint8x16_t dst8 = vreinterpretq_u8_s32(dst32); const uint8x8_t src8_low = vget_low_u8(src8); const uint8x8_t dst8_low = vget_low_u8(dst8); const uint8x8_t src8_high = vget_high_u8(src8); const uint8x8_t dst8_high = vget_high_u8(dst8); const int16x8_t src16_low = vreinterpretq_s16_u16(vmovl_u8(src8_low)); const int16x8_t dst16_low = vreinterpretq_s16_u16(vmovl_u8(dst8_low)); const int16x8_t src16_high = vreinterpretq_s16_u16(vmovl_u8(src8_high)); const int16x8_t dst16_high = vreinterpretq_s16_u16(vmovl_u8(dst8_high)); const int16x8_t srcalpha16_low = qvbyte_mul_s16(src16_low, const_alpha16, half); const int16x8_t srcalpha16_high = qvbyte_mul_s16(src16_high, const_alpha16, half); const int16x8_t result16_low = qvsource_over_s16(srcalpha16_low, dst16_low, half, full); const int16x8_t result16_high = qvsource_over_s16(srcalpha16_high, dst16_high, half, full); const int32x2_t result32_low = vreinterpret_s32_s8(vmovn_s16(result16_low)); const int32x2_t result32_high = vreinterpret_s32_s8(vmovn_s16(result16_high)); vst1q_s32((int32_t *)&dst[x], vcombine_s32(result32_low, result32_high)); } } for (; x<w; ++x) { uint s = src[x]; if (s != 0) { s = BYTE_MUL(s, const_alpha); dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); } } dst = (quint32 *)(((uchar *) dst) + dbpl); src = (const quint32 *)(((const uchar *) src) + sbpl); } }
inline void vst1q(s32 * ptr, const int32x4_t & v) { return vst1q_s32(ptr, v); }
// Contains a function for the core loop in the normalized lattice MA // filter routine for iSAC codec, optimized for ARM Neon platform. // It does: // for 0 <= n < HALF_SUBFRAMELEN - 1: // *ptr2 = input2 * (*ptr2) + input0 * (*ptr0)); // *ptr1 = input1 * (*ptr0) + input0 * (*ptr2); // Output is not bit-exact with the reference C code, due to the replacement // of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon // instructions. The difference should not be bigger than 1. void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0, // Filter coefficient int16_t input1, // Filter coefficient int32_t input2, // Inverse coefficient int32_t* ptr0, // Sample buffer int32_t* ptr1, // Sample buffer int32_t* ptr2) // Sample buffer { int n = 0; int loop = (HALF_SUBFRAMELEN - 1) >> 3; int loop_tail = (HALF_SUBFRAMELEN - 1) & 0x7; int32x4_t input0_v = vdupq_n_s32((int32_t)input0 << 16); int32x4_t input1_v = vdupq_n_s32((int32_t)input1 << 16); int32x4_t input2_v = vdupq_n_s32(input2); int32x4_t tmp0a, tmp1a, tmp2a, tmp3a; int32x4_t tmp0b, tmp1b, tmp2b, tmp3b; int32x4_t ptr0va, ptr1va, ptr2va; int32x4_t ptr0vb, ptr1vb, ptr2vb; // Unroll to process 8 samples at once. for (n = 0; n < loop; n++) { ptr0va = vld1q_s32(ptr0); ptr0vb = vld1q_s32(ptr0 + 4); ptr0 += 8; ptr2va = vld1q_s32(ptr2); ptr2vb = vld1q_s32(ptr2 + 4); // Calculate tmp0 = (*ptr0) * input0. tmp0a = vqrdmulhq_s32(ptr0va, input0_v); tmp0b = vqrdmulhq_s32(ptr0vb, input0_v); // Calculate tmp1 = (*ptr0) * input1. tmp1a = vqrdmulhq_s32(ptr0va, input1_v); tmp1b = vqrdmulhq_s32(ptr0vb, input1_v); // Calculate tmp2 = tmp0 + *(ptr2). tmp2a = vaddq_s32(tmp0a, ptr2va); tmp2b = vaddq_s32(tmp0b, ptr2vb); tmp2a = vshlq_n_s32(tmp2a, 15); tmp2b = vshlq_n_s32(tmp2b, 15); // Calculate *ptr2 = input2 * tmp2. ptr2va = vqrdmulhq_s32(tmp2a, input2_v); ptr2vb = vqrdmulhq_s32(tmp2b, input2_v); vst1q_s32(ptr2, ptr2va); vst1q_s32(ptr2 + 4, ptr2vb); ptr2 += 8; // Calculate tmp3 = ptr2v * input0. tmp3a = vqrdmulhq_s32(ptr2va, input0_v); tmp3b = vqrdmulhq_s32(ptr2vb, input0_v); // Calculate *ptr1 = tmp1 + tmp3. ptr1va = vaddq_s32(tmp1a, tmp3a); ptr1vb = vaddq_s32(tmp1b, tmp3b); vst1q_s32(ptr1, ptr1va); vst1q_s32(ptr1 + 4, ptr1vb); ptr1 += 8; } // Process four more samples. if (loop_tail & 0x4) { ptr0va = vld1q_s32(ptr0); ptr2va = vld1q_s32(ptr2); ptr0 += 4; // Calculate tmp0 = (*ptr0) * input0. tmp0a = vqrdmulhq_s32(ptr0va, input0_v); // Calculate tmp1 = (*ptr0) * input1. tmp1a = vqrdmulhq_s32(ptr0va, input1_v); // Calculate tmp2 = tmp0 + *(ptr2). tmp2a = vaddq_s32(tmp0a, ptr2va); tmp2a = vshlq_n_s32(tmp2a, 15); // Calculate *ptr2 = input2 * tmp2. ptr2va = vqrdmulhq_s32(tmp2a, input2_v); vst1q_s32(ptr2, ptr2va); ptr2 += 4; // Calculate tmp3 = *(ptr2) * input0. tmp3a = vqrdmulhq_s32(ptr2va, input0_v); // Calculate *ptr1 = tmp1 + tmp3. ptr1va = vaddq_s32(tmp1a, tmp3a); vst1q_s32(ptr1, ptr1va); ptr1 += 4; } // Process two more samples. if (loop_tail & 0x2) { int32x2_t ptr0v_tail, ptr2v_tail, ptr1v_tail; int32x2_t tmp0_tail, tmp1_tail, tmp2_tail, tmp3_tail; ptr0v_tail = vld1_s32(ptr0); ptr2v_tail = vld1_s32(ptr2); ptr0 += 2; // Calculate tmp0 = (*ptr0) * input0. tmp0_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input0_v)); // Calculate tmp1 = (*ptr0) * input1. tmp1_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input1_v)); // Calculate tmp2 = tmp0 + *(ptr2). tmp2_tail = vadd_s32(tmp0_tail, ptr2v_tail); tmp2_tail = vshl_n_s32(tmp2_tail, 15); // Calculate *ptr2 = input2 * tmp2. ptr2v_tail = vqrdmulh_s32(tmp2_tail, vget_low_s32(input2_v)); vst1_s32(ptr2, ptr2v_tail); ptr2 += 2; // Calculate tmp3 = *(ptr2) * input0. tmp3_tail = vqrdmulh_s32(ptr2v_tail, vget_low_s32(input0_v)); // Calculate *ptr1 = tmp1 + tmp3. ptr1v_tail = vadd_s32(tmp1_tail, tmp3_tail); vst1_s32(ptr1, ptr1v_tail); ptr1 += 2; } // Process one more sample. if (loop_tail & 0x1) { int16_t t16a = (int16_t)(input2 >> 16); int16_t t16b = (int16_t)input2; if (t16b < 0) t16a++; int32_t tmp32a; int32_t tmp32b; // Calculate *ptr2 = input2 * (*ptr2 + input0 * (*ptr0)). tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr0); tmp32b = *ptr2 + tmp32a; *ptr2 = (int32_t)(WEBRTC_SPL_MUL(t16a, tmp32b) + (WEBRTC_SPL_MUL_16_32_RSFT16(t16b, tmp32b))); // Calculate *ptr1 = input1 * (*ptr0) + input0 * (*ptr2). tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input1, *ptr0); tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr2); *ptr1 = tmp32a + tmp32b; }
static inline int32_t ComplexMulAndFindMaxNeon(int16_t* inre1Q9, int16_t* inre2Q9, int32_t* outreQ16, int32_t* outimQ16) { int k; const int16_t* kCosTab = &WebRtcIsacfix_kCosTab1[0]; const int16_t* kSinTab = &WebRtcIsacfix_kSinTab1[0]; // 0.5 / sqrt(240) in Q19 is round((.5 / sqrt(240)) * (2^19)) = 16921. // Use "16921 << 5" and vqdmulh, instead of ">> 26" as in the C code. int32_t fact = 16921 << 5; int32x4_t factq = vdupq_n_s32(fact); uint32x4_t max_r = vdupq_n_u32(0); uint32x4_t max_i = vdupq_n_u32(0); for (k = 0; k < FRAMESAMPLES/2; k += 8) { int16x8_t tmpr = vld1q_s16(kCosTab); int16x8_t tmpi = vld1q_s16(kSinTab); int16x8_t inre1 = vld1q_s16(inre1Q9); int16x8_t inre2 = vld1q_s16(inre2Q9); kCosTab += 8; kSinTab += 8; inre1Q9 += 8; inre2Q9 += 8; // Use ">> 26", instead of ">> 7", ">> 16" and then ">> 3" as in the C code. int32x4_t tmp0 = vmull_s16(vget_low_s16(tmpr), vget_low_s16(inre1)); int32x4_t tmp1 = vmull_s16(vget_low_s16(tmpr), vget_low_s16(inre2)); tmp0 = vmlal_s16(tmp0, vget_low_s16(tmpi), vget_low_s16(inre2)); tmp1 = vmlsl_s16(tmp1, vget_low_s16(tmpi), vget_low_s16(inre1)); #if defined(WEBRTC_ARCH_ARM64) int32x4_t tmp2 = vmull_high_s16(tmpr, inre1); int32x4_t tmp3 = vmull_high_s16(tmpr, inre2); tmp2 = vmlal_high_s16(tmp2, tmpi, inre2); tmp3 = vmlsl_high_s16(tmp3, tmpi, inre1); #else int32x4_t tmp2 = vmull_s16(vget_high_s16(tmpr), vget_high_s16(inre1)); int32x4_t tmp3 = vmull_s16(vget_high_s16(tmpr), vget_high_s16(inre2)); tmp2 = vmlal_s16(tmp2, vget_high_s16(tmpi), vget_high_s16(inre2)); tmp3 = vmlsl_s16(tmp3, vget_high_s16(tmpi), vget_high_s16(inre1)); #endif int32x4_t outr_0 = vqdmulhq_s32(tmp0, factq); int32x4_t outr_1 = vqdmulhq_s32(tmp2, factq); int32x4_t outi_0 = vqdmulhq_s32(tmp1, factq); int32x4_t outi_1 = vqdmulhq_s32(tmp3, factq); vst1q_s32(outreQ16, outr_0); outreQ16 += 4; vst1q_s32(outreQ16, outr_1); outreQ16 += 4; vst1q_s32(outimQ16, outi_0); outimQ16 += 4; vst1q_s32(outimQ16, outi_1); outimQ16 += 4; // Find the absolute maximum in the vectors. tmp0 = vabsq_s32(outr_0); tmp1 = vabsq_s32(outr_1); tmp2 = vabsq_s32(outi_0); tmp3 = vabsq_s32(outi_1); // vabs doesn't change the value of 0x80000000. // Use u32 so we don't lose the value 0x80000000. max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp0)); max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp2)); max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp1)); max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp3)); } max_r = vmaxq_u32(max_r, max_i); #if defined(WEBRTC_ARCH_ARM64) uint32_t maximum = vmaxvq_u32(max_r); #else uint32x2_t max32x2_r = vmax_u32(vget_low_u32(max_r), vget_high_u32(max_r)); max32x2_r = vpmax_u32(max32x2_r, max32x2_r); uint32_t maximum = vget_lane_u32(max32x2_r, 0); #endif return (int32_t)maximum; }
void silk_warped_autocorrelation_FIX_neon( opus_int32 *corr, /* O Result [order + 1] */ opus_int *scale, /* O Scaling of the correlation vector */ const opus_int16 *input, /* I Input data to correlate */ const opus_int warping_Q16, /* I Warping coefficient */ const opus_int length, /* I Length of input */ const opus_int order /* I Correlation order (even) */ ) { if( ( MAX_SHAPE_LPC_ORDER > 24 ) || ( order < 6 ) ) { silk_warped_autocorrelation_FIX_c( corr, scale, input, warping_Q16, length, order ); } else { opus_int n, i, lsh; opus_int64 corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; /* In reverse order */ opus_int64 corr_QC_orderT; int64x2_t lsh_s64x2; const opus_int orderT = ( order + 3 ) & ~3; opus_int64 *corr_QCT; opus_int32 *input_QS; VARDECL( opus_int32, input_QST ); VARDECL( opus_int32, state ); SAVE_STACK; /* Order must be even */ silk_assert( ( order & 1 ) == 0 ); silk_assert( 2 * QS - QC >= 0 ); ALLOC( input_QST, length + 2 * MAX_SHAPE_LPC_ORDER, opus_int32 ); input_QS = input_QST; /* input_QS has zero paddings in the beginning and end. */ vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; /* Loop over samples */ for( n = 0; n < length - 7; n += 8, input_QS += 8 ) { const int16x8_t t0_s16x4 = vld1q_s16( input + n ); vst1q_s32( input_QS + 0, vshll_n_s16( vget_low_s16( t0_s16x4 ), QS ) ); vst1q_s32( input_QS + 4, vshll_n_s16( vget_high_s16( t0_s16x4 ), QS ) ); } for( ; n < length; n++, input_QS++ ) { input_QS[ 0 ] = silk_LSHIFT32( (opus_int32)input[ n ], QS ); } vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS = input_QST + MAX_SHAPE_LPC_ORDER - orderT; /* The following loop runs ( length + order ) times, with ( order ) extra epilogues. */ /* The zero paddings in input_QS guarantee corr_QC's correctness even with the extra epilogues. */ /* The values of state_QS will be polluted by the extra epilogues, however they are temporary values. */ /* Keep the C code here to help understand the intrinsics optimization. */ /* { opus_int32 state_QS[ 2 ][ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; opus_int32 *state_QST[ 3 ]; state_QST[ 0 ] = state_QS[ 0 ]; state_QST[ 1 ] = state_QS[ 1 ]; for( n = 0; n < length + order; n++, input_QS++ ) { state_QST[ 0 ][ orderT ] = input_QS[ orderT ]; for( i = 0; i < orderT; i++ ) { corr_QC[ i ] += silk_RSHIFT64( silk_SMULL( state_QST[ 0 ][ i ], input_QS[ i ] ), 2 * QS - QC ); state_QST[ 1 ][ i ] = silk_SMLAWB( state_QST[ 1 ][ i + 1 ], state_QST[ 0 ][ i ] - state_QST[ 0 ][ i + 1 ], warping_Q16 ); } state_QST[ 2 ] = state_QST[ 0 ]; state_QST[ 0 ] = state_QST[ 1 ]; state_QST[ 1 ] = state_QST[ 2 ]; } } */ { const int32x4_t warping_Q16_s32x4 = vdupq_n_s32( warping_Q16 << 15 ); const opus_int32 *in = input_QS + orderT; opus_int o = orderT; int32x4_t state_QS_s32x4[ 3 ][ 2 ]; ALLOC( state, length + orderT, opus_int32 ); state_QS_s32x4[ 2 ][ 1 ] = vdupq_n_s32( 0 ); /* Calculate 8 taps of all inputs in each loop. */ do { state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 0 ][ 1 ] = state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 1 ][ 1 ] = vdupq_n_s32( 0 ); n = 0; do { calc_corr( input_QS + n, corr_QC, o - 8, state_QS_s32x4[ 0 ][ 0 ] ); calc_corr( input_QS + n, corr_QC, o - 4, state_QS_s32x4[ 0 ][ 1 ] ); state_QS_s32x4[ 2 ][ 1 ] = vld1q_s32( in + n ); vst1q_lane_s32( state + n, state_QS_s32x4[ 0 ][ 0 ], 0 ); state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 0 ][ 1 ], 1 ); state_QS_s32x4[ 2 ][ 1 ] = vextq_s32( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], 1 ); state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 ); state_QS_s32x4[ 0 ][ 1 ] = calc_state( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], state_QS_s32x4[ 1 ][ 1 ], warping_Q16_s32x4 ); state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ]; state_QS_s32x4[ 1 ][ 1 ] = state_QS_s32x4[ 2 ][ 1 ]; } while( ++n < ( length + order ) ); in = state; o -= 8; } while( o > 4 ); if( o ) { /* Calculate the last 4 taps of all inputs. */ opus_int32 *stateT = state; silk_assert( o == 4 ); state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 1 ][ 0 ] = vdupq_n_s32( 0 ); n = length + order; do { calc_corr( input_QS, corr_QC, 0, state_QS_s32x4[ 0 ][ 0 ] ); state_QS_s32x4[ 2 ][ 0 ] = vld1q_s32( stateT ); vst1q_lane_s32( stateT, state_QS_s32x4[ 0 ][ 0 ], 0 ); state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], 1 ); state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 ); state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ]; input_QS++; stateT++; } while( --n ); } } { const opus_int16 *inputT = input; int32x4_t t_s32x4; int64x1_t t_s64x1; int64x2_t t_s64x2 = vdupq_n_s64( 0 ); for( n = 0; n <= length - 8; n += 8 ) { int16x8_t input_s16x8 = vld1q_s16( inputT ); t_s32x4 = vmull_s16( vget_low_s16( input_s16x8 ), vget_low_s16( input_s16x8 ) ); t_s32x4 = vmlal_s16( t_s32x4, vget_high_s16( input_s16x8 ), vget_high_s16( input_s16x8 ) ); t_s64x2 = vaddw_s32( t_s64x2, vget_low_s32( t_s32x4 ) ); t_s64x2 = vaddw_s32( t_s64x2, vget_high_s32( t_s32x4 ) ); inputT += 8; } t_s64x1 = vadd_s64( vget_low_s64( t_s64x2 ), vget_high_s64( t_s64x2 ) ); corr_QC_orderT = vget_lane_s64( t_s64x1, 0 ); for( ; n < length; n++ ) { corr_QC_orderT += silk_SMULL( input[ n ], input[ n ] ); } corr_QC_orderT = silk_LSHIFT64( corr_QC_orderT, QC ); corr_QC[ orderT ] = corr_QC_orderT; } corr_QCT = corr_QC + orderT - order; lsh = silk_CLZ64( corr_QC_orderT ) - 35; lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC ); *scale = -( QC + lsh ); silk_assert( *scale >= -30 && *scale <= 12 ); lsh_s64x2 = vdupq_n_s64( lsh ); for( i = 0; i <= order - 3; i += 4 ) { int32x4_t corr_s32x4; int64x2_t corr_QC0_s64x2, corr_QC1_s64x2; corr_QC0_s64x2 = vld1q_s64( corr_QCT + i ); corr_QC1_s64x2 = vld1q_s64( corr_QCT + i + 2 ); corr_QC0_s64x2 = vshlq_s64( corr_QC0_s64x2, lsh_s64x2 ); corr_QC1_s64x2 = vshlq_s64( corr_QC1_s64x2, lsh_s64x2 ); corr_s32x4 = vcombine_s32( vmovn_s64( corr_QC1_s64x2 ), vmovn_s64( corr_QC0_s64x2 ) ); corr_s32x4 = vrev64q_s32( corr_s32x4 ); vst1q_s32( corr + order - i - 3, corr_s32x4 ); } if( lsh >= 0 ) { for( ; i < order + 1; i++ ) { corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_LSHIFT64( corr_QCT[ i ], lsh ) ); } } else { for( ; i < order + 1; i++ ) { corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_RSHIFT64( corr_QCT[ i ], -lsh ) ); } } silk_assert( corr_QCT[ order ] >= 0 ); /* If breaking, decrease QC*/ RESTORE_STACK; } #ifdef OPUS_CHECK_ASM { opus_int32 corr_c[ MAX_SHAPE_LPC_ORDER + 1 ]; opus_int scale_c; silk_warped_autocorrelation_FIX_c( corr_c, &scale_c, input, warping_Q16, length, order ); silk_assert( !memcmp( corr_c, corr, sizeof( corr_c[ 0 ] ) * ( order + 1 ) ) ); silk_assert( scale_c == *scale ); } #endif }
static inline int32_t TransformAndFindMaxNeon(int16_t* inre, int16_t* inim, int32_t* outre, int32_t* outim) { int k; int16_t* inre1 = inre; int16_t* inre2 = &inre[FRAMESAMPLES/2 - 4]; int16_t* inim1 = inim; int16_t* inim2 = &inim[FRAMESAMPLES/2 - 4]; int32_t* outre1 = outre; int32_t* outre2 = &outre[FRAMESAMPLES/2 - 4]; int32_t* outim1 = outim; int32_t* outim2 = &outim[FRAMESAMPLES/2 - 4]; const int16_t* kSinTab1 = &WebRtcIsacfix_kSinTab2[0]; const int16_t* kSinTab2 = &WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4 - 4]; uint32x4_t max_r = vdupq_n_u32(0); uint32x4_t max_i = vdupq_n_u32(0); // Use ">> 5", instead of "<< 9" and then ">> 14" as in the C code. for (k = 0; k < FRAMESAMPLES/4; k += 4) { int16x4_t tmpi = vld1_s16(kSinTab1); kSinTab1 += 4; int16x4_t tmpr = vld1_s16(kSinTab2); kSinTab2 -= 4; int16x4_t inre_0 = vld1_s16(inre1); inre1 += 4; int16x4_t inre_1 = vld1_s16(inre2); inre2 -= 4; int16x4_t inim_0 = vld1_s16(inim1); inim1 += 4; int16x4_t inim_1 = vld1_s16(inim2); inim2 -= 4; tmpr = vneg_s16(tmpr); inre_1 = vrev64_s16(inre_1); inim_1 = vrev64_s16(inim_1); tmpr = vrev64_s16(tmpr); int32x4_t xr = vmull_s16(tmpr, inre_0); int32x4_t xi = vmull_s16(tmpr, inim_0); int32x4_t yr = vmull_s16(tmpr, inim_1); int32x4_t yi = vmull_s16(tmpi, inim_1); xr = vmlal_s16(xr, tmpi, inim_0); xi = vmlsl_s16(xi, tmpi, inre_0); yr = vmlal_s16(yr, tmpi, inre_1); yi = vmlsl_s16(yi, tmpr, inre_1); yr = vnegq_s32(yr); xr = vshrq_n_s32(xr, 5); xi = vshrq_n_s32(xi, 5); yr = vshrq_n_s32(yr, 5); yi = vshrq_n_s32(yi, 5); int32x4_t outr0 = vsubq_s32(xr, yi); int32x4_t outr1 = vaddq_s32(xr, yi); int32x4_t outi0 = vaddq_s32(xi, yr); int32x4_t outi1 = vsubq_s32(yr, xi); // Find the absolute maximum in the vectors. int32x4_t tmp0 = vabsq_s32(outr0); int32x4_t tmp1 = vabsq_s32(outr1); int32x4_t tmp2 = vabsq_s32(outi0); int32x4_t tmp3 = vabsq_s32(outi1); // vabs doesn't change the value of 0x80000000. // Use u32 so we don't lose the value 0x80000000. max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp0)); max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp2)); max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp1)); max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp3)); // Store the vectors. outr1 = vrev64q_s32(outr1); outi1 = vrev64q_s32(outi1); int32x4_t outr_1 = vcombine_s32(vget_high_s32(outr1), vget_low_s32(outr1)); int32x4_t outi_1 = vcombine_s32(vget_high_s32(outi1), vget_low_s32(outi1)); vst1q_s32(outre1, outr0); outre1 += 4; vst1q_s32(outim1, outi0); outim1 += 4; vst1q_s32(outre2, outr_1); outre2 -= 4; vst1q_s32(outim2, outi_1); outim2 -= 4; } max_r = vmaxq_u32(max_r, max_i); #if defined(WEBRTC_ARCH_ARM64) uint32_t maximum = vmaxvq_u32(max_r); #else uint32x2_t max32x2_r = vmax_u32(vget_low_u32(max_r), vget_high_u32(max_r)); max32x2_r = vpmax_u32(max32x2_r, max32x2_r); uint32_t maximum = vget_lane_u32(max32x2_r, 0); #endif return (int32_t)maximum; }
void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore* aecm, const uint16_t* far_spectrum, int32_t* echo_est, uint32_t* far_energy, uint32_t* echo_energy_adapt, uint32_t* echo_energy_stored) { int16_t* start_stored_p = aecm->channelStored; int16_t* start_adapt_p = aecm->channelAdapt16; int32_t* echo_est_p = echo_est; const int16_t* end_stored_p = aecm->channelStored + PART_LEN; const uint16_t* far_spectrum_p = far_spectrum; int16x8_t store_v, adapt_v; uint16x8_t spectrum_v; uint32x4_t echo_est_v_low, echo_est_v_high; uint32x4_t far_energy_v, echo_stored_v, echo_adapt_v; far_energy_v = vdupq_n_u32(0); echo_adapt_v = vdupq_n_u32(0); echo_stored_v = vdupq_n_u32(0); // Get energy for the delayed far end signal and estimated // echo using both stored and adapted channels. // The C code: // for (i = 0; i < PART_LEN1; i++) { // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], // far_spectrum[i]); // (*far_energy) += (uint32_t)(far_spectrum[i]); // *echo_energy_adapt += aecm->channelAdapt16[i] * far_spectrum[i]; // (*echo_energy_stored) += (uint32_t)echo_est[i]; // } while (start_stored_p < end_stored_p) { spectrum_v = vld1q_u16(far_spectrum_p); adapt_v = vld1q_s16(start_adapt_p); store_v = vld1q_s16(start_stored_p); far_energy_v = vaddw_u16(far_energy_v, vget_low_u16(spectrum_v)); far_energy_v = vaddw_u16(far_energy_v, vget_high_u16(spectrum_v)); echo_est_v_low = vmull_u16(vreinterpret_u16_s16(vget_low_s16(store_v)), vget_low_u16(spectrum_v)); echo_est_v_high = vmull_u16(vreinterpret_u16_s16(vget_high_s16(store_v)), vget_high_u16(spectrum_v)); vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low)); vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high)); echo_stored_v = vaddq_u32(echo_est_v_low, echo_stored_v); echo_stored_v = vaddq_u32(echo_est_v_high, echo_stored_v); echo_adapt_v = vmlal_u16(echo_adapt_v, vreinterpret_u16_s16(vget_low_s16(adapt_v)), vget_low_u16(spectrum_v)); echo_adapt_v = vmlal_u16(echo_adapt_v, vreinterpret_u16_s16(vget_high_s16(adapt_v)), vget_high_u16(spectrum_v)); start_stored_p += 8; start_adapt_p += 8; far_spectrum_p += 8; echo_est_p += 8; } AddLanes(far_energy, far_energy_v); AddLanes(echo_energy_stored, echo_stored_v); AddLanes(echo_energy_adapt, echo_adapt_v); echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN], far_spectrum[PART_LEN]); *echo_energy_stored += (uint32_t)echo_est[PART_LEN]; *far_energy += (uint32_t)far_spectrum[PART_LEN]; *echo_energy_adapt += aecm->channelAdapt16[PART_LEN] * far_spectrum[PART_LEN]; }
static void PCorr2Q32(const int16_t *in, int32_t *logcorQ8) { int16_t scaling,n,k; int32_t ysum32,csum32, lys, lcs; int32_t oneQ8; const int16_t *x, *inptr; oneQ8 = WEBRTC_SPL_LSHIFT_W32((int32_t)1, 8); // 1.00 in Q8 x = in + PITCH_MAX_LAG/2 + 2; scaling = WebRtcSpl_GetScalingSquare ((int16_t *) in, PITCH_CORR_LEN2, PITCH_CORR_LEN2); ysum32 = 1; csum32 = 0; x = in + PITCH_MAX_LAG/2 + 2; for (n = 0; n < PITCH_CORR_LEN2; n++) { ysum32 += WEBRTC_SPL_MUL_16_16_RSFT( (int16_t) in[n],(int16_t) in[n], scaling); // Q0 csum32 += WEBRTC_SPL_MUL_16_16_RSFT((int16_t) x[n],(int16_t) in[n], scaling); // Q0 } logcorQ8 += PITCH_LAG_SPAN2 - 1; lys=Log2Q8((uint32_t) ysum32); // Q8 lys=WEBRTC_SPL_RSHIFT_W32(lys, 1); //sqrt(ysum); if (csum32>0) { lcs=Log2Q8((uint32_t) csum32); // 2log(csum) in Q8 if (lcs>(lys + oneQ8) ){ // csum/sqrt(ysum) > 2 in Q8 *logcorQ8 = lcs - lys; // log2(csum/sqrt(ysum)) } else { *logcorQ8 = oneQ8; // 1.00 } } else { *logcorQ8 = 0; } for (k = 1; k < PITCH_LAG_SPAN2; k++) { inptr = &in[k]; ysum32 -= WEBRTC_SPL_MUL_16_16_RSFT( (int16_t) in[k-1],(int16_t) in[k-1], scaling); ysum32 += WEBRTC_SPL_MUL_16_16_RSFT( (int16_t) in[PITCH_CORR_LEN2 + k - 1],(int16_t) in[PITCH_CORR_LEN2 + k - 1], scaling); #ifdef WEBRTC_ARCH_ARM_NEON { int32_t vbuff[4]; int32x4_t int_32x4_sum = vmovq_n_s32(0); // Can't shift a Neon register to right with a non-constant shift value. int32x4_t int_32x4_scale = vdupq_n_s32(-scaling); // Assert a codition used in loop unrolling at compile-time. COMPILE_ASSERT(PITCH_CORR_LEN2 %4 == 0); for (n = 0; n < PITCH_CORR_LEN2; n += 4) { int16x4_t int_16x4_x = vld1_s16(&x[n]); int16x4_t int_16x4_in = vld1_s16(&inptr[n]); int32x4_t int_32x4 = vmull_s16(int_16x4_x, int_16x4_in); int_32x4 = vshlq_s32(int_32x4, int_32x4_scale); int_32x4_sum = vaddq_s32(int_32x4_sum, int_32x4); } // Use vector store to avoid long stall from data trasferring // from vector to general register. vst1q_s32(vbuff, int_32x4_sum); csum32 = vbuff[0] + vbuff[1]; csum32 += vbuff[2]; csum32 += vbuff[3]; } #else csum32 = 0; if(scaling == 0) { for (n = 0; n < PITCH_CORR_LEN2; n++) { csum32 += x[n] * inptr[n]; } } else { for (n = 0; n < PITCH_CORR_LEN2; n++) { csum32 += (x[n] * inptr[n]) >> scaling; } } #endif logcorQ8--; lys=Log2Q8((uint32_t)ysum32); // Q8 lys=WEBRTC_SPL_RSHIFT_W32(lys, 1); //sqrt(ysum); if (csum32>0) { lcs=Log2Q8((uint32_t) csum32); // 2log(csum) in Q8 if (lcs>(lys + oneQ8) ){ // csum/sqrt(ysum) > 2 *logcorQ8 = lcs - lys; // log2(csum/sqrt(ysum)) } else { *logcorQ8 = oneQ8; // 1.00 } } else { *logcorQ8 = 0; } } }