OD_SIMD_INLINE void od_load4(const od_coeff *x, int xstride, int32x4_t *t0, int32x4_t *t1, int32x4_t *t2, int32x4_t *t3) { *t0 = vld1q_s32((const int *)(x + 0*xstride)); *t1 = vld1q_s32((const int *)(x + 1*xstride)); *t2 = vld1q_s32((const int *)(x + 2*xstride)); *t3 = vld1q_s32((const int *)(x + 3*xstride)); }
void * scaled_sumi_thread_NEON(void * argument) { jsize i = 0; struct scaled_sumfneon_thread_data * data = (struct scaled_sumfneon_thread_data *) argument; int32_t * r = (int32_t *)data->r; const int32_t * x = (const int32_t *)data->x; const int32_t * y = (const int32_t *)data->y; const int32_t a = (const int32_t)data->a; const jsize size = data->size; int32x4_t rx4, xx4, yx4, ax4; ax4 = vdupq_n_s32(a); for(i; i < size ; i += 4) { xx4 = vld1q_s32(&(x[i])); yx4 = vld1q_s32(&(y[i])); rx4 = vmlaq_s32(xx4, ax4, yx4); vst1q_s32(&(r[i]), rx4); } }
static void add_int_neon(int* dst, int* src1, int* src2, int count) { int i; for (i = 0; i < count; i += 4) { int32x4_t in1, in2, out; in1 = vld1q_s32(src1); src1 += 4; in2 = vld1q_s32(src2); src2 += 4; out = vaddq_s32(in1, in2); vst1q_s32(dst, out); dst += 4; } }
/* s32x4 mm mul */ void mw_neon_mm_mul_s32x4(int * A, int Row, int T, int * B, int Col, int * C) { int i, k, j; int32x4_t neon_b, neon_c; int32x4_t neon_a0, neon_a1, neon_a2, neon_a3; int32x4_t neon_b0, neon_b1, neon_b2, neon_b3; for (i = 0; i < Row; i+=4) { for (k = 0; k < Col; k+=1) { neon_c = vmovq_n_s32(0); for (j = 0; j < T; j+=4) { int j_T = j * T + i; int k_Row = k * Row; neon_a0 = vld1q_s32(A + j_T); j_T+=Row; neon_a1 = vld1q_s32(A + j_T); j_T+=Row; neon_a2 = vld1q_s32(A + j_T); j_T+=Row; neon_a3 = vld1q_s32(A + j_T); neon_b = vld1q_s32(B + k_Row + j); neon_b0 = vdupq_n_s32(vgetq_lane_s32(neon_b, 0)); neon_b1 = vdupq_n_s32(vgetq_lane_s32(neon_b, 1)); neon_b2 = vdupq_n_s32(vgetq_lane_s32(neon_b, 2)); neon_b3 = vdupq_n_s32(vgetq_lane_s32(neon_b, 3)); neon_c = vaddq_s32(vmulq_s32(neon_a0, neon_b0), neon_c); neon_c = vaddq_s32(vmulq_s32(neon_a1, neon_b1), neon_c); neon_c = vaddq_s32(vmulq_s32(neon_a2, neon_b2), neon_c); neon_c = vaddq_s32(vmulq_s32(neon_a3, neon_b3), neon_c); vst1q_lane_s32(C + k_Row + i, neon_c, 0); vst1q_lane_s32(C + k_Row + i + 1, neon_c, 1); vst1q_lane_s32(C + k_Row + i + 2, neon_c, 2); vst1q_lane_s32(C + k_Row + i + 3, neon_c, 3); } } } }
/* s32x4 mv mul */ void mw_neon_mv_mul_s32x4(int * A, int Row, int T, int * B, int * C) { int i = 0; int k = 0; int32x4_t neon_b, neon_c; int32x4_t neon_a0, neon_a1, neon_a2, neon_a3; int32x4_t neon_b0, neon_b1, neon_b2, neon_b3; for (i = 0; i < Row; i+=4) { neon_c = vmovq_n_s32(0); for (k = 0; k < T; k+=4) { int j = k * T + i; neon_a0 = vld1q_s32(A + j); j+=Row; neon_a1 = vld1q_s32(A + j); j+=Row; neon_a2 = vld1q_s32(A + j); j+=Row; neon_a3 = vld1q_s32(A + j); neon_b = vld1q_s32(B + k); neon_b0 = vdupq_n_s32(vgetq_lane_s32(neon_b, 0)); neon_b1 = vdupq_n_s32(vgetq_lane_s32(neon_b, 1)); neon_b2 = vdupq_n_s32(vgetq_lane_s32(neon_b, 2)); neon_b3 = vdupq_n_s32(vgetq_lane_s32(neon_b, 3)); neon_c = vaddq_s32(vmulq_s32(neon_a0, neon_b0), neon_c); neon_c = vaddq_s32(vmulq_s32(neon_a1, neon_b1), neon_c); neon_c = vaddq_s32(vmulq_s32(neon_a2, neon_b2), neon_c); neon_c = vaddq_s32(vmulq_s32(neon_a3, neon_b3), neon_c); } vst1q_s32(C + i, neon_c); } }
/* s32x4 saturated sub */ void mw_neon_mm_qsub_s32x4(int * A, int Row, int Col, int * B, int * C) { int32x4_t neon_a, neon_b, neon_c; int size = Row * Col; int i = 0; int k = 0; for (i = 4; i <= size ; i+=4) { k = i - 4; neon_a = vld1q_s32(A + k); neon_b = vld1q_s32(B + k); neon_c = vqsubq_s32(neon_a, neon_b); vst1q_s32(C + k, neon_c); } k = i - 4; for (i = 0; i < size % 4; i++) { C[k + i] = A[k + i] - B[k + i]; } }
static OPUS_INLINE void calc_corr( const opus_int32 *const input_QS, opus_int64 *const corr_QC, const opus_int offset, const int32x4_t state_QS_s32x4 ) { int64x2_t corr_QC_s64x2[ 2 ], t_s64x2[ 2 ]; const int32x4_t input_QS_s32x4 = vld1q_s32( input_QS + offset ); corr_QC_s64x2[ 0 ] = vld1q_s64( corr_QC + offset + 0 ); corr_QC_s64x2[ 1 ] = vld1q_s64( corr_QC + offset + 2 ); t_s64x2[ 0 ] = vmull_s32( vget_low_s32( state_QS_s32x4 ), vget_low_s32( input_QS_s32x4 ) ); t_s64x2[ 1 ] = vmull_s32( vget_high_s32( state_QS_s32x4 ), vget_high_s32( input_QS_s32x4 ) ); corr_QC_s64x2[ 0 ] = vsraq_n_s64( corr_QC_s64x2[ 0 ], t_s64x2[ 0 ], 2 * QS - QC ); corr_QC_s64x2[ 1 ] = vsraq_n_s64( corr_QC_s64x2[ 1 ], t_s64x2[ 1 ], 2 * QS - QC ); vst1q_s64( corr_QC + offset + 0, corr_QC_s64x2[ 0 ] ); vst1q_s64( corr_QC + offset + 2, corr_QC_s64x2[ 1 ] ); }
void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { const int16x8_t max = vdupq_n_s16((1 << bd) - 1); int32x4_t c0 = vld1q_s32(input); int32x4_t c1 = vld1q_s32(input + 4); int32x4_t c2 = vld1q_s32(input + 8); int32x4_t c3 = vld1q_s32(input + 12); int16x8_t a0, a1; if (bd == 8) { const int16x4_t cospis = vld1_s16(kCospi); // Rows a0 = vcombine_s16(vmovn_s32(c0), vmovn_s32(c1)); a1 = vcombine_s16(vmovn_s32(c2), vmovn_s32(c3)); idct4x4_16_kernel_bd8(cospis, &a0, &a1); // Columns a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1)); idct4x4_16_kernel_bd8(cospis, &a0, &a1); a0 = vrshrq_n_s16(a0, 4); a1 = vrshrq_n_s16(a1, 4); } else { const int32x4_t cospis = vld1q_s32(kCospi32); if (bd == 10) { idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3); idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3); } else { idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3); idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3); } a0 = vcombine_s16(vqrshrn_n_s32(c0, 4), vqrshrn_n_s32(c1, 4)); a1 = vcombine_s16(vqrshrn_n_s32(c3, 4), vqrshrn_n_s32(c2, 4)); } highbd_idct4x4_1_add_kernel1(&dest, stride, a0, max); highbd_idct4x4_1_add_kernel2(&dest, stride, a1, max); }
inline void ClampBufferToS16(s16 *out, const s32 *in, size_t size, s8 volShift) { #ifdef _M_SSE // Size will always be 16-byte aligned as the hwBlockSize is. while (size >= 8) { __m128i in1 = _mm_loadu_si128((__m128i *)in); __m128i in2 = _mm_loadu_si128((__m128i *)(in + 4)); __m128i packed = _mm_packs_epi32(in1, in2); if (useShift) { packed = _mm_srai_epi16(packed, volShift); } _mm_storeu_si128((__m128i *)out, packed); out += 8; in += 8; size -= 8; } #elif PPSSPP_ARCH(ARM_NEON) int16x4_t signedVolShift = vdup_n_s16 (-volShift); // Can only dynamic-shift right, but by a signed integer while (size >= 8) { int32x4_t in1 = vld1q_s32(in); int32x4_t in2 = vld1q_s32(in + 4); int16x4_t packed1 = vqmovn_s32(in1); int16x4_t packed2 = vqmovn_s32(in2); if (useShift) { packed1 = vshl_s16(packed1, signedVolShift); packed2 = vshl_s16(packed2, signedVolShift); } vst1_s16(out, packed1); vst1_s16(out + 4, packed2); out += 8; in += 8; size -= 8; } #endif // This does the remainder if SIMD was used, otherwise it does it all. for (size_t i = 0; i < size; i++) { out[i] = clamp_s16(useShift ? (in[i] >> volShift) : in[i]); } }
static inline void yuv2rgb_4x2(const uint8_t *y1, const uint8_t *y2, const uint8_t *u, const uint8_t *v, int16_t *r1, int16_t *g1, int16_t *b1, int16_t *r2, int16_t *g2, int16_t *b2){ int32x4_t ry1; int32x4_t ry2; int32x4_t rvug; int32x4_t rvr; int32x4_t rub; int32x4_t rr1,rg1,rb1,rr2,rg2,rb2; int32x4_t max; LOAD_Y_PREMULTS(0) LOAD_Y_PREMULTS(1) LOAD_Y_PREMULTS(2) LOAD_Y_PREMULTS(3) LOAD_UV_PREMULTS(0) LOAD_UV_PREMULTS(1) max=vld1q_s32(yuvmax); /*the following does not work */ //max=vdupq_n_s32(255); rr1=vaddq_s32(ry1,rvr); rr2=vaddq_s32(ry2,rvr); rg1=vaddq_s32(ry1,rvug); rg2=vaddq_s32(ry2,rvug); rb1=vaddq_s32(ry1,rub); rb2=vaddq_s32(ry2,rub); rr1=vminq_s32(vabsq_s32(rr1),max); rr2=vminq_s32(vabsq_s32(rr2),max); rg1=vminq_s32(vabsq_s32(rg1),max); rg2=vminq_s32(vabsq_s32(rg2),max); rb1=vminq_s32(vabsq_s32(rb1),max); rb2=vminq_s32(vabsq_s32(rb2),max); vst1_s16(r1,vqshrn_n_s32(rr1,13)); vst1_s16(r2,vqshrn_n_s32(rr2,13)); vst1_s16(g1,vqshrn_n_s32(rg1,13)); vst1_s16(g2,vqshrn_n_s32(rg2,13)); vst1_s16(b1,vqshrn_n_s32(rb1,13)); vst1_s16(b2,vqshrn_n_s32(rb2,13)); }
test_vreinterpretq_f64_s32 () { int32x4_t a; float64x2_t b; int32_t c[4] = { 0x54442D18, 0x400921FB, 0x8B145769, 0x4005BF0A }; float64_t d[2] = { PI_F64, E_F64 }; float64_t e[2]; int i; a = vld1q_s32 (c); b = wrap_vreinterpretq_f64_s32 (a); vst1q_f64 (e, b); for (i = 0; i < 2; i++) if (!DOUBLE_EQUALS (d[i], e[i], __DBL_EPSILON__)) return 1; return 0; };
static INLINE void load_4x8_s32_dual(const tran_low_t *input, int32x4_t *const in0, int32x4_t *const in1, int32x4_t *const in2, int32x4_t *const in3, int32x4_t *const in4, int32x4_t *const in5, int32x4_t *const in6, int32x4_t *const in7) { *in0 = vld1q_s32(input); input += 32; *in1 = vld1q_s32(input); input += 32; *in2 = vld1q_s32(input); input += 32; *in3 = vld1q_s32(input); input += 32; *in4 = vld1q_s32(input); input += 32; *in5 = vld1q_s32(input); input += 32; *in6 = vld1q_s32(input); input += 32; *in7 = vld1q_s32(input); }
void silk_biquad_alt_stride2_neon( const opus_int16 *in, /* I input signal */ const opus_int32 *B_Q28, /* I MA coefficients [3] */ const opus_int32 *A_Q28, /* I AR coefficients [2] */ opus_int32 *S, /* I/O State vector [4] */ opus_int16 *out, /* O output signal */ const opus_int32 len /* I signal length (must be even) */ ) { /* DIRECT FORM II TRANSPOSED (uses 2 element state vector) */ opus_int k = 0; const int32x2_t offset_s32x2 = vdup_n_s32( (1<<14) - 1 ); const int32x4_t offset_s32x4 = vcombine_s32( offset_s32x2, offset_s32x2 ); int16x4_t in_s16x4 = vdup_n_s16( 0 ); int16x4_t out_s16x4; int32x2_t A_Q28_s32x2, A_L_s32x2, A_U_s32x2, B_Q28_s32x2, t_s32x2; int32x4_t A_L_s32x4, A_U_s32x4, B_Q28_s32x4, S_s32x4, out32_Q14_s32x4; int32x2x2_t t0_s32x2x2, t1_s32x2x2, t2_s32x2x2, S_s32x2x2; #ifdef OPUS_CHECK_ASM opus_int32 S_c[ 4 ]; VARDECL( opus_int16, out_c ); SAVE_STACK; ALLOC( out_c, 2 * len, opus_int16 ); silk_memcpy( &S_c, S, sizeof( S_c ) ); silk_biquad_alt_stride2_c( in, B_Q28, A_Q28, S_c, out_c, len ); #endif /* Negate A_Q28 values and split in two parts */ A_Q28_s32x2 = vld1_s32( A_Q28 ); A_Q28_s32x2 = vneg_s32( A_Q28_s32x2 ); A_L_s32x2 = vshl_n_s32( A_Q28_s32x2, 18 ); /* ( -A_Q28[] & 0x00003FFF ) << 18 */ A_L_s32x2 = vreinterpret_s32_u32( vshr_n_u32( vreinterpret_u32_s32( A_L_s32x2 ), 3 ) ); /* ( -A_Q28[] & 0x00003FFF ) << 15 */ A_U_s32x2 = vshr_n_s32( A_Q28_s32x2, 14 ); /* silk_RSHIFT( -A_Q28[], 14 ) */ A_U_s32x2 = vshl_n_s32( A_U_s32x2, 16 ); /* silk_RSHIFT( -A_Q28[], 14 ) << 16 (Clip two leading bits to conform to C function.) */ A_U_s32x2 = vshr_n_s32( A_U_s32x2, 1 ); /* silk_RSHIFT( -A_Q28[], 14 ) << 15 */ B_Q28_s32x2 = vld1_s32( B_Q28 ); t_s32x2 = vld1_s32( B_Q28 + 1 ); t0_s32x2x2 = vzip_s32( A_L_s32x2, A_L_s32x2 ); t1_s32x2x2 = vzip_s32( A_U_s32x2, A_U_s32x2 ); t2_s32x2x2 = vzip_s32( t_s32x2, t_s32x2 ); A_L_s32x4 = vcombine_s32( t0_s32x2x2.val[ 0 ], t0_s32x2x2.val[ 1 ] ); /* A{0,0,1,1}_L_Q28 */ A_U_s32x4 = vcombine_s32( t1_s32x2x2.val[ 0 ], t1_s32x2x2.val[ 1 ] ); /* A{0,0,1,1}_U_Q28 */ B_Q28_s32x4 = vcombine_s32( t2_s32x2x2.val[ 0 ], t2_s32x2x2.val[ 1 ] ); /* B_Q28[ {1,1,2,2} ] */ S_s32x4 = vld1q_s32( S ); /* S0 = S[ 0 ]; S3 = S[ 3 ]; */ S_s32x2x2 = vtrn_s32( vget_low_s32( S_s32x4 ), vget_high_s32( S_s32x4 ) ); /* S2 = S[ 1 ]; S1 = S[ 2 ]; */ S_s32x4 = vcombine_s32( S_s32x2x2.val[ 0 ], S_s32x2x2.val[ 1 ] ); for( ; k < len - 1; k += 2 ) { int32x4_t in_s32x4[ 2 ], t_s32x4; int32x2_t out32_Q14_s32x2[ 2 ]; /* S[ 2 * i + 0 ], S[ 2 * i + 1 ], S[ 2 * i + 2 ], S[ 2 * i + 3 ]: Q12 */ in_s16x4 = vld1_s16( &in[ 2 * k ] ); /* in{0,1,2,3} = in[ 2 * k + {0,1,2,3} ]; */ in_s32x4[ 0 ] = vshll_n_s16( in_s16x4, 15 ); /* in{0,1,2,3} << 15 */ t_s32x4 = vqdmulhq_lane_s32( in_s32x4[ 0 ], B_Q28_s32x2, 0 ); /* silk_SMULWB( B_Q28[ 0 ], in{0,1,2,3} ) */ in_s32x4[ 1 ] = vcombine_s32( vget_high_s32( in_s32x4[ 0 ] ), vget_high_s32( in_s32x4[ 0 ] ) ); /* in{2,3,2,3} << 15 */ in_s32x4[ 0 ] = vcombine_s32( vget_low_s32 ( in_s32x4[ 0 ] ), vget_low_s32 ( in_s32x4[ 0 ] ) ); /* in{0,1,0,1} << 15 */ silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_low_s32 ( t_s32x4 ), in_s32x4[ 0 ], &S_s32x4, &out32_Q14_s32x2[ 0 ] ); silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_high_s32( t_s32x4 ), in_s32x4[ 1 ], &S_s32x4, &out32_Q14_s32x2[ 1 ] ); /* Scale back to Q0 and saturate */ out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2[ 0 ], out32_Q14_s32x2[ 1 ] ); /* out32_Q14_{0,1,2,3} */ out32_Q14_s32x4 = vaddq_s32( out32_Q14_s32x4, offset_s32x4 ); /* out32_Q14_{0,1,2,3} + (1<<14) - 1 */ out_s16x4 = vqshrn_n_s32( out32_Q14_s32x4, 14 ); /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ) */ vst1_s16( &out[ 2 * k ], out_s16x4 ); /* out[ 2 * k + {0,1,2,3} ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ); */ } /* Process leftover. */ if( k < len ) { int32x4_t in_s32x4; int32x2_t out32_Q14_s32x2; /* S[ 2 * i + 0 ], S[ 2 * i + 1 ]: Q12 */ in_s16x4 = vld1_lane_s16( &in[ 2 * k + 0 ], in_s16x4, 0 ); /* in{0,1} = in[ 2 * k + {0,1} ]; */ in_s16x4 = vld1_lane_s16( &in[ 2 * k + 1 ], in_s16x4, 1 ); /* in{0,1} = in[ 2 * k + {0,1} ]; */ in_s32x4 = vshll_n_s16( in_s16x4, 15 ); /* in{0,1} << 15 */ t_s32x2 = vqdmulh_lane_s32( vget_low_s32( in_s32x4 ), B_Q28_s32x2, 0 ); /* silk_SMULWB( B_Q28[ 0 ], in{0,1} ) */ in_s32x4 = vcombine_s32( vget_low_s32( in_s32x4 ), vget_low_s32( in_s32x4 ) ); /* in{0,1,0,1} << 15 */ silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, t_s32x2, in_s32x4, &S_s32x4, &out32_Q14_s32x2 ); /* Scale back to Q0 and saturate */ out32_Q14_s32x2 = vadd_s32( out32_Q14_s32x2, offset_s32x2 ); /* out32_Q14_{0,1} + (1<<14) - 1 */ out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2, out32_Q14_s32x2 ); /* out32_Q14_{0,1,0,1} + (1<<14) - 1 */ out_s16x4 = vqshrn_n_s32( out32_Q14_s32x4, 14 ); /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,0,1} + (1<<14) - 1, 14 ) ) */ vst1_lane_s16( &out[ 2 * k + 0 ], out_s16x4, 0 ); /* out[ 2 * k + 0 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_0 + (1<<14) - 1, 14 ) ); */ vst1_lane_s16( &out[ 2 * k + 1 ], out_s16x4, 1 ); /* out[ 2 * k + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_1 + (1<<14) - 1, 14 ) ); */ } vst1q_lane_s32( &S[ 0 ], S_s32x4, 0 ); /* S[ 0 ] = S0; */ vst1q_lane_s32( &S[ 1 ], S_s32x4, 2 ); /* S[ 1 ] = S2; */ vst1q_lane_s32( &S[ 2 ], S_s32x4, 1 ); /* S[ 2 ] = S1; */ vst1q_lane_s32( &S[ 3 ], S_s32x4, 3 ); /* S[ 3 ] = S3; */ #ifdef OPUS_CHECK_ASM silk_assert( !memcmp( S_c, S, sizeof( S_c ) ) ); silk_assert( !memcmp( out_c, out, 2 * len * sizeof( opus_int16 ) ) ); RESTORE_STACK; #endif }
void qt_blend_argb32_on_argb32_neon(uchar *destPixels, int dbpl, const uchar *srcPixels, int sbpl, int w, int h, int const_alpha) { const uint *src = (const uint *) srcPixels; uint *dst = (uint *) destPixels; int16x8_t half = vdupq_n_s16(0x80); int16x8_t full = vdupq_n_s16(0xff); if (const_alpha == 256) { for (int y = 0; y < h; ++y) { int x = 0; for (; x < w-3; x += 4) { int32x4_t src32 = vld1q_s32((int32_t *)&src[x]); if ((src[x] & src[x+1] & src[x+2] & src[x+3]) >= 0xff000000) { // all opaque vst1q_s32((int32_t *)&dst[x], src32); } else if (src[x] | src[x+1] | src[x+2] | src[x+3]) { int32x4_t dst32 = vld1q_s32((int32_t *)&dst[x]); const uint8x16_t src8 = vreinterpretq_u8_s32(src32); const uint8x16_t dst8 = vreinterpretq_u8_s32(dst32); const uint8x8_t src8_low = vget_low_u8(src8); const uint8x8_t dst8_low = vget_low_u8(dst8); const uint8x8_t src8_high = vget_high_u8(src8); const uint8x8_t dst8_high = vget_high_u8(dst8); const int16x8_t src16_low = vreinterpretq_s16_u16(vmovl_u8(src8_low)); const int16x8_t dst16_low = vreinterpretq_s16_u16(vmovl_u8(dst8_low)); const int16x8_t src16_high = vreinterpretq_s16_u16(vmovl_u8(src8_high)); const int16x8_t dst16_high = vreinterpretq_s16_u16(vmovl_u8(dst8_high)); const int16x8_t result16_low = qvsource_over_s16(src16_low, dst16_low, half, full); const int16x8_t result16_high = qvsource_over_s16(src16_high, dst16_high, half, full); const int32x2_t result32_low = vreinterpret_s32_s8(vmovn_s16(result16_low)); const int32x2_t result32_high = vreinterpret_s32_s8(vmovn_s16(result16_high)); vst1q_s32((int32_t *)&dst[x], vcombine_s32(result32_low, result32_high)); } } for (; x<w; ++x) { uint s = src[x]; if (s >= 0xff000000) dst[x] = s; else if (s != 0) dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); } dst = (quint32 *)(((uchar *) dst) + dbpl); src = (const quint32 *)(((const uchar *) src) + sbpl); } } else if (const_alpha != 0) { const_alpha = (const_alpha * 255) >> 8; int16x8_t const_alpha16 = vdupq_n_s16(const_alpha); for (int y = 0; y < h; ++y) { int x = 0; for (; x < w-3; x += 4) { if (src[x] | src[x+1] | src[x+2] | src[x+3]) { int32x4_t src32 = vld1q_s32((int32_t *)&src[x]); int32x4_t dst32 = vld1q_s32((int32_t *)&dst[x]); const uint8x16_t src8 = vreinterpretq_u8_s32(src32); const uint8x16_t dst8 = vreinterpretq_u8_s32(dst32); const uint8x8_t src8_low = vget_low_u8(src8); const uint8x8_t dst8_low = vget_low_u8(dst8); const uint8x8_t src8_high = vget_high_u8(src8); const uint8x8_t dst8_high = vget_high_u8(dst8); const int16x8_t src16_low = vreinterpretq_s16_u16(vmovl_u8(src8_low)); const int16x8_t dst16_low = vreinterpretq_s16_u16(vmovl_u8(dst8_low)); const int16x8_t src16_high = vreinterpretq_s16_u16(vmovl_u8(src8_high)); const int16x8_t dst16_high = vreinterpretq_s16_u16(vmovl_u8(dst8_high)); const int16x8_t srcalpha16_low = qvbyte_mul_s16(src16_low, const_alpha16, half); const int16x8_t srcalpha16_high = qvbyte_mul_s16(src16_high, const_alpha16, half); const int16x8_t result16_low = qvsource_over_s16(srcalpha16_low, dst16_low, half, full); const int16x8_t result16_high = qvsource_over_s16(srcalpha16_high, dst16_high, half, full); const int32x2_t result32_low = vreinterpret_s32_s8(vmovn_s16(result16_low)); const int32x2_t result32_high = vreinterpret_s32_s8(vmovn_s16(result16_high)); vst1q_s32((int32_t *)&dst[x], vcombine_s32(result32_low, result32_high)); } } for (; x<w; ++x) { uint s = src[x]; if (s != 0) { s = BYTE_MUL(s, const_alpha); dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); } } dst = (quint32 *)(((uchar *) dst) + dbpl); src = (const quint32 *)(((const uchar *) src) + sbpl); } }
void vp9_highbd_iht8x8_64_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int tx_type, int bd) { int32x4_t a[16]; int16x8_t c[8]; a[0] = vld1q_s32(input); a[1] = vld1q_s32(input + 4); a[2] = vld1q_s32(input + 8); a[3] = vld1q_s32(input + 12); a[4] = vld1q_s32(input + 16); a[5] = vld1q_s32(input + 20); a[6] = vld1q_s32(input + 24); a[7] = vld1q_s32(input + 28); a[8] = vld1q_s32(input + 32); a[9] = vld1q_s32(input + 36); a[10] = vld1q_s32(input + 40); a[11] = vld1q_s32(input + 44); a[12] = vld1q_s32(input + 48); a[13] = vld1q_s32(input + 52); a[14] = vld1q_s32(input + 56); a[15] = vld1q_s32(input + 60); if (bd == 8) { c[0] = vcombine_s16(vmovn_s32(a[0]), vmovn_s32(a[1])); c[1] = vcombine_s16(vmovn_s32(a[2]), vmovn_s32(a[3])); c[2] = vcombine_s16(vmovn_s32(a[4]), vmovn_s32(a[5])); c[3] = vcombine_s16(vmovn_s32(a[6]), vmovn_s32(a[7])); c[4] = vcombine_s16(vmovn_s32(a[8]), vmovn_s32(a[9])); c[5] = vcombine_s16(vmovn_s32(a[10]), vmovn_s32(a[11])); c[6] = vcombine_s16(vmovn_s32(a[12]), vmovn_s32(a[13])); c[7] = vcombine_s16(vmovn_s32(a[14]), vmovn_s32(a[15])); switch (tx_type) { case DCT_DCT: { const int16x8_t cospis = vld1q_s16(kCospi); const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 idct8x8_64_1d_bd8(cospis0, cospis1, c); idct8x8_64_1d_bd8(cospis0, cospis1, c); break; } case ADST_DCT: { const int16x8_t cospis = vld1q_s16(kCospi); const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 idct8x8_64_1d_bd8(cospis0, cospis1, c); transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6], &c[7]); iadst8(c); break; } case DCT_ADST: { const int16x8_t cospis = vld1q_s16(kCospi); const int16x4_t cospis0 = vget_low_s16(cospis); // cospi 0, 8, 16, 24 const int16x4_t cospis1 = vget_high_s16(cospis); // cospi 4, 12, 20, 28 transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6], &c[7]); iadst8(c); idct8x8_64_1d_bd8(cospis0, cospis1, c); break; } default: { transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6], &c[7]); iadst8(c); transpose_s16_8x8(&c[0], &c[1], &c[2], &c[3], &c[4], &c[5], &c[6], &c[7]); iadst8(c); break; } } c[0] = vrshrq_n_s16(c[0], 5); c[1] = vrshrq_n_s16(c[1], 5); c[2] = vrshrq_n_s16(c[2], 5); c[3] = vrshrq_n_s16(c[3], 5); c[4] = vrshrq_n_s16(c[4], 5); c[5] = vrshrq_n_s16(c[5], 5); c[6] = vrshrq_n_s16(c[6], 5); c[7] = vrshrq_n_s16(c[7], 5); } else { switch (tx_type) { case DCT_DCT: { const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 if (bd == 10) { idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], &a[15]); idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], &a[15]); } else { idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], &a[15]); idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], &a[15]); } break; } case ADST_DCT: { const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 if (bd == 10) { idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); idct8x8_64_half1d_bd10(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], &a[15]); transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); iadst8_bd10(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], &a[15]); iadst8_bd10(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], &a[15]); } else { idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); idct8x8_64_half1d_bd12(cospis0, cospis1, &a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], &a[15]); transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); iadst8_bd12(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], &a[15]); iadst8_bd12(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], &a[15]); } break; } case DCT_ADST: { const int32x4_t cospis0 = vld1q_s32(kCospi32); // cospi 0, 8, 16, 24 const int32x4_t cospis1 = vld1q_s32(kCospi32 + 4); // cospi 4, 12, 20, 28 if (bd == 10) { transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); iadst8_bd10(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], &a[15]); iadst8_bd10(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], &a[15]); idct8x8_64_half1d_bd10(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); idct8x8_64_half1d_bd10(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], &a[15]); } else { transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); iadst8_bd12(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], &a[15]); iadst8_bd12(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], &a[15]); idct8x8_64_half1d_bd12(cospis0, cospis1, &a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); idct8x8_64_half1d_bd12(cospis0, cospis1, &a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], &a[15]); } break; } default: { assert(tx_type == ADST_ADST); if (bd == 10) { transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); iadst8_bd10(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], &a[15]); iadst8_bd10(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], &a[15]); transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); iadst8_bd10(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], &a[15]); iadst8_bd10(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], &a[15]); } else { transpose_s32_8x4(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); iadst8_bd12(&a[0], &a[1], &a[2], &a[3], &a[4], &a[5], &a[6], &a[7]); transpose_s32_8x4(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], &a[15]); iadst8_bd12(&a[8], &a[9], &a[10], &a[11], &a[12], &a[13], &a[14], &a[15]); transpose_s32_8x4(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); iadst8_bd12(&a[0], &a[8], &a[1], &a[9], &a[2], &a[10], &a[3], &a[11]); transpose_s32_8x4(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], &a[15]); iadst8_bd12(&a[4], &a[12], &a[5], &a[13], &a[6], &a[14], &a[7], &a[15]); } break; } } c[0] = vcombine_s16(vrshrn_n_s32(a[0], 5), vrshrn_n_s32(a[4], 5)); c[1] = vcombine_s16(vrshrn_n_s32(a[8], 5), vrshrn_n_s32(a[12], 5)); c[2] = vcombine_s16(vrshrn_n_s32(a[1], 5), vrshrn_n_s32(a[5], 5)); c[3] = vcombine_s16(vrshrn_n_s32(a[9], 5), vrshrn_n_s32(a[13], 5)); c[4] = vcombine_s16(vrshrn_n_s32(a[2], 5), vrshrn_n_s32(a[6], 5)); c[5] = vcombine_s16(vrshrn_n_s32(a[10], 5), vrshrn_n_s32(a[14], 5)); c[6] = vcombine_s16(vrshrn_n_s32(a[3], 5), vrshrn_n_s32(a[7], 5)); c[7] = vcombine_s16(vrshrn_n_s32(a[11], 5), vrshrn_n_s32(a[15], 5)); } highbd_add8x8(c, dest, stride, bd); }
void silk_warped_autocorrelation_FIX_neon( opus_int32 *corr, /* O Result [order + 1] */ opus_int *scale, /* O Scaling of the correlation vector */ const opus_int16 *input, /* I Input data to correlate */ const opus_int warping_Q16, /* I Warping coefficient */ const opus_int length, /* I Length of input */ const opus_int order /* I Correlation order (even) */ ) { if( ( MAX_SHAPE_LPC_ORDER > 24 ) || ( order < 6 ) ) { silk_warped_autocorrelation_FIX_c( corr, scale, input, warping_Q16, length, order ); } else { opus_int n, i, lsh; opus_int64 corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; /* In reverse order */ opus_int64 corr_QC_orderT; int64x2_t lsh_s64x2; const opus_int orderT = ( order + 3 ) & ~3; opus_int64 *corr_QCT; opus_int32 *input_QS; VARDECL( opus_int32, input_QST ); VARDECL( opus_int32, state ); SAVE_STACK; /* Order must be even */ silk_assert( ( order & 1 ) == 0 ); silk_assert( 2 * QS - QC >= 0 ); ALLOC( input_QST, length + 2 * MAX_SHAPE_LPC_ORDER, opus_int32 ); input_QS = input_QST; /* input_QS has zero paddings in the beginning and end. */ vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; /* Loop over samples */ for( n = 0; n < length - 7; n += 8, input_QS += 8 ) { const int16x8_t t0_s16x4 = vld1q_s16( input + n ); vst1q_s32( input_QS + 0, vshll_n_s16( vget_low_s16( t0_s16x4 ), QS ) ); vst1q_s32( input_QS + 4, vshll_n_s16( vget_high_s16( t0_s16x4 ), QS ) ); } for( ; n < length; n++, input_QS++ ) { input_QS[ 0 ] = silk_LSHIFT32( (opus_int32)input[ n ], QS ); } vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS += 4; vst1q_s32( input_QS, vdupq_n_s32( 0 ) ); input_QS = input_QST + MAX_SHAPE_LPC_ORDER - orderT; /* The following loop runs ( length + order ) times, with ( order ) extra epilogues. */ /* The zero paddings in input_QS guarantee corr_QC's correctness even with the extra epilogues. */ /* The values of state_QS will be polluted by the extra epilogues, however they are temporary values. */ /* Keep the C code here to help understand the intrinsics optimization. */ /* { opus_int32 state_QS[ 2 ][ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; opus_int32 *state_QST[ 3 ]; state_QST[ 0 ] = state_QS[ 0 ]; state_QST[ 1 ] = state_QS[ 1 ]; for( n = 0; n < length + order; n++, input_QS++ ) { state_QST[ 0 ][ orderT ] = input_QS[ orderT ]; for( i = 0; i < orderT; i++ ) { corr_QC[ i ] += silk_RSHIFT64( silk_SMULL( state_QST[ 0 ][ i ], input_QS[ i ] ), 2 * QS - QC ); state_QST[ 1 ][ i ] = silk_SMLAWB( state_QST[ 1 ][ i + 1 ], state_QST[ 0 ][ i ] - state_QST[ 0 ][ i + 1 ], warping_Q16 ); } state_QST[ 2 ] = state_QST[ 0 ]; state_QST[ 0 ] = state_QST[ 1 ]; state_QST[ 1 ] = state_QST[ 2 ]; } } */ { const int32x4_t warping_Q16_s32x4 = vdupq_n_s32( warping_Q16 << 15 ); const opus_int32 *in = input_QS + orderT; opus_int o = orderT; int32x4_t state_QS_s32x4[ 3 ][ 2 ]; ALLOC( state, length + orderT, opus_int32 ); state_QS_s32x4[ 2 ][ 1 ] = vdupq_n_s32( 0 ); /* Calculate 8 taps of all inputs in each loop. */ do { state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 0 ][ 1 ] = state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 1 ][ 1 ] = vdupq_n_s32( 0 ); n = 0; do { calc_corr( input_QS + n, corr_QC, o - 8, state_QS_s32x4[ 0 ][ 0 ] ); calc_corr( input_QS + n, corr_QC, o - 4, state_QS_s32x4[ 0 ][ 1 ] ); state_QS_s32x4[ 2 ][ 1 ] = vld1q_s32( in + n ); vst1q_lane_s32( state + n, state_QS_s32x4[ 0 ][ 0 ], 0 ); state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 0 ][ 1 ], 1 ); state_QS_s32x4[ 2 ][ 1 ] = vextq_s32( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], 1 ); state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 ); state_QS_s32x4[ 0 ][ 1 ] = calc_state( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], state_QS_s32x4[ 1 ][ 1 ], warping_Q16_s32x4 ); state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ]; state_QS_s32x4[ 1 ][ 1 ] = state_QS_s32x4[ 2 ][ 1 ]; } while( ++n < ( length + order ) ); in = state; o -= 8; } while( o > 4 ); if( o ) { /* Calculate the last 4 taps of all inputs. */ opus_int32 *stateT = state; silk_assert( o == 4 ); state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 1 ][ 0 ] = vdupq_n_s32( 0 ); n = length + order; do { calc_corr( input_QS, corr_QC, 0, state_QS_s32x4[ 0 ][ 0 ] ); state_QS_s32x4[ 2 ][ 0 ] = vld1q_s32( stateT ); vst1q_lane_s32( stateT, state_QS_s32x4[ 0 ][ 0 ], 0 ); state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], 1 ); state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 ); state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ]; input_QS++; stateT++; } while( --n ); } } { const opus_int16 *inputT = input; int32x4_t t_s32x4; int64x1_t t_s64x1; int64x2_t t_s64x2 = vdupq_n_s64( 0 ); for( n = 0; n <= length - 8; n += 8 ) { int16x8_t input_s16x8 = vld1q_s16( inputT ); t_s32x4 = vmull_s16( vget_low_s16( input_s16x8 ), vget_low_s16( input_s16x8 ) ); t_s32x4 = vmlal_s16( t_s32x4, vget_high_s16( input_s16x8 ), vget_high_s16( input_s16x8 ) ); t_s64x2 = vaddw_s32( t_s64x2, vget_low_s32( t_s32x4 ) ); t_s64x2 = vaddw_s32( t_s64x2, vget_high_s32( t_s32x4 ) ); inputT += 8; } t_s64x1 = vadd_s64( vget_low_s64( t_s64x2 ), vget_high_s64( t_s64x2 ) ); corr_QC_orderT = vget_lane_s64( t_s64x1, 0 ); for( ; n < length; n++ ) { corr_QC_orderT += silk_SMULL( input[ n ], input[ n ] ); } corr_QC_orderT = silk_LSHIFT64( corr_QC_orderT, QC ); corr_QC[ orderT ] = corr_QC_orderT; } corr_QCT = corr_QC + orderT - order; lsh = silk_CLZ64( corr_QC_orderT ) - 35; lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC ); *scale = -( QC + lsh ); silk_assert( *scale >= -30 && *scale <= 12 ); lsh_s64x2 = vdupq_n_s64( lsh ); for( i = 0; i <= order - 3; i += 4 ) { int32x4_t corr_s32x4; int64x2_t corr_QC0_s64x2, corr_QC1_s64x2; corr_QC0_s64x2 = vld1q_s64( corr_QCT + i ); corr_QC1_s64x2 = vld1q_s64( corr_QCT + i + 2 ); corr_QC0_s64x2 = vshlq_s64( corr_QC0_s64x2, lsh_s64x2 ); corr_QC1_s64x2 = vshlq_s64( corr_QC1_s64x2, lsh_s64x2 ); corr_s32x4 = vcombine_s32( vmovn_s64( corr_QC1_s64x2 ), vmovn_s64( corr_QC0_s64x2 ) ); corr_s32x4 = vrev64q_s32( corr_s32x4 ); vst1q_s32( corr + order - i - 3, corr_s32x4 ); } if( lsh >= 0 ) { for( ; i < order + 1; i++ ) { corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_LSHIFT64( corr_QCT[ i ], lsh ) ); } } else { for( ; i < order + 1; i++ ) { corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_RSHIFT64( corr_QCT[ i ], -lsh ) ); } } silk_assert( corr_QCT[ order ] >= 0 ); /* If breaking, decrease QC*/ RESTORE_STACK; } #ifdef OPUS_CHECK_ASM { opus_int32 corr_c[ MAX_SHAPE_LPC_ORDER + 1 ]; opus_int scale_c; silk_warped_autocorrelation_FIX_c( corr_c, &scale_c, input, warping_Q16, length, order ); silk_assert( !memcmp( corr_c, corr, sizeof( corr_c[ 0 ] ) * ( order + 1 ) ) ); silk_assert( scale_c == *scale ); } #endif }
void test_vld1Qs32 (void) { int32x4_t out_int32x4_t; out_int32x4_t = vld1q_s32 (0); }
inline int32x4_t vld1q(const s32 * ptr) { return vld1q_s32(ptr); }
static INLINE void load_8x8_s32_dual( const tran_low_t *input, int32x4x2_t *const in0, int32x4x2_t *const in1, int32x4x2_t *const in2, int32x4x2_t *const in3, int32x4x2_t *const in4, int32x4x2_t *const in5, int32x4x2_t *const in6, int32x4x2_t *const in7) { in0->val[0] = vld1q_s32(input); in0->val[1] = vld1q_s32(input + 4); input += 32; in1->val[0] = vld1q_s32(input); in1->val[1] = vld1q_s32(input + 4); input += 32; in2->val[0] = vld1q_s32(input); in2->val[1] = vld1q_s32(input + 4); input += 32; in3->val[0] = vld1q_s32(input); in3->val[1] = vld1q_s32(input + 4); input += 32; in4->val[0] = vld1q_s32(input); in4->val[1] = vld1q_s32(input + 4); input += 32; in5->val[0] = vld1q_s32(input); in5->val[1] = vld1q_s32(input + 4); input += 32; in6->val[0] = vld1q_s32(input); in6->val[1] = vld1q_s32(input + 4); input += 32; in7->val[0] = vld1q_s32(input); in7->val[1] = vld1q_s32(input + 4); }
void ne10_img_vresize_linear_neon (const int** src, unsigned char* dst, const short* beta, int width) { const int *S0 = src[0], *S1 = src[1]; int32x4_t qS0_0123, qS0_4567, qS1_0123, qS1_4567; int32x4_t qT_0123, qT_4567; int16x4_t dT_0123, dT_4567; uint16x8_t qT_01234567; uint8x8_t dT_01234567, dDst_01234567; int32x2_t dBeta; dBeta = vset_lane_s32 ( (int) (beta[0]), dBeta, 0); dBeta = vset_lane_s32 ( (int) (beta[1]), dBeta, 1); int32x4_t qDelta, qMin, qMax; qDelta = vdupq_n_s32 (DELTA); qMin = vdupq_n_s32 (0); qMax = vdupq_n_s32 (255); int x = 0; for (; x <= width - 8; x += 8) { qS0_0123 = vld1q_s32 (&S0[x]); qS0_4567 = vld1q_s32 (&S0[x + 4]); qS1_0123 = vld1q_s32 (&S1[x]); qS1_4567 = vld1q_s32 (&S1[x + 4]); qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0); qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0); qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1); qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1); qT_0123 = vaddq_s32 (qT_0123, qDelta); qT_4567 = vaddq_s32 (qT_4567, qDelta); qT_0123 = vshrq_n_s32 (qT_0123, BITS); qT_4567 = vshrq_n_s32 (qT_4567, BITS); qT_0123 = vmaxq_s32 (qT_0123, qMin); qT_4567 = vmaxq_s32 (qT_4567, qMin); qT_0123 = vminq_s32 (qT_0123, qMax); qT_4567 = vminq_s32 (qT_4567, qMax); dT_0123 = vmovn_s32 (qT_0123); dT_4567 = vmovn_s32 (qT_4567); qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567)); dT_01234567 = vmovn_u16 (qT_01234567); vst1_u8 (&dst[x], dT_01234567); } if (x < width) { uint8x8_t dMask; dMask = vld1_u8 ( (uint8_t *) (&ne10_img_vresize_linear_mask_residual_table[ (width - x - 1)])); dDst_01234567 = vld1_u8 (&dst[x]); qS0_0123 = vld1q_s32 (&S0[x]); qS0_4567 = vld1q_s32 (&S0[x + 4]); qS1_0123 = vld1q_s32 (&S1[x]); qS1_4567 = vld1q_s32 (&S1[x + 4]); qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0); qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0); qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1); qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1); qT_0123 = vaddq_s32 (qT_0123, qDelta); qT_4567 = vaddq_s32 (qT_4567, qDelta); qT_0123 = vshrq_n_s32 (qT_0123, BITS); qT_4567 = vshrq_n_s32 (qT_4567, BITS); qT_0123 = vmaxq_s32 (qT_0123, qMin); qT_4567 = vmaxq_s32 (qT_4567, qMin); qT_0123 = vminq_s32 (qT_0123, qMax); qT_4567 = vminq_s32 (qT_4567, qMax); dT_0123 = vmovn_s32 (qT_0123); dT_4567 = vmovn_s32 (qT_4567); qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567)); dT_01234567 = vmovn_u16 (qT_01234567); dMask = vbsl_u8 (dMask, dT_01234567, dDst_01234567); vst1_u8 (&dst[x], dMask); } }
// Contains a function for the core loop in the normalized lattice MA // filter routine for iSAC codec, optimized for ARM Neon platform. // It does: // for 0 <= n < HALF_SUBFRAMELEN - 1: // *ptr2 = input2 * (*ptr2) + input0 * (*ptr0)); // *ptr1 = input1 * (*ptr0) + input0 * (*ptr2); // Output is not bit-exact with the reference C code, due to the replacement // of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon // instructions. The difference should not be bigger than 1. void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0, // Filter coefficient int16_t input1, // Filter coefficient int32_t input2, // Inverse coefficient int32_t* ptr0, // Sample buffer int32_t* ptr1, // Sample buffer int32_t* ptr2) // Sample buffer { int n = 0; int loop = (HALF_SUBFRAMELEN - 1) >> 3; int loop_tail = (HALF_SUBFRAMELEN - 1) & 0x7; int32x4_t input0_v = vdupq_n_s32((int32_t)input0 << 16); int32x4_t input1_v = vdupq_n_s32((int32_t)input1 << 16); int32x4_t input2_v = vdupq_n_s32(input2); int32x4_t tmp0a, tmp1a, tmp2a, tmp3a; int32x4_t tmp0b, tmp1b, tmp2b, tmp3b; int32x4_t ptr0va, ptr1va, ptr2va; int32x4_t ptr0vb, ptr1vb, ptr2vb; // Unroll to process 8 samples at once. for (n = 0; n < loop; n++) { ptr0va = vld1q_s32(ptr0); ptr0vb = vld1q_s32(ptr0 + 4); ptr0 += 8; ptr2va = vld1q_s32(ptr2); ptr2vb = vld1q_s32(ptr2 + 4); // Calculate tmp0 = (*ptr0) * input0. tmp0a = vqrdmulhq_s32(ptr0va, input0_v); tmp0b = vqrdmulhq_s32(ptr0vb, input0_v); // Calculate tmp1 = (*ptr0) * input1. tmp1a = vqrdmulhq_s32(ptr0va, input1_v); tmp1b = vqrdmulhq_s32(ptr0vb, input1_v); // Calculate tmp2 = tmp0 + *(ptr2). tmp2a = vaddq_s32(tmp0a, ptr2va); tmp2b = vaddq_s32(tmp0b, ptr2vb); tmp2a = vshlq_n_s32(tmp2a, 15); tmp2b = vshlq_n_s32(tmp2b, 15); // Calculate *ptr2 = input2 * tmp2. ptr2va = vqrdmulhq_s32(tmp2a, input2_v); ptr2vb = vqrdmulhq_s32(tmp2b, input2_v); vst1q_s32(ptr2, ptr2va); vst1q_s32(ptr2 + 4, ptr2vb); ptr2 += 8; // Calculate tmp3 = ptr2v * input0. tmp3a = vqrdmulhq_s32(ptr2va, input0_v); tmp3b = vqrdmulhq_s32(ptr2vb, input0_v); // Calculate *ptr1 = tmp1 + tmp3. ptr1va = vaddq_s32(tmp1a, tmp3a); ptr1vb = vaddq_s32(tmp1b, tmp3b); vst1q_s32(ptr1, ptr1va); vst1q_s32(ptr1 + 4, ptr1vb); ptr1 += 8; } // Process four more samples. if (loop_tail & 0x4) { ptr0va = vld1q_s32(ptr0); ptr2va = vld1q_s32(ptr2); ptr0 += 4; // Calculate tmp0 = (*ptr0) * input0. tmp0a = vqrdmulhq_s32(ptr0va, input0_v); // Calculate tmp1 = (*ptr0) * input1. tmp1a = vqrdmulhq_s32(ptr0va, input1_v); // Calculate tmp2 = tmp0 + *(ptr2). tmp2a = vaddq_s32(tmp0a, ptr2va); tmp2a = vshlq_n_s32(tmp2a, 15); // Calculate *ptr2 = input2 * tmp2. ptr2va = vqrdmulhq_s32(tmp2a, input2_v); vst1q_s32(ptr2, ptr2va); ptr2 += 4; // Calculate tmp3 = *(ptr2) * input0. tmp3a = vqrdmulhq_s32(ptr2va, input0_v); // Calculate *ptr1 = tmp1 + tmp3. ptr1va = vaddq_s32(tmp1a, tmp3a); vst1q_s32(ptr1, ptr1va); ptr1 += 4; } // Process two more samples. if (loop_tail & 0x2) { int32x2_t ptr0v_tail, ptr2v_tail, ptr1v_tail; int32x2_t tmp0_tail, tmp1_tail, tmp2_tail, tmp3_tail; ptr0v_tail = vld1_s32(ptr0); ptr2v_tail = vld1_s32(ptr2); ptr0 += 2; // Calculate tmp0 = (*ptr0) * input0. tmp0_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input0_v)); // Calculate tmp1 = (*ptr0) * input1. tmp1_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input1_v)); // Calculate tmp2 = tmp0 + *(ptr2). tmp2_tail = vadd_s32(tmp0_tail, ptr2v_tail); tmp2_tail = vshl_n_s32(tmp2_tail, 15); // Calculate *ptr2 = input2 * tmp2. ptr2v_tail = vqrdmulh_s32(tmp2_tail, vget_low_s32(input2_v)); vst1_s32(ptr2, ptr2v_tail); ptr2 += 2; // Calculate tmp3 = *(ptr2) * input0. tmp3_tail = vqrdmulh_s32(ptr2v_tail, vget_low_s32(input0_v)); // Calculate *ptr1 = tmp1 + tmp3. ptr1v_tail = vadd_s32(tmp1_tail, tmp3_tail); vst1_s32(ptr1, ptr1v_tail); ptr1 += 2; } // Process one more sample. if (loop_tail & 0x1) { int16_t t16a = (int16_t)(input2 >> 16); int16_t t16b = (int16_t)input2; if (t16b < 0) t16a++; int32_t tmp32a; int32_t tmp32b; // Calculate *ptr2 = input2 * (*ptr2 + input0 * (*ptr0)). tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr0); tmp32b = *ptr2 + tmp32a; *ptr2 = (int32_t)(WEBRTC_SPL_MUL(t16a, tmp32b) + (WEBRTC_SPL_MUL_16_32_RSFT16(t16b, tmp32b))); // Calculate *ptr1 = input1 * (*ptr0) + input0 * (*ptr2). tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input1, *ptr0); tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr2); *ptr1 = tmp32a + tmp32b; }