static INLINE void GENERATE_COSINE_CONSTANTS(int16x4_t *d0s16, int16x4_t *d1s16, int16x4_t *d2s16) { *d0s16 = vdup_n_s16(cospi_8_64); *d1s16 = vdup_n_s16(cospi_16_64); *d2s16 = vdup_n_s16(cospi_24_64); return; }
static INLINE void GENERATE_SINE_CONSTANTS(int16x4_t *d3s16, int16x4_t *d4s16, int16x4_t *d5s16, int16x8_t *q3s16) { *d3s16 = vdup_n_s16(sinpi_1_9); *d4s16 = vdup_n_s16(sinpi_2_9); *q3s16 = vdupq_n_s16(sinpi_3_9); *d5s16 = vdup_n_s16(sinpi_4_9); return; }
int16x4_t sub_abs_to_vabd_16() { int16x4_t val1 = vdup_n_s16 (10); int16x4_t val2 = vdup_n_s16 (30); int16x4_t sres = vsub_s16(val1, val2); int16x4_t res = vabs_s16 (sres); return res; }
void test_vdup_ns16 (void) { int16x4_t out_int16x4_t; int16_t arg0_int16_t; out_int16x4_t = vdup_n_s16 (arg0_int16_t); }
/* return the sum of all elements in an array. This works by calculating 4 totals (one for each lane) and adding those at the end to get the final total */ int sum_array(int16_t *array, int size) { /* initialize the accumulator vector to zero */ int16x4_t acc = vdup_n_s16(0); int32x2_t acc1; int64x1_t acc2; /* this implementation assumes the size of the array is a multiple of 4 */ assert((size % 4) == 0); /* counting backwards gives better code */ for (; size != 0; size -= 4) { int16x4_t vec; /* load 4 values in parallel from the array */ vec = vld1_s16(array); /* increment the array pointer to the next element */ array += 4; /* add the vector to the accumulator vector */ acc = vadd_s16(acc, vec); } /* calculate the total */ acc1 = vpaddl_s16(acc); acc2 = vpaddl_s32(acc1); /* return the total as an integer */ return (int)vget_lane_s64(acc2, 0); }
static void inline ff_dct_unquantize_h263_neon(int qscale, int qadd, int nCoeffs, int16_t *block) { int16x8_t q0s16, q2s16, q3s16, q8s16, q10s16, q11s16, q13s16; int16x8_t q14s16, q15s16, qzs16; int16x4_t d0s16, d2s16, d3s16, dzs16; uint16x8_t q1u16, q9u16; uint16x4_t d1u16; dzs16 = vdup_n_s16(0); qzs16 = vdupq_n_s16(0); q15s16 = vdupq_n_s16(qscale << 1); q14s16 = vdupq_n_s16(qadd); q13s16 = vnegq_s16(q14s16); if (nCoeffs > 4) { for (; nCoeffs > 8; nCoeffs -= 16, block += 16) { q0s16 = vld1q_s16(block); q3s16 = vreinterpretq_s16_u16(vcltq_s16(q0s16, qzs16)); q8s16 = vld1q_s16(block + 8); q1u16 = vceqq_s16(q0s16, qzs16); q2s16 = vmulq_s16(q0s16, q15s16); q11s16 = vreinterpretq_s16_u16(vcltq_s16(q8s16, qzs16)); q10s16 = vmulq_s16(q8s16, q15s16); q3s16 = vbslq_s16(vreinterpretq_u16_s16(q3s16), q13s16, q14s16); q11s16 = vbslq_s16(vreinterpretq_u16_s16(q11s16), q13s16, q14s16); q2s16 = vaddq_s16(q2s16, q3s16); q9u16 = vceqq_s16(q8s16, qzs16); q10s16 = vaddq_s16(q10s16, q11s16); q0s16 = vbslq_s16(q1u16, q0s16, q2s16); q8s16 = vbslq_s16(q9u16, q8s16, q10s16); vst1q_s16(block, q0s16); vst1q_s16(block + 8, q8s16); } } if (nCoeffs <= 0) return; d0s16 = vld1_s16(block); d3s16 = vreinterpret_s16_u16(vclt_s16(d0s16, dzs16)); d1u16 = vceq_s16(d0s16, dzs16); d2s16 = vmul_s16(d0s16, vget_high_s16(q15s16)); d3s16 = vbsl_s16(vreinterpret_u16_s16(d3s16), vget_high_s16(q13s16), vget_high_s16(q14s16)); d2s16 = vadd_s16(d2s16, d3s16); d0s16 = vbsl_s16(d1u16, d0s16, d2s16); vst1_s16(block, d0s16); }
inline void ClampBufferToS16(s16 *out, const s32 *in, size_t size, s8 volShift) { #ifdef _M_SSE // Size will always be 16-byte aligned as the hwBlockSize is. while (size >= 8) { __m128i in1 = _mm_loadu_si128((__m128i *)in); __m128i in2 = _mm_loadu_si128((__m128i *)(in + 4)); __m128i packed = _mm_packs_epi32(in1, in2); if (useShift) { packed = _mm_srai_epi16(packed, volShift); } _mm_storeu_si128((__m128i *)out, packed); out += 8; in += 8; size -= 8; } #elif PPSSPP_ARCH(ARM_NEON) int16x4_t signedVolShift = vdup_n_s16 (-volShift); // Can only dynamic-shift right, but by a signed integer while (size >= 8) { int32x4_t in1 = vld1q_s32(in); int32x4_t in2 = vld1q_s32(in + 4); int16x4_t packed1 = vqmovn_s32(in1); int16x4_t packed2 = vqmovn_s32(in2); if (useShift) { packed1 = vshl_s16(packed1, signedVolShift); packed2 = vshl_s16(packed2, signedVolShift); } vst1_s16(out, packed1); vst1_s16(out + 4, packed2); out += 8; in += 8; size -= 8; } #endif // This does the remainder if SIMD was used, otherwise it does it all. for (size_t i = 0; i < size; i++) { out[i] = clamp_s16(useShift ? (in[i] >> volShift) : in[i]); } }
// coeff: 16 bits, dynamic range [-32640, 32640]. // length: value range {16, 64, 256, 1024}. int aom_satd_neon(const int16_t *coeff, int length) { const int16x4_t zero = vdup_n_s16(0); int32x4_t accum = vdupq_n_s32(0); do { const int16x8_t src0 = vld1q_s16(coeff); const int16x8_t src8 = vld1q_s16(coeff + 8); accum = vabal_s16(accum, vget_low_s16(src0), zero); accum = vabal_s16(accum, vget_high_s16(src0), zero); accum = vabal_s16(accum, vget_low_s16(src8), zero); accum = vabal_s16(accum, vget_high_s16(src8), zero); length -= 16; coeff += 16; } while (length != 0); { // satd: 26 bits, dynamic range [-32640 * 1024, 32640 * 1024] const int64x2_t s0 = vpaddlq_s32(accum); // cascading summation of 'accum'. const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)), vreinterpret_s32_s64(vget_high_s64(s0))); const int satd = vget_lane_s32(s1, 0); return satd; } }
static INLINE void IDCT8x8_1D( int16x8_t *q8s16, int16x8_t *q9s16, int16x8_t *q10s16, int16x8_t *q11s16, int16x8_t *q12s16, int16x8_t *q13s16, int16x8_t *q14s16, int16x8_t *q15s16) { int16x4_t d0s16, d1s16, d2s16, d3s16; int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; int32x4_t q2s32, q3s32, q5s32, q6s32, q8s32, q9s32; int32x4_t q10s32, q11s32, q12s32, q13s32, q15s32; d0s16 = vdup_n_s16(cospi_28_64); d1s16 = vdup_n_s16(cospi_4_64); d2s16 = vdup_n_s16(cospi_12_64); d3s16 = vdup_n_s16(cospi_20_64); d16s16 = vget_low_s16(*q8s16); d17s16 = vget_high_s16(*q8s16); d18s16 = vget_low_s16(*q9s16); d19s16 = vget_high_s16(*q9s16); d20s16 = vget_low_s16(*q10s16); d21s16 = vget_high_s16(*q10s16); d22s16 = vget_low_s16(*q11s16); d23s16 = vget_high_s16(*q11s16); d24s16 = vget_low_s16(*q12s16); d25s16 = vget_high_s16(*q12s16); d26s16 = vget_low_s16(*q13s16); d27s16 = vget_high_s16(*q13s16); d28s16 = vget_low_s16(*q14s16); d29s16 = vget_high_s16(*q14s16); d30s16 = vget_low_s16(*q15s16); d31s16 = vget_high_s16(*q15s16); q2s32 = vmull_s16(d18s16, d0s16); q3s32 = vmull_s16(d19s16, d0s16); q5s32 = vmull_s16(d26s16, d2s16); q6s32 = vmull_s16(d27s16, d2s16); q2s32 = vmlsl_s16(q2s32, d30s16, d1s16); q3s32 = vmlsl_s16(q3s32, d31s16, d1s16); q5s32 = vmlsl_s16(q5s32, d22s16, d3s16); q6s32 = vmlsl_s16(q6s32, d23s16, d3s16); d8s16 = vqrshrn_n_s32(q2s32, 14); d9s16 = vqrshrn_n_s32(q3s32, 14); d10s16 = vqrshrn_n_s32(q5s32, 14); d11s16 = vqrshrn_n_s32(q6s32, 14); q4s16 = vcombine_s16(d8s16, d9s16); q5s16 = vcombine_s16(d10s16, d11s16); q2s32 = vmull_s16(d18s16, d1s16); q3s32 = vmull_s16(d19s16, d1s16); q9s32 = vmull_s16(d26s16, d3s16); q13s32 = vmull_s16(d27s16, d3s16); q2s32 = vmlal_s16(q2s32, d30s16, d0s16); q3s32 = vmlal_s16(q3s32, d31s16, d0s16); q9s32 = vmlal_s16(q9s32, d22s16, d2s16); q13s32 = vmlal_s16(q13s32, d23s16, d2s16); d14s16 = vqrshrn_n_s32(q2s32, 14); d15s16 = vqrshrn_n_s32(q3s32, 14); d12s16 = vqrshrn_n_s32(q9s32, 14); d13s16 = vqrshrn_n_s32(q13s32, 14); q6s16 = vcombine_s16(d12s16, d13s16); q7s16 = vcombine_s16(d14s16, d15s16); d0s16 = vdup_n_s16(cospi_16_64); q2s32 = vmull_s16(d16s16, d0s16); q3s32 = vmull_s16(d17s16, d0s16); q13s32 = vmull_s16(d16s16, d0s16); q15s32 = vmull_s16(d17s16, d0s16); q2s32 = vmlal_s16(q2s32, d24s16, d0s16); q3s32 = vmlal_s16(q3s32, d25s16, d0s16); q13s32 = vmlsl_s16(q13s32, d24s16, d0s16); q15s32 = vmlsl_s16(q15s32, d25s16, d0s16); d0s16 = vdup_n_s16(cospi_24_64); d1s16 = vdup_n_s16(cospi_8_64); d18s16 = vqrshrn_n_s32(q2s32, 14); d19s16 = vqrshrn_n_s32(q3s32, 14); d22s16 = vqrshrn_n_s32(q13s32, 14); d23s16 = vqrshrn_n_s32(q15s32, 14); *q9s16 = vcombine_s16(d18s16, d19s16); *q11s16 = vcombine_s16(d22s16, d23s16); q2s32 = vmull_s16(d20s16, d0s16); q3s32 = vmull_s16(d21s16, d0s16); q8s32 = vmull_s16(d20s16, d1s16); q12s32 = vmull_s16(d21s16, d1s16); q2s32 = vmlsl_s16(q2s32, d28s16, d1s16); q3s32 = vmlsl_s16(q3s32, d29s16, d1s16); q8s32 = vmlal_s16(q8s32, d28s16, d0s16); q12s32 = vmlal_s16(q12s32, d29s16, d0s16); d26s16 = vqrshrn_n_s32(q2s32, 14); d27s16 = vqrshrn_n_s32(q3s32, 14); d30s16 = vqrshrn_n_s32(q8s32, 14); d31s16 = vqrshrn_n_s32(q12s32, 14); *q13s16 = vcombine_s16(d26s16, d27s16); *q15s16 = vcombine_s16(d30s16, d31s16); q0s16 = vaddq_s16(*q9s16, *q15s16); q1s16 = vaddq_s16(*q11s16, *q13s16); q2s16 = vsubq_s16(*q11s16, *q13s16); q3s16 = vsubq_s16(*q9s16, *q15s16); *q13s16 = vsubq_s16(q4s16, q5s16); q4s16 = vaddq_s16(q4s16, q5s16); *q14s16 = vsubq_s16(q7s16, q6s16); q7s16 = vaddq_s16(q7s16, q6s16); d26s16 = vget_low_s16(*q13s16); d27s16 = vget_high_s16(*q13s16); d28s16 = vget_low_s16(*q14s16); d29s16 = vget_high_s16(*q14s16); d16s16 = vdup_n_s16(cospi_16_64); q9s32 = vmull_s16(d28s16, d16s16); q10s32 = vmull_s16(d29s16, d16s16); q11s32 = vmull_s16(d28s16, d16s16); q12s32 = vmull_s16(d29s16, d16s16); q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); q11s32 = vmlal_s16(q11s32, d26s16, d16s16); q12s32 = vmlal_s16(q12s32, d27s16, d16s16); d10s16 = vqrshrn_n_s32(q9s32, 14); d11s16 = vqrshrn_n_s32(q10s32, 14); d12s16 = vqrshrn_n_s32(q11s32, 14); d13s16 = vqrshrn_n_s32(q12s32, 14); q5s16 = vcombine_s16(d10s16, d11s16); q6s16 = vcombine_s16(d12s16, d13s16); *q8s16 = vaddq_s16(q0s16, q7s16); *q9s16 = vaddq_s16(q1s16, q6s16); *q10s16 = vaddq_s16(q2s16, q5s16); *q11s16 = vaddq_s16(q3s16, q4s16); *q12s16 = vsubq_s16(q3s16, q4s16); *q13s16 = vsubq_s16(q2s16, q5s16); *q14s16 = vsubq_s16(q1s16, q6s16); *q15s16 = vsubq_s16(q0s16, q7s16); return; }
void vpx_idct8x8_12_add_neon( int16_t *input, uint8_t *dest, int dest_stride) { uint8_t *d1, *d2; uint8x8_t d0u8, d1u8, d2u8, d3u8; int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16; int16x4_t d26s16, d27s16, d28s16, d29s16; uint64x1_t d0u64, d1u64, d2u64, d3u64; int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; uint16x8_t q8u16, q9u16, q10u16, q11u16; int32x4_t q9s32, q10s32, q11s32, q12s32; q8s16 = vld1q_s16(input); q9s16 = vld1q_s16(input + 8); q10s16 = vld1q_s16(input + 16); q11s16 = vld1q_s16(input + 24); q12s16 = vld1q_s16(input + 32); q13s16 = vld1q_s16(input + 40); q14s16 = vld1q_s16(input + 48); q15s16 = vld1q_s16(input + 56); TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); // First transform rows // stage 1 q0s16 = vdupq_n_s16(cospi_28_64 * 2); q1s16 = vdupq_n_s16(cospi_4_64 * 2); q4s16 = vqrdmulhq_s16(q9s16, q0s16); q0s16 = vdupq_n_s16(-cospi_20_64 * 2); q7s16 = vqrdmulhq_s16(q9s16, q1s16); q1s16 = vdupq_n_s16(cospi_12_64 * 2); q5s16 = vqrdmulhq_s16(q11s16, q0s16); q0s16 = vdupq_n_s16(cospi_16_64 * 2); q6s16 = vqrdmulhq_s16(q11s16, q1s16); // stage 2 & stage 3 - even half q1s16 = vdupq_n_s16(cospi_24_64 * 2); q9s16 = vqrdmulhq_s16(q8s16, q0s16); q0s16 = vdupq_n_s16(cospi_8_64 * 2); q13s16 = vqrdmulhq_s16(q10s16, q1s16); q15s16 = vqrdmulhq_s16(q10s16, q0s16); // stage 3 -odd half q0s16 = vaddq_s16(q9s16, q15s16); q1s16 = vaddq_s16(q9s16, q13s16); q2s16 = vsubq_s16(q9s16, q13s16); q3s16 = vsubq_s16(q9s16, q15s16); // stage 2 - odd half q13s16 = vsubq_s16(q4s16, q5s16); q4s16 = vaddq_s16(q4s16, q5s16); q14s16 = vsubq_s16(q7s16, q6s16); q7s16 = vaddq_s16(q7s16, q6s16); d26s16 = vget_low_s16(q13s16); d27s16 = vget_high_s16(q13s16); d28s16 = vget_low_s16(q14s16); d29s16 = vget_high_s16(q14s16); d16s16 = vdup_n_s16(cospi_16_64); q9s32 = vmull_s16(d28s16, d16s16); q10s32 = vmull_s16(d29s16, d16s16); q11s32 = vmull_s16(d28s16, d16s16); q12s32 = vmull_s16(d29s16, d16s16); q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); q11s32 = vmlal_s16(q11s32, d26s16, d16s16); q12s32 = vmlal_s16(q12s32, d27s16, d16s16); d10s16 = vqrshrn_n_s32(q9s32, 14); d11s16 = vqrshrn_n_s32(q10s32, 14); d12s16 = vqrshrn_n_s32(q11s32, 14); d13s16 = vqrshrn_n_s32(q12s32, 14); q5s16 = vcombine_s16(d10s16, d11s16); q6s16 = vcombine_s16(d12s16, d13s16); // stage 4 q8s16 = vaddq_s16(q0s16, q7s16); q9s16 = vaddq_s16(q1s16, q6s16); q10s16 = vaddq_s16(q2s16, q5s16); q11s16 = vaddq_s16(q3s16, q4s16); q12s16 = vsubq_s16(q3s16, q4s16); q13s16 = vsubq_s16(q2s16, q5s16); q14s16 = vsubq_s16(q1s16, q6s16); q15s16 = vsubq_s16(q0s16, q7s16); TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); q8s16 = vrshrq_n_s16(q8s16, 5); q9s16 = vrshrq_n_s16(q9s16, 5); q10s16 = vrshrq_n_s16(q10s16, 5); q11s16 = vrshrq_n_s16(q11s16, 5); q12s16 = vrshrq_n_s16(q12s16, 5); q13s16 = vrshrq_n_s16(q13s16, 5); q14s16 = vrshrq_n_s16(q14s16, 5); q15s16 = vrshrq_n_s16(q15s16, 5); d1 = d2 = dest; d0u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d1u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d2u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d3u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); d2 += dest_stride; q8s16 = q12s16; q9s16 = q13s16; q10s16 = q14s16; q11s16 = q15s16; d0u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d1u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d2u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d3u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); d2 += dest_stride; return; }
void ne10_img_hresize_4channels_linear_neon (const unsigned char** src, int** dst, int count, const int* xofs, const short* alpha, int swidth, int dwidth, int cn, int xmin, int xmax) { int dx, k; int dx0 = 0; int16x4x2_t alpha_vec; uint8x8_t dS0_vec, dS1_vec; int16x8_t qS0_vec, qS1_vec; int16x4_t dS0_0123, dS0_4567, dS1_0123, dS1_4567; int32x4_t qT0_vec, qT1_vec; int16x4_t dCoeff; dCoeff = vdup_n_s16 (INTER_RESIZE_COEF_SCALE); for (k = 0; k <= count - 2; k++) { const unsigned char *S0 = src[k], *S1 = src[k + 1]; int *D0 = dst[k], *D1 = dst[k + 1]; for (dx = dx0; dx < xmax; dx += 4) { int sx = xofs[dx]; alpha_vec = vld2_s16 (&alpha[dx * 2]); dS0_vec = vld1_u8 (&S0[sx]); dS1_vec = vld1_u8 (&S1[sx]); qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec)); qS1_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS1_vec)); dS0_0123 = vget_low_s16 (qS0_vec); dS0_4567 = vget_high_s16 (qS0_vec); dS1_0123 = vget_low_s16 (qS1_vec); dS1_4567 = vget_high_s16 (qS1_vec); qT0_vec = vmull_s16 (dS0_0123, alpha_vec.val[0]); qT1_vec = vmull_s16 (dS1_0123, alpha_vec.val[0]); qT0_vec = vmlal_s16 (qT0_vec, dS0_4567, alpha_vec.val[1]); qT1_vec = vmlal_s16 (qT1_vec, dS1_4567, alpha_vec.val[1]); vst1q_s32 (&D0[dx], qT0_vec); vst1q_s32 (&D1[dx], qT1_vec); } for (; dx < dwidth; dx += 4) { int sx = xofs[dx]; dS0_vec = vld1_u8 (&S0[sx]); dS1_vec = vld1_u8 (&S1[sx]); qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec)); qS1_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS1_vec)); dS0_0123 = vget_low_s16 (qS0_vec); dS1_0123 = vget_low_s16 (qS1_vec); qT0_vec = vmull_s16 (dS0_0123, dCoeff); qT1_vec = vmull_s16 (dS1_0123, dCoeff); vst1q_s32 (&D0[dx], qT0_vec); vst1q_s32 (&D1[dx], qT1_vec); } } for (; k < count; k++) { const unsigned char *S = src[k]; int *D = dst[k]; for (dx = 0; dx < xmax; dx += 4) { int sx = xofs[dx]; alpha_vec = vld2_s16 (&alpha[dx * 2]); dS0_vec = vld1_u8 (&S[sx]); qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec)); dS0_0123 = vget_low_s16 (qS0_vec); dS0_4567 = vget_high_s16 (qS0_vec); qT0_vec = vmull_s16 (dS0_0123, alpha_vec.val[0]); qT0_vec = vmlal_s16 (qT0_vec, dS0_4567, alpha_vec.val[1]); vst1q_s32 (&D[dx], qT0_vec); } for (; dx < dwidth; dx += 4) { int sx = xofs[dx]; dS0_vec = vld1_u8 (&S[sx]); qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec)); dS0_0123 = vget_low_s16 (qS0_vec); qT0_vec = vmull_s16 (dS0_0123, dCoeff); vst1q_s32 (&D[dx], qT0_vec); } } }
int16x4_t test_vdup_n_s16(int16_t v1) { // CHECK: test_vdup_n_s16 return vdup_n_s16(v1); // CHECK: dup {{v[0-9]+}}.4h, {{w[0-9]+}} }
void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { uint8x8_t d26u8, d27u8; uint32x2_t d26u32, d27u32; uint16x8_t q8u16, q9u16; int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16; int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16; int16x8_t q8s16, q9s16, q13s16, q14s16; int32x4_t q1s32, q13s32, q14s32, q15s32; int16x4x2_t d0x2s16, d1x2s16; int32x4x2_t q0x2s32; uint8_t *d; d26u32 = d27u32 = vdup_n_u32(0); q8s16 = vld1q_s16(input); q9s16 = vld1q_s16(input + 8); d16s16 = vget_low_s16(q8s16); d17s16 = vget_high_s16(q8s16); d18s16 = vget_low_s16(q9s16); d19s16 = vget_high_s16(q9s16); d0x2s16 = vtrn_s16(d16s16, d17s16); d1x2s16 = vtrn_s16(d18s16, d19s16); q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); d20s16 = vdup_n_s16((int16_t)cospi_8_64); d21s16 = vdup_n_s16((int16_t)cospi_16_64); q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16)); d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); d22s16 = vdup_n_s16((int16_t)cospi_24_64); // stage 1 d23s16 = vadd_s16(d16s16, d18s16); d24s16 = vsub_s16(d16s16, d18s16); q15s32 = vmull_s16(d17s16, d22s16); q1s32 = vmull_s16(d17s16, d20s16); q13s32 = vmull_s16(d23s16, d21s16); q14s32 = vmull_s16(d24s16, d21s16); q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); q1s32 = vmlal_s16(q1s32, d19s16, d22s16); d26s16 = vqrshrn_n_s32(q13s32, 14); d27s16 = vqrshrn_n_s32(q14s32, 14); d29s16 = vqrshrn_n_s32(q15s32, 14); d28s16 = vqrshrn_n_s32(q1s32, 14); q13s16 = vcombine_s16(d26s16, d27s16); q14s16 = vcombine_s16(d28s16, d29s16); // stage 2 q8s16 = vaddq_s16(q13s16, q14s16); q9s16 = vsubq_s16(q13s16, q14s16); d16s16 = vget_low_s16(q8s16); d17s16 = vget_high_s16(q8s16); d18s16 = vget_high_s16(q9s16); // vswp d18 d19 d19s16 = vget_low_s16(q9s16); d0x2s16 = vtrn_s16(d16s16, d17s16); d1x2s16 = vtrn_s16(d18s16, d19s16); q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16)); d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); // do the transform on columns // stage 1 d23s16 = vadd_s16(d16s16, d18s16); d24s16 = vsub_s16(d16s16, d18s16); q15s32 = vmull_s16(d17s16, d22s16); q1s32 = vmull_s16(d17s16, d20s16); q13s32 = vmull_s16(d23s16, d21s16); q14s32 = vmull_s16(d24s16, d21s16); q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); q1s32 = vmlal_s16(q1s32, d19s16, d22s16); d26s16 = vqrshrn_n_s32(q13s32, 14); d27s16 = vqrshrn_n_s32(q14s32, 14); d29s16 = vqrshrn_n_s32(q15s32, 14); d28s16 = vqrshrn_n_s32(q1s32, 14); q13s16 = vcombine_s16(d26s16, d27s16); q14s16 = vcombine_s16(d28s16, d29s16); // stage 2 q8s16 = vaddq_s16(q13s16, q14s16); q9s16 = vsubq_s16(q13s16, q14s16); q8s16 = vrshrq_n_s16(q8s16, 4); q9s16 = vrshrq_n_s16(q9s16, 4); d = dest; d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0); d += dest_stride; d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1); d += dest_stride; d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1); d += dest_stride; d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0); q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32)); d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d = dest; vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0); d += dest_stride; vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1); d += dest_stride; vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1); d += dest_stride; vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0); return; }
// Update the noise estimation information. static void UpdateNoiseEstimateNeon(NoiseSuppressionFixedC* inst, int offset) { const int16_t kExp2Const = 11819; // Q13 int16_t* ptr_noiseEstLogQuantile = NULL; int16_t* ptr_noiseEstQuantile = NULL; int16x4_t kExp2Const16x4 = vdup_n_s16(kExp2Const); int32x4_t twentyOne32x4 = vdupq_n_s32(21); int32x4_t constA32x4 = vdupq_n_s32(0x1fffff); int32x4_t constB32x4 = vdupq_n_s32(0x200000); int16_t tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset, inst->magnLen); // Guarantee a Q-domain as high as possible and still fit in int16 inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(kExp2Const, tmp16, 21); int32x4_t qNoise32x4 = vdupq_n_s32(inst->qNoise); for (ptr_noiseEstLogQuantile = &inst->noiseEstLogQuantile[offset], ptr_noiseEstQuantile = &inst->noiseEstQuantile[0]; ptr_noiseEstQuantile < &inst->noiseEstQuantile[inst->magnLen - 3]; ptr_noiseEstQuantile += 4, ptr_noiseEstLogQuantile += 4) { // tmp32no2 = kExp2Const * inst->noiseEstLogQuantile[offset + i]; int16x4_t v16x4 = vld1_s16(ptr_noiseEstLogQuantile); int32x4_t v32x4B = vmull_s16(v16x4, kExp2Const16x4); // tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac int32x4_t v32x4A = vandq_s32(v32x4B, constA32x4); v32x4A = vorrq_s32(v32x4A, constB32x4); // tmp16 = (int16_t)(tmp32no2 >> 21); v32x4B = vshrq_n_s32(v32x4B, 21); // tmp16 -= 21;// shift 21 to get result in Q0 v32x4B = vsubq_s32(v32x4B, twentyOne32x4); // tmp16 += (int16_t) inst->qNoise; // shift to get result in Q(qNoise) v32x4B = vaddq_s32(v32x4B, qNoise32x4); // if (tmp16 < 0) { // tmp32no1 >>= -tmp16; // } else { // tmp32no1 <<= tmp16; // } v32x4B = vshlq_s32(v32x4A, v32x4B); // tmp16 = WebRtcSpl_SatW32ToW16(tmp32no1); v16x4 = vqmovn_s32(v32x4B); //inst->noiseEstQuantile[i] = tmp16; vst1_s16(ptr_noiseEstQuantile, v16x4); } // Last iteration: // inst->quantile[i]=exp(inst->lquantile[offset+i]); // in Q21 int32_t tmp32no2 = kExp2Const * *ptr_noiseEstLogQuantile; int32_t tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac tmp16 = (int16_t)(tmp32no2 >> 21); tmp16 -= 21;// shift 21 to get result in Q0 tmp16 += (int16_t) inst->qNoise; //shift to get result in Q(qNoise) if (tmp16 < 0) { tmp32no1 >>= -tmp16; } else {
frac = (int16_t)((((uint32_t)magn[i] << zeros) & 0x7FFFFFFF) >> 23); assert(frac < 256); // log2(magn(i)) log2 = (int16_t)(((31 - zeros) << 8) + WebRtcNsx_kLogTableFrac[frac]); // log2(magn(i))*log(2) lmagn[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2_const, 15); // + log(2^stages) lmagn[i] += logval; } else { lmagn[i] = logval; } } int16x4_t Q3_16x4 = vdup_n_s16(3); int16x8_t WIDTHQ8_16x8 = vdupq_n_s16(WIDTH_Q8); int16x8_t WIDTHFACTOR_16x8 = vdupq_n_s16(width_factor); int16_t factor = FACTOR_Q7; if (inst->blockIndex < END_STARTUP_LONG) factor = FACTOR_Q7_STARTUP; // Loop over simultaneous estimates for (s = 0; s < SIMULT; s++) { offset = s * inst->magnLen; // Get counter values from state counter = inst->noiseEstCounter[s]; assert(counter < 201); countDiv = WebRtcNsx_kCounterDiv[counter];
void silk_biquad_alt_stride2_neon( const opus_int16 *in, /* I input signal */ const opus_int32 *B_Q28, /* I MA coefficients [3] */ const opus_int32 *A_Q28, /* I AR coefficients [2] */ opus_int32 *S, /* I/O State vector [4] */ opus_int16 *out, /* O output signal */ const opus_int32 len /* I signal length (must be even) */ ) { /* DIRECT FORM II TRANSPOSED (uses 2 element state vector) */ opus_int k = 0; const int32x2_t offset_s32x2 = vdup_n_s32( (1<<14) - 1 ); const int32x4_t offset_s32x4 = vcombine_s32( offset_s32x2, offset_s32x2 ); int16x4_t in_s16x4 = vdup_n_s16( 0 ); int16x4_t out_s16x4; int32x2_t A_Q28_s32x2, A_L_s32x2, A_U_s32x2, B_Q28_s32x2, t_s32x2; int32x4_t A_L_s32x4, A_U_s32x4, B_Q28_s32x4, S_s32x4, out32_Q14_s32x4; int32x2x2_t t0_s32x2x2, t1_s32x2x2, t2_s32x2x2, S_s32x2x2; #ifdef OPUS_CHECK_ASM opus_int32 S_c[ 4 ]; VARDECL( opus_int16, out_c ); SAVE_STACK; ALLOC( out_c, 2 * len, opus_int16 ); silk_memcpy( &S_c, S, sizeof( S_c ) ); silk_biquad_alt_stride2_c( in, B_Q28, A_Q28, S_c, out_c, len ); #endif /* Negate A_Q28 values and split in two parts */ A_Q28_s32x2 = vld1_s32( A_Q28 ); A_Q28_s32x2 = vneg_s32( A_Q28_s32x2 ); A_L_s32x2 = vshl_n_s32( A_Q28_s32x2, 18 ); /* ( -A_Q28[] & 0x00003FFF ) << 18 */ A_L_s32x2 = vreinterpret_s32_u32( vshr_n_u32( vreinterpret_u32_s32( A_L_s32x2 ), 3 ) ); /* ( -A_Q28[] & 0x00003FFF ) << 15 */ A_U_s32x2 = vshr_n_s32( A_Q28_s32x2, 14 ); /* silk_RSHIFT( -A_Q28[], 14 ) */ A_U_s32x2 = vshl_n_s32( A_U_s32x2, 16 ); /* silk_RSHIFT( -A_Q28[], 14 ) << 16 (Clip two leading bits to conform to C function.) */ A_U_s32x2 = vshr_n_s32( A_U_s32x2, 1 ); /* silk_RSHIFT( -A_Q28[], 14 ) << 15 */ B_Q28_s32x2 = vld1_s32( B_Q28 ); t_s32x2 = vld1_s32( B_Q28 + 1 ); t0_s32x2x2 = vzip_s32( A_L_s32x2, A_L_s32x2 ); t1_s32x2x2 = vzip_s32( A_U_s32x2, A_U_s32x2 ); t2_s32x2x2 = vzip_s32( t_s32x2, t_s32x2 ); A_L_s32x4 = vcombine_s32( t0_s32x2x2.val[ 0 ], t0_s32x2x2.val[ 1 ] ); /* A{0,0,1,1}_L_Q28 */ A_U_s32x4 = vcombine_s32( t1_s32x2x2.val[ 0 ], t1_s32x2x2.val[ 1 ] ); /* A{0,0,1,1}_U_Q28 */ B_Q28_s32x4 = vcombine_s32( t2_s32x2x2.val[ 0 ], t2_s32x2x2.val[ 1 ] ); /* B_Q28[ {1,1,2,2} ] */ S_s32x4 = vld1q_s32( S ); /* S0 = S[ 0 ]; S3 = S[ 3 ]; */ S_s32x2x2 = vtrn_s32( vget_low_s32( S_s32x4 ), vget_high_s32( S_s32x4 ) ); /* S2 = S[ 1 ]; S1 = S[ 2 ]; */ S_s32x4 = vcombine_s32( S_s32x2x2.val[ 0 ], S_s32x2x2.val[ 1 ] ); for( ; k < len - 1; k += 2 ) { int32x4_t in_s32x4[ 2 ], t_s32x4; int32x2_t out32_Q14_s32x2[ 2 ]; /* S[ 2 * i + 0 ], S[ 2 * i + 1 ], S[ 2 * i + 2 ], S[ 2 * i + 3 ]: Q12 */ in_s16x4 = vld1_s16( &in[ 2 * k ] ); /* in{0,1,2,3} = in[ 2 * k + {0,1,2,3} ]; */ in_s32x4[ 0 ] = vshll_n_s16( in_s16x4, 15 ); /* in{0,1,2,3} << 15 */ t_s32x4 = vqdmulhq_lane_s32( in_s32x4[ 0 ], B_Q28_s32x2, 0 ); /* silk_SMULWB( B_Q28[ 0 ], in{0,1,2,3} ) */ in_s32x4[ 1 ] = vcombine_s32( vget_high_s32( in_s32x4[ 0 ] ), vget_high_s32( in_s32x4[ 0 ] ) ); /* in{2,3,2,3} << 15 */ in_s32x4[ 0 ] = vcombine_s32( vget_low_s32 ( in_s32x4[ 0 ] ), vget_low_s32 ( in_s32x4[ 0 ] ) ); /* in{0,1,0,1} << 15 */ silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_low_s32 ( t_s32x4 ), in_s32x4[ 0 ], &S_s32x4, &out32_Q14_s32x2[ 0 ] ); silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_high_s32( t_s32x4 ), in_s32x4[ 1 ], &S_s32x4, &out32_Q14_s32x2[ 1 ] ); /* Scale back to Q0 and saturate */ out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2[ 0 ], out32_Q14_s32x2[ 1 ] ); /* out32_Q14_{0,1,2,3} */ out32_Q14_s32x4 = vaddq_s32( out32_Q14_s32x4, offset_s32x4 ); /* out32_Q14_{0,1,2,3} + (1<<14) - 1 */ out_s16x4 = vqshrn_n_s32( out32_Q14_s32x4, 14 ); /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ) */ vst1_s16( &out[ 2 * k ], out_s16x4 ); /* out[ 2 * k + {0,1,2,3} ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ); */ } /* Process leftover. */ if( k < len ) { int32x4_t in_s32x4; int32x2_t out32_Q14_s32x2; /* S[ 2 * i + 0 ], S[ 2 * i + 1 ]: Q12 */ in_s16x4 = vld1_lane_s16( &in[ 2 * k + 0 ], in_s16x4, 0 ); /* in{0,1} = in[ 2 * k + {0,1} ]; */ in_s16x4 = vld1_lane_s16( &in[ 2 * k + 1 ], in_s16x4, 1 ); /* in{0,1} = in[ 2 * k + {0,1} ]; */ in_s32x4 = vshll_n_s16( in_s16x4, 15 ); /* in{0,1} << 15 */ t_s32x2 = vqdmulh_lane_s32( vget_low_s32( in_s32x4 ), B_Q28_s32x2, 0 ); /* silk_SMULWB( B_Q28[ 0 ], in{0,1} ) */ in_s32x4 = vcombine_s32( vget_low_s32( in_s32x4 ), vget_low_s32( in_s32x4 ) ); /* in{0,1,0,1} << 15 */ silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, t_s32x2, in_s32x4, &S_s32x4, &out32_Q14_s32x2 ); /* Scale back to Q0 and saturate */ out32_Q14_s32x2 = vadd_s32( out32_Q14_s32x2, offset_s32x2 ); /* out32_Q14_{0,1} + (1<<14) - 1 */ out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2, out32_Q14_s32x2 ); /* out32_Q14_{0,1,0,1} + (1<<14) - 1 */ out_s16x4 = vqshrn_n_s32( out32_Q14_s32x4, 14 ); /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,0,1} + (1<<14) - 1, 14 ) ) */ vst1_lane_s16( &out[ 2 * k + 0 ], out_s16x4, 0 ); /* out[ 2 * k + 0 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_0 + (1<<14) - 1, 14 ) ); */ vst1_lane_s16( &out[ 2 * k + 1 ], out_s16x4, 1 ); /* out[ 2 * k + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_1 + (1<<14) - 1, 14 ) ); */ } vst1q_lane_s32( &S[ 0 ], S_s32x4, 0 ); /* S[ 0 ] = S0; */ vst1q_lane_s32( &S[ 1 ], S_s32x4, 2 ); /* S[ 1 ] = S2; */ vst1q_lane_s32( &S[ 2 ], S_s32x4, 1 ); /* S[ 2 ] = S1; */ vst1q_lane_s32( &S[ 3 ], S_s32x4, 3 ); /* S[ 3 ] = S3; */ #ifdef OPUS_CHECK_ASM silk_assert( !memcmp( S_c, S, sizeof( S_c ) ) ); silk_assert( !memcmp( out_c, out, 2 * len * sizeof( opus_int16 ) ) ); RESTORE_STACK; #endif }
void vp8_short_fdct8x4_neon( int16_t *input, int16_t *output, int pitch) { int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; int16x4_t d16s16, d17s16, d26s16, d27s16, d28s16, d29s16; uint16x4_t d28u16, d29u16; uint16x8_t q14u16; int16x8_t q0s16, q1s16, q2s16, q3s16; int16x8_t q11s16, q12s16, q13s16, q14s16, q15s16, qEmptys16; int32x4_t q9s32, q10s32, q11s32, q12s32; int16x8x2_t v2tmp0, v2tmp1; int32x4x2_t v2tmp2, v2tmp3; d16s16 = vdup_n_s16(5352); d17s16 = vdup_n_s16(2217); q9s32 = vdupq_n_s32(14500); q10s32 = vdupq_n_s32(7500); // Part one pitch >>= 1; q0s16 = vld1q_s16(input); input += pitch; q1s16 = vld1q_s16(input); input += pitch; q2s16 = vld1q_s16(input); input += pitch; q3s16 = vld1q_s16(input); v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16), vreinterpretq_s32_s16(q2s16)); v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16), vreinterpretq_s32_s16(q3s16)); v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0 vreinterpretq_s16_s32(v2tmp3.val[0])); // q1 v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2 vreinterpretq_s16_s32(v2tmp3.val[1])); // q3 q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]); q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]); q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]); q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]); q11s16 = vshlq_n_s16(q11s16, 3); q12s16 = vshlq_n_s16(q12s16, 3); q13s16 = vshlq_n_s16(q13s16, 3); q14s16 = vshlq_n_s16(q14s16, 3); q0s16 = vaddq_s16(q11s16, q12s16); q2s16 = vsubq_s16(q11s16, q12s16); q11s32 = q9s32; q12s32 = q10s32; d26s16 = vget_low_s16(q13s16); d27s16 = vget_high_s16(q13s16); d28s16 = vget_low_s16(q14s16); d29s16 = vget_high_s16(q14s16); q9s32 = vmlal_s16(q9s32, d28s16, d16s16); q10s32 = vmlal_s16(q10s32, d28s16, d17s16); q11s32 = vmlal_s16(q11s32, d29s16, d16s16); q12s32 = vmlal_s16(q12s32, d29s16, d17s16); q9s32 = vmlal_s16(q9s32, d26s16, d17s16); q10s32 = vmlsl_s16(q10s32, d26s16, d16s16); q11s32 = vmlal_s16(q11s32, d27s16, d17s16); q12s32 = vmlsl_s16(q12s32, d27s16, d16s16); d2s16 = vshrn_n_s32(q9s32, 12); d6s16 = vshrn_n_s32(q10s32, 12); d3s16 = vshrn_n_s32(q11s32, 12); d7s16 = vshrn_n_s32(q12s32, 12); q1s16 = vcombine_s16(d2s16, d3s16); q3s16 = vcombine_s16(d6s16, d7s16); // Part two q9s32 = vdupq_n_s32(12000); q10s32 = vdupq_n_s32(51000); v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16), vreinterpretq_s32_s16(q2s16)); v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16), vreinterpretq_s32_s16(q3s16)); v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0 vreinterpretq_s16_s32(v2tmp3.val[0])); // q1 v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2 vreinterpretq_s16_s32(v2tmp3.val[1])); // q3 q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]); q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]); q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]); q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]); q15s16 = vdupq_n_s16(7); q11s16 = vaddq_s16(q11s16, q15s16); q0s16 = vaddq_s16(q11s16, q12s16); q1s16 = vsubq_s16(q11s16, q12s16); q11s32 = q9s32; q12s32 = q10s32; d0s16 = vget_low_s16(q0s16); d1s16 = vget_high_s16(q0s16); d2s16 = vget_low_s16(q1s16); d3s16 = vget_high_s16(q1s16); d0s16 = vshr_n_s16(d0s16, 4); d4s16 = vshr_n_s16(d1s16, 4); d2s16 = vshr_n_s16(d2s16, 4); d6s16 = vshr_n_s16(d3s16, 4); d26s16 = vget_low_s16(q13s16); d27s16 = vget_high_s16(q13s16); d28s16 = vget_low_s16(q14s16); d29s16 = vget_high_s16(q14s16); q9s32 = vmlal_s16(q9s32, d28s16, d16s16); q10s32 = vmlal_s16(q10s32, d28s16, d17s16); q11s32 = vmlal_s16(q11s32, d29s16, d16s16); q12s32 = vmlal_s16(q12s32, d29s16, d17s16); q9s32 = vmlal_s16(q9s32, d26s16, d17s16); q10s32 = vmlsl_s16(q10s32, d26s16, d16s16); q11s32 = vmlal_s16(q11s32, d27s16, d17s16); q12s32 = vmlsl_s16(q12s32, d27s16, d16s16); d1s16 = vshrn_n_s32(q9s32, 16); d3s16 = vshrn_n_s32(q10s32, 16); d5s16 = vshrn_n_s32(q11s32, 16); d7s16 = vshrn_n_s32(q12s32, 16); qEmptys16 = vdupq_n_s16(0); q14u16 = vceqq_s16(q14s16, qEmptys16); q14u16 = vmvnq_u16(q14u16); d28u16 = vget_low_u16(q14u16); d29u16 = vget_high_u16(q14u16); d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d28u16)); d5s16 = vsub_s16(d5s16, vreinterpret_s16_u16(d29u16)); q0s16 = vcombine_s16(d0s16, d1s16); q1s16 = vcombine_s16(d2s16, d3s16); q2s16 = vcombine_s16(d4s16, d5s16); q3s16 = vcombine_s16(d6s16, d7s16); vst1q_s16(output, q0s16); vst1q_s16(output + 8, q1s16); vst1q_s16(output + 16, q2s16); vst1q_s16(output + 24, q3s16); return; }
inline int16x4_t vdup_n(const s16 & val) { return vdup_n_s16(val); }
void vp8_short_fdct4x4_neon( int16_t *input, int16_t *output, int pitch) { int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; int16x4_t d16s16, d17s16, d26s16, dEmptys16; uint16x4_t d4u16; int16x8_t q0s16, q1s16; int32x4_t q9s32, q10s32, q11s32, q12s32; int16x4x2_t v2tmp0, v2tmp1; int32x2x2_t v2tmp2, v2tmp3; d16s16 = vdup_n_s16(5352); d17s16 = vdup_n_s16(2217); q9s32 = vdupq_n_s32(14500); q10s32 = vdupq_n_s32(7500); q11s32 = vdupq_n_s32(12000); q12s32 = vdupq_n_s32(51000); // Part one pitch >>= 1; d0s16 = vld1_s16(input); input += pitch; d1s16 = vld1_s16(input); input += pitch; d2s16 = vld1_s16(input); input += pitch; d3s16 = vld1_s16(input); v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16), vreinterpret_s32_s16(d2s16)); v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16), vreinterpret_s32_s16(d3s16)); v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0 vreinterpret_s16_s32(v2tmp3.val[0])); // d1 v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2 vreinterpret_s16_s32(v2tmp3.val[1])); // d3 d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]); d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]); d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]); d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]); d4s16 = vshl_n_s16(d4s16, 3); d5s16 = vshl_n_s16(d5s16, 3); d6s16 = vshl_n_s16(d6s16, 3); d7s16 = vshl_n_s16(d7s16, 3); d0s16 = vadd_s16(d4s16, d5s16); d2s16 = vsub_s16(d4s16, d5s16); q9s32 = vmlal_s16(q9s32, d7s16, d16s16); q10s32 = vmlal_s16(q10s32, d7s16, d17s16); q9s32 = vmlal_s16(q9s32, d6s16, d17s16); q10s32 = vmlsl_s16(q10s32, d6s16, d16s16); d1s16 = vshrn_n_s32(q9s32, 12); d3s16 = vshrn_n_s32(q10s32, 12); // Part two v2tmp2 = vtrn_s32(vreinterpret_s32_s16(d0s16), vreinterpret_s32_s16(d2s16)); v2tmp3 = vtrn_s32(vreinterpret_s32_s16(d1s16), vreinterpret_s32_s16(d3s16)); v2tmp0 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[0]), // d0 vreinterpret_s16_s32(v2tmp3.val[0])); // d1 v2tmp1 = vtrn_s16(vreinterpret_s16_s32(v2tmp2.val[1]), // d2 vreinterpret_s16_s32(v2tmp3.val[1])); // d3 d4s16 = vadd_s16(v2tmp0.val[0], v2tmp1.val[1]); d5s16 = vadd_s16(v2tmp0.val[1], v2tmp1.val[0]); d6s16 = vsub_s16(v2tmp0.val[1], v2tmp1.val[0]); d7s16 = vsub_s16(v2tmp0.val[0], v2tmp1.val[1]); d26s16 = vdup_n_s16(7); d4s16 = vadd_s16(d4s16, d26s16); d0s16 = vadd_s16(d4s16, d5s16); d2s16 = vsub_s16(d4s16, d5s16); q11s32 = vmlal_s16(q11s32, d7s16, d16s16); q12s32 = vmlal_s16(q12s32, d7s16, d17s16); dEmptys16 = vdup_n_s16(0); d4u16 = vceq_s16(d7s16, dEmptys16); d0s16 = vshr_n_s16(d0s16, 4); d2s16 = vshr_n_s16(d2s16, 4); q11s32 = vmlal_s16(q11s32, d6s16, d17s16); q12s32 = vmlsl_s16(q12s32, d6s16, d16s16); d4u16 = vmvn_u16(d4u16); d1s16 = vshrn_n_s32(q11s32, 16); d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d4u16)); d3s16 = vshrn_n_s32(q12s32, 16); q0s16 = vcombine_s16(d0s16, d1s16); q1s16 = vcombine_s16(d2s16, d3s16); vst1q_s16(output, q0s16); vst1q_s16(output + 8, q1s16); return; }