static INLINE void TRANSPOSE8X8( int16x8_t *q8s16, int16x8_t *q9s16, int16x8_t *q10s16, int16x8_t *q11s16, int16x8_t *q12s16, int16x8_t *q13s16, int16x8_t *q14s16, int16x8_t *q15s16) { int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; int32x4x2_t q0x2s32, q1x2s32, q2x2s32, q3x2s32; int16x8x2_t q0x2s16, q1x2s16, q2x2s16, q3x2s16; d16s16 = vget_low_s16(*q8s16); d17s16 = vget_high_s16(*q8s16); d18s16 = vget_low_s16(*q9s16); d19s16 = vget_high_s16(*q9s16); d20s16 = vget_low_s16(*q10s16); d21s16 = vget_high_s16(*q10s16); d22s16 = vget_low_s16(*q11s16); d23s16 = vget_high_s16(*q11s16); d24s16 = vget_low_s16(*q12s16); d25s16 = vget_high_s16(*q12s16); d26s16 = vget_low_s16(*q13s16); d27s16 = vget_high_s16(*q13s16); d28s16 = vget_low_s16(*q14s16); d29s16 = vget_high_s16(*q14s16); d30s16 = vget_low_s16(*q15s16); d31s16 = vget_high_s16(*q15s16); *q8s16 = vcombine_s16(d16s16, d24s16); // vswp d17, d24 *q9s16 = vcombine_s16(d18s16, d26s16); // vswp d19, d26 *q10s16 = vcombine_s16(d20s16, d28s16); // vswp d21, d28 *q11s16 = vcombine_s16(d22s16, d30s16); // vswp d23, d30 *q12s16 = vcombine_s16(d17s16, d25s16); *q13s16 = vcombine_s16(d19s16, d27s16); *q14s16 = vcombine_s16(d21s16, d29s16); *q15s16 = vcombine_s16(d23s16, d31s16); q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q8s16), vreinterpretq_s32_s16(*q10s16)); q1x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q9s16), vreinterpretq_s32_s16(*q11s16)); q2x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q12s16), vreinterpretq_s32_s16(*q14s16)); q3x2s32 = vtrnq_s32(vreinterpretq_s32_s16(*q13s16), vreinterpretq_s32_s16(*q15s16)); q0x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[0]), // q8 vreinterpretq_s16_s32(q1x2s32.val[0])); // q9 q1x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q0x2s32.val[1]), // q10 vreinterpretq_s16_s32(q1x2s32.val[1])); // q11 q2x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[0]), // q12 vreinterpretq_s16_s32(q3x2s32.val[0])); // q13 q3x2s16 = vtrnq_s16(vreinterpretq_s16_s32(q2x2s32.val[1]), // q14 vreinterpretq_s16_s32(q3x2s32.val[1])); // q15 *q8s16 = q0x2s16.val[0]; *q9s16 = q0x2s16.val[1]; *q10s16 = q1x2s16.val[0]; *q11s16 = q1x2s16.val[1]; *q12s16 = q2x2s16.val[0]; *q13s16 = q2x2s16.val[1]; *q14s16 = q3x2s16.val[0]; *q15s16 = q3x2s16.val[1]; return; }
int16x8x2_t test_vtrnq_s16(int16x8_t a, int16x8_t b) { // CHECK-LABEL: test_vtrnq_s16 return vtrnq_s16(a, b); // CHECK: trn1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h // CHECK: trn2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h }
void vp8_short_fdct8x4_neon( int16_t *input, int16_t *output, int pitch) { int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; int16x4_t d16s16, d17s16, d26s16, d27s16, d28s16, d29s16; uint16x4_t d28u16, d29u16; uint16x8_t q14u16; int16x8_t q0s16, q1s16, q2s16, q3s16; int16x8_t q11s16, q12s16, q13s16, q14s16, q15s16, qEmptys16; int32x4_t q9s32, q10s32, q11s32, q12s32; int16x8x2_t v2tmp0, v2tmp1; int32x4x2_t v2tmp2, v2tmp3; d16s16 = vdup_n_s16(5352); d17s16 = vdup_n_s16(2217); q9s32 = vdupq_n_s32(14500); q10s32 = vdupq_n_s32(7500); // Part one pitch >>= 1; q0s16 = vld1q_s16(input); input += pitch; q1s16 = vld1q_s16(input); input += pitch; q2s16 = vld1q_s16(input); input += pitch; q3s16 = vld1q_s16(input); v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16), vreinterpretq_s32_s16(q2s16)); v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16), vreinterpretq_s32_s16(q3s16)); v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0 vreinterpretq_s16_s32(v2tmp3.val[0])); // q1 v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2 vreinterpretq_s16_s32(v2tmp3.val[1])); // q3 q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]); q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]); q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]); q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]); q11s16 = vshlq_n_s16(q11s16, 3); q12s16 = vshlq_n_s16(q12s16, 3); q13s16 = vshlq_n_s16(q13s16, 3); q14s16 = vshlq_n_s16(q14s16, 3); q0s16 = vaddq_s16(q11s16, q12s16); q2s16 = vsubq_s16(q11s16, q12s16); q11s32 = q9s32; q12s32 = q10s32; d26s16 = vget_low_s16(q13s16); d27s16 = vget_high_s16(q13s16); d28s16 = vget_low_s16(q14s16); d29s16 = vget_high_s16(q14s16); q9s32 = vmlal_s16(q9s32, d28s16, d16s16); q10s32 = vmlal_s16(q10s32, d28s16, d17s16); q11s32 = vmlal_s16(q11s32, d29s16, d16s16); q12s32 = vmlal_s16(q12s32, d29s16, d17s16); q9s32 = vmlal_s16(q9s32, d26s16, d17s16); q10s32 = vmlsl_s16(q10s32, d26s16, d16s16); q11s32 = vmlal_s16(q11s32, d27s16, d17s16); q12s32 = vmlsl_s16(q12s32, d27s16, d16s16); d2s16 = vshrn_n_s32(q9s32, 12); d6s16 = vshrn_n_s32(q10s32, 12); d3s16 = vshrn_n_s32(q11s32, 12); d7s16 = vshrn_n_s32(q12s32, 12); q1s16 = vcombine_s16(d2s16, d3s16); q3s16 = vcombine_s16(d6s16, d7s16); // Part two q9s32 = vdupq_n_s32(12000); q10s32 = vdupq_n_s32(51000); v2tmp2 = vtrnq_s32(vreinterpretq_s32_s16(q0s16), vreinterpretq_s32_s16(q2s16)); v2tmp3 = vtrnq_s32(vreinterpretq_s32_s16(q1s16), vreinterpretq_s32_s16(q3s16)); v2tmp0 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[0]), // q0 vreinterpretq_s16_s32(v2tmp3.val[0])); // q1 v2tmp1 = vtrnq_s16(vreinterpretq_s16_s32(v2tmp2.val[1]), // q2 vreinterpretq_s16_s32(v2tmp3.val[1])); // q3 q11s16 = vaddq_s16(v2tmp0.val[0], v2tmp1.val[1]); q12s16 = vaddq_s16(v2tmp0.val[1], v2tmp1.val[0]); q13s16 = vsubq_s16(v2tmp0.val[1], v2tmp1.val[0]); q14s16 = vsubq_s16(v2tmp0.val[0], v2tmp1.val[1]); q15s16 = vdupq_n_s16(7); q11s16 = vaddq_s16(q11s16, q15s16); q0s16 = vaddq_s16(q11s16, q12s16); q1s16 = vsubq_s16(q11s16, q12s16); q11s32 = q9s32; q12s32 = q10s32; d0s16 = vget_low_s16(q0s16); d1s16 = vget_high_s16(q0s16); d2s16 = vget_low_s16(q1s16); d3s16 = vget_high_s16(q1s16); d0s16 = vshr_n_s16(d0s16, 4); d4s16 = vshr_n_s16(d1s16, 4); d2s16 = vshr_n_s16(d2s16, 4); d6s16 = vshr_n_s16(d3s16, 4); d26s16 = vget_low_s16(q13s16); d27s16 = vget_high_s16(q13s16); d28s16 = vget_low_s16(q14s16); d29s16 = vget_high_s16(q14s16); q9s32 = vmlal_s16(q9s32, d28s16, d16s16); q10s32 = vmlal_s16(q10s32, d28s16, d17s16); q11s32 = vmlal_s16(q11s32, d29s16, d16s16); q12s32 = vmlal_s16(q12s32, d29s16, d17s16); q9s32 = vmlal_s16(q9s32, d26s16, d17s16); q10s32 = vmlsl_s16(q10s32, d26s16, d16s16); q11s32 = vmlal_s16(q11s32, d27s16, d17s16); q12s32 = vmlsl_s16(q12s32, d27s16, d16s16); d1s16 = vshrn_n_s32(q9s32, 16); d3s16 = vshrn_n_s32(q10s32, 16); d5s16 = vshrn_n_s32(q11s32, 16); d7s16 = vshrn_n_s32(q12s32, 16); qEmptys16 = vdupq_n_s16(0); q14u16 = vceqq_s16(q14s16, qEmptys16); q14u16 = vmvnq_u16(q14u16); d28u16 = vget_low_u16(q14u16); d29u16 = vget_high_u16(q14u16); d1s16 = vsub_s16(d1s16, vreinterpret_s16_u16(d28u16)); d5s16 = vsub_s16(d5s16, vreinterpret_s16_u16(d29u16)); q0s16 = vcombine_s16(d0s16, d1s16); q1s16 = vcombine_s16(d2s16, d3s16); q2s16 = vcombine_s16(d4s16, d5s16); q3s16 = vcombine_s16(d6s16, d7s16); vst1q_s16(output, q0s16); vst1q_s16(output + 8, q1s16); vst1q_s16(output + 16, q2s16); vst1q_s16(output + 24, q3s16); return; }
// TODO(johannkoenig): Make a transpose library and dedup with idct. Consider // reversing transpose order which may make it easier for the compiler to // reconcile the vtrn.64 moves. static void transpose8x8(int16x8_t *a0, int16x8_t *a1, int16x8_t *a2, int16x8_t *a3, int16x8_t *a4, int16x8_t *a5, int16x8_t *a6, int16x8_t *a7) { // Swap 64 bit elements. Goes from: // a0: 00 01 02 03 04 05 06 07 // a1: 08 09 10 11 12 13 14 15 // a2: 16 17 18 19 20 21 22 23 // a3: 24 25 26 27 28 29 30 31 // a4: 32 33 34 35 36 37 38 39 // a5: 40 41 42 43 44 45 46 47 // a6: 48 49 50 51 52 53 54 55 // a7: 56 57 58 59 60 61 62 63 // to: // a04_lo: 00 01 02 03 32 33 34 35 // a15_lo: 08 09 10 11 40 41 42 43 // a26_lo: 16 17 18 19 48 49 50 51 // a37_lo: 24 25 26 27 56 57 58 59 // a04_hi: 04 05 06 07 36 37 38 39 // a15_hi: 12 13 14 15 44 45 46 47 // a26_hi: 20 21 22 23 52 53 54 55 // a37_hi: 28 29 30 31 60 61 62 63 const int16x8_t a04_lo = vcombine_s16(vget_low_s16(*a0), vget_low_s16(*a4)); const int16x8_t a15_lo = vcombine_s16(vget_low_s16(*a1), vget_low_s16(*a5)); const int16x8_t a26_lo = vcombine_s16(vget_low_s16(*a2), vget_low_s16(*a6)); const int16x8_t a37_lo = vcombine_s16(vget_low_s16(*a3), vget_low_s16(*a7)); const int16x8_t a04_hi = vcombine_s16(vget_high_s16(*a0), vget_high_s16(*a4)); const int16x8_t a15_hi = vcombine_s16(vget_high_s16(*a1), vget_high_s16(*a5)); const int16x8_t a26_hi = vcombine_s16(vget_high_s16(*a2), vget_high_s16(*a6)); const int16x8_t a37_hi = vcombine_s16(vget_high_s16(*a3), vget_high_s16(*a7)); // Swap 32 bit elements resulting in: // a0246_lo: // 00 01 16 17 32 33 48 49 // 02 03 18 19 34 35 50 51 // a1357_lo: // 08 09 24 25 40 41 56 57 // 10 11 26 27 42 43 58 59 // a0246_hi: // 04 05 20 21 36 37 52 53 // 06 07 22 23 38 39 54 55 // a1657_hi: // 12 13 28 29 44 45 60 61 // 14 15 30 31 46 47 62 63 const int32x4x2_t a0246_lo = vtrnq_s32(vreinterpretq_s32_s16(a04_lo), vreinterpretq_s32_s16(a26_lo)); const int32x4x2_t a1357_lo = vtrnq_s32(vreinterpretq_s32_s16(a15_lo), vreinterpretq_s32_s16(a37_lo)); const int32x4x2_t a0246_hi = vtrnq_s32(vreinterpretq_s32_s16(a04_hi), vreinterpretq_s32_s16(a26_hi)); const int32x4x2_t a1357_hi = vtrnq_s32(vreinterpretq_s32_s16(a15_hi), vreinterpretq_s32_s16(a37_hi)); // Swap 16 bit elements resulting in: // b0: // 00 08 16 24 32 40 48 56 // 01 09 17 25 33 41 49 57 // b1: // 02 10 18 26 34 42 50 58 // 03 11 19 27 35 43 51 59 // b2: // 04 12 20 28 36 44 52 60 // 05 13 21 29 37 45 53 61 // b3: // 06 14 22 30 38 46 54 62 // 07 15 23 31 39 47 55 63 const int16x8x2_t b0 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[0]), vreinterpretq_s16_s32(a1357_lo.val[0])); const int16x8x2_t b1 = vtrnq_s16(vreinterpretq_s16_s32(a0246_lo.val[1]), vreinterpretq_s16_s32(a1357_lo.val[1])); const int16x8x2_t b2 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[0]), vreinterpretq_s16_s32(a1357_hi.val[0])); const int16x8x2_t b3 = vtrnq_s16(vreinterpretq_s16_s32(a0246_hi.val[1]), vreinterpretq_s16_s32(a1357_hi.val[1])); *a0 = b0.val[0]; *a1 = b0.val[1]; *a2 = b1.val[0]; *a3 = b1.val[1]; *a4 = b2.val[0]; *a5 = b2.val[1]; *a6 = b3.val[0]; *a7 = b3.val[1]; }
void idct_dequant_full_2x_neon( int16_t *q, int16_t *dq, unsigned char *dst, int stride) { unsigned char *dst0, *dst1; int32x2_t d28, d29, d30, d31; int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; int16x8_t qEmpty = vdupq_n_s16(0); int32x4x2_t q2tmp0, q2tmp1; int16x8x2_t q2tmp2, q2tmp3; int16x4_t dLow0, dLow1, dHigh0, dHigh1; d28 = d29 = d30 = d31 = vdup_n_s32(0); // load dq q0 = vld1q_s16(dq); dq += 8; q1 = vld1q_s16(dq); // load q q2 = vld1q_s16(q); vst1q_s16(q, qEmpty); q += 8; q3 = vld1q_s16(q); vst1q_s16(q, qEmpty); q += 8; q4 = vld1q_s16(q); vst1q_s16(q, qEmpty); q += 8; q5 = vld1q_s16(q); vst1q_s16(q, qEmpty); // load src from dst dst0 = dst; dst1 = dst + 4; d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0); dst0 += stride; d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1); dst1 += stride; d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0); dst0 += stride; d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1); dst1 += stride; d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0); dst0 += stride; d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1); dst1 += stride; d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0); d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1); q2 = vmulq_s16(q2, q0); q3 = vmulq_s16(q3, q1); q4 = vmulq_s16(q4, q0); q5 = vmulq_s16(q5, q1); // vswp dLow0 = vget_low_s16(q2); dHigh0 = vget_high_s16(q2); dLow1 = vget_low_s16(q4); dHigh1 = vget_high_s16(q4); q2 = vcombine_s16(dLow0, dLow1); q4 = vcombine_s16(dHigh0, dHigh1); dLow0 = vget_low_s16(q3); dHigh0 = vget_high_s16(q3); dLow1 = vget_low_s16(q5); dHigh1 = vget_high_s16(q5); q3 = vcombine_s16(dLow0, dLow1); q5 = vcombine_s16(dHigh0, dHigh1); q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2); q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2); q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1); q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1); q10 = vqaddq_s16(q2, q3); q11 = vqsubq_s16(q2, q3); q8 = vshrq_n_s16(q8, 1); q9 = vshrq_n_s16(q9, 1); q4 = vqaddq_s16(q4, q8); q5 = vqaddq_s16(q5, q9); q2 = vqsubq_s16(q6, q5); q3 = vqaddq_s16(q7, q4); q4 = vqaddq_s16(q10, q3); q5 = vqaddq_s16(q11, q2); q6 = vqsubq_s16(q11, q2); q7 = vqsubq_s16(q10, q3); q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), vreinterpretq_s16_s32(q2tmp1.val[0])); q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), vreinterpretq_s16_s32(q2tmp1.val[1])); // loop 2 q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2); q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2); q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1); q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1); q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]); q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]); q10 = vshrq_n_s16(q10, 1); q11 = vshrq_n_s16(q11, 1); q10 = vqaddq_s16(q2tmp2.val[1], q10); q11 = vqaddq_s16(q2tmp3.val[1], q11); q8 = vqsubq_s16(q8, q11); q9 = vqaddq_s16(q9, q10); q4 = vqaddq_s16(q2, q9); q5 = vqaddq_s16(q3, q8); q6 = vqsubq_s16(q3, q8); q7 = vqsubq_s16(q2, q9); q4 = vrshrq_n_s16(q4, 3); q5 = vrshrq_n_s16(q5, 3); q6 = vrshrq_n_s16(q6, 3); q7 = vrshrq_n_s16(q7, 3); q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), vreinterpretq_s16_s32(q2tmp1.val[0])); q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), vreinterpretq_s16_s32(q2tmp1.val[1])); q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]), vreinterpret_u8_s32(d28))); q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]), vreinterpret_u8_s32(d29))); q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]), vreinterpret_u8_s32(d30))); q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]), vreinterpret_u8_s32(d31))); d28 = vreinterpret_s32_u8(vqmovun_s16(q4)); d29 = vreinterpret_s32_u8(vqmovun_s16(q5)); d30 = vreinterpret_s32_u8(vqmovun_s16(q6)); d31 = vreinterpret_s32_u8(vqmovun_s16(q7)); dst0 = dst; dst1 = dst + 4; vst1_lane_s32((int32_t *)dst0, d28, 0); dst0 += stride; vst1_lane_s32((int32_t *)dst1, d28, 1); dst1 += stride; vst1_lane_s32((int32_t *)dst0, d29, 0); dst0 += stride; vst1_lane_s32((int32_t *)dst1, d29, 1); dst1 += stride; vst1_lane_s32((int32_t *)dst0, d30, 0); dst0 += stride; vst1_lane_s32((int32_t *)dst1, d30, 1); dst1 += stride; vst1_lane_s32((int32_t *)dst0, d31, 0); vst1_lane_s32((int32_t *)dst1, d31, 1); return; }