static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src, uint16_t* dst, int len) { int i; const int16x8_t zero = vdupq_n_s16(0); const int16x8_t max = vdupq_n_s16(MAX_Y); uint64x2_t sum = vdupq_n_u64(0); uint64_t diff; for (i = 0; i + 8 <= len; i += 8) { const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i)); const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i)); const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i)); const int16x8_t D = vsubq_s16(A, B); // diff_y const int16x8_t F = vaddq_s16(C, D); // new_y const uint16x8_t H = vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero)); const int16x8_t I = vabsq_s16(D); // abs(diff_y) vst1q_u16(dst + i, H); sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I))); } diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1); for (; i < len; ++i) { const int diff_y = ref[i] - src[i]; const int new_y = (int)(dst[i]) + diff_y; dst[i] = clip_y(new_y); diff += (uint64_t)(abs(diff_y)); } return diff; }
void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore* aecm, const uint16_t* far_spectrum, int32_t* echo_est) { assert((uintptr_t)echo_est % 32 == 0); assert((uintptr_t)(aecm->channelStored) % 16 == 0); assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0); // This is C code of following optimized code. // During startup we store the channel every block. // memcpy(aecm->channelStored, // aecm->channelAdapt16, // sizeof(int16_t) * PART_LEN1); // Recalculate echo estimate // for (i = 0; i < PART_LEN; i += 4) { // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], // far_spectrum[i]); // echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1], // far_spectrum[i + 1]); // echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2], // far_spectrum[i + 2]); // echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3], // far_spectrum[i + 3]); // } // echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i], // far_spectrum[i]); const uint16_t* far_spectrum_p = far_spectrum; int16_t* start_adapt_p = aecm->channelAdapt16; int16_t* start_stored_p = aecm->channelStored; const int16_t* end_stored_p = aecm->channelStored + PART_LEN; int32_t* echo_est_p = echo_est; uint16x8_t far_spectrum_v; int16x8_t adapt_v; uint32x4_t echo_est_v_low, echo_est_v_high; while (start_stored_p < end_stored_p) { far_spectrum_v = vld1q_u16(far_spectrum_p); adapt_v = vld1q_s16(start_adapt_p); vst1q_s16(start_stored_p, adapt_v); echo_est_v_low = vmull_u16(vget_low_u16(far_spectrum_v), vget_low_u16(vreinterpretq_u16_s16(adapt_v))); echo_est_v_high = vmull_u16(vget_high_u16(far_spectrum_v), vget_high_u16(vreinterpretq_u16_s16(adapt_v))); vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low)); vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high)); far_spectrum_p += 8; start_adapt_p += 8; start_stored_p += 8; echo_est_p += 8; } aecm->channelStored[PART_LEN] = aecm->channelAdapt16[PART_LEN]; echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN], far_spectrum[PART_LEN]); }
void idct_dequant_0_2x_neon( int16_t *q, int16_t dq, unsigned char *dst, int stride) { unsigned char *dst0; int i, a0, a1; int16x8x2_t q2Add; int32x2_t d2s32, d4s32; uint8x8_t d2u8, d4u8; uint16x8_t q1u16, q2u16; a0 = ((q[0] * dq) + 4) >> 3; a1 = ((q[16] * dq) + 4) >> 3; q[0] = q[16] = 0; q2Add.val[0] = vdupq_n_s16((int16_t)a0); q2Add.val[1] = vdupq_n_s16((int16_t)a1); for (i = 0; i < 2; i++, dst += 4) { dst0 = dst; d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0); dst0 += stride; d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1); dst0 += stride; d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0); dst0 += stride; d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1); q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), vreinterpret_u8_s32(d2s32)); q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), vreinterpret_u8_s32(d4s32)); d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); d2s32 = vreinterpret_s32_u8(d2u8); d4s32 = vreinterpret_s32_u8(d4u8); dst0 = dst; vst1_lane_s32((int32_t *)dst0, d2s32, 0); dst0 += stride; vst1_lane_s32((int32_t *)dst0, d2s32, 1); dst0 += stride; vst1_lane_s32((int32_t *)dst0, d4s32, 0); dst0 += stride; vst1_lane_s32((int32_t *)dst0, d4s32, 1); } return; }
static void inline ff_dct_unquantize_h263_neon(int qscale, int qadd, int nCoeffs, int16_t *block) { int16x8_t q0s16, q2s16, q3s16, q8s16, q10s16, q11s16, q13s16; int16x8_t q14s16, q15s16, qzs16; int16x4_t d0s16, d2s16, d3s16, dzs16; uint16x8_t q1u16, q9u16; uint16x4_t d1u16; dzs16 = vdup_n_s16(0); qzs16 = vdupq_n_s16(0); q15s16 = vdupq_n_s16(qscale << 1); q14s16 = vdupq_n_s16(qadd); q13s16 = vnegq_s16(q14s16); if (nCoeffs > 4) { for (; nCoeffs > 8; nCoeffs -= 16, block += 16) { q0s16 = vld1q_s16(block); q3s16 = vreinterpretq_s16_u16(vcltq_s16(q0s16, qzs16)); q8s16 = vld1q_s16(block + 8); q1u16 = vceqq_s16(q0s16, qzs16); q2s16 = vmulq_s16(q0s16, q15s16); q11s16 = vreinterpretq_s16_u16(vcltq_s16(q8s16, qzs16)); q10s16 = vmulq_s16(q8s16, q15s16); q3s16 = vbslq_s16(vreinterpretq_u16_s16(q3s16), q13s16, q14s16); q11s16 = vbslq_s16(vreinterpretq_u16_s16(q11s16), q13s16, q14s16); q2s16 = vaddq_s16(q2s16, q3s16); q9u16 = vceqq_s16(q8s16, qzs16); q10s16 = vaddq_s16(q10s16, q11s16); q0s16 = vbslq_s16(q1u16, q0s16, q2s16); q8s16 = vbslq_s16(q9u16, q8s16, q10s16); vst1q_s16(block, q0s16); vst1q_s16(block + 8, q8s16); } } if (nCoeffs <= 0) return; d0s16 = vld1_s16(block); d3s16 = vreinterpret_s16_u16(vclt_s16(d0s16, dzs16)); d1u16 = vceq_s16(d0s16, dzs16); d2s16 = vmul_s16(d0s16, vget_high_s16(q15s16)); d3s16 = vbsl_s16(vreinterpret_u16_s16(d3s16), vget_high_s16(q13s16), vget_high_s16(q14s16)); d2s16 = vadd_s16(d2s16, d3s16); d0s16 = vbsl_s16(d1u16, d0s16, d2s16); vst1_s16(block, d0s16); }
void vpx_idct4x4_1_add_neon( int16_t *input, uint8_t *dest, int dest_stride) { uint8x8_t d6u8; uint32x2_t d2u32 = vdup_n_u32(0); uint16x8_t q8u16; int16x8_t q0s16; uint8_t *d1, *d2; int16_t i, a1, cospi_16_64 = 11585; int16_t out = dct_const_round_shift(input[0] * cospi_16_64); out = dct_const_round_shift(out * cospi_16_64); a1 = ROUND_POWER_OF_TWO(out, 4); q0s16 = vdupq_n_s16(a1); // dc_only_idct_add d1 = d2 = dest; for (i = 0; i < 2; i++) { d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0); d1 += dest_stride; d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1); d1 += dest_stride; q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), vreinterpret_u8_u32(d2u32)); d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0); d2 += dest_stride; vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1); d2 += dest_stride; } return; }
void test_vreinterpretQu16_s16 (void) { uint16x8_t out_uint16x8_t; int16x8_t arg0_int16x8_t; out_uint16x8_t = vreinterpretq_u16_s16 (arg0_int16x8_t); }
static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len, const uint16_t* best_y, uint16_t* out) { int i; const int16x8_t max = vdupq_n_s16(MAX_Y); const int16x8_t zero = vdupq_n_s16(0); for (i = 0; i + 8 <= len; i += 8) { const int16x8_t a0 = vld1q_s16(A + i + 0); const int16x8_t a1 = vld1q_s16(A + i + 1); const int16x8_t b0 = vld1q_s16(B + i + 0); const int16x8_t b1 = vld1q_s16(B + i + 1); const int16x8_t a0b1 = vaddq_s16(a0, b1); const int16x8_t a1b0 = vaddq_s16(a1, b0); const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1 const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1) const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0) const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3); const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3); const int16x8_t d0 = vaddq_s16(c1, a0); const int16x8_t d1 = vaddq_s16(c0, a1); const int16x8_t e0 = vrshrq_n_s16(d0, 1); const int16x8_t e1 = vrshrq_n_s16(d1, 1); const int16x8x2_t f = vzipq_s16(e0, e1); const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0)); const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8)); const int16x8_t h0 = vaddq_s16(g0, f.val[0]); const int16x8_t h1 = vaddq_s16(g1, f.val[1]); const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero); const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero); vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0)); vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1)); } for (; i < len; ++i) { const int a0b1 = A[i + 0] + B[i + 1]; const int a1b0 = A[i + 1] + B[i + 0]; const int a0a1b0b1 = a0b1 + a1b0 + 8; const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0); out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1); } }
QT_BEGIN_NAMESPACE static inline int16x8_t qvdiv_255_s16(int16x8_t x, int16x8_t half) { // result = (x + (x >> 8) + 0x80) >> 8 const int16x8_t temp = vshrq_n_s16(x, 8); // x >> 8 const int16x8_t sum_part = vaddq_s16(x, half); // x + 0x80 const int16x8_t sum = vaddq_s16(temp, sum_part); return vreinterpretq_s16_u16(vshrq_n_u16(vreinterpretq_u16_s16(sum), 8)); }
static INLINE void highbd_idct32x32_1_add_pos_kernel(uint16_t **dest, const int stride, const int16x8_t res, const int16x8_t max) { const uint16x8_t a0 = vld1q_u16(*dest); const uint16x8_t a1 = vld1q_u16(*dest + 8); const uint16x8_t a2 = vld1q_u16(*dest + 16); const uint16x8_t a3 = vld1q_u16(*dest + 24); const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0)); const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1)); const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2)); const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3)); const int16x8_t c0 = vminq_s16(b0, max); const int16x8_t c1 = vminq_s16(b1, max); const int16x8_t c2 = vminq_s16(b2, max); const int16x8_t c3 = vminq_s16(b3, max); vst1q_u16(*dest, vreinterpretq_u16_s16(c0)); vst1q_u16(*dest + 8, vreinterpretq_u16_s16(c1)); vst1q_u16(*dest + 16, vreinterpretq_u16_s16(c2)); vst1q_u16(*dest + 24, vreinterpretq_u16_s16(c3)); *dest += stride; }
static INLINE void idct4x4_1_add_kernel(uint8_t **dest, const int stride, const int16x8_t res, uint32x2_t *const d) { uint16x8_t a; uint8x8_t b; *d = vld1_lane_u32((const uint32_t *)*dest, *d, 0); *d = vld1_lane_u32((const uint32_t *)(*dest + stride), *d, 1); a = vaddw_u8(vreinterpretq_u16_s16(res), vreinterpret_u8_u32(*d)); b = vqmovun_s16(vreinterpretq_s16_u16(a)); vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 0); *dest += stride; vst1_lane_u32((uint32_t *)*dest, vreinterpret_u32_u8(b), 1); *dest += stride; }
void vp9_idct8x8_1_add_neon( int16_t *input, uint8_t *dest, int dest_stride) { uint8x8_t d2u8, d3u8, d30u8, d31u8; uint64x1_t d2u64, d3u64, d4u64, d5u64; uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; int16x8_t q0s16; uint8_t *d1, *d2; int16_t i, a1, cospi_16_64 = 11585; int16_t out = dct_const_round_shift(input[0] * cospi_16_64); out = dct_const_round_shift(out * cospi_16_64); a1 = ROUND_POWER_OF_TWO(out, 5); q0s16 = vdupq_n_s16(a1); q0u16 = vreinterpretq_u16_s16(q0s16); d1 = d2 = dest; for (i = 0; i < 2; i++) { d2u64 = vld1_u64((const uint64_t *)d1); d1 += dest_stride; d3u64 = vld1_u64((const uint64_t *)d1); d1 += dest_stride; d4u64 = vld1_u64((const uint64_t *)d1); d1 += dest_stride; d5u64 = vld1_u64((const uint64_t *)d1); d1 += dest_stride; q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8)); d2 += dest_stride; } return; }
void vpx_idct8x8_12_add_neon( int16_t *input, uint8_t *dest, int dest_stride) { uint8_t *d1, *d2; uint8x8_t d0u8, d1u8, d2u8, d3u8; int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16; int16x4_t d26s16, d27s16, d28s16, d29s16; uint64x1_t d0u64, d1u64, d2u64, d3u64; int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; uint16x8_t q8u16, q9u16, q10u16, q11u16; int32x4_t q9s32, q10s32, q11s32, q12s32; q8s16 = vld1q_s16(input); q9s16 = vld1q_s16(input + 8); q10s16 = vld1q_s16(input + 16); q11s16 = vld1q_s16(input + 24); q12s16 = vld1q_s16(input + 32); q13s16 = vld1q_s16(input + 40); q14s16 = vld1q_s16(input + 48); q15s16 = vld1q_s16(input + 56); TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); // First transform rows // stage 1 q0s16 = vdupq_n_s16(cospi_28_64 * 2); q1s16 = vdupq_n_s16(cospi_4_64 * 2); q4s16 = vqrdmulhq_s16(q9s16, q0s16); q0s16 = vdupq_n_s16(-cospi_20_64 * 2); q7s16 = vqrdmulhq_s16(q9s16, q1s16); q1s16 = vdupq_n_s16(cospi_12_64 * 2); q5s16 = vqrdmulhq_s16(q11s16, q0s16); q0s16 = vdupq_n_s16(cospi_16_64 * 2); q6s16 = vqrdmulhq_s16(q11s16, q1s16); // stage 2 & stage 3 - even half q1s16 = vdupq_n_s16(cospi_24_64 * 2); q9s16 = vqrdmulhq_s16(q8s16, q0s16); q0s16 = vdupq_n_s16(cospi_8_64 * 2); q13s16 = vqrdmulhq_s16(q10s16, q1s16); q15s16 = vqrdmulhq_s16(q10s16, q0s16); // stage 3 -odd half q0s16 = vaddq_s16(q9s16, q15s16); q1s16 = vaddq_s16(q9s16, q13s16); q2s16 = vsubq_s16(q9s16, q13s16); q3s16 = vsubq_s16(q9s16, q15s16); // stage 2 - odd half q13s16 = vsubq_s16(q4s16, q5s16); q4s16 = vaddq_s16(q4s16, q5s16); q14s16 = vsubq_s16(q7s16, q6s16); q7s16 = vaddq_s16(q7s16, q6s16); d26s16 = vget_low_s16(q13s16); d27s16 = vget_high_s16(q13s16); d28s16 = vget_low_s16(q14s16); d29s16 = vget_high_s16(q14s16); d16s16 = vdup_n_s16(cospi_16_64); q9s32 = vmull_s16(d28s16, d16s16); q10s32 = vmull_s16(d29s16, d16s16); q11s32 = vmull_s16(d28s16, d16s16); q12s32 = vmull_s16(d29s16, d16s16); q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); q11s32 = vmlal_s16(q11s32, d26s16, d16s16); q12s32 = vmlal_s16(q12s32, d27s16, d16s16); d10s16 = vqrshrn_n_s32(q9s32, 14); d11s16 = vqrshrn_n_s32(q10s32, 14); d12s16 = vqrshrn_n_s32(q11s32, 14); d13s16 = vqrshrn_n_s32(q12s32, 14); q5s16 = vcombine_s16(d10s16, d11s16); q6s16 = vcombine_s16(d12s16, d13s16); // stage 4 q8s16 = vaddq_s16(q0s16, q7s16); q9s16 = vaddq_s16(q1s16, q6s16); q10s16 = vaddq_s16(q2s16, q5s16); q11s16 = vaddq_s16(q3s16, q4s16); q12s16 = vsubq_s16(q3s16, q4s16); q13s16 = vsubq_s16(q2s16, q5s16); q14s16 = vsubq_s16(q1s16, q6s16); q15s16 = vsubq_s16(q0s16, q7s16); TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); q8s16 = vrshrq_n_s16(q8s16, 5); q9s16 = vrshrq_n_s16(q9s16, 5); q10s16 = vrshrq_n_s16(q10s16, 5); q11s16 = vrshrq_n_s16(q11s16, 5); q12s16 = vrshrq_n_s16(q12s16, 5); q13s16 = vrshrq_n_s16(q13s16, 5); q14s16 = vrshrq_n_s16(q14s16, 5); q15s16 = vrshrq_n_s16(q15s16, 5); d1 = d2 = dest; d0u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d1u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d2u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d3u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); d2 += dest_stride; q8s16 = q12s16; q9s16 = q13s16; q10s16 = q14s16; q11s16 = q15s16; d0u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d1u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d2u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d3u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); d2 += dest_stride; return; }
void vpx_idct8x8_64_add_neon( int16_t *input, uint8_t *dest, int dest_stride) { uint8_t *d1, *d2; uint8x8_t d0u8, d1u8, d2u8, d3u8; uint64x1_t d0u64, d1u64, d2u64, d3u64; int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; uint16x8_t q8u16, q9u16, q10u16, q11u16; q8s16 = vld1q_s16(input); q9s16 = vld1q_s16(input + 8); q10s16 = vld1q_s16(input + 16); q11s16 = vld1q_s16(input + 24); q12s16 = vld1q_s16(input + 32); q13s16 = vld1q_s16(input + 40); q14s16 = vld1q_s16(input + 48); q15s16 = vld1q_s16(input + 56); TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); q8s16 = vrshrq_n_s16(q8s16, 5); q9s16 = vrshrq_n_s16(q9s16, 5); q10s16 = vrshrq_n_s16(q10s16, 5); q11s16 = vrshrq_n_s16(q11s16, 5); q12s16 = vrshrq_n_s16(q12s16, 5); q13s16 = vrshrq_n_s16(q13s16, 5); q14s16 = vrshrq_n_s16(q14s16, 5); q15s16 = vrshrq_n_s16(q15s16, 5); d1 = d2 = dest; d0u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d1u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d2u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d3u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); d2 += dest_stride; q8s16 = q12s16; q9s16 = q13s16; q10s16 = q14s16; q11s16 = q15s16; d0u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d1u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d2u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d3u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); d2 += dest_stride; return; }
void ne10_img_vresize_linear_neon (const int** src, unsigned char* dst, const short* beta, int width) { const int *S0 = src[0], *S1 = src[1]; int32x4_t qS0_0123, qS0_4567, qS1_0123, qS1_4567; int32x4_t qT_0123, qT_4567; int16x4_t dT_0123, dT_4567; uint16x8_t qT_01234567; uint8x8_t dT_01234567, dDst_01234567; int32x2_t dBeta; dBeta = vset_lane_s32 ( (int) (beta[0]), dBeta, 0); dBeta = vset_lane_s32 ( (int) (beta[1]), dBeta, 1); int32x4_t qDelta, qMin, qMax; qDelta = vdupq_n_s32 (DELTA); qMin = vdupq_n_s32 (0); qMax = vdupq_n_s32 (255); int x = 0; for (; x <= width - 8; x += 8) { qS0_0123 = vld1q_s32 (&S0[x]); qS0_4567 = vld1q_s32 (&S0[x + 4]); qS1_0123 = vld1q_s32 (&S1[x]); qS1_4567 = vld1q_s32 (&S1[x + 4]); qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0); qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0); qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1); qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1); qT_0123 = vaddq_s32 (qT_0123, qDelta); qT_4567 = vaddq_s32 (qT_4567, qDelta); qT_0123 = vshrq_n_s32 (qT_0123, BITS); qT_4567 = vshrq_n_s32 (qT_4567, BITS); qT_0123 = vmaxq_s32 (qT_0123, qMin); qT_4567 = vmaxq_s32 (qT_4567, qMin); qT_0123 = vminq_s32 (qT_0123, qMax); qT_4567 = vminq_s32 (qT_4567, qMax); dT_0123 = vmovn_s32 (qT_0123); dT_4567 = vmovn_s32 (qT_4567); qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567)); dT_01234567 = vmovn_u16 (qT_01234567); vst1_u8 (&dst[x], dT_01234567); } if (x < width) { uint8x8_t dMask; dMask = vld1_u8 ( (uint8_t *) (&ne10_img_vresize_linear_mask_residual_table[ (width - x - 1)])); dDst_01234567 = vld1_u8 (&dst[x]); qS0_0123 = vld1q_s32 (&S0[x]); qS0_4567 = vld1q_s32 (&S0[x + 4]); qS1_0123 = vld1q_s32 (&S1[x]); qS1_4567 = vld1q_s32 (&S1[x + 4]); qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0); qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0); qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1); qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1); qT_0123 = vaddq_s32 (qT_0123, qDelta); qT_4567 = vaddq_s32 (qT_4567, qDelta); qT_0123 = vshrq_n_s32 (qT_0123, BITS); qT_4567 = vshrq_n_s32 (qT_4567, BITS); qT_0123 = vmaxq_s32 (qT_0123, qMin); qT_4567 = vmaxq_s32 (qT_4567, qMin); qT_0123 = vminq_s32 (qT_0123, qMax); qT_4567 = vminq_s32 (qT_4567, qMax); dT_0123 = vmovn_s32 (qT_0123); dT_4567 = vmovn_s32 (qT_4567); qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567)); dT_01234567 = vmovn_u16 (qT_01234567); dMask = vbsl_u8 (dMask, dT_01234567, dDst_01234567); vst1_u8 (&dst[x], dMask); } }
void vp8_short_idct4x4llm_neon(int16_t *input, unsigned char *pred_ptr, int pred_stride, unsigned char *dst_ptr, int dst_stride) { int i; uint32x2_t d6u32 = vdup_n_u32(0); uint8x8_t d1u8; int16x4_t d2, d3, d4, d5, d10, d11, d12, d13; uint16x8_t q1u16; int16x8_t q1s16, q2s16, q3s16, q4s16; int32x2x2_t v2tmp0, v2tmp1; int16x4x2_t v2tmp2, v2tmp3; d2 = vld1_s16(input); d3 = vld1_s16(input + 4); d4 = vld1_s16(input + 8); d5 = vld1_s16(input + 12); // 1st for loop q1s16 = vcombine_s16(d2, d4); // Swap d3 d4 here q2s16 = vcombine_s16(d3, d5); q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2); q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1); d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 q3s16 = vshrq_n_s16(q3s16, 1); q4s16 = vshrq_n_s16(q4s16, 1); q3s16 = vqaddq_s16(q3s16, q2s16); q4s16 = vqaddq_s16(q4s16, q2s16); d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1 d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1 d2 = vqadd_s16(d12, d11); d3 = vqadd_s16(d13, d10); d4 = vqsub_s16(d13, d10); d5 = vqsub_s16(d12, d11); v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]), vreinterpret_s16_s32(v2tmp1.val[0])); v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]), vreinterpret_s16_s32(v2tmp1.val[1])); // 2nd for loop q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp3.val[0]); q2s16 = vcombine_s16(v2tmp2.val[1], v2tmp3.val[1]); q3s16 = vqdmulhq_n_s16(q2s16, sinpi8sqrt2); q4s16 = vqdmulhq_n_s16(q2s16, cospi8sqrt2minus1); d12 = vqadd_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // a1 d13 = vqsub_s16(vget_low_s16(q1s16), vget_high_s16(q1s16)); // b1 q3s16 = vshrq_n_s16(q3s16, 1); q4s16 = vshrq_n_s16(q4s16, 1); q3s16 = vqaddq_s16(q3s16, q2s16); q4s16 = vqaddq_s16(q4s16, q2s16); d10 = vqsub_s16(vget_low_s16(q3s16), vget_high_s16(q4s16)); // c1 d11 = vqadd_s16(vget_high_s16(q3s16), vget_low_s16(q4s16)); // d1 d2 = vqadd_s16(d12, d11); d3 = vqadd_s16(d13, d10); d4 = vqsub_s16(d13, d10); d5 = vqsub_s16(d12, d11); d2 = vrshr_n_s16(d2, 3); d3 = vrshr_n_s16(d3, 3); d4 = vrshr_n_s16(d4, 3); d5 = vrshr_n_s16(d5, 3); v2tmp0 = vtrn_s32(vreinterpret_s32_s16(d2), vreinterpret_s32_s16(d4)); v2tmp1 = vtrn_s32(vreinterpret_s32_s16(d3), vreinterpret_s32_s16(d5)); v2tmp2 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[0]), vreinterpret_s16_s32(v2tmp1.val[0])); v2tmp3 = vtrn_s16(vreinterpret_s16_s32(v2tmp0.val[1]), vreinterpret_s16_s32(v2tmp1.val[1])); q1s16 = vcombine_s16(v2tmp2.val[0], v2tmp2.val[1]); q2s16 = vcombine_s16(v2tmp3.val[0], v2tmp3.val[1]); // dc_only_idct_add for (i = 0; i < 2; i++, q1s16 = q2s16) { d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 0); pred_ptr += pred_stride; d6u32 = vld1_lane_u32((const uint32_t *)pred_ptr, d6u32, 1); pred_ptr += pred_stride; q1u16 = vaddw_u8(vreinterpretq_u16_s16(q1s16), vreinterpret_u8_u32(d6u32)); d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 0); dst_ptr += dst_stride; vst1_lane_u32((uint32_t *)dst_ptr, vreinterpret_u32_u8(d1u8), 1); dst_ptr += dst_stride; } return; }
void aom_idct4x4_16_add_neon(int16_t *input, uint8_t *dest, int dest_stride) { uint8x8_t d26u8, d27u8; uint32x2_t d26u32, d27u32; uint16x8_t q8u16, q9u16; int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16; int16x4_t d22s16, d23s16, d24s16, d26s16, d27s16, d28s16, d29s16; int16x8_t q8s16, q9s16, q13s16, q14s16; int32x4_t q1s32, q13s32, q14s32, q15s32; int16x4x2_t d0x2s16, d1x2s16; int32x4x2_t q0x2s32; uint8_t *d; d26u32 = d27u32 = vdup_n_u32(0); q8s16 = vld1q_s16(input); q9s16 = vld1q_s16(input + 8); d16s16 = vget_low_s16(q8s16); d17s16 = vget_high_s16(q8s16); d18s16 = vget_low_s16(q9s16); d19s16 = vget_high_s16(q9s16); d0x2s16 = vtrn_s16(d16s16, d17s16); d1x2s16 = vtrn_s16(d18s16, d19s16); q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); d20s16 = vdup_n_s16((int16_t)cospi_8_64); d21s16 = vdup_n_s16((int16_t)cospi_16_64); q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16)); d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); d22s16 = vdup_n_s16((int16_t)cospi_24_64); // stage 1 d23s16 = vadd_s16(d16s16, d18s16); d24s16 = vsub_s16(d16s16, d18s16); q15s32 = vmull_s16(d17s16, d22s16); q1s32 = vmull_s16(d17s16, d20s16); q13s32 = vmull_s16(d23s16, d21s16); q14s32 = vmull_s16(d24s16, d21s16); q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); q1s32 = vmlal_s16(q1s32, d19s16, d22s16); d26s16 = vqrshrn_n_s32(q13s32, 14); d27s16 = vqrshrn_n_s32(q14s32, 14); d29s16 = vqrshrn_n_s32(q15s32, 14); d28s16 = vqrshrn_n_s32(q1s32, 14); q13s16 = vcombine_s16(d26s16, d27s16); q14s16 = vcombine_s16(d28s16, d29s16); // stage 2 q8s16 = vaddq_s16(q13s16, q14s16); q9s16 = vsubq_s16(q13s16, q14s16); d16s16 = vget_low_s16(q8s16); d17s16 = vget_high_s16(q8s16); d18s16 = vget_high_s16(q9s16); // vswp d18 d19 d19s16 = vget_low_s16(q9s16); d0x2s16 = vtrn_s16(d16s16, d17s16); d1x2s16 = vtrn_s16(d18s16, d19s16); q8s16 = vcombine_s16(d0x2s16.val[0], d0x2s16.val[1]); q9s16 = vcombine_s16(d1x2s16.val[0], d1x2s16.val[1]); q0x2s32 = vtrnq_s32(vreinterpretq_s32_s16(q8s16), vreinterpretq_s32_s16(q9s16)); d16s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); d17s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[0])); d18s16 = vget_low_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); d19s16 = vget_high_s16(vreinterpretq_s16_s32(q0x2s32.val[1])); // do the transform on columns // stage 1 d23s16 = vadd_s16(d16s16, d18s16); d24s16 = vsub_s16(d16s16, d18s16); q15s32 = vmull_s16(d17s16, d22s16); q1s32 = vmull_s16(d17s16, d20s16); q13s32 = vmull_s16(d23s16, d21s16); q14s32 = vmull_s16(d24s16, d21s16); q15s32 = vmlsl_s16(q15s32, d19s16, d20s16); q1s32 = vmlal_s16(q1s32, d19s16, d22s16); d26s16 = vqrshrn_n_s32(q13s32, 14); d27s16 = vqrshrn_n_s32(q14s32, 14); d29s16 = vqrshrn_n_s32(q15s32, 14); d28s16 = vqrshrn_n_s32(q1s32, 14); q13s16 = vcombine_s16(d26s16, d27s16); q14s16 = vcombine_s16(d28s16, d29s16); // stage 2 q8s16 = vaddq_s16(q13s16, q14s16); q9s16 = vsubq_s16(q13s16, q14s16); q8s16 = vrshrq_n_s16(q8s16, 4); q9s16 = vrshrq_n_s16(q9s16, 4); d = dest; d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 0); d += dest_stride; d26u32 = vld1_lane_u32((const uint32_t *)d, d26u32, 1); d += dest_stride; d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 1); d += dest_stride; d27u32 = vld1_lane_u32((const uint32_t *)d, d27u32, 0); q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32)); d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d = dest; vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 0); d += dest_stride; vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d26u8), 1); d += dest_stride; vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 1); d += dest_stride; vst1_lane_u32((uint32_t *)d, vreinterpret_u32_u8(d27u8), 0); return; }
/* * Notice: * - nb_pkts < RTE_I40E_DESCS_PER_LOOP, just return no packet * - nb_pkts > RTE_I40E_VPMD_RX_BURST, only scan RTE_I40E_VPMD_RX_BURST * numbers of DD bits */ static inline uint16_t _recv_raw_pkts_vec(struct i40e_rx_queue *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts, uint8_t *split_packet) { volatile union i40e_rx_desc *rxdp; struct i40e_rx_entry *sw_ring; uint16_t nb_pkts_recd; int pos; uint32_t *ptype_tbl = rxq->vsi->adapter->ptype_tbl; /* mask to shuffle from desc. to mbuf */ uint8x16_t shuf_msk = { 0xFF, 0xFF, /* pkt_type set as unknown */ 0xFF, 0xFF, /* pkt_type set as unknown */ 14, 15, /* octet 15~14, low 16 bits pkt_len */ 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 14, 15, /* octet 15~14, 16 bits data_len */ 2, 3, /* octet 2~3, low 16 bits vlan_macip */ 4, 5, 6, 7 /* octet 4~7, 32bits rss */ }; uint8x16_t eop_check = { 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; uint16x8_t crc_adjust = { 0, 0, /* ignore pkt_type field */ rxq->crc_len, /* sub crc on pkt_len */ 0, /* ignore high-16bits of pkt_len */ rxq->crc_len, /* sub crc on data_len */ 0, 0, 0 /* ignore non-length fields */ }; /* nb_pkts shall be less equal than RTE_I40E_MAX_RX_BURST */ nb_pkts = RTE_MIN(nb_pkts, RTE_I40E_MAX_RX_BURST); /* nb_pkts has to be floor-aligned to RTE_I40E_DESCS_PER_LOOP */ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_I40E_DESCS_PER_LOOP); /* Just the act of getting into the function from the application is * going to cost about 7 cycles */ rxdp = rxq->rx_ring + rxq->rx_tail; rte_prefetch_non_temporal(rxdp); /* See if we need to rearm the RX queue - gives the prefetch a bit * of time to act */ if (rxq->rxrearm_nb > RTE_I40E_RXQ_REARM_THRESH) i40e_rxq_rearm(rxq); /* Before we start moving massive data around, check to see if * there is actually a packet available */ if (!(rxdp->wb.qword1.status_error_len & rte_cpu_to_le_32(1 << I40E_RX_DESC_STATUS_DD_SHIFT))) return 0; /* Cache is empty -> need to scan the buffer rings, but first move * the next 'n' mbufs into the cache */ sw_ring = &rxq->sw_ring[rxq->rx_tail]; /* A. load 4 packet in one loop * [A*. mask out 4 unused dirty field in desc] * B. copy 4 mbuf point from swring to rx_pkts * C. calc the number of DD bits among the 4 packets * [C*. extract the end-of-packet bit, if requested] * D. fill info. from desc to mbuf */ for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; pos += RTE_I40E_DESCS_PER_LOOP, rxdp += RTE_I40E_DESCS_PER_LOOP) { uint64x2_t descs[RTE_I40E_DESCS_PER_LOOP]; uint8x16_t pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; uint16x8x2_t sterr_tmp1, sterr_tmp2; uint64x2_t mbp1, mbp2; uint16x8_t staterr; uint16x8_t tmp; uint64_t stat; int32x4_t len_shl = {0, 0, 0, PKTLEN_SHIFT}; /* B.1 load 1 mbuf point */ mbp1 = vld1q_u64((uint64_t *)&sw_ring[pos]); /* Read desc statuses backwards to avoid race condition */ /* A.1 load 4 pkts desc */ descs[3] = vld1q_u64((uint64_t *)(rxdp + 3)); rte_rmb(); /* B.2 copy 2 mbuf point into rx_pkts */ vst1q_u64((uint64_t *)&rx_pkts[pos], mbp1); /* B.1 load 1 mbuf point */ mbp2 = vld1q_u64((uint64_t *)&sw_ring[pos + 2]); descs[2] = vld1q_u64((uint64_t *)(rxdp + 2)); /* B.1 load 2 mbuf point */ descs[1] = vld1q_u64((uint64_t *)(rxdp + 1)); descs[0] = vld1q_u64((uint64_t *)(rxdp)); /* B.2 copy 2 mbuf point into rx_pkts */ vst1q_u64((uint64_t *)&rx_pkts[pos + 2], mbp2); if (split_packet) { rte_mbuf_prefetch_part2(rx_pkts[pos]); rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); } /* avoid compiler reorder optimization */ rte_compiler_barrier(); /* pkt 3,4 shift the pktlen field to be 16-bit aligned*/ uint32x4_t len3 = vshlq_u32(vreinterpretq_u32_u64(descs[3]), len_shl); descs[3] = vreinterpretq_u64_u32(len3); uint32x4_t len2 = vshlq_u32(vreinterpretq_u32_u64(descs[2]), len_shl); descs[2] = vreinterpretq_u64_u32(len2); /* D.1 pkt 3,4 convert format from desc to pktmbuf */ pkt_mb4 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[3]), shuf_msk); pkt_mb3 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[2]), shuf_msk); /* C.1 4=>2 filter staterr info only */ sterr_tmp2 = vzipq_u16(vreinterpretq_u16_u64(descs[1]), vreinterpretq_u16_u64(descs[3])); /* C.1 4=>2 filter staterr info only */ sterr_tmp1 = vzipq_u16(vreinterpretq_u16_u64(descs[0]), vreinterpretq_u16_u64(descs[2])); /* C.2 get 4 pkts staterr value */ staterr = vzipq_u16(sterr_tmp1.val[1], sterr_tmp2.val[1]).val[0]; desc_to_olflags_v(rxq, descs, &rx_pkts[pos]); /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */ tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb4), crc_adjust); pkt_mb4 = vreinterpretq_u8_u16(tmp); tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb3), crc_adjust); pkt_mb3 = vreinterpretq_u8_u16(tmp); /* pkt 1,2 shift the pktlen field to be 16-bit aligned*/ uint32x4_t len1 = vshlq_u32(vreinterpretq_u32_u64(descs[1]), len_shl); descs[1] = vreinterpretq_u64_u32(len1); uint32x4_t len0 = vshlq_u32(vreinterpretq_u32_u64(descs[0]), len_shl); descs[0] = vreinterpretq_u64_u32(len0); /* D.1 pkt 1,2 convert format from desc to pktmbuf */ pkt_mb2 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[1]), shuf_msk); pkt_mb1 = vqtbl1q_u8(vreinterpretq_u8_u64(descs[0]), shuf_msk); /* D.3 copy final 3,4 data to rx_pkts */ vst1q_u8((void *)&rx_pkts[pos + 3]->rx_descriptor_fields1, pkt_mb4); vst1q_u8((void *)&rx_pkts[pos + 2]->rx_descriptor_fields1, pkt_mb3); /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */ tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb2), crc_adjust); pkt_mb2 = vreinterpretq_u8_u16(tmp); tmp = vsubq_u16(vreinterpretq_u16_u8(pkt_mb1), crc_adjust); pkt_mb1 = vreinterpretq_u8_u16(tmp); /* C* extract and record EOP bit */ if (split_packet) { uint8x16_t eop_shuf_mask = { 0x00, 0x02, 0x04, 0x06, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}; uint8x16_t eop_bits; /* and with mask to extract bits, flipping 1-0 */ eop_bits = vmvnq_u8(vreinterpretq_u8_u16(staterr)); eop_bits = vandq_u8(eop_bits, eop_check); /* the staterr values are not in order, as the count * count of dd bits doesn't care. However, for end of * packet tracking, we do care, so shuffle. This also * compresses the 32-bit values to 8-bit */ eop_bits = vqtbl1q_u8(eop_bits, eop_shuf_mask); /* store the resulting 32-bit value */ vst1q_lane_u32((uint32_t *)split_packet, vreinterpretq_u32_u8(eop_bits), 0); split_packet += RTE_I40E_DESCS_PER_LOOP; /* zero-out next pointers */ rx_pkts[pos]->next = NULL; rx_pkts[pos + 1]->next = NULL; rx_pkts[pos + 2]->next = NULL; rx_pkts[pos + 3]->next = NULL; } staterr = vshlq_n_u16(staterr, I40E_UINT16_BIT - 1); staterr = vreinterpretq_u16_s16( vshrq_n_s16(vreinterpretq_s16_u16(staterr), I40E_UINT16_BIT - 1)); stat = ~vgetq_lane_u64(vreinterpretq_u64_u16(staterr), 0); rte_prefetch_non_temporal(rxdp + RTE_I40E_DESCS_PER_LOOP); /* D.3 copy final 1,2 data to rx_pkts */ vst1q_u8((void *)&rx_pkts[pos + 1]->rx_descriptor_fields1, pkt_mb2); vst1q_u8((void *)&rx_pkts[pos]->rx_descriptor_fields1, pkt_mb1); desc_to_ptype_v(descs, &rx_pkts[pos], ptype_tbl); /* C.4 calc avaialbe number of desc */ if (unlikely(stat == 0)) { nb_pkts_recd += RTE_I40E_DESCS_PER_LOOP; } else { nb_pkts_recd += __builtin_ctzl(stat) / I40E_UINT16_BIT; break; } } /* Update our internal tail pointer */ rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd); rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1)); rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); return nb_pkts_recd; }
void idct_dequant_full_2x_neon( int16_t *q, int16_t *dq, unsigned char *dst, int stride) { unsigned char *dst0, *dst1; int32x2_t d28, d29, d30, d31; int16x8_t q0, q1, q2, q3, q4, q5, q6, q7, q8, q9, q10, q11; int16x8_t qEmpty = vdupq_n_s16(0); int32x4x2_t q2tmp0, q2tmp1; int16x8x2_t q2tmp2, q2tmp3; int16x4_t dLow0, dLow1, dHigh0, dHigh1; d28 = d29 = d30 = d31 = vdup_n_s32(0); // load dq q0 = vld1q_s16(dq); dq += 8; q1 = vld1q_s16(dq); // load q q2 = vld1q_s16(q); vst1q_s16(q, qEmpty); q += 8; q3 = vld1q_s16(q); vst1q_s16(q, qEmpty); q += 8; q4 = vld1q_s16(q); vst1q_s16(q, qEmpty); q += 8; q5 = vld1q_s16(q); vst1q_s16(q, qEmpty); // load src from dst dst0 = dst; dst1 = dst + 4; d28 = vld1_lane_s32((const int32_t *)dst0, d28, 0); dst0 += stride; d28 = vld1_lane_s32((const int32_t *)dst1, d28, 1); dst1 += stride; d29 = vld1_lane_s32((const int32_t *)dst0, d29, 0); dst0 += stride; d29 = vld1_lane_s32((const int32_t *)dst1, d29, 1); dst1 += stride; d30 = vld1_lane_s32((const int32_t *)dst0, d30, 0); dst0 += stride; d30 = vld1_lane_s32((const int32_t *)dst1, d30, 1); dst1 += stride; d31 = vld1_lane_s32((const int32_t *)dst0, d31, 0); d31 = vld1_lane_s32((const int32_t *)dst1, d31, 1); q2 = vmulq_s16(q2, q0); q3 = vmulq_s16(q3, q1); q4 = vmulq_s16(q4, q0); q5 = vmulq_s16(q5, q1); // vswp dLow0 = vget_low_s16(q2); dHigh0 = vget_high_s16(q2); dLow1 = vget_low_s16(q4); dHigh1 = vget_high_s16(q4); q2 = vcombine_s16(dLow0, dLow1); q4 = vcombine_s16(dHigh0, dHigh1); dLow0 = vget_low_s16(q3); dHigh0 = vget_high_s16(q3); dLow1 = vget_low_s16(q5); dHigh1 = vget_high_s16(q5); q3 = vcombine_s16(dLow0, dLow1); q5 = vcombine_s16(dHigh0, dHigh1); q6 = vqdmulhq_n_s16(q4, sinpi8sqrt2); q7 = vqdmulhq_n_s16(q5, sinpi8sqrt2); q8 = vqdmulhq_n_s16(q4, cospi8sqrt2minus1); q9 = vqdmulhq_n_s16(q5, cospi8sqrt2minus1); q10 = vqaddq_s16(q2, q3); q11 = vqsubq_s16(q2, q3); q8 = vshrq_n_s16(q8, 1); q9 = vshrq_n_s16(q9, 1); q4 = vqaddq_s16(q4, q8); q5 = vqaddq_s16(q5, q9); q2 = vqsubq_s16(q6, q5); q3 = vqaddq_s16(q7, q4); q4 = vqaddq_s16(q10, q3); q5 = vqaddq_s16(q11, q2); q6 = vqsubq_s16(q11, q2); q7 = vqsubq_s16(q10, q3); q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), vreinterpretq_s16_s32(q2tmp1.val[0])); q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), vreinterpretq_s16_s32(q2tmp1.val[1])); // loop 2 q8 = vqdmulhq_n_s16(q2tmp2.val[1], sinpi8sqrt2); q9 = vqdmulhq_n_s16(q2tmp3.val[1], sinpi8sqrt2); q10 = vqdmulhq_n_s16(q2tmp2.val[1], cospi8sqrt2minus1); q11 = vqdmulhq_n_s16(q2tmp3.val[1], cospi8sqrt2minus1); q2 = vqaddq_s16(q2tmp2.val[0], q2tmp3.val[0]); q3 = vqsubq_s16(q2tmp2.val[0], q2tmp3.val[0]); q10 = vshrq_n_s16(q10, 1); q11 = vshrq_n_s16(q11, 1); q10 = vqaddq_s16(q2tmp2.val[1], q10); q11 = vqaddq_s16(q2tmp3.val[1], q11); q8 = vqsubq_s16(q8, q11); q9 = vqaddq_s16(q9, q10); q4 = vqaddq_s16(q2, q9); q5 = vqaddq_s16(q3, q8); q6 = vqsubq_s16(q3, q8); q7 = vqsubq_s16(q2, q9); q4 = vrshrq_n_s16(q4, 3); q5 = vrshrq_n_s16(q5, 3); q6 = vrshrq_n_s16(q6, 3); q7 = vrshrq_n_s16(q7, 3); q2tmp0 = vtrnq_s32(vreinterpretq_s32_s16(q4), vreinterpretq_s32_s16(q6)); q2tmp1 = vtrnq_s32(vreinterpretq_s32_s16(q5), vreinterpretq_s32_s16(q7)); q2tmp2 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[0]), vreinterpretq_s16_s32(q2tmp1.val[0])); q2tmp3 = vtrnq_s16(vreinterpretq_s16_s32(q2tmp0.val[1]), vreinterpretq_s16_s32(q2tmp1.val[1])); q4 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[0]), vreinterpret_u8_s32(d28))); q5 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp2.val[1]), vreinterpret_u8_s32(d29))); q6 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[0]), vreinterpret_u8_s32(d30))); q7 = vreinterpretq_s16_u16(vaddw_u8(vreinterpretq_u16_s16(q2tmp3.val[1]), vreinterpret_u8_s32(d31))); d28 = vreinterpret_s32_u8(vqmovun_s16(q4)); d29 = vreinterpret_s32_u8(vqmovun_s16(q5)); d30 = vreinterpret_s32_u8(vqmovun_s16(q6)); d31 = vreinterpret_s32_u8(vqmovun_s16(q7)); dst0 = dst; dst1 = dst + 4; vst1_lane_s32((int32_t *)dst0, d28, 0); dst0 += stride; vst1_lane_s32((int32_t *)dst1, d28, 1); dst1 += stride; vst1_lane_s32((int32_t *)dst0, d29, 0); dst0 += stride; vst1_lane_s32((int32_t *)dst1, d29, 1); dst1 += stride; vst1_lane_s32((int32_t *)dst0, d30, 0); dst0 += stride; vst1_lane_s32((int32_t *)dst1, d30, 1); dst1 += stride; vst1_lane_s32((int32_t *)dst0, d31, 0); vst1_lane_s32((int32_t *)dst1, d31, 1); return; }
void av1_iht8x8_64_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, const TxfmParam *txfm_param) { int i; uint8_t *d1, *d2; uint8x8_t d0u8, d1u8, d2u8, d3u8; uint64x1_t d0u64, d1u64, d2u64, d3u64; int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; uint16x8_t q8u16, q9u16, q10u16, q11u16; q8s16 = vld1q_s16(input); q9s16 = vld1q_s16(input + 8); q10s16 = vld1q_s16(input + 8 * 2); q11s16 = vld1q_s16(input + 8 * 3); q12s16 = vld1q_s16(input + 8 * 4); q13s16 = vld1q_s16(input + 8 * 5); q14s16 = vld1q_s16(input + 8 * 6); q15s16 = vld1q_s16(input + 8 * 7); TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); const TX_TYPE tx_type = txfm_param->tx_type; switch (tx_type) { case DCT_DCT: // idct_idct is not supported. Fall back to C av1_iht8x8_64_add_c(input, dest, dest_stride, txfm_param); return; break; case ADST_DCT: // iadst_idct // generate IDCT constants // GENERATE_IDCT_CONSTANTS // first transform rows IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); // transpose the matrix TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); // generate IADST constants // GENERATE_IADST_CONSTANTS // then transform columns IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); break; case DCT_ADST: // idct_iadst // generate IADST constants // GENERATE_IADST_CONSTANTS // first transform rows IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); // transpose the matrix TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); // generate IDCT constants // GENERATE_IDCT_CONSTANTS // then transform columns IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); break; case ADST_ADST: // iadst_iadst // generate IADST constants // GENERATE_IADST_CONSTANTS // first transform rows IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); // transpose the matrix TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); // then transform columns IADST8X8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); break; default: // iadst_idct assert(0); break; } q8s16 = vrshrq_n_s16(q8s16, 5); q9s16 = vrshrq_n_s16(q9s16, 5); q10s16 = vrshrq_n_s16(q10s16, 5); q11s16 = vrshrq_n_s16(q11s16, 5); q12s16 = vrshrq_n_s16(q12s16, 5); q13s16 = vrshrq_n_s16(q13s16, 5); q14s16 = vrshrq_n_s16(q14s16, 5); q15s16 = vrshrq_n_s16(q15s16, 5); for (d1 = d2 = dest, i = 0; i < 2; i++) { if (i != 0) { q8s16 = q12s16; q9s16 = q13s16; q10s16 = q14s16; q11s16 = q15s16; } d0u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d1u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d2u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d3u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); d2 += dest_stride; } return; }
void vp10_iht4x4_16_add_neon(const tran_low_t *input, uint8_t *dest, int dest_stride, int tx_type) { uint8x8_t d26u8, d27u8; int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16; uint32x2_t d26u32, d27u32; int16x8_t q3s16, q8s16, q9s16; uint16x8_t q8u16, q9u16; d26u32 = d27u32 = vdup_n_u32(0); q8s16 = vld1q_s16(input); q9s16 = vld1q_s16(input + 8); TRANSPOSE4X4(&q8s16, &q9s16); switch (tx_type) { case 0: // idct_idct is not supported. Fall back to C vp10_iht4x4_16_add_c(input, dest, dest_stride, tx_type); return; break; case 1: // iadst_idct // generate constants GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16); GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16); // first transform rows IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16); // transpose the matrix TRANSPOSE4X4(&q8s16, &q9s16); // then transform columns IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); break; case 2: // idct_iadst // generate constantsyy GENERATE_COSINE_CONSTANTS(&d0s16, &d1s16, &d2s16); GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16); // first transform rows IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); // transpose the matrix TRANSPOSE4X4(&q8s16, &q9s16); // then transform columns IDCT4x4_1D(&d0s16, &d1s16, &d2s16, &q8s16, &q9s16); break; case 3: // iadst_iadst // generate constants GENERATE_SINE_CONSTANTS(&d3s16, &d4s16, &d5s16, &q3s16); // first transform rows IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); // transpose the matrix TRANSPOSE4X4(&q8s16, &q9s16); // then transform columns IADST4x4_1D(&d3s16, &d4s16, &d5s16, &q3s16, &q8s16, &q9s16); break; default: // iadst_idct assert(0); break; } q8s16 = vrshrq_n_s16(q8s16, 4); q9s16 = vrshrq_n_s16(q9s16, 4); d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 0); dest += dest_stride; d26u32 = vld1_lane_u32((const uint32_t *)dest, d26u32, 1); dest += dest_stride; d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 0); dest += dest_stride; d27u32 = vld1_lane_u32((const uint32_t *)dest, d27u32, 1); q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u32(d26u32)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u32(d27u32)); d26u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d27u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 1); dest -= dest_stride; vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d27u8), 0); dest -= dest_stride; vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 1); dest -= dest_stride; vst1_lane_u32((uint32_t *)dest, vreinterpret_u32_u8(d26u8), 0); return; }