void test_vqshluQ_ns16 (void) { uint16x8_t out_uint16x8_t; int16x8_t arg0_int16x8_t; out_uint16x8_t = vqshluq_n_s16 (arg0_int16x8_t, 1); }
static INLINE void highbd_idct32x32_1_add_neg_kernel(uint16_t **dest, const int stride, const int16x8_t res) { const uint16x8_t a0 = vld1q_u16(*dest); const uint16x8_t a1 = vld1q_u16(*dest + 8); const uint16x8_t a2 = vld1q_u16(*dest + 16); const uint16x8_t a3 = vld1q_u16(*dest + 24); const int16x8_t b0 = vaddq_s16(res, vreinterpretq_s16_u16(a0)); const int16x8_t b1 = vaddq_s16(res, vreinterpretq_s16_u16(a1)); const int16x8_t b2 = vaddq_s16(res, vreinterpretq_s16_u16(a2)); const int16x8_t b3 = vaddq_s16(res, vreinterpretq_s16_u16(a3)); const uint16x8_t c0 = vqshluq_n_s16(b0, 0); const uint16x8_t c1 = vqshluq_n_s16(b1, 0); const uint16x8_t c2 = vqshluq_n_s16(b2, 0); const uint16x8_t c3 = vqshluq_n_s16(b3, 0); vst1q_u16(*dest, c0); vst1q_u16(*dest + 8, c1); vst1q_u16(*dest + 16, c2); vst1q_u16(*dest + 24, c3); *dest += stride; }
// res is in reverse row order static INLINE void highbd_idct4x4_1_add_kernel2(uint16_t **dest, const int stride, const int16x8_t res, const int16x8_t max) { const uint16x4_t a0 = vld1_u16(*dest); const uint16x4_t a1 = vld1_u16(*dest + stride); const int16x8_t a = vreinterpretq_s16_u16(vcombine_u16(a1, a0)); // Note: In some profile tests, res is quite close to +/-32767. // We use saturating addition. const int16x8_t b = vqaddq_s16(res, a); const int16x8_t c = vminq_s16(b, max); const uint16x8_t d = vqshluq_n_s16(c, 0); vst1_u16(*dest, vget_high_u16(d)); *dest += stride; vst1_u16(*dest, vget_low_u16(d)); *dest += stride; }
int16x8_t test_vqshluq_n_s16(int16x8_t in) { // CHECK-LABEL: @test_vqshluq_n_s16 // CHECK: call <8 x i16> @llvm.arm64.neon.sqshlu.v8i16(<8 x i16> %in, <8 x i16> <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>) return vqshluq_n_s16(in, 1); }