void test_vzipQs16 (void) { int16x8x2_t out_int16x8x2_t; int16x8_t arg0_int16x8_t; int16x8_t arg1_int16x8_t; out_int16x8x2_t = vzipq_s16 (arg0_int16x8_t, arg1_int16x8_t); }
void multadd_real_vector_complex_scalar(int16_t *x, int16_t *alpha, int16_t *y, uint32_t N) { uint32_t i; // do 8 multiplications at a time simd_q15_t alpha_r_128,alpha_i_128,yr,yi,*x_128=(simd_q15_t*)x,*y_128=(simd_q15_t*)y; int j; // printf("alpha = %d,%d\n",alpha[0],alpha[1]); alpha_r_128 = set1_int16(alpha[0]); alpha_i_128 = set1_int16(alpha[1]); j=0; for (i=0; i<N>>3; i++) { yr = mulhi_s1_int16(alpha_r_128,x_128[i]); yi = mulhi_s1_int16(alpha_i_128,x_128[i]); #if defined(__x86_64__) || defined(__i386__) y_128[j] = _mm_adds_epi16(y_128[j],_mm_unpacklo_epi16(yr,yi)); j++; y_128[j] = _mm_adds_epi16(y_128[j],_mm_unpackhi_epi16(yr,yi)); j++; #elif defined(__arm__) int16x8x2_t yint; yint = vzipq_s16(yr,yi); y_128[j] = adds_int16(y_128[j],yint.val[0]); j++; y_128[j] = adds_int16(y_128[j],yint.val[1]); j++; #endif } _mm_empty(); _m_empty(); }
static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len, const uint16_t* best_y, uint16_t* out) { int i; const int16x8_t max = vdupq_n_s16(MAX_Y); const int16x8_t zero = vdupq_n_s16(0); for (i = 0; i + 8 <= len; i += 8) { const int16x8_t a0 = vld1q_s16(A + i + 0); const int16x8_t a1 = vld1q_s16(A + i + 1); const int16x8_t b0 = vld1q_s16(B + i + 0); const int16x8_t b1 = vld1q_s16(B + i + 1); const int16x8_t a0b1 = vaddq_s16(a0, b1); const int16x8_t a1b0 = vaddq_s16(a1, b0); const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1 const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1) const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0) const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3); const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3); const int16x8_t d0 = vaddq_s16(c1, a0); const int16x8_t d1 = vaddq_s16(c0, a1); const int16x8_t e0 = vrshrq_n_s16(d0, 1); const int16x8_t e1 = vrshrq_n_s16(d1, 1); const int16x8x2_t f = vzipq_s16(e0, e1); const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0)); const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8)); const int16x8_t h0 = vaddq_s16(g0, f.val[0]); const int16x8_t h1 = vaddq_s16(g1, f.val[1]); const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero); const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero); vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0)); vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1)); } for (; i < len; ++i) { const int a0b1 = A[i + 0] + B[i + 1]; const int a1b0 = A[i + 1] + B[i + 0]; const int a0a1b0b1 = a0b1 + a1b0 + 8; const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0); out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1); } }
int16x8x2_t test_vzipq_s16(int16x8_t a, int16x8_t b) { // CHECK-LABEL: test_vzipq_s16 return vzipq_s16(a, b); // CHECK: zip1 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h // CHECK: zip2 {{v[0-9]+}}.8h, {{v[0-9]+}}.8h }