Exemplo n.º 1
0
OD_SIMD_INLINE void od_store4(od_coeff *x, int xstride,
 int32x4_t t0, int32x4_t t1, int32x4_t t2, int32x4_t t3) {
  vst1q_s32((int *)(x + 0*xstride), t0);
  vst1q_s32((int *)(x + 1*xstride), t1);
  vst1q_s32((int *)(x + 2*xstride), t2);
  vst1q_s32((int *)(x + 3*xstride), t3);
}
Exemplo n.º 2
0
void WebRtcAecm_StoreAdaptiveChannelNeon(AecmCore* aecm,
                                         const uint16_t* far_spectrum,
                                         int32_t* echo_est) {
  assert((uintptr_t)echo_est % 32 == 0);
  assert((uintptr_t)(aecm->channelStored) % 16 == 0);
  assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0);

  // This is C code of following optimized code.
  // During startup we store the channel every block.
  //  memcpy(aecm->channelStored,
  //         aecm->channelAdapt16,
  //         sizeof(int16_t) * PART_LEN1);
  // Recalculate echo estimate
  //  for (i = 0; i < PART_LEN; i += 4) {
  //    echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
  //                                        far_spectrum[i]);
  //    echo_est[i + 1] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 1],
  //                                            far_spectrum[i + 1]);
  //    echo_est[i + 2] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 2],
  //                                            far_spectrum[i + 2]);
  //    echo_est[i + 3] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i + 3],
  //                                            far_spectrum[i + 3]);
  //  }
  //  echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
  //                                     far_spectrum[i]);
  const uint16_t* far_spectrum_p = far_spectrum;
  int16_t* start_adapt_p = aecm->channelAdapt16;
  int16_t* start_stored_p = aecm->channelStored;
  const int16_t* end_stored_p = aecm->channelStored + PART_LEN;
  int32_t* echo_est_p = echo_est;

  uint16x8_t far_spectrum_v;
  int16x8_t adapt_v;
  uint32x4_t echo_est_v_low, echo_est_v_high;

  while (start_stored_p < end_stored_p) {
    far_spectrum_v = vld1q_u16(far_spectrum_p);
    adapt_v = vld1q_s16(start_adapt_p);

    vst1q_s16(start_stored_p, adapt_v);

    echo_est_v_low = vmull_u16(vget_low_u16(far_spectrum_v),
                               vget_low_u16(vreinterpretq_u16_s16(adapt_v)));
    echo_est_v_high = vmull_u16(vget_high_u16(far_spectrum_v),
                                vget_high_u16(vreinterpretq_u16_s16(adapt_v)));

    vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low));
    vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high));

    far_spectrum_p += 8;
    start_adapt_p += 8;
    start_stored_p += 8;
    echo_est_p += 8;
  }
  aecm->channelStored[PART_LEN] = aecm->channelAdapt16[PART_LEN];
  echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN],
                                             far_spectrum[PART_LEN]);
}
Exemplo n.º 3
0
void * scaled_sumi_thread_NEON(void * argument)
{
    jsize i = 0;
    struct scaled_sumfneon_thread_data * data = (struct scaled_sumfneon_thread_data *) argument;
    int32_t * r = (int32_t *)data->r;
    const int32_t * x = (const int32_t *)data->x;
    const int32_t * y = (const int32_t *)data->y;
    const int32_t a = (const int32_t)data->a;
    const jsize size = data->size;

    int32x4_t rx4, xx4, yx4, ax4;

    ax4 = vdupq_n_s32(a);

    for(i; i < size ; i += 4)
    {
        xx4 = vld1q_s32(&(x[i]));
        yx4 = vld1q_s32(&(y[i]));

        rx4 = vmlaq_s32(xx4, ax4, yx4);

        vst1q_s32(&(r[i]), rx4);
    }

}
Exemplo n.º 4
0
void test_vst1Qs32 (void)
{
  int32_t *arg0_int32_t;
  int32x4_t arg1_int32x4_t;

  vst1q_s32 (arg0_int32_t, arg1_int32x4_t);
}
Exemplo n.º 5
0
void test(JNIEnv * env, jobject jRoot, jobject jObj) {
    int *testSet1 = (int*)malloc(sizeof(int)*DATA_SIZE);

    for(uint32_t i = 0; i<DATA_SIZE; i++) {
        testSet1[i] = i;
    }


    clock_t begin = clock();

    for (uint32_t i=0; i<DATA_SIZE/4/2; i++) {
        int32_t *src = testSet1+i*4;
        int32_t *dest = testSet1+DATA_SIZE - 4*(i+1);
        int32x4_t tmp = vld1q_dup_s32(src);
        int32x4_t destData = vld1q_dup_s32(dest);
        int32x4_t rDestData = vrev64q_s32(destData);
        vst1q_s32(src, rDestData);
        vst1q_s32(dest, tmp);
    }

    clock_t end = clock();

    for (uint32_t i = 0; i<DATA_SIZE/2; i++) {
        int t = testSet1[i];
        int d = testSet1[DATA_SIZE-1-i];
        testSet1[i] = d;
        testSet1[DATA_SIZE-1-i] = t;
    }

    clock_t end2 = clock();

    clock_t cost1 = end-begin;
    clock_t cost2 = end2-end;

    __android_log_print(ANDROID_LOG_DEBUG, "NEON", "last number is %d, acc=%.1fx", testSet1[DATA_SIZE-1], 1.f*cost2/cost1);

    free(testSet1);

    jclass clasz = env->FindClass("com/tencent/helloneon/BenchListener");
    jmethodID method = env->GetMethodID(clasz, "onResult", "(Ljava/lang/String;)V");

    std::stringstream out;
    out << "benchResult:" << 1.f*cost2/cost1;
    env->CallVoidMethod(jObj, method, env->NewStringUTF(out.str().c_str()));
}
void
EAP_MemsetBuff_filterbank_Int32(int32 *ptr_left, int32 *ptr_right)
{
#ifdef __ARM_NEON__
  int i = 240;
  int32x4_t zero = { 0, };

  for (i = 0; i < 240; i++, ptr_left += 8, ptr_right += 8)
  {
    vst1q_s32(ptr_left, zero);
    vst1q_s32(ptr_right, zero);
    vst1q_s32(ptr_left + 4, zero);
    vst1q_s32(ptr_right + 4, zero);
  }
#else
  memset(ptr_left, 0, 240 * 8 * sizeof(int32));
  memset(ptr_right, 0, 240 * 8 * sizeof(int32));
#endif
}
Exemplo n.º 7
0
test_vdupq_lane_s32 ()
{
  int32x2_t a;
  int32x4_t b;
  int i;
  int32_t c[2] = { 0, 1 };
  int32_t d[4];

  a = vld1_s32 (c);
  b = wrap_vdupq_lane_s32_0 (a);
  vst1q_s32 (d, b);
  for (i = 0; i < 4; i++)
    if (c[0] != d[i])
      return 1;

  b = wrap_vdupq_lane_s32_1 (a);
  vst1q_s32 (d, b);
  for (i = 0; i < 4; i++)
    if (c[1] != d[i])
      return 1;
  return 0;
}
Exemplo n.º 8
0
static void add_int_neon(int* dst, int* src1, int* src2, int count)
{
  int i;
  for (i = 0; i < count; i += 4) {
      int32x4_t in1, in2, out;
      in1 = vld1q_s32(src1);
      src1 += 4;
      in2 = vld1q_s32(src2);
      src2 += 4;
      out = vaddq_s32(in1, in2);
      vst1q_s32(dst, out);
      dst += 4;
    }
}
Exemplo n.º 9
0
void matMult(int16_t mat1[], int16_t mat2[], int32_t prod[matrix_size][matrix_size])
{
	int output_size = 2 * matrix_size;
	int l,k;
	int16x4_t data1;
	int32x4_t mac_output[output_size/4];
	int32x4_t MAC_addvalue[output_size/4];
	int16x4_t constant_value;
	unsigned int index_input = 0;
	unsigned int transfer_index = 0 ;
	int32_t *pres_ver;

	/* Allocate output */
	pres_ver = malloc(output_size * output_size * sizeof(int32_t));
	for(l = 0 ; l < matrix_size/4; l++)
	{
	    MAC_addvalue[l] = vmovq_n_s32(0);
	}

	/* Perform the multiplication */
	for(l = 0; l < matrix_size*matrix_size; l++)
	{
	    constant_value = vmov_n_s16 (mat1[l]);
	    for(k = 0 ; k < matrix_size/4 ; k++)
	    {
	        data1 = vld1_s16 (&mat2[index_input]);
	        MAC4 (&MAC_addvalue[k], &constant_value, &data1,&mac_output[k]);
	        MAC_addvalue[k] = mac_output[k];
	        index_input +=4;
	    }

		index_input+=output_size-matrix_size;
	    if ((l + 1) % matrix_size == 0 )
	    {
	        index_input = 0;

	        for(k = 0 ; k < matrix_size/4 ; k++)
	        {
	            vst1q_s32(&pres_ver[transfer_index],MAC_addvalue[k]);
	            transfer_index +=4;
	        }

			transfer_index += output_size-matrix_size;
	        for(k = 0 ; k < matrix_size/4; k++)
	        {
	            MAC_addvalue[k] = vmovq_n_s32(0);
	        }
	    }
	}
}
Exemplo n.º 10
0
void WebRtcAecm_ResetAdaptiveChannelNeon(AecmCore* aecm) {
  assert((uintptr_t)(aecm->channelStored) % 16 == 0);
  assert((uintptr_t)(aecm->channelAdapt16) % 16 == 0);
  assert((uintptr_t)(aecm->channelAdapt32) % 32 == 0);

  // The C code of following optimized code.
  // for (i = 0; i < PART_LEN1; i++) {
  //   aecm->channelAdapt16[i] = aecm->channelStored[i];
  //   aecm->channelAdapt32[i] = WEBRTC_SPL_LSHIFT_W32(
  //              (int32_t)aecm->channelStored[i], 16);
  // }

  int16_t* start_stored_p = aecm->channelStored;
  int16_t* start_adapt16_p = aecm->channelAdapt16;
  int32_t* start_adapt32_p = aecm->channelAdapt32;
  const int16_t* end_stored_p = start_stored_p + PART_LEN;

  int16x8_t stored_v;
  int32x4_t adapt32_v_low, adapt32_v_high;

  while (start_stored_p < end_stored_p) {
    stored_v = vld1q_s16(start_stored_p);
    vst1q_s16(start_adapt16_p, stored_v);

    adapt32_v_low = vshll_n_s16(vget_low_s16(stored_v), 16);
    adapt32_v_high = vshll_n_s16(vget_high_s16(stored_v), 16);

    vst1q_s32(start_adapt32_p, adapt32_v_low);
    vst1q_s32(start_adapt32_p + 4, adapt32_v_high);

    start_stored_p += 8;
    start_adapt16_p += 8;
    start_adapt32_p += 8;
  }
  aecm->channelAdapt16[PART_LEN] = aecm->channelStored[PART_LEN];
  aecm->channelAdapt32[PART_LEN] = (int32_t)aecm->channelStored[PART_LEN] << 16;
}
Exemplo n.º 11
0
OD_SIMD_INLINE void od_overflow_check_epi32(int32x4_t val, int32_t scale,
 int32_t offset, int idx) {
#if defined(OD_DCT_TEST) && defined(OD_DCT_CHECK_OVERFLOW)
  int32_t mem[4];
  int n;
  vst1q_s32((int *)mem, val);
  for (n = 0; n < 4; n++) {
    OD_DCT_OVERFLOW_CHECK(mem[n], scale, offset, idx);
  }
#endif
  (void)val;
  (void)scale;
  (void)offset;
  (void)idx;
}
Exemplo n.º 12
0
test_vreinterpretq_s32_f64 ()
{
  float64x2_t a;
  int32x4_t b;
  float64_t c[2] = { PI_F64, E_F64 };
  int32_t d[4] = { 0x54442D18, 0x400921FB, 0x8B145769, 0x4005BF0A };
  int32_t e[4];
  int i;

  a = vld1q_f64 (c);
  b = wrap_vreinterpretq_s32_f64 (a);
  vst1q_s32 (e, b);
  for (i = 0; i < 4; i++)
    if (d[i] != e[i])
      return 1;
  return 0;
};
/* Performs one rotation/translation */
static void 
neon_coord_4(
    float32x4_t a_4, 
    float32x4_t b_4,
    float32x4_t x_4, 
    float32x4_t y_4,
    float32x4_t pos_4f, 
    float32x4_t point5_4, 
    int * result)
{
    float32x4_t tmp1 = vmulq_f32(a_4, x_4);
    float32x4_t tmp2 = vmulq_f32(b_4, y_4);
    tmp2 = vaddq_f32(tmp1, tmp2);
    tmp2 = vaddq_f32(tmp2, pos_4f);
    tmp2 = vaddq_f32(tmp2, point5_4);
    int32x4_t c_4 = vcvtq_s32_f32(tmp2);
    vst1q_s32(result, c_4);
}
Exemplo n.º 14
0
/* s32x4 mv mul */
void mw_neon_mv_mul_s32x4(int * A, int Row, int T, int * B, int * C)
{
	int i = 0;
	int k = 0;

	int32x4_t neon_b, neon_c;
	int32x4_t neon_a0, neon_a1, neon_a2, neon_a3;
	int32x4_t neon_b0, neon_b1, neon_b2, neon_b3;

	for (i = 0; i < Row; i+=4)
	{
		neon_c = vmovq_n_s32(0);

		for (k = 0; k < T; k+=4)
		{
			int j = k * T + i;

			neon_a0 = vld1q_s32(A + j);
			j+=Row;
			neon_a1 = vld1q_s32(A + j);
			j+=Row;
			neon_a2 = vld1q_s32(A + j);
			j+=Row;
			neon_a3 = vld1q_s32(A + j);

			neon_b = vld1q_s32(B + k);
			neon_b0 = vdupq_n_s32(vgetq_lane_s32(neon_b, 0));
			neon_b1 = vdupq_n_s32(vgetq_lane_s32(neon_b, 1));
			neon_b2 = vdupq_n_s32(vgetq_lane_s32(neon_b, 2));
			neon_b3 = vdupq_n_s32(vgetq_lane_s32(neon_b, 3));

			neon_c = vaddq_s32(vmulq_s32(neon_a0, neon_b0), neon_c);
			neon_c = vaddq_s32(vmulq_s32(neon_a1, neon_b1), neon_c);
			neon_c = vaddq_s32(vmulq_s32(neon_a2, neon_b2), neon_c);
			neon_c = vaddq_s32(vmulq_s32(neon_a3, neon_b3), neon_c);

		}

		vst1q_s32(C + i, neon_c);
	}
}
Exemplo n.º 15
0
/* s32x4 saturated sub */
void mw_neon_mm_qsub_s32x4(int * A, int Row, int Col, int * B, int * C)
{
	int32x4_t neon_a, neon_b, neon_c;
	int size = Row * Col;
	int i = 0;
	int k = 0;

	for (i = 4; i <= size ; i+=4)
	{
		k = i - 4;
		neon_a = vld1q_s32(A + k);
		neon_b = vld1q_s32(B + k);
		neon_c = vqsubq_s32(neon_a, neon_b);
		vst1q_s32(C + k, neon_c);
	}

	k = i - 4;
    for (i = 0; i < size % 4; i++)
	{
		C[k + i] = A[k + i] - B[k + i];
	}
}
Exemplo n.º 16
0
static inline void PostShiftAndDivideAndDemodulateNeon(int16_t* inre,
                                                       int16_t* inim,
                                                       int32_t* outre1,
                                                       int32_t* outre2,
                                                       int32_t sh) {
  int k;
  int16_t* p_inre = inre;
  int16_t* p_inim = inim;
  int32_t* p_outre1 = outre1;
  int32_t* p_outre2 = outre2;
  const int16_t* kCosTab = &WebRtcIsacfix_kCosTab1[0];
  const int16_t* kSinTab = &WebRtcIsacfix_kSinTab1[0];
  int32x4_t shift = vdupq_n_s32(-sh - 16);
  // Divide through by the normalizing constant:
  // scale all values with 1/240, i.e. with 273 in Q16.
  // 273/65536 ~= 0.0041656
  // 1/240 ~= 0.0041666
  int16x8_t scale = vdupq_n_s16(273);
  // Sqrt(240) in Q11 is round(15.49193338482967 * 2048) = 31727.
  int factQ19 = 31727 << 16;
  int32x4_t fact = vdupq_n_s32(factQ19);

  for (k = 0; k < FRAMESAMPLES/2; k += 8) {
    int16x8_t inre16x8 = vld1q_s16(p_inre);
    int16x8_t inim16x8 = vld1q_s16(p_inim);
    p_inre += 8;
    p_inim += 8;
    int16x8_t tmpr = vld1q_s16(kCosTab);
    int16x8_t tmpi = vld1q_s16(kSinTab);
    kCosTab += 8;
    kSinTab += 8;
    // By vshl and vmull, we effectively did "<< (-sh - 16)",
    // instead of "<< (-sh)" and ">> 16" as in the C code.
    int32x4_t outre1_0 = vmull_s16(vget_low_s16(inre16x8), vget_low_s16(scale));
    int32x4_t outre2_0 = vmull_s16(vget_low_s16(inim16x8), vget_low_s16(scale));
#if defined(WEBRTC_ARCH_ARM64)
    int32x4_t outre1_1 = vmull_high_s16(inre16x8, scale);
    int32x4_t outre2_1 = vmull_high_s16(inim16x8, scale);
#else
    int32x4_t outre1_1 = vmull_s16(vget_high_s16(inre16x8),
                                   vget_high_s16(scale));
    int32x4_t outre2_1 = vmull_s16(vget_high_s16(inim16x8),
                                   vget_high_s16(scale));
#endif

    outre1_0 = vshlq_s32(outre1_0, shift);
    outre1_1 = vshlq_s32(outre1_1, shift);
    outre2_0 = vshlq_s32(outre2_0, shift);
    outre2_1 = vshlq_s32(outre2_1, shift);

    // Demodulate and separate.
    int32x4_t tmpr_0 = vmovl_s16(vget_low_s16(tmpr));
    int32x4_t tmpi_0 = vmovl_s16(vget_low_s16(tmpi));
#if defined(WEBRTC_ARCH_ARM64)
    int32x4_t tmpr_1 = vmovl_high_s16(tmpr);
    int32x4_t tmpi_1 = vmovl_high_s16(tmpi);
#else
    int32x4_t tmpr_1 = vmovl_s16(vget_high_s16(tmpr));
    int32x4_t tmpi_1 = vmovl_s16(vget_high_s16(tmpi));
#endif

    int64x2_t xr0 = vmull_s32(vget_low_s32(tmpr_0), vget_low_s32(outre1_0));
    int64x2_t xi0 = vmull_s32(vget_low_s32(tmpr_0), vget_low_s32(outre2_0));
    int64x2_t xr2 = vmull_s32(vget_low_s32(tmpr_1), vget_low_s32(outre1_1));
    int64x2_t xi2 = vmull_s32(vget_low_s32(tmpr_1), vget_low_s32(outre2_1));
    xr0 = vmlsl_s32(xr0, vget_low_s32(tmpi_0), vget_low_s32(outre2_0));
    xi0 = vmlal_s32(xi0, vget_low_s32(tmpi_0), vget_low_s32(outre1_0));
    xr2 = vmlsl_s32(xr2, vget_low_s32(tmpi_1), vget_low_s32(outre2_1));
    xi2 = vmlal_s32(xi2, vget_low_s32(tmpi_1), vget_low_s32(outre1_1));

#if defined(WEBRTC_ARCH_ARM64)
    int64x2_t xr1 = vmull_high_s32(tmpr_0, outre1_0);
    int64x2_t xi1 = vmull_high_s32(tmpr_0, outre2_0);
    int64x2_t xr3 = vmull_high_s32(tmpr_1, outre1_1);
    int64x2_t xi3 = vmull_high_s32(tmpr_1, outre2_1);
    xr1 = vmlsl_high_s32(xr1, tmpi_0, outre2_0);
    xi1 = vmlal_high_s32(xi1, tmpi_0, outre1_0);
    xr3 = vmlsl_high_s32(xr3, tmpi_1, outre2_1);
    xi3 = vmlal_high_s32(xi3, tmpi_1, outre1_1);
#else
    int64x2_t xr1 = vmull_s32(vget_high_s32(tmpr_0), vget_high_s32(outre1_0));
    int64x2_t xi1 = vmull_s32(vget_high_s32(tmpr_0), vget_high_s32(outre2_0));
    int64x2_t xr3 = vmull_s32(vget_high_s32(tmpr_1), vget_high_s32(outre1_1));
    int64x2_t xi3 = vmull_s32(vget_high_s32(tmpr_1), vget_high_s32(outre2_1));
    xr1 = vmlsl_s32(xr1, vget_high_s32(tmpi_0), vget_high_s32(outre2_0));
    xi1 = vmlal_s32(xi1, vget_high_s32(tmpi_0), vget_high_s32(outre1_0));
    xr3 = vmlsl_s32(xr3, vget_high_s32(tmpi_1), vget_high_s32(outre2_1));
    xi3 = vmlal_s32(xi3, vget_high_s32(tmpi_1), vget_high_s32(outre1_1));
#endif

    outre1_0 = vcombine_s32(vshrn_n_s64(xr0, 10), vshrn_n_s64(xr1, 10));
    outre2_0 = vcombine_s32(vshrn_n_s64(xi0, 10), vshrn_n_s64(xi1, 10));
    outre1_1 = vcombine_s32(vshrn_n_s64(xr2, 10), vshrn_n_s64(xr3, 10));
    outre2_1 = vcombine_s32(vshrn_n_s64(xi2, 10), vshrn_n_s64(xi3, 10));
    outre1_0 = vqdmulhq_s32(outre1_0, fact);
    outre2_0 = vqdmulhq_s32(outre2_0, fact);
    outre1_1 = vqdmulhq_s32(outre1_1, fact);
    outre2_1 = vqdmulhq_s32(outre2_1, fact);

    vst1q_s32(p_outre1, outre1_0);
    p_outre1 += 4;
    vst1q_s32(p_outre1, outre1_1);
    p_outre1 += 4;
    vst1q_s32(p_outre2, outre2_0);
    p_outre2 += 4;
    vst1q_s32(p_outre2, outre2_1);
    p_outre2 += 4;
  }
}
Exemplo n.º 17
0
void ne10_img_hresize_4channels_linear_neon (const unsigned char** src, int** dst, int count,
        const int* xofs, const short* alpha,
        int swidth, int dwidth, int cn, int xmin, int xmax)
{
    int dx, k;
    int dx0 = 0;

    int16x4x2_t alpha_vec;

    uint8x8_t dS0_vec, dS1_vec;
    int16x8_t qS0_vec, qS1_vec;
    int16x4_t dS0_0123, dS0_4567, dS1_0123, dS1_4567;

    int32x4_t qT0_vec, qT1_vec;

    int16x4_t dCoeff;
    dCoeff = vdup_n_s16 (INTER_RESIZE_COEF_SCALE);

    for (k = 0; k <= count - 2; k++)
    {
        const unsigned char *S0 = src[k], *S1 = src[k + 1];
        int *D0 = dst[k], *D1 = dst[k + 1];

        for (dx = dx0; dx < xmax; dx += 4)
        {
            int sx = xofs[dx];

            alpha_vec = vld2_s16 (&alpha[dx * 2]);

            dS0_vec = vld1_u8 (&S0[sx]);
            dS1_vec = vld1_u8 (&S1[sx]);

            qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));
            qS1_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS1_vec));

            dS0_0123 = vget_low_s16 (qS0_vec);
            dS0_4567 = vget_high_s16 (qS0_vec);
            dS1_0123 = vget_low_s16 (qS1_vec);
            dS1_4567 = vget_high_s16 (qS1_vec);

            qT0_vec = vmull_s16 (dS0_0123, alpha_vec.val[0]);
            qT1_vec = vmull_s16 (dS1_0123, alpha_vec.val[0]);
            qT0_vec = vmlal_s16 (qT0_vec, dS0_4567, alpha_vec.val[1]);
            qT1_vec = vmlal_s16 (qT1_vec, dS1_4567, alpha_vec.val[1]);

            vst1q_s32 (&D0[dx], qT0_vec);
            vst1q_s32 (&D1[dx], qT1_vec);
        }

        for (; dx < dwidth; dx += 4)
        {
            int sx = xofs[dx];

            dS0_vec = vld1_u8 (&S0[sx]);
            dS1_vec = vld1_u8 (&S1[sx]);

            qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));
            qS1_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS1_vec));

            dS0_0123 = vget_low_s16 (qS0_vec);
            dS1_0123 = vget_low_s16 (qS1_vec);

            qT0_vec = vmull_s16 (dS0_0123, dCoeff);
            qT1_vec = vmull_s16 (dS1_0123, dCoeff);

            vst1q_s32 (&D0[dx], qT0_vec);
            vst1q_s32 (&D1[dx], qT1_vec);
        }
    }

    for (; k < count; k++)
    {
        const unsigned char *S = src[k];
        int *D = dst[k];
        for (dx = 0; dx < xmax; dx += 4)
        {
            int sx = xofs[dx];

            alpha_vec = vld2_s16 (&alpha[dx * 2]);

            dS0_vec = vld1_u8 (&S[sx]);
            qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));

            dS0_0123 = vget_low_s16 (qS0_vec);
            dS0_4567 = vget_high_s16 (qS0_vec);

            qT0_vec = vmull_s16 (dS0_0123, alpha_vec.val[0]);
            qT0_vec = vmlal_s16 (qT0_vec, dS0_4567, alpha_vec.val[1]);

            vst1q_s32 (&D[dx], qT0_vec);
        }

        for (; dx < dwidth; dx += 4)
        {
            int sx = xofs[dx];

            dS0_vec = vld1_u8 (&S[sx]);
            qS0_vec = vreinterpretq_s16_u16 (vmovl_u8 (dS0_vec));
            dS0_0123 = vget_low_s16 (qS0_vec);
            qT0_vec = vmull_s16 (dS0_0123, dCoeff);

            vst1q_s32 (&D[dx], qT0_vec);
        }
    }
}
// vp9/common/vp9_scan.c:vp9_default_iscan_32x32 arranges the first 135 non-zero
// coefficients as follows:
//      0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15
//  0   0   2   5  10  17  25  38  47  62  83 101 121
//  1   1   4   8  15  22  30  45  58  74  92 112 133
//  2   3   7  12  18  28  36  52  64  82 102 118
//  3   6  11  16  23  31  43  60  73  90 109 126
//  4   9  14  19  29  37  50  65  78  98 116 134
//  5  13  20  26  35  44  54  72  85 105 123
//  6  21  27  33  42  53  63  80  94 113 132
//  7  24  32  39  48  57  71  88 104 120
//  8  34  40  46  56  68  81  96 111 130
//  9  41  49  55  67  77  91 107 124
// 10  51  59  66  76  89  99 119 131
// 11  61  69  75  87 100 114 129
// 12  70  79  86  97 108 122
// 13  84  93 103 110 125
// 14  98 106 115 127
// 15 117 128
static void vpx_highbd_idct32_12_neon(const tran_low_t *const input,
                                      int32_t *output) {
  int32x4x2_t in[12], s1[32], s2[32], s3[32], s4[32], s5[32], s6[32], s7[32],
      s8[32];

  load_8x8_s32_dual(input, &in[0], &in[1], &in[2], &in[3], &in[4], &in[5],
                    &in[6], &in[7]);
  transpose_s32_8x8(&in[0], &in[1], &in[2], &in[3], &in[4], &in[5], &in[6],
                    &in[7]);

  load_4x8_s32_dual(input + 8, &in[8].val[0], &in[8].val[1], &in[9].val[0],
                    &in[9].val[1], &in[10].val[0], &in[10].val[1],
                    &in[11].val[0], &in[11].val[1]);
  transpose_s32_4x8(&in[8].val[0], &in[8].val[1], &in[9].val[0], &in[9].val[1],
                    &in[10].val[0], &in[10].val[1], &in[11].val[0],
                    &in[11].val[1]);

  // stage 1
  s1[16] = multiply_shift_and_narrow_s32_dual(in[1], cospi_31_64);
  s1[31] = multiply_shift_and_narrow_s32_dual(in[1], cospi_1_64);

  s1[18] = multiply_shift_and_narrow_s32_dual(in[9], cospi_23_64);
  s1[29] = multiply_shift_and_narrow_s32_dual(in[9], cospi_9_64);

  s1[19] = multiply_shift_and_narrow_s32_dual(in[7], -cospi_25_64);
  s1[28] = multiply_shift_and_narrow_s32_dual(in[7], cospi_7_64);

  s1[20] = multiply_shift_and_narrow_s32_dual(in[5], cospi_27_64);
  s1[27] = multiply_shift_and_narrow_s32_dual(in[5], cospi_5_64);

  s1[21] = multiply_shift_and_narrow_s32_dual(in[11], -cospi_21_64);
  s1[26] = multiply_shift_and_narrow_s32_dual(in[11], cospi_11_64);

  s1[23] = multiply_shift_and_narrow_s32_dual(in[3], -cospi_29_64);
  s1[24] = multiply_shift_and_narrow_s32_dual(in[3], cospi_3_64);

  // stage 2
  s2[8] = multiply_shift_and_narrow_s32_dual(in[2], cospi_30_64);
  s2[15] = multiply_shift_and_narrow_s32_dual(in[2], cospi_2_64);

  s2[10] = multiply_shift_and_narrow_s32_dual(in[10], cospi_22_64);
  s2[13] = multiply_shift_and_narrow_s32_dual(in[10], cospi_10_64);

  s2[11] = multiply_shift_and_narrow_s32_dual(in[6], -cospi_26_64);
  s2[12] = multiply_shift_and_narrow_s32_dual(in[6], cospi_6_64);

  s2[18] = highbd_idct_sub_dual(s1[19], s1[18]);
  s2[19] = highbd_idct_add_dual(s1[18], s1[19]);
  s2[20] = highbd_idct_add_dual(s1[20], s1[21]);
  s2[21] = highbd_idct_sub_dual(s1[20], s1[21]);
  s2[26] = highbd_idct_sub_dual(s1[27], s1[26]);
  s2[27] = highbd_idct_add_dual(s1[26], s1[27]);
  s2[28] = highbd_idct_add_dual(s1[28], s1[29]);
  s2[29] = highbd_idct_sub_dual(s1[28], s1[29]);

  // stage 3
  s3[4] = multiply_shift_and_narrow_s32_dual(in[4], cospi_28_64);
  s3[7] = multiply_shift_and_narrow_s32_dual(in[4], cospi_4_64);

  s3[10] = highbd_idct_sub_dual(s2[11], s2[10]);
  s3[11] = highbd_idct_add_dual(s2[10], s2[11]);
  s3[12] = highbd_idct_add_dual(s2[12], s2[13]);
  s3[13] = highbd_idct_sub_dual(s2[12], s2[13]);

  s3[17] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], -cospi_4_64,
                                                         s1[31], cospi_28_64);
  s3[30] = multiply_accumulate_shift_and_narrow_s32_dual(s1[16], cospi_28_64,
                                                         s1[31], cospi_4_64);

  s3[18] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_28_64,
                                                         s2[29], -cospi_4_64);
  s3[29] = multiply_accumulate_shift_and_narrow_s32_dual(s2[18], -cospi_4_64,
                                                         s2[29], cospi_28_64);

  s3[21] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], -cospi_20_64,
                                                         s2[26], cospi_12_64);
  s3[26] = multiply_accumulate_shift_and_narrow_s32_dual(s2[21], cospi_12_64,
                                                         s2[26], cospi_20_64);

  s3[22] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_12_64,
                                                         s1[24], -cospi_20_64);
  s3[25] = multiply_accumulate_shift_and_narrow_s32_dual(s1[23], -cospi_20_64,
                                                         s1[24], cospi_12_64);

  // stage 4
  s4[0] = multiply_shift_and_narrow_s32_dual(in[0], cospi_16_64);
  s4[2] = multiply_shift_and_narrow_s32_dual(in[8], cospi_24_64);
  s4[3] = multiply_shift_and_narrow_s32_dual(in[8], cospi_8_64);

  s4[9] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], -cospi_8_64,
                                                        s2[15], cospi_24_64);
  s4[14] = multiply_accumulate_shift_and_narrow_s32_dual(s2[8], cospi_24_64,
                                                         s2[15], cospi_8_64);

  s4[10] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_24_64,
                                                         s3[13], -cospi_8_64);
  s4[13] = multiply_accumulate_shift_and_narrow_s32_dual(s3[10], -cospi_8_64,
                                                         s3[13], cospi_24_64);

  s4[16] = highbd_idct_add_dual(s1[16], s2[19]);
  s4[17] = highbd_idct_add_dual(s3[17], s3[18]);
  s4[18] = highbd_idct_sub_dual(s3[17], s3[18]);
  s4[19] = highbd_idct_sub_dual(s1[16], s2[19]);
  s4[20] = highbd_idct_sub_dual(s1[23], s2[20]);
  s4[21] = highbd_idct_sub_dual(s3[22], s3[21]);
  s4[22] = highbd_idct_add_dual(s3[21], s3[22]);
  s4[23] = highbd_idct_add_dual(s2[20], s1[23]);
  s4[24] = highbd_idct_add_dual(s1[24], s2[27]);
  s4[25] = highbd_idct_add_dual(s3[25], s3[26]);
  s4[26] = highbd_idct_sub_dual(s3[25], s3[26]);
  s4[27] = highbd_idct_sub_dual(s1[24], s2[27]);
  s4[28] = highbd_idct_sub_dual(s1[31], s2[28]);
  s4[29] = highbd_idct_sub_dual(s3[30], s3[29]);
  s4[30] = highbd_idct_add_dual(s3[29], s3[30]);
  s4[31] = highbd_idct_add_dual(s2[28], s1[31]);

  // stage 5
  s5[0] = highbd_idct_add_dual(s4[0], s4[3]);
  s5[1] = highbd_idct_add_dual(s4[0], s4[2]);
  s5[2] = highbd_idct_sub_dual(s4[0], s4[2]);
  s5[3] = highbd_idct_sub_dual(s4[0], s4[3]);

  s5[5] = sub_multiply_shift_and_narrow_s32_dual(s3[7], s3[4], cospi_16_64);
  s5[6] = add_multiply_shift_and_narrow_s32_dual(s3[4], s3[7], cospi_16_64);

  s5[8] = highbd_idct_add_dual(s2[8], s3[11]);
  s5[9] = highbd_idct_add_dual(s4[9], s4[10]);
  s5[10] = highbd_idct_sub_dual(s4[9], s4[10]);
  s5[11] = highbd_idct_sub_dual(s2[8], s3[11]);
  s5[12] = highbd_idct_sub_dual(s2[15], s3[12]);
  s5[13] = highbd_idct_sub_dual(s4[14], s4[13]);
  s5[14] = highbd_idct_add_dual(s4[13], s4[14]);
  s5[15] = highbd_idct_add_dual(s2[15], s3[12]);

  s5[18] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], -cospi_8_64,
                                                         s4[29], cospi_24_64);
  s5[29] = multiply_accumulate_shift_and_narrow_s32_dual(s4[18], cospi_24_64,
                                                         s4[29], cospi_8_64);

  s5[19] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], -cospi_8_64,
                                                         s4[28], cospi_24_64);
  s5[28] = multiply_accumulate_shift_and_narrow_s32_dual(s4[19], cospi_24_64,
                                                         s4[28], cospi_8_64);

  s5[20] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_24_64,
                                                         s4[27], -cospi_8_64);
  s5[27] = multiply_accumulate_shift_and_narrow_s32_dual(s4[20], -cospi_8_64,
                                                         s4[27], cospi_24_64);

  s5[21] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_24_64,
                                                         s4[26], -cospi_8_64);
  s5[26] = multiply_accumulate_shift_and_narrow_s32_dual(s4[21], -cospi_8_64,
                                                         s4[26], cospi_24_64);

  // stage 6
  s6[0] = highbd_idct_add_dual(s5[0], s3[7]);
  s6[1] = highbd_idct_add_dual(s5[1], s5[6]);
  s6[2] = highbd_idct_add_dual(s5[2], s5[5]);
  s6[3] = highbd_idct_add_dual(s5[3], s3[4]);
  s6[4] = highbd_idct_sub_dual(s5[3], s3[4]);
  s6[5] = highbd_idct_sub_dual(s5[2], s5[5]);
  s6[6] = highbd_idct_sub_dual(s5[1], s5[6]);
  s6[7] = highbd_idct_sub_dual(s5[0], s3[7]);

  s6[10] = sub_multiply_shift_and_narrow_s32_dual(s5[13], s5[10], cospi_16_64);
  s6[13] = add_multiply_shift_and_narrow_s32_dual(s5[10], s5[13], cospi_16_64);

  s6[11] = sub_multiply_shift_and_narrow_s32_dual(s5[12], s5[11], cospi_16_64);
  s6[12] = add_multiply_shift_and_narrow_s32_dual(s5[11], s5[12], cospi_16_64);

  s6[16] = highbd_idct_add_dual(s4[16], s4[23]);
  s6[17] = highbd_idct_add_dual(s4[17], s4[22]);
  s6[18] = highbd_idct_add_dual(s5[18], s5[21]);
  s6[19] = highbd_idct_add_dual(s5[19], s5[20]);
  s6[20] = highbd_idct_sub_dual(s5[19], s5[20]);
  s6[21] = highbd_idct_sub_dual(s5[18], s5[21]);
  s6[22] = highbd_idct_sub_dual(s4[17], s4[22]);
  s6[23] = highbd_idct_sub_dual(s4[16], s4[23]);

  s6[24] = highbd_idct_sub_dual(s4[31], s4[24]);
  s6[25] = highbd_idct_sub_dual(s4[30], s4[25]);
  s6[26] = highbd_idct_sub_dual(s5[29], s5[26]);
  s6[27] = highbd_idct_sub_dual(s5[28], s5[27]);
  s6[28] = highbd_idct_add_dual(s5[27], s5[28]);
  s6[29] = highbd_idct_add_dual(s5[26], s5[29]);
  s6[30] = highbd_idct_add_dual(s4[25], s4[30]);
  s6[31] = highbd_idct_add_dual(s4[24], s4[31]);

  // stage 7
  s7[0] = highbd_idct_add_dual(s6[0], s5[15]);
  s7[1] = highbd_idct_add_dual(s6[1], s5[14]);
  s7[2] = highbd_idct_add_dual(s6[2], s6[13]);
  s7[3] = highbd_idct_add_dual(s6[3], s6[12]);
  s7[4] = highbd_idct_add_dual(s6[4], s6[11]);
  s7[5] = highbd_idct_add_dual(s6[5], s6[10]);
  s7[6] = highbd_idct_add_dual(s6[6], s5[9]);
  s7[7] = highbd_idct_add_dual(s6[7], s5[8]);
  s7[8] = highbd_idct_sub_dual(s6[7], s5[8]);
  s7[9] = highbd_idct_sub_dual(s6[6], s5[9]);
  s7[10] = highbd_idct_sub_dual(s6[5], s6[10]);
  s7[11] = highbd_idct_sub_dual(s6[4], s6[11]);
  s7[12] = highbd_idct_sub_dual(s6[3], s6[12]);
  s7[13] = highbd_idct_sub_dual(s6[2], s6[13]);
  s7[14] = highbd_idct_sub_dual(s6[1], s5[14]);
  s7[15] = highbd_idct_sub_dual(s6[0], s5[15]);

  s7[20] = sub_multiply_shift_and_narrow_s32_dual(s6[27], s6[20], cospi_16_64);
  s7[27] = add_multiply_shift_and_narrow_s32_dual(s6[20], s6[27], cospi_16_64);

  s7[21] = sub_multiply_shift_and_narrow_s32_dual(s6[26], s6[21], cospi_16_64);
  s7[26] = add_multiply_shift_and_narrow_s32_dual(s6[21], s6[26], cospi_16_64);

  s7[22] = sub_multiply_shift_and_narrow_s32_dual(s6[25], s6[22], cospi_16_64);
  s7[25] = add_multiply_shift_and_narrow_s32_dual(s6[22], s6[25], cospi_16_64);

  s7[23] = sub_multiply_shift_and_narrow_s32_dual(s6[24], s6[23], cospi_16_64);
  s7[24] = add_multiply_shift_and_narrow_s32_dual(s6[23], s6[24], cospi_16_64);

  // final stage
  s8[0] = highbd_idct_add_dual(s7[0], s6[31]);
  s8[1] = highbd_idct_add_dual(s7[1], s6[30]);
  s8[2] = highbd_idct_add_dual(s7[2], s6[29]);
  s8[3] = highbd_idct_add_dual(s7[3], s6[28]);
  s8[4] = highbd_idct_add_dual(s7[4], s7[27]);
  s8[5] = highbd_idct_add_dual(s7[5], s7[26]);
  s8[6] = highbd_idct_add_dual(s7[6], s7[25]);
  s8[7] = highbd_idct_add_dual(s7[7], s7[24]);
  s8[8] = highbd_idct_add_dual(s7[8], s7[23]);
  s8[9] = highbd_idct_add_dual(s7[9], s7[22]);
  s8[10] = highbd_idct_add_dual(s7[10], s7[21]);
  s8[11] = highbd_idct_add_dual(s7[11], s7[20]);
  s8[12] = highbd_idct_add_dual(s7[12], s6[19]);
  s8[13] = highbd_idct_add_dual(s7[13], s6[18]);
  s8[14] = highbd_idct_add_dual(s7[14], s6[17]);
  s8[15] = highbd_idct_add_dual(s7[15], s6[16]);
  s8[16] = highbd_idct_sub_dual(s7[15], s6[16]);
  s8[17] = highbd_idct_sub_dual(s7[14], s6[17]);
  s8[18] = highbd_idct_sub_dual(s7[13], s6[18]);
  s8[19] = highbd_idct_sub_dual(s7[12], s6[19]);
  s8[20] = highbd_idct_sub_dual(s7[11], s7[20]);
  s8[21] = highbd_idct_sub_dual(s7[10], s7[21]);
  s8[22] = highbd_idct_sub_dual(s7[9], s7[22]);
  s8[23] = highbd_idct_sub_dual(s7[8], s7[23]);
  s8[24] = highbd_idct_sub_dual(s7[7], s7[24]);
  s8[25] = highbd_idct_sub_dual(s7[6], s7[25]);
  s8[26] = highbd_idct_sub_dual(s7[5], s7[26]);
  s8[27] = highbd_idct_sub_dual(s7[4], s7[27]);
  s8[28] = highbd_idct_sub_dual(s7[3], s6[28]);
  s8[29] = highbd_idct_sub_dual(s7[2], s6[29]);
  s8[30] = highbd_idct_sub_dual(s7[1], s6[30]);
  s8[31] = highbd_idct_sub_dual(s7[0], s6[31]);

  vst1q_s32(output + 0, s8[0].val[0]);
  vst1q_s32(output + 4, s8[0].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[1].val[0]);
  vst1q_s32(output + 4, s8[1].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[2].val[0]);
  vst1q_s32(output + 4, s8[2].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[3].val[0]);
  vst1q_s32(output + 4, s8[3].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[4].val[0]);
  vst1q_s32(output + 4, s8[4].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[5].val[0]);
  vst1q_s32(output + 4, s8[5].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[6].val[0]);
  vst1q_s32(output + 4, s8[6].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[7].val[0]);
  vst1q_s32(output + 4, s8[7].val[1]);
  output += 16;

  vst1q_s32(output + 0, s8[8].val[0]);
  vst1q_s32(output + 4, s8[8].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[9].val[0]);
  vst1q_s32(output + 4, s8[9].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[10].val[0]);
  vst1q_s32(output + 4, s8[10].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[11].val[0]);
  vst1q_s32(output + 4, s8[11].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[12].val[0]);
  vst1q_s32(output + 4, s8[12].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[13].val[0]);
  vst1q_s32(output + 4, s8[13].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[14].val[0]);
  vst1q_s32(output + 4, s8[14].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[15].val[0]);
  vst1q_s32(output + 4, s8[15].val[1]);
  output += 16;

  vst1q_s32(output + 0, s8[16].val[0]);
  vst1q_s32(output + 4, s8[16].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[17].val[0]);
  vst1q_s32(output + 4, s8[17].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[18].val[0]);
  vst1q_s32(output + 4, s8[18].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[19].val[0]);
  vst1q_s32(output + 4, s8[19].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[20].val[0]);
  vst1q_s32(output + 4, s8[20].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[21].val[0]);
  vst1q_s32(output + 4, s8[21].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[22].val[0]);
  vst1q_s32(output + 4, s8[22].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[23].val[0]);
  vst1q_s32(output + 4, s8[23].val[1]);
  output += 16;

  vst1q_s32(output + 0, s8[24].val[0]);
  vst1q_s32(output + 4, s8[24].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[25].val[0]);
  vst1q_s32(output + 4, s8[25].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[26].val[0]);
  vst1q_s32(output + 4, s8[26].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[27].val[0]);
  vst1q_s32(output + 4, s8[27].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[28].val[0]);
  vst1q_s32(output + 4, s8[28].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[29].val[0]);
  vst1q_s32(output + 4, s8[29].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[30].val[0]);
  vst1q_s32(output + 4, s8[30].val[1]);
  output += 16;
  vst1q_s32(output + 0, s8[31].val[0]);
  vst1q_s32(output + 4, s8[31].val[1]);
}
void qt_blend_argb32_on_argb32_neon(uchar *destPixels, int dbpl,
                                    const uchar *srcPixels, int sbpl,
                                    int w, int h,
                                    int const_alpha)
{
    const uint *src = (const uint *) srcPixels;
    uint *dst = (uint *) destPixels;
    int16x8_t half = vdupq_n_s16(0x80);
    int16x8_t full = vdupq_n_s16(0xff);
    if (const_alpha == 256) {
        for (int y = 0; y < h; ++y) {
            int x = 0;
            for (; x < w-3; x += 4) {
                int32x4_t src32 = vld1q_s32((int32_t *)&src[x]);
                if ((src[x] & src[x+1] & src[x+2] & src[x+3]) >= 0xff000000) {
                    // all opaque
                    vst1q_s32((int32_t *)&dst[x], src32);
                } else if (src[x] | src[x+1] | src[x+2] | src[x+3]) {
                    int32x4_t dst32 = vld1q_s32((int32_t *)&dst[x]);

                    const uint8x16_t src8 = vreinterpretq_u8_s32(src32);
                    const uint8x16_t dst8 = vreinterpretq_u8_s32(dst32);

                    const uint8x8_t src8_low = vget_low_u8(src8);
                    const uint8x8_t dst8_low = vget_low_u8(dst8);

                    const uint8x8_t src8_high = vget_high_u8(src8);
                    const uint8x8_t dst8_high = vget_high_u8(dst8);

                    const int16x8_t src16_low = vreinterpretq_s16_u16(vmovl_u8(src8_low));
                    const int16x8_t dst16_low = vreinterpretq_s16_u16(vmovl_u8(dst8_low));

                    const int16x8_t src16_high = vreinterpretq_s16_u16(vmovl_u8(src8_high));
                    const int16x8_t dst16_high = vreinterpretq_s16_u16(vmovl_u8(dst8_high));

                    const int16x8_t result16_low = qvsource_over_s16(src16_low, dst16_low, half, full);
                    const int16x8_t result16_high = qvsource_over_s16(src16_high, dst16_high, half, full);

                    const int32x2_t result32_low = vreinterpret_s32_s8(vmovn_s16(result16_low));
                    const int32x2_t result32_high = vreinterpret_s32_s8(vmovn_s16(result16_high));

                    vst1q_s32((int32_t *)&dst[x], vcombine_s32(result32_low, result32_high));
                }
            }
            for (; x<w; ++x) {
                uint s = src[x];
                if (s >= 0xff000000)
                    dst[x] = s;
                else if (s != 0)
                    dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
            }
            dst = (quint32 *)(((uchar *) dst) + dbpl);
            src = (const quint32 *)(((const uchar *) src) + sbpl);
        }
    } else if (const_alpha != 0) {
        const_alpha = (const_alpha * 255) >> 8;
        int16x8_t const_alpha16 = vdupq_n_s16(const_alpha);
        for (int y = 0; y < h; ++y) {
            int x = 0;
            for (; x < w-3; x += 4) {
                if (src[x] | src[x+1] | src[x+2] | src[x+3]) {
                    int32x4_t src32 = vld1q_s32((int32_t *)&src[x]);
                    int32x4_t dst32 = vld1q_s32((int32_t *)&dst[x]);

                    const uint8x16_t src8 = vreinterpretq_u8_s32(src32);
                    const uint8x16_t dst8 = vreinterpretq_u8_s32(dst32);

                    const uint8x8_t src8_low = vget_low_u8(src8);
                    const uint8x8_t dst8_low = vget_low_u8(dst8);

                    const uint8x8_t src8_high = vget_high_u8(src8);
                    const uint8x8_t dst8_high = vget_high_u8(dst8);

                    const int16x8_t src16_low = vreinterpretq_s16_u16(vmovl_u8(src8_low));
                    const int16x8_t dst16_low = vreinterpretq_s16_u16(vmovl_u8(dst8_low));

                    const int16x8_t src16_high = vreinterpretq_s16_u16(vmovl_u8(src8_high));
                    const int16x8_t dst16_high = vreinterpretq_s16_u16(vmovl_u8(dst8_high));

                    const int16x8_t srcalpha16_low = qvbyte_mul_s16(src16_low, const_alpha16, half);
                    const int16x8_t srcalpha16_high = qvbyte_mul_s16(src16_high, const_alpha16, half);

                    const int16x8_t result16_low = qvsource_over_s16(srcalpha16_low, dst16_low, half, full);
                    const int16x8_t result16_high = qvsource_over_s16(srcalpha16_high, dst16_high, half, full);

                    const int32x2_t result32_low = vreinterpret_s32_s8(vmovn_s16(result16_low));
                    const int32x2_t result32_high = vreinterpret_s32_s8(vmovn_s16(result16_high));

                    vst1q_s32((int32_t *)&dst[x], vcombine_s32(result32_low, result32_high));
                }
            }
            for (; x<w; ++x) {
                uint s = src[x];
                if (s != 0) {
                    s = BYTE_MUL(s, const_alpha);
                    dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s));
                }
            }
            dst = (quint32 *)(((uchar *) dst) + dbpl);
            src = (const quint32 *)(((const uchar *) src) + sbpl);
        }
    }
Exemplo n.º 20
0
inline void vst1q(s32 * ptr, const int32x4_t   & v) { return vst1q_s32(ptr, v); }
Exemplo n.º 21
0
// Contains a function for the core loop in the normalized lattice MA
// filter routine for iSAC codec, optimized for ARM Neon platform.
// It does:
//  for 0 <= n < HALF_SUBFRAMELEN - 1:
//    *ptr2 = input2 * (*ptr2) + input0 * (*ptr0));
//    *ptr1 = input1 * (*ptr0) + input0 * (*ptr2);
// Output is not bit-exact with the reference C code, due to the replacement
// of WEBRTC_SPL_MUL_16_32_RSFT15 and LATTICE_MUL_32_32_RSFT16 with Neon
// instructions. The difference should not be bigger than 1.
void WebRtcIsacfix_FilterMaLoopNeon(int16_t input0,  // Filter coefficient
                                    int16_t input1,  // Filter coefficient
                                    int32_t input2,  // Inverse coefficient
                                    int32_t* ptr0,   // Sample buffer
                                    int32_t* ptr1,   // Sample buffer
                                    int32_t* ptr2)   // Sample buffer
{
  int n = 0;
  int loop = (HALF_SUBFRAMELEN - 1) >> 3;
  int loop_tail = (HALF_SUBFRAMELEN - 1) & 0x7;

  int32x4_t input0_v = vdupq_n_s32((int32_t)input0 << 16);
  int32x4_t input1_v = vdupq_n_s32((int32_t)input1 << 16);
  int32x4_t input2_v = vdupq_n_s32(input2);
  int32x4_t tmp0a, tmp1a, tmp2a, tmp3a;
  int32x4_t tmp0b, tmp1b, tmp2b, tmp3b;
  int32x4_t ptr0va, ptr1va, ptr2va;
  int32x4_t ptr0vb, ptr1vb, ptr2vb;

  // Unroll to process 8 samples at once.
  for (n = 0; n < loop; n++) {
    ptr0va = vld1q_s32(ptr0);
    ptr0vb = vld1q_s32(ptr0 + 4);
    ptr0 += 8;

    ptr2va = vld1q_s32(ptr2);
    ptr2vb = vld1q_s32(ptr2 + 4);

    // Calculate tmp0 = (*ptr0) * input0.
    tmp0a = vqrdmulhq_s32(ptr0va, input0_v);
    tmp0b = vqrdmulhq_s32(ptr0vb, input0_v);

    // Calculate tmp1 = (*ptr0) * input1.
    tmp1a = vqrdmulhq_s32(ptr0va, input1_v);
    tmp1b = vqrdmulhq_s32(ptr0vb, input1_v);

    // Calculate tmp2 = tmp0 + *(ptr2).
    tmp2a = vaddq_s32(tmp0a, ptr2va);
    tmp2b = vaddq_s32(tmp0b, ptr2vb);
    tmp2a = vshlq_n_s32(tmp2a, 15);
    tmp2b = vshlq_n_s32(tmp2b, 15);

    // Calculate *ptr2 = input2 * tmp2.
    ptr2va = vqrdmulhq_s32(tmp2a, input2_v);
    ptr2vb = vqrdmulhq_s32(tmp2b, input2_v);

    vst1q_s32(ptr2, ptr2va);
    vst1q_s32(ptr2 + 4, ptr2vb);
    ptr2 += 8;

    // Calculate tmp3 = ptr2v * input0.
    tmp3a = vqrdmulhq_s32(ptr2va, input0_v);
    tmp3b = vqrdmulhq_s32(ptr2vb, input0_v);

    // Calculate *ptr1 = tmp1 + tmp3.
    ptr1va = vaddq_s32(tmp1a, tmp3a);
    ptr1vb = vaddq_s32(tmp1b, tmp3b);

    vst1q_s32(ptr1, ptr1va);
    vst1q_s32(ptr1 + 4, ptr1vb);
    ptr1 += 8;
  }

  // Process four more samples.
  if (loop_tail & 0x4) {
    ptr0va = vld1q_s32(ptr0);
    ptr2va = vld1q_s32(ptr2);
    ptr0 += 4;

    // Calculate tmp0 = (*ptr0) * input0.
    tmp0a = vqrdmulhq_s32(ptr0va, input0_v);

    // Calculate tmp1 = (*ptr0) * input1.
    tmp1a = vqrdmulhq_s32(ptr0va, input1_v);

    // Calculate tmp2 = tmp0 + *(ptr2).
    tmp2a = vaddq_s32(tmp0a, ptr2va);
    tmp2a = vshlq_n_s32(tmp2a, 15);

    // Calculate *ptr2 = input2 * tmp2.
    ptr2va = vqrdmulhq_s32(tmp2a, input2_v);

    vst1q_s32(ptr2, ptr2va);
    ptr2 += 4;

    // Calculate tmp3 = *(ptr2) * input0.
    tmp3a = vqrdmulhq_s32(ptr2va, input0_v);

    // Calculate *ptr1 = tmp1 + tmp3.
    ptr1va = vaddq_s32(tmp1a, tmp3a);

    vst1q_s32(ptr1, ptr1va);
    ptr1 += 4;
  }

  // Process two more samples.
  if (loop_tail & 0x2) {
    int32x2_t ptr0v_tail, ptr2v_tail, ptr1v_tail;
    int32x2_t tmp0_tail, tmp1_tail, tmp2_tail, tmp3_tail;
    ptr0v_tail = vld1_s32(ptr0);
    ptr2v_tail = vld1_s32(ptr2);
    ptr0 += 2;

    // Calculate tmp0 = (*ptr0) * input0.
    tmp0_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input0_v));

    // Calculate tmp1 = (*ptr0) * input1.
    tmp1_tail = vqrdmulh_s32(ptr0v_tail, vget_low_s32(input1_v));

    // Calculate tmp2 = tmp0 + *(ptr2).
    tmp2_tail = vadd_s32(tmp0_tail, ptr2v_tail);
    tmp2_tail = vshl_n_s32(tmp2_tail, 15);

    // Calculate *ptr2 = input2 * tmp2.
    ptr2v_tail = vqrdmulh_s32(tmp2_tail, vget_low_s32(input2_v));

    vst1_s32(ptr2, ptr2v_tail);
    ptr2 += 2;

    // Calculate tmp3 = *(ptr2) * input0.
    tmp3_tail = vqrdmulh_s32(ptr2v_tail, vget_low_s32(input0_v));

    // Calculate *ptr1 = tmp1 + tmp3.
    ptr1v_tail = vadd_s32(tmp1_tail, tmp3_tail);

    vst1_s32(ptr1, ptr1v_tail);
    ptr1 += 2;
  }

  // Process one more sample.
  if (loop_tail & 0x1) {
    int16_t t16a = (int16_t)(input2 >> 16);
    int16_t t16b = (int16_t)input2;
    if (t16b < 0) t16a++;
    int32_t tmp32a;
    int32_t tmp32b;

    // Calculate *ptr2 = input2 * (*ptr2 + input0 * (*ptr0)).
    tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr0);
    tmp32b = *ptr2 + tmp32a;
    *ptr2 = (int32_t)(WEBRTC_SPL_MUL(t16a, tmp32b) +
                       (WEBRTC_SPL_MUL_16_32_RSFT16(t16b, tmp32b)));

    // Calculate *ptr1 = input1 * (*ptr0) + input0 * (*ptr2).
    tmp32a = WEBRTC_SPL_MUL_16_32_RSFT15(input1, *ptr0);
    tmp32b = WEBRTC_SPL_MUL_16_32_RSFT15(input0, *ptr2);
    *ptr1 = tmp32a + tmp32b;
  }
Exemplo n.º 22
0
static inline int32_t ComplexMulAndFindMaxNeon(int16_t* inre1Q9,
                                               int16_t* inre2Q9,
                                               int32_t* outreQ16,
                                               int32_t* outimQ16) {
  int k;
  const int16_t* kCosTab = &WebRtcIsacfix_kCosTab1[0];
  const int16_t* kSinTab = &WebRtcIsacfix_kSinTab1[0];
  // 0.5 / sqrt(240) in Q19 is round((.5 / sqrt(240)) * (2^19)) = 16921.
  // Use "16921 << 5" and vqdmulh, instead of ">> 26" as in the C code.
  int32_t fact  = 16921 << 5;
  int32x4_t factq = vdupq_n_s32(fact);
  uint32x4_t max_r = vdupq_n_u32(0);
  uint32x4_t max_i = vdupq_n_u32(0);

  for (k = 0; k < FRAMESAMPLES/2; k += 8) {
    int16x8_t tmpr = vld1q_s16(kCosTab);
    int16x8_t tmpi = vld1q_s16(kSinTab);
    int16x8_t inre1 = vld1q_s16(inre1Q9);
    int16x8_t inre2 = vld1q_s16(inre2Q9);
    kCosTab += 8;
    kSinTab += 8;
    inre1Q9 += 8;
    inre2Q9 += 8;

    // Use ">> 26", instead of ">> 7", ">> 16" and then ">> 3" as in the C code.
    int32x4_t tmp0 = vmull_s16(vget_low_s16(tmpr), vget_low_s16(inre1));
    int32x4_t tmp1 = vmull_s16(vget_low_s16(tmpr), vget_low_s16(inre2));
    tmp0 = vmlal_s16(tmp0, vget_low_s16(tmpi), vget_low_s16(inre2));
    tmp1 = vmlsl_s16(tmp1, vget_low_s16(tmpi), vget_low_s16(inre1));
#if defined(WEBRTC_ARCH_ARM64)
    int32x4_t tmp2 = vmull_high_s16(tmpr, inre1);
    int32x4_t tmp3 = vmull_high_s16(tmpr, inre2);
    tmp2 = vmlal_high_s16(tmp2, tmpi, inre2);
    tmp3 = vmlsl_high_s16(tmp3, tmpi, inre1);
#else
    int32x4_t tmp2 = vmull_s16(vget_high_s16(tmpr), vget_high_s16(inre1));
    int32x4_t tmp3 = vmull_s16(vget_high_s16(tmpr), vget_high_s16(inre2));
    tmp2 = vmlal_s16(tmp2, vget_high_s16(tmpi), vget_high_s16(inre2));
    tmp3 = vmlsl_s16(tmp3, vget_high_s16(tmpi), vget_high_s16(inre1));
#endif

    int32x4_t outr_0 = vqdmulhq_s32(tmp0, factq);
    int32x4_t outr_1 = vqdmulhq_s32(tmp2, factq);
    int32x4_t outi_0 = vqdmulhq_s32(tmp1, factq);
    int32x4_t outi_1 = vqdmulhq_s32(tmp3, factq);
    vst1q_s32(outreQ16, outr_0);
    outreQ16 += 4;
    vst1q_s32(outreQ16, outr_1);
    outreQ16 += 4;
    vst1q_s32(outimQ16, outi_0);
    outimQ16 += 4;
    vst1q_s32(outimQ16, outi_1);
    outimQ16 += 4;

    // Find the absolute maximum in the vectors.
    tmp0 = vabsq_s32(outr_0);
    tmp1 = vabsq_s32(outr_1);
    tmp2 = vabsq_s32(outi_0);
    tmp3 = vabsq_s32(outi_1);
    // vabs doesn't change the value of 0x80000000.
    // Use u32 so we don't lose the value 0x80000000.
    max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp0));
    max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp2));
    max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp1));
    max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp3));
  }

  max_r = vmaxq_u32(max_r, max_i);
#if defined(WEBRTC_ARCH_ARM64)
  uint32_t maximum = vmaxvq_u32(max_r);
#else
  uint32x2_t max32x2_r = vmax_u32(vget_low_u32(max_r), vget_high_u32(max_r));
  max32x2_r = vpmax_u32(max32x2_r, max32x2_r);
  uint32_t maximum = vget_lane_u32(max32x2_r, 0);
#endif

  return (int32_t)maximum;
}
void silk_warped_autocorrelation_FIX_neon(
          opus_int32                *corr,                                  /* O    Result [order + 1]                                                          */
          opus_int                  *scale,                                 /* O    Scaling of the correlation vector                                           */
    const opus_int16                *input,                                 /* I    Input data to correlate                                                     */
    const opus_int                  warping_Q16,                            /* I    Warping coefficient                                                         */
    const opus_int                  length,                                 /* I    Length of input                                                             */
    const opus_int                  order                                   /* I    Correlation order (even)                                                    */
)
{
    if( ( MAX_SHAPE_LPC_ORDER > 24 ) || ( order < 6 ) ) {
        silk_warped_autocorrelation_FIX_c( corr, scale, input, warping_Q16, length, order );
    } else {
        opus_int       n, i, lsh;
        opus_int64     corr_QC[ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 }; /* In reverse order */
        opus_int64     corr_QC_orderT;
        int64x2_t      lsh_s64x2;
        const opus_int orderT = ( order + 3 ) & ~3;
        opus_int64     *corr_QCT;
        opus_int32     *input_QS;
        VARDECL( opus_int32, input_QST );
        VARDECL( opus_int32, state );
        SAVE_STACK;

        /* Order must be even */
        silk_assert( ( order & 1 ) == 0 );
        silk_assert( 2 * QS - QC >= 0 );

        ALLOC( input_QST, length + 2 * MAX_SHAPE_LPC_ORDER, opus_int32 );

        input_QS = input_QST;
        /* input_QS has zero paddings in the beginning and end. */
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;

        /* Loop over samples */
        for( n = 0; n < length - 7; n += 8, input_QS += 8 ) {
            const int16x8_t t0_s16x4 = vld1q_s16( input + n );
            vst1q_s32( input_QS + 0, vshll_n_s16( vget_low_s16( t0_s16x4 ), QS ) );
            vst1q_s32( input_QS + 4, vshll_n_s16( vget_high_s16( t0_s16x4 ), QS ) );
        }
        for( ; n < length; n++, input_QS++ ) {
            input_QS[ 0 ] = silk_LSHIFT32( (opus_int32)input[ n ], QS );
        }
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS += 4;
        vst1q_s32( input_QS, vdupq_n_s32( 0 ) );
        input_QS = input_QST + MAX_SHAPE_LPC_ORDER - orderT;

        /* The following loop runs ( length + order ) times, with ( order ) extra epilogues.                  */
        /* The zero paddings in input_QS guarantee corr_QC's correctness even with the extra epilogues.       */
        /* The values of state_QS will be polluted by the extra epilogues, however they are temporary values. */

        /* Keep the C code here to help understand the intrinsics optimization. */
        /*
        {
            opus_int32 state_QS[ 2 ][ MAX_SHAPE_LPC_ORDER + 1 ] = { 0 };
            opus_int32 *state_QST[ 3 ];
            state_QST[ 0 ] = state_QS[ 0 ];
            state_QST[ 1 ] = state_QS[ 1 ];
            for( n = 0; n < length + order; n++, input_QS++ ) {
                state_QST[ 0 ][ orderT ] = input_QS[ orderT ];
                for( i = 0; i < orderT; i++ ) {
                    corr_QC[ i ] += silk_RSHIFT64( silk_SMULL( state_QST[ 0 ][ i ], input_QS[ i ] ), 2 * QS - QC );
                    state_QST[ 1 ][ i ] = silk_SMLAWB( state_QST[ 1 ][ i + 1 ], state_QST[ 0 ][ i ] - state_QST[ 0 ][ i + 1 ], warping_Q16 );
                }
                state_QST[ 2 ] = state_QST[ 0 ];
                state_QST[ 0 ] = state_QST[ 1 ];
                state_QST[ 1 ] = state_QST[ 2 ];
            }
        }
        */

        {
            const int32x4_t warping_Q16_s32x4 = vdupq_n_s32( warping_Q16 << 15 );
            const opus_int32 *in = input_QS + orderT;
            opus_int o = orderT;
            int32x4_t state_QS_s32x4[ 3 ][ 2 ];

            ALLOC( state, length + orderT, opus_int32 );
            state_QS_s32x4[ 2 ][ 1 ] = vdupq_n_s32( 0 );

            /* Calculate 8 taps of all inputs in each loop. */
            do {
                state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 0 ][ 1 ] =
                state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 1 ][ 1 ] = vdupq_n_s32( 0 );
                n = 0;
                do {
                    calc_corr( input_QS + n, corr_QC, o - 8, state_QS_s32x4[ 0 ][ 0 ] );
                    calc_corr( input_QS + n, corr_QC, o - 4, state_QS_s32x4[ 0 ][ 1 ] );
                    state_QS_s32x4[ 2 ][ 1 ] = vld1q_s32( in + n );
                    vst1q_lane_s32( state + n, state_QS_s32x4[ 0 ][ 0 ], 0 );
                    state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 0 ][ 1 ], 1 );
                    state_QS_s32x4[ 2 ][ 1 ] = vextq_s32( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], 1 );
                    state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 );
                    state_QS_s32x4[ 0 ][ 1 ] = calc_state( state_QS_s32x4[ 0 ][ 1 ], state_QS_s32x4[ 2 ][ 1 ], state_QS_s32x4[ 1 ][ 1 ], warping_Q16_s32x4 );
                    state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ];
                    state_QS_s32x4[ 1 ][ 1 ] = state_QS_s32x4[ 2 ][ 1 ];
                } while( ++n < ( length + order ) );
                in = state;
                o -= 8;
            } while( o > 4 );

            if( o ) {
                /* Calculate the last 4 taps of all inputs. */
                opus_int32 *stateT = state;
                silk_assert( o == 4 );
                state_QS_s32x4[ 0 ][ 0 ] = state_QS_s32x4[ 1 ][ 0 ] = vdupq_n_s32( 0 );
                n = length + order;
                do {
                    calc_corr( input_QS, corr_QC, 0, state_QS_s32x4[ 0 ][ 0 ] );
                    state_QS_s32x4[ 2 ][ 0 ] = vld1q_s32( stateT );
                    vst1q_lane_s32( stateT, state_QS_s32x4[ 0 ][ 0 ], 0 );
                    state_QS_s32x4[ 2 ][ 0 ] = vextq_s32( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], 1 );
                    state_QS_s32x4[ 0 ][ 0 ] = calc_state( state_QS_s32x4[ 0 ][ 0 ], state_QS_s32x4[ 2 ][ 0 ], state_QS_s32x4[ 1 ][ 0 ], warping_Q16_s32x4 );
                    state_QS_s32x4[ 1 ][ 0 ] = state_QS_s32x4[ 2 ][ 0 ];
                    input_QS++;
                    stateT++;
                } while( --n );
            }
        }

        {
            const opus_int16 *inputT = input;
            int32x4_t t_s32x4;
            int64x1_t t_s64x1;
            int64x2_t t_s64x2 = vdupq_n_s64( 0 );
            for( n = 0; n <= length - 8; n += 8 ) {
                int16x8_t input_s16x8 = vld1q_s16( inputT );
                t_s32x4 = vmull_s16( vget_low_s16( input_s16x8 ), vget_low_s16( input_s16x8 ) );
                t_s32x4 = vmlal_s16( t_s32x4, vget_high_s16( input_s16x8 ), vget_high_s16( input_s16x8 ) );
                t_s64x2 = vaddw_s32( t_s64x2, vget_low_s32( t_s32x4 ) );
                t_s64x2 = vaddw_s32( t_s64x2, vget_high_s32( t_s32x4 ) );
                inputT += 8;
            }
            t_s64x1 = vadd_s64( vget_low_s64( t_s64x2 ), vget_high_s64( t_s64x2 ) );
            corr_QC_orderT = vget_lane_s64( t_s64x1, 0 );
            for( ; n < length; n++ ) {
                corr_QC_orderT += silk_SMULL( input[ n ], input[ n ] );
            }
            corr_QC_orderT = silk_LSHIFT64( corr_QC_orderT, QC );
            corr_QC[ orderT ] = corr_QC_orderT;
        }

        corr_QCT = corr_QC + orderT - order;
        lsh = silk_CLZ64( corr_QC_orderT ) - 35;
        lsh = silk_LIMIT( lsh, -12 - QC, 30 - QC );
        *scale = -( QC + lsh );
        silk_assert( *scale >= -30 && *scale <= 12 );
        lsh_s64x2 = vdupq_n_s64( lsh );
        for( i = 0; i <= order - 3; i += 4 ) {
            int32x4_t corr_s32x4;
            int64x2_t corr_QC0_s64x2, corr_QC1_s64x2;
            corr_QC0_s64x2 = vld1q_s64( corr_QCT + i );
            corr_QC1_s64x2 = vld1q_s64( corr_QCT + i + 2 );
            corr_QC0_s64x2 = vshlq_s64( corr_QC0_s64x2, lsh_s64x2 );
            corr_QC1_s64x2 = vshlq_s64( corr_QC1_s64x2, lsh_s64x2 );
            corr_s32x4     = vcombine_s32( vmovn_s64( corr_QC1_s64x2 ), vmovn_s64( corr_QC0_s64x2 ) );
            corr_s32x4     = vrev64q_s32( corr_s32x4 );
            vst1q_s32( corr + order - i - 3, corr_s32x4 );
        }
        if( lsh >= 0 ) {
            for( ; i < order + 1; i++ ) {
                corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_LSHIFT64( corr_QCT[ i ], lsh ) );
            }
        } else {
            for( ; i < order + 1; i++ ) {
                corr[ order - i ] = (opus_int32)silk_CHECK_FIT32( silk_RSHIFT64( corr_QCT[ i ], -lsh ) );
            }
        }
        silk_assert( corr_QCT[ order ] >= 0 ); /* If breaking, decrease QC*/
        RESTORE_STACK;
    }

#ifdef OPUS_CHECK_ASM
    {
        opus_int32 corr_c[ MAX_SHAPE_LPC_ORDER + 1 ];
        opus_int   scale_c;
        silk_warped_autocorrelation_FIX_c( corr_c, &scale_c, input, warping_Q16, length, order );
        silk_assert( !memcmp( corr_c, corr, sizeof( corr_c[ 0 ] ) * ( order + 1 ) ) );
        silk_assert( scale_c == *scale );
    }
#endif
}
Exemplo n.º 24
0
static inline int32_t TransformAndFindMaxNeon(int16_t* inre,
                                              int16_t* inim,
                                              int32_t* outre,
                                              int32_t* outim) {
  int k;
  int16_t* inre1 = inre;
  int16_t* inre2 = &inre[FRAMESAMPLES/2 - 4];
  int16_t* inim1 = inim;
  int16_t* inim2 = &inim[FRAMESAMPLES/2 - 4];
  int32_t* outre1 = outre;
  int32_t* outre2 = &outre[FRAMESAMPLES/2 - 4];
  int32_t* outim1 = outim;
  int32_t* outim2 = &outim[FRAMESAMPLES/2 - 4];
  const int16_t* kSinTab1 = &WebRtcIsacfix_kSinTab2[0];
  const int16_t* kSinTab2 = &WebRtcIsacfix_kSinTab2[FRAMESAMPLES/4 - 4];
  uint32x4_t max_r = vdupq_n_u32(0);
  uint32x4_t max_i = vdupq_n_u32(0);

  // Use ">> 5", instead of "<< 9" and then ">> 14" as in the C code.
  for (k = 0; k < FRAMESAMPLES/4; k += 4) {
    int16x4_t tmpi = vld1_s16(kSinTab1);
    kSinTab1 += 4;
    int16x4_t tmpr = vld1_s16(kSinTab2);
    kSinTab2 -= 4;
    int16x4_t inre_0 = vld1_s16(inre1);
    inre1 += 4;
    int16x4_t inre_1 = vld1_s16(inre2);
    inre2 -= 4;
    int16x4_t inim_0 = vld1_s16(inim1);
    inim1 += 4;
    int16x4_t inim_1 = vld1_s16(inim2);
    inim2 -= 4;
    tmpr = vneg_s16(tmpr);
    inre_1 = vrev64_s16(inre_1);
    inim_1 = vrev64_s16(inim_1);
    tmpr = vrev64_s16(tmpr);

    int32x4_t xr = vmull_s16(tmpr, inre_0);
    int32x4_t xi = vmull_s16(tmpr, inim_0);
    int32x4_t yr = vmull_s16(tmpr, inim_1);
    int32x4_t yi = vmull_s16(tmpi, inim_1);
    xr = vmlal_s16(xr, tmpi, inim_0);
    xi = vmlsl_s16(xi, tmpi, inre_0);
    yr = vmlal_s16(yr, tmpi, inre_1);
    yi = vmlsl_s16(yi, tmpr, inre_1);
    yr = vnegq_s32(yr);

    xr = vshrq_n_s32(xr, 5);
    xi = vshrq_n_s32(xi, 5);
    yr = vshrq_n_s32(yr, 5);
    yi = vshrq_n_s32(yi, 5);

    int32x4_t outr0 = vsubq_s32(xr, yi);
    int32x4_t outr1 = vaddq_s32(xr, yi);
    int32x4_t outi0 = vaddq_s32(xi, yr);
    int32x4_t outi1 = vsubq_s32(yr, xi);

    // Find the absolute maximum in the vectors.
    int32x4_t tmp0 = vabsq_s32(outr0);
    int32x4_t tmp1 = vabsq_s32(outr1);
    int32x4_t tmp2 = vabsq_s32(outi0);
    int32x4_t tmp3 = vabsq_s32(outi1);
    // vabs doesn't change the value of 0x80000000.
    // Use u32 so we don't lose the value 0x80000000.
    max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp0));
    max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp2));
    max_r = vmaxq_u32(max_r, vreinterpretq_u32_s32(tmp1));
    max_i = vmaxq_u32(max_i, vreinterpretq_u32_s32(tmp3));

    // Store the vectors.
    outr1 = vrev64q_s32(outr1);
    outi1 = vrev64q_s32(outi1);
    int32x4_t outr_1 = vcombine_s32(vget_high_s32(outr1), vget_low_s32(outr1));
    int32x4_t outi_1 = vcombine_s32(vget_high_s32(outi1), vget_low_s32(outi1));

    vst1q_s32(outre1, outr0);
    outre1 += 4;
    vst1q_s32(outim1, outi0);
    outim1 += 4;
    vst1q_s32(outre2, outr_1);
    outre2 -= 4;
    vst1q_s32(outim2, outi_1);
    outim2 -= 4;
  }

  max_r = vmaxq_u32(max_r, max_i);
#if defined(WEBRTC_ARCH_ARM64)
  uint32_t maximum = vmaxvq_u32(max_r);
#else
  uint32x2_t max32x2_r = vmax_u32(vget_low_u32(max_r), vget_high_u32(max_r));
  max32x2_r = vpmax_u32(max32x2_r, max32x2_r);
  uint32_t maximum = vget_lane_u32(max32x2_r, 0);
#endif

  return (int32_t)maximum;
}
Exemplo n.º 25
0
void WebRtcAecm_CalcLinearEnergiesNeon(AecmCore* aecm,
                                       const uint16_t* far_spectrum,
                                       int32_t* echo_est,
                                       uint32_t* far_energy,
                                       uint32_t* echo_energy_adapt,
                                       uint32_t* echo_energy_stored) {
  int16_t* start_stored_p = aecm->channelStored;
  int16_t* start_adapt_p = aecm->channelAdapt16;
  int32_t* echo_est_p = echo_est;
  const int16_t* end_stored_p = aecm->channelStored + PART_LEN;
  const uint16_t* far_spectrum_p = far_spectrum;
  int16x8_t store_v, adapt_v;
  uint16x8_t spectrum_v;
  uint32x4_t echo_est_v_low, echo_est_v_high;
  uint32x4_t far_energy_v, echo_stored_v, echo_adapt_v;

  far_energy_v = vdupq_n_u32(0);
  echo_adapt_v = vdupq_n_u32(0);
  echo_stored_v = vdupq_n_u32(0);

  // Get energy for the delayed far end signal and estimated
  // echo using both stored and adapted channels.
  // The C code:
  //  for (i = 0; i < PART_LEN1; i++) {
  //      echo_est[i] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[i],
  //                                         far_spectrum[i]);
  //      (*far_energy) += (uint32_t)(far_spectrum[i]);
  //      *echo_energy_adapt += aecm->channelAdapt16[i] * far_spectrum[i];
  //      (*echo_energy_stored) += (uint32_t)echo_est[i];
  //  }
  while (start_stored_p < end_stored_p) {
    spectrum_v = vld1q_u16(far_spectrum_p);
    adapt_v = vld1q_s16(start_adapt_p);
    store_v = vld1q_s16(start_stored_p);

    far_energy_v = vaddw_u16(far_energy_v, vget_low_u16(spectrum_v));
    far_energy_v = vaddw_u16(far_energy_v, vget_high_u16(spectrum_v));

    echo_est_v_low = vmull_u16(vreinterpret_u16_s16(vget_low_s16(store_v)),
                               vget_low_u16(spectrum_v));
    echo_est_v_high = vmull_u16(vreinterpret_u16_s16(vget_high_s16(store_v)),
                                vget_high_u16(spectrum_v));
    vst1q_s32(echo_est_p, vreinterpretq_s32_u32(echo_est_v_low));
    vst1q_s32(echo_est_p + 4, vreinterpretq_s32_u32(echo_est_v_high));

    echo_stored_v = vaddq_u32(echo_est_v_low, echo_stored_v);
    echo_stored_v = vaddq_u32(echo_est_v_high, echo_stored_v);

    echo_adapt_v = vmlal_u16(echo_adapt_v,
                             vreinterpret_u16_s16(vget_low_s16(adapt_v)),
                             vget_low_u16(spectrum_v));
    echo_adapt_v = vmlal_u16(echo_adapt_v,
                             vreinterpret_u16_s16(vget_high_s16(adapt_v)),
                             vget_high_u16(spectrum_v));

    start_stored_p += 8;
    start_adapt_p += 8;
    far_spectrum_p += 8;
    echo_est_p += 8;
  }

  AddLanes(far_energy, far_energy_v);
  AddLanes(echo_energy_stored, echo_stored_v);
  AddLanes(echo_energy_adapt, echo_adapt_v);

  echo_est[PART_LEN] = WEBRTC_SPL_MUL_16_U16(aecm->channelStored[PART_LEN],
                                             far_spectrum[PART_LEN]);
  *echo_energy_stored += (uint32_t)echo_est[PART_LEN];
  *far_energy += (uint32_t)far_spectrum[PART_LEN];
  *echo_energy_adapt += aecm->channelAdapt16[PART_LEN] * far_spectrum[PART_LEN];
}
Exemplo n.º 26
0
static void PCorr2Q32(const int16_t *in, int32_t *logcorQ8)
{
  int16_t scaling,n,k;
  int32_t ysum32,csum32, lys, lcs;
  int32_t oneQ8;


  const int16_t *x, *inptr;

  oneQ8 = WEBRTC_SPL_LSHIFT_W32((int32_t)1, 8);  // 1.00 in Q8

  x = in + PITCH_MAX_LAG/2 + 2;
  scaling = WebRtcSpl_GetScalingSquare ((int16_t *) in, PITCH_CORR_LEN2, PITCH_CORR_LEN2);
  ysum32 = 1;
  csum32 = 0;
  x = in + PITCH_MAX_LAG/2 + 2;
  for (n = 0; n < PITCH_CORR_LEN2; n++) {
    ysum32 += WEBRTC_SPL_MUL_16_16_RSFT( (int16_t) in[n],(int16_t) in[n], scaling);  // Q0
    csum32 += WEBRTC_SPL_MUL_16_16_RSFT((int16_t) x[n],(int16_t) in[n], scaling); // Q0
  }

  logcorQ8 += PITCH_LAG_SPAN2 - 1;

  lys=Log2Q8((uint32_t) ysum32); // Q8
  lys=WEBRTC_SPL_RSHIFT_W32(lys, 1); //sqrt(ysum);

  if (csum32>0) {

    lcs=Log2Q8((uint32_t) csum32);   // 2log(csum) in Q8

    if (lcs>(lys + oneQ8) ){ // csum/sqrt(ysum) > 2 in Q8
      *logcorQ8 = lcs - lys;  // log2(csum/sqrt(ysum))
    } else {
      *logcorQ8 = oneQ8;  // 1.00
    }

  } else {
    *logcorQ8 = 0;
  }


  for (k = 1; k < PITCH_LAG_SPAN2; k++) {
    inptr = &in[k];
    ysum32 -= WEBRTC_SPL_MUL_16_16_RSFT( (int16_t) in[k-1],(int16_t) in[k-1], scaling);
    ysum32 += WEBRTC_SPL_MUL_16_16_RSFT( (int16_t) in[PITCH_CORR_LEN2 + k - 1],(int16_t) in[PITCH_CORR_LEN2 + k - 1], scaling);

#ifdef WEBRTC_ARCH_ARM_NEON
    {
      int32_t vbuff[4];
      int32x4_t int_32x4_sum = vmovq_n_s32(0);
      // Can't shift a Neon register to right with a non-constant shift value.
      int32x4_t int_32x4_scale = vdupq_n_s32(-scaling);
      // Assert a codition used in loop unrolling at compile-time.
      COMPILE_ASSERT(PITCH_CORR_LEN2 %4 == 0);

      for (n = 0; n < PITCH_CORR_LEN2; n += 4) {
        int16x4_t int_16x4_x = vld1_s16(&x[n]);
        int16x4_t int_16x4_in = vld1_s16(&inptr[n]);
        int32x4_t int_32x4 = vmull_s16(int_16x4_x, int_16x4_in);
        int_32x4 = vshlq_s32(int_32x4, int_32x4_scale);
        int_32x4_sum = vaddq_s32(int_32x4_sum, int_32x4);
      }

      // Use vector store to avoid long stall from data trasferring
      // from vector to general register.
      vst1q_s32(vbuff, int_32x4_sum);
      csum32 = vbuff[0] + vbuff[1];
      csum32 += vbuff[2];
      csum32 += vbuff[3];
    }
#else
    csum32 = 0;
    if(scaling == 0) {
      for (n = 0; n < PITCH_CORR_LEN2; n++) {
        csum32 += x[n] * inptr[n];
      }
    } else {
      for (n = 0; n < PITCH_CORR_LEN2; n++) {
        csum32 += (x[n] * inptr[n]) >> scaling;
      }
    }
#endif

    logcorQ8--;

    lys=Log2Q8((uint32_t)ysum32); // Q8
    lys=WEBRTC_SPL_RSHIFT_W32(lys, 1); //sqrt(ysum);

    if (csum32>0) {

      lcs=Log2Q8((uint32_t) csum32);   // 2log(csum) in Q8

      if (lcs>(lys + oneQ8) ){ // csum/sqrt(ysum) > 2
        *logcorQ8 = lcs - lys;  // log2(csum/sqrt(ysum))
      } else {
        *logcorQ8 = oneQ8;  // 1.00
      }

    } else {
      *logcorQ8 = 0;
    }
  }
}