예제 #1
0
static INLINE void idct4x4_16_kernel_bd10(const int32x4_t cospis,
                                          int32x4_t *const a0,
                                          int32x4_t *const a1,
                                          int32x4_t *const a2,
                                          int32x4_t *const a3) {
  int32x4_t b0, b1, b2, b3;

  transpose_s32_4x4(a0, a1, a2, a3);
  b0 = vaddq_s32(*a0, *a2);
  b1 = vsubq_s32(*a0, *a2);
  b0 = vmulq_lane_s32(b0, vget_high_s32(cospis), 0);
  b1 = vmulq_lane_s32(b1, vget_high_s32(cospis), 0);
  b2 = vmulq_lane_s32(*a1, vget_high_s32(cospis), 1);
  b3 = vmulq_lane_s32(*a1, vget_low_s32(cospis), 1);
  b2 = vmlsq_lane_s32(b2, *a3, vget_low_s32(cospis), 1);
  b3 = vmlaq_lane_s32(b3, *a3, vget_high_s32(cospis), 1);
  b0 = vrshrq_n_s32(b0, DCT_CONST_BITS);
  b1 = vrshrq_n_s32(b1, DCT_CONST_BITS);
  b2 = vrshrq_n_s32(b2, DCT_CONST_BITS);
  b3 = vrshrq_n_s32(b3, DCT_CONST_BITS);
  *a0 = vaddq_s32(b0, b3);
  *a1 = vaddq_s32(b1, b2);
  *a2 = vsubq_s32(b1, b2);
  *a3 = vsubq_s32(b0, b3);
}
예제 #2
0
unsigned int vpx_get4x4sse_cs_neon(
        const unsigned char *src_ptr,
        int source_stride,
        const unsigned char *ref_ptr,
        int recon_stride) {
    int16x4_t d22s16, d24s16, d26s16, d28s16;
    int64x1_t d0s64;
    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
    int32x4_t q7s32, q8s32, q9s32, q10s32;
    uint16x8_t q11u16, q12u16, q13u16, q14u16;
    int64x2_t q1s64;

    d0u8 = vld1_u8(src_ptr);
    src_ptr += source_stride;
    d4u8 = vld1_u8(ref_ptr);
    ref_ptr += recon_stride;
    d1u8 = vld1_u8(src_ptr);
    src_ptr += source_stride;
    d5u8 = vld1_u8(ref_ptr);
    ref_ptr += recon_stride;
    d2u8 = vld1_u8(src_ptr);
    src_ptr += source_stride;
    d6u8 = vld1_u8(ref_ptr);
    ref_ptr += recon_stride;
    d3u8 = vld1_u8(src_ptr);
    src_ptr += source_stride;
    d7u8 = vld1_u8(ref_ptr);
    ref_ptr += recon_stride;

    q11u16 = vsubl_u8(d0u8, d4u8);
    q12u16 = vsubl_u8(d1u8, d5u8);
    q13u16 = vsubl_u8(d2u8, d6u8);
    q14u16 = vsubl_u8(d3u8, d7u8);

    d22s16 = vget_low_s16(vreinterpretq_s16_u16(q11u16));
    d24s16 = vget_low_s16(vreinterpretq_s16_u16(q12u16));
    d26s16 = vget_low_s16(vreinterpretq_s16_u16(q13u16));
    d28s16 = vget_low_s16(vreinterpretq_s16_u16(q14u16));

    q7s32 = vmull_s16(d22s16, d22s16);
    q8s32 = vmull_s16(d24s16, d24s16);
    q9s32 = vmull_s16(d26s16, d26s16);
    q10s32 = vmull_s16(d28s16, d28s16);

    q7s32 = vaddq_s32(q7s32, q8s32);
    q9s32 = vaddq_s32(q9s32, q10s32);
    q9s32 = vaddq_s32(q7s32, q9s32);

    q1s64 = vpaddlq_s32(q9s32);
    d0s64 = vadd_s64(vget_low_s64(q1s64), vget_high_s64(q1s64));

    return vget_lane_u32(vreinterpret_u32_s64(d0s64), 0);
}
예제 #3
0
파일: mw_neon.c 프로젝트: Daniel-Hwang/eeg
/* s32x4 mm mul */
void mw_neon_mm_mul_s32x4(int * A, int Row, int T, int * B, int Col, int * C)
{
	int i, k, j;

	int32x4_t neon_b, neon_c;
	int32x4_t neon_a0, neon_a1, neon_a2, neon_a3;
	int32x4_t neon_b0, neon_b1, neon_b2, neon_b3;

	for (i = 0; i < Row; i+=4)
	{

		for (k = 0; k < Col; k+=1)
		{
			neon_c = vmovq_n_s32(0);

			for (j = 0; j < T; j+=4)
			{

				int j_T = j * T + i;
				int k_Row = k * Row;

				neon_a0 = vld1q_s32(A + j_T);
				j_T+=Row;
				neon_a1 = vld1q_s32(A + j_T);
				j_T+=Row;
				neon_a2 = vld1q_s32(A + j_T);
				j_T+=Row;
				neon_a3 = vld1q_s32(A + j_T);

				neon_b = vld1q_s32(B + k_Row + j);
				neon_b0 = vdupq_n_s32(vgetq_lane_s32(neon_b, 0));
				neon_b1 = vdupq_n_s32(vgetq_lane_s32(neon_b, 1));
				neon_b2 = vdupq_n_s32(vgetq_lane_s32(neon_b, 2));
				neon_b3 = vdupq_n_s32(vgetq_lane_s32(neon_b, 3));

				neon_c = vaddq_s32(vmulq_s32(neon_a0, neon_b0), neon_c);
				neon_c = vaddq_s32(vmulq_s32(neon_a1, neon_b1), neon_c);
				neon_c = vaddq_s32(vmulq_s32(neon_a2, neon_b2), neon_c);
				neon_c = vaddq_s32(vmulq_s32(neon_a3, neon_b3), neon_c);

				vst1q_lane_s32(C + k_Row + i, neon_c, 0);
				vst1q_lane_s32(C + k_Row + i + 1, neon_c, 1);
				vst1q_lane_s32(C + k_Row + i + 2, neon_c, 2);
				vst1q_lane_s32(C + k_Row + i + 3, neon_c, 3);

			}
		}
	}
}
예제 #4
0
static inline void silk_biquad_alt_stride2_kernel( const int32x4_t A_L_s32x4, const int32x4_t A_U_s32x4, const int32x4_t B_Q28_s32x4, const int32x2_t t_s32x2, const int32x4_t in_s32x4, int32x4_t *S_s32x4, int32x2_t *out32_Q14_s32x2 )
{
    int32x4_t t_s32x4, out32_Q14_s32x4;

    *out32_Q14_s32x2 = vadd_s32( vget_low_s32( *S_s32x4 ), t_s32x2 );              /* silk_SMLAWB( S{0,1}, B_Q28[ 0 ], in{0,1} )                                      */
    *S_s32x4         = vcombine_s32( vget_high_s32( *S_s32x4 ), vdup_n_s32( 0 ) ); /* S{0,1} = S{2,3}; S{2,3} = 0;                                                    */
    *out32_Q14_s32x2 = vshl_n_s32( *out32_Q14_s32x2, 2 );                          /* out32_Q14_{0,1} = silk_LSHIFT( silk_SMLAWB( S{0,1}, B_Q28[ 0 ], in{0,1} ), 2 ); */
    out32_Q14_s32x4  = vcombine_s32( *out32_Q14_s32x2, *out32_Q14_s32x2 );         /* out32_Q14_{0,1,0,1}                                                             */
    t_s32x4          = vqdmulhq_s32( out32_Q14_s32x4, A_L_s32x4 );                 /* silk_SMULWB( out32_Q14_{0,1,0,1}, A{0,0,1,1}_L_Q28 )                            */
    *S_s32x4         = vrsraq_n_s32( *S_s32x4, t_s32x4, 14 );                      /* S{0,1} = S{2,3} + silk_RSHIFT_ROUND();  S{2,3} = silk_RSHIFT_ROUND();           */
    t_s32x4          = vqdmulhq_s32( out32_Q14_s32x4, A_U_s32x4 );                 /* silk_SMULWB( out32_Q14_{0,1,0,1}, A{0,0,1,1}_U_Q28 )                            */
    *S_s32x4         = vaddq_s32( *S_s32x4, t_s32x4 );                             /* S0 = silk_SMLAWB( S{0,1,2,3}, out32_Q14_{0,1,0,1}, A{0,0,1,1}_U_Q28 );          */
    t_s32x4          = vqdmulhq_s32( in_s32x4, B_Q28_s32x4 );                      /* silk_SMULWB( B_Q28[ {1,1,2,2} ], in{0,1,0,1} )                                  */
    *S_s32x4         = vaddq_s32( *S_s32x4, t_s32x4 );                             /* S0 = silk_SMLAWB( S0, B_Q28[ {1,1,2,2} ], in{0,1,0,1} );                        */
}
예제 #5
0
inline v_int32x4 v_dotprod(const v_int16x8& a, const v_int16x8& b)
{
    int32x4_t c = vmull_s16(vget_low_s16(a.val), vget_low_s16(b.val));
    int32x4_t d = vmull_s16(vget_high_s16(a.val), vget_high_s16(b.val));
    int32x4x2_t cd = vtrnq_s32(c, d);
    return v_int32x4(vaddq_s32(cd.val[0], cd.val[1]));
}
static void variance_neon_w8(const uint8_t *a, int a_stride,
                             const uint8_t *b, int b_stride,
                             int w, int h, unsigned int *sse, int *sum) {
  int i, j;
  int16x8_t v_sum = vdupq_n_s16(0);
  int32x4_t v_sse_lo = vdupq_n_s32(0);
  int32x4_t v_sse_hi = vdupq_n_s32(0);

  for (i = 0; i < h; ++i) {
    for (j = 0; j < w; j += 8) {
      const uint8x8_t v_a = vld1_u8(&a[j]);
      const uint8x8_t v_b = vld1_u8(&b[j]);
      const uint16x8_t v_diff = vsubl_u8(v_a, v_b);
      const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff);
      v_sum = vaddq_s16(v_sum, sv_diff);
      v_sse_lo = vmlal_s16(v_sse_lo,
                           vget_low_s16(sv_diff),
                           vget_low_s16(sv_diff));
      v_sse_hi = vmlal_s16(v_sse_hi,
                           vget_high_s16(sv_diff),
                           vget_high_s16(sv_diff));
    }
    a += a_stride;
    b += b_stride;
  }

  *sum = horizontal_add_s16x8(v_sum);
  *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi));
}
void decal_nofilter_scale_neon(uint32_t dst[], SkFixed fx, SkFixed dx, int count)
{
    int i;

    if (count >= 8) {
        /* SkFixed is 16.16 fixed point */
        SkFixed dx2 = dx+dx;
        SkFixed dx4 = dx2+dx2;
        SkFixed dx8 = dx4+dx4;

        /* now build fx/fx+dx/fx+2dx/fx+3dx */
        SkFixed fx1, fx2, fx3;
        int32x4_t lbase, hbase;
        uint16_t *dst16 = (uint16_t *)dst;

        fx1 = fx+dx;
        fx2 = fx1+dx;
        fx3 = fx2+dx;

        /* avoid an 'lbase unitialized' warning */
        lbase = vdupq_n_s32(fx);
        lbase = vsetq_lane_s32(fx1, lbase, 1);
        lbase = vsetq_lane_s32(fx2, lbase, 2);
        lbase = vsetq_lane_s32(fx3, lbase, 3);
        hbase = vaddq_s32(lbase, vdupq_n_s32(dx4));

        /* take upper 16 of each, store, and bump everything */
        do {
            int32x4_t lout, hout;
            uint16x8_t hi16;

            lout = lbase;
            hout = hbase;
            /* gets hi's of all louts then hi's of all houts */
            asm ("vuzpq.16 %q0, %q1" : "+w" (lout), "+w" (hout));
            hi16 = vreinterpretq_u16_s32(hout);
            vst1q_u16(dst16, hi16);

            /* on to the next */
            lbase = vaddq_s32 (lbase, vdupq_n_s32(dx8));
            hbase = vaddq_s32 (hbase, vdupq_n_s32(dx8));
            dst16 += 8;
            count -= 8;
            fx += dx8;
        } while (count >= 8);
        dst = (uint32_t *) dst16;
    }
예제 #8
0
static INLINE void IADST4x4_1D(int16x4_t *d3s16, int16x4_t *d4s16,
                               int16x4_t *d5s16, int16x8_t *q3s16,
                               int16x8_t *q8s16, int16x8_t *q9s16) {
  int16x4_t d6s16, d16s16, d17s16, d18s16, d19s16;
  int32x4_t q8s32, q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;

  d6s16 = vget_low_s16(*q3s16);

  d16s16 = vget_low_s16(*q8s16);
  d17s16 = vget_high_s16(*q8s16);
  d18s16 = vget_low_s16(*q9s16);
  d19s16 = vget_high_s16(*q9s16);

  q10s32 = vmull_s16(*d3s16, d16s16);
  q11s32 = vmull_s16(*d4s16, d16s16);
  q12s32 = vmull_s16(d6s16, d17s16);
  q13s32 = vmull_s16(*d5s16, d18s16);
  q14s32 = vmull_s16(*d3s16, d18s16);
  q15s32 = vmovl_s16(d16s16);
  q15s32 = vaddw_s16(q15s32, d19s16);
  q8s32 = vmull_s16(*d4s16, d19s16);
  q15s32 = vsubw_s16(q15s32, d18s16);
  q9s32 = vmull_s16(*d5s16, d19s16);

  q10s32 = vaddq_s32(q10s32, q13s32);
  q10s32 = vaddq_s32(q10s32, q8s32);
  q11s32 = vsubq_s32(q11s32, q14s32);
  q8s32 = vdupq_n_s32(sinpi_3_9);
  q11s32 = vsubq_s32(q11s32, q9s32);
  q15s32 = vmulq_s32(q15s32, q8s32);

  q13s32 = vaddq_s32(q10s32, q12s32);
  q10s32 = vaddq_s32(q10s32, q11s32);
  q14s32 = vaddq_s32(q11s32, q12s32);
  q10s32 = vsubq_s32(q10s32, q12s32);

  d16s16 = vqrshrn_n_s32(q13s32, 14);
  d17s16 = vqrshrn_n_s32(q14s32, 14);
  d18s16 = vqrshrn_n_s32(q15s32, 14);
  d19s16 = vqrshrn_n_s32(q10s32, 14);

  *q8s16 = vcombine_s16(d16s16, d17s16);
  *q9s16 = vcombine_s16(d18s16, d19s16);
  return;
}
static INLINE void iadst_half_butterfly_bd10_neon(int32x4_t *const x,
                                                  const int32x2_t c) {
  const int32x4_t sum = vaddq_s32(x[0], x[1]);
  const int32x4_t sub = vsubq_s32(x[0], x[1]);

  x[0] = vmulq_lane_s32(sum, c, 0);
  x[1] = vmulq_lane_s32(sub, c, 0);
  x[0] = vrshrq_n_s32(x[0], DCT_CONST_BITS);
  x[1] = vrshrq_n_s32(x[1], DCT_CONST_BITS);
}
예제 #10
0
파일: armdct.c 프로젝트: AlecGamble/daala
OD_SIMD_INLINE void od_idct4_kernel(int32x4_t *y0, int32x4_t *y1,
 int32x4_t *y2, int32x4_t *y3) {
  int32x4_t t0 = *y0;
  int32x4_t t1 = *y1;
  int32x4_t t2 = *y2;
  int32x4_t t3 = *y3;
  int32x4_t t2h;
  od_transpose4(&t0, &t1, &t2, &t3);
  t3 = vaddq_s32(t3, OD_DCT_MUL(t1, 18293, 8192, 14));
  t1 = vsubq_s32(t1, OD_DCT_MUL(t3, 21407, 16384, 15));
  t3 = vaddq_s32(t3, OD_DCT_MUL(t1, 23013, 16384, 15));
  t2 = vsubq_s32(t0, t2);
  t2h = OD_UNBIASED_RSHIFT(t2, 1);
  t0 = vsubq_s32(t0, vsubq_s32(t2h, OD_UNBIASED_RSHIFT(t3, 1)));
  t1 = vsubq_s32(t2h, t1);
  *y0 = t0;
  *y1 = vsubq_s32(t2, t1);
  *y2 = t1;
  *y3 = vsubq_s32(t0, t3);
}
예제 #11
0
파일: mw_neon.c 프로젝트: Daniel-Hwang/eeg
/* s32x4 mv mul */
void mw_neon_mv_mul_s32x4(int * A, int Row, int T, int * B, int * C)
{
	int i = 0;
	int k = 0;

	int32x4_t neon_b, neon_c;
	int32x4_t neon_a0, neon_a1, neon_a2, neon_a3;
	int32x4_t neon_b0, neon_b1, neon_b2, neon_b3;

	for (i = 0; i < Row; i+=4)
	{
		neon_c = vmovq_n_s32(0);

		for (k = 0; k < T; k+=4)
		{
			int j = k * T + i;

			neon_a0 = vld1q_s32(A + j);
			j+=Row;
			neon_a1 = vld1q_s32(A + j);
			j+=Row;
			neon_a2 = vld1q_s32(A + j);
			j+=Row;
			neon_a3 = vld1q_s32(A + j);

			neon_b = vld1q_s32(B + k);
			neon_b0 = vdupq_n_s32(vgetq_lane_s32(neon_b, 0));
			neon_b1 = vdupq_n_s32(vgetq_lane_s32(neon_b, 1));
			neon_b2 = vdupq_n_s32(vgetq_lane_s32(neon_b, 2));
			neon_b3 = vdupq_n_s32(vgetq_lane_s32(neon_b, 3));

			neon_c = vaddq_s32(vmulq_s32(neon_a0, neon_b0), neon_c);
			neon_c = vaddq_s32(vmulq_s32(neon_a1, neon_b1), neon_c);
			neon_c = vaddq_s32(vmulq_s32(neon_a2, neon_b2), neon_c);
			neon_c = vaddq_s32(vmulq_s32(neon_a3, neon_b3), neon_c);

		}

		vst1q_s32(C + i, neon_c);
	}
}
예제 #12
0
static INLINE void idct4x4_16_kernel_bd12(const int32x4_t cospis,
                                          int32x4_t *const a0,
                                          int32x4_t *const a1,
                                          int32x4_t *const a2,
                                          int32x4_t *const a3) {
  int32x4_t b0, b1, b2, b3;
  int64x2_t c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, c10, c11;

  transpose_s32_4x4(a0, a1, a2, a3);
  b0 = vaddq_s32(*a0, *a2);
  b1 = vsubq_s32(*a0, *a2);
  c0 = vmull_lane_s32(vget_low_s32(b0), vget_high_s32(cospis), 0);
  c1 = vmull_lane_s32(vget_high_s32(b0), vget_high_s32(cospis), 0);
  c2 = vmull_lane_s32(vget_low_s32(b1), vget_high_s32(cospis), 0);
  c3 = vmull_lane_s32(vget_high_s32(b1), vget_high_s32(cospis), 0);
  c4 = vmull_lane_s32(vget_low_s32(*a1), vget_high_s32(cospis), 1);
  c5 = vmull_lane_s32(vget_high_s32(*a1), vget_high_s32(cospis), 1);
  c6 = vmull_lane_s32(vget_low_s32(*a1), vget_low_s32(cospis), 1);
  c7 = vmull_lane_s32(vget_high_s32(*a1), vget_low_s32(cospis), 1);
  c8 = vmull_lane_s32(vget_low_s32(*a3), vget_low_s32(cospis), 1);
  c9 = vmull_lane_s32(vget_high_s32(*a3), vget_low_s32(cospis), 1);
  c10 = vmull_lane_s32(vget_low_s32(*a3), vget_high_s32(cospis), 1);
  c11 = vmull_lane_s32(vget_high_s32(*a3), vget_high_s32(cospis), 1);
  c4 = vsubq_s64(c4, c8);
  c5 = vsubq_s64(c5, c9);
  c6 = vaddq_s64(c6, c10);
  c7 = vaddq_s64(c7, c11);
  b0 = vcombine_s32(vrshrn_n_s64(c0, DCT_CONST_BITS),
                    vrshrn_n_s64(c1, DCT_CONST_BITS));
  b1 = vcombine_s32(vrshrn_n_s64(c2, DCT_CONST_BITS),
                    vrshrn_n_s64(c3, DCT_CONST_BITS));
  b2 = vcombine_s32(vrshrn_n_s64(c4, DCT_CONST_BITS),
                    vrshrn_n_s64(c5, DCT_CONST_BITS));
  b3 = vcombine_s32(vrshrn_n_s64(c6, DCT_CONST_BITS),
                    vrshrn_n_s64(c7, DCT_CONST_BITS));
  *a0 = vaddq_s32(b0, b3);
  *a1 = vaddq_s32(b1, b2);
  *a2 = vsubq_s32(b1, b2);
  *a3 = vsubq_s32(b0, b3);
}
예제 #13
0
파일: add.c 프로젝트: Winddoing/MyCode
static void add_int_neon(int* dst, int* src1, int* src2, int count)
{
  int i;
  for (i = 0; i < count; i += 4) {
      int32x4_t in1, in2, out;
      in1 = vld1q_s32(src1);
      src1 += 4;
      in2 = vld1q_s32(src2);
      src2 += 4;
      out = vaddq_s32(in1, in2);
      vst1q_s32(dst, out);
      dst += 4;
    }
}
예제 #14
0
파일: armdct.c 프로젝트: AlecGamble/daala
OD_SIMD_INLINE void od_fdct4_kernel(int32x4_t *x0, int32x4_t *x1,
 int32x4_t *x2, int32x4_t *x3) {
  /*9 adds, 2 shifts, 3 "muls".*/
  int32x4_t t0 = *x0;
  int32x4_t t2 = *x1;
  int32x4_t t1 = *x2;
  int32x4_t t3 = *x3;
  int32x4_t t2h;
  /*+1/-1 butterflies:*/
  t3 = vsubq_s32(t0, t3);
  t2 = vaddq_s32(t2, t1);
  t2h = OD_UNBIASED_RSHIFT(t2, 1);
  t1 = vsubq_s32(t2h, t1);
  t0 = vsubq_s32(t0, OD_UNBIASED_RSHIFT(t3, 1));
  /*+ Embedded 2-point type-II DCT.*/
  t0 = vaddq_s32(t0, t2h);
  t2 = vsubq_s32(t0, t2);
  /*+ Embedded 2-point type-IV DST.*/
  /*23013/32768 ~= 4*sin(\frac{\pi}{8}) - 2*tan(\frac{\pi}{8}) ~=
     0.70230660471416898931046248770220*/
  od_overflow_check_epi32(t1, 23013, 16384, 0);
  t3 = vsubq_s32(t3, OD_DCT_MUL(t1, 23013, 16384, 15));
  /*21407/32768~=\sqrt{1/2}*cos(\frac{\pi}{8}))
     ~=0.65328148243818826392832158671359*/
  od_overflow_check_epi32(t3, 21407, 16384, 1);
  t1 = vaddq_s32(t1, OD_DCT_MUL(t3, 21407, 16384, 15));
  /*18293/16384 ~= 4*sin(\frac{\pi}{8}) - tan(\frac{\pi}{8}) ~=
     1.1165201670872640381121512119119*/
  od_overflow_check_epi32(t3, 18293, 8192, 2);
  t3 = vsubq_s32(t3, OD_DCT_MUL(t1, 18293, 8192, 14));
  od_transpose4(&t0, &t1, &t2, &t3);
  *x0 = t0;
  *x1 = t1;
  *x2 = t2;
  *x3 = t3;
}
static INLINE void iadst_half_butterfly_bd12_neon(int32x4_t *const x,
                                                  const int32x2_t c) {
  const int32x4_t sum = vaddq_s32(x[0], x[1]);
  const int32x4_t sub = vsubq_s32(x[0], x[1]);
  const int64x2_t t0_lo = vmull_lane_s32(vget_low_s32(sum), c, 0);
  const int64x2_t t1_lo = vmull_lane_s32(vget_low_s32(sub), c, 0);
  const int64x2_t t0_hi = vmull_lane_s32(vget_high_s32(sum), c, 0);
  const int64x2_t t1_hi = vmull_lane_s32(vget_high_s32(sub), c, 0);
  const int32x2_t out0_lo = vrshrn_n_s64(t0_lo, DCT_CONST_BITS);
  const int32x2_t out1_lo = vrshrn_n_s64(t1_lo, DCT_CONST_BITS);
  const int32x2_t out0_hi = vrshrn_n_s64(t0_hi, DCT_CONST_BITS);
  const int32x2_t out1_hi = vrshrn_n_s64(t1_hi, DCT_CONST_BITS);

  x[0] = vcombine_s32(out0_lo, out0_hi);
  x[1] = vcombine_s32(out1_lo, out1_hi);
}
예제 #16
0
파일: mw_neon.c 프로젝트: Daniel-Hwang/eeg
/* s32x4 add */
void mw_neon_mm_add_s32x4(int * A, int Row, int Col, int * B, int * C)
{
	int32x4_t neon_a, neon_b, neon_c;
	int size = Row * Col;
	int i = 0;
	int k = 0;

	for (i = 4; i <= size ; i+=4)
	{
		k = i - 4;
		neon_a = vld1q_s32(A + k);
		neon_b = vld1q_s32(B + k);
		neon_c = vaddq_s32(neon_a, neon_b);
		vst1q_s32(C + k, neon_c);
	}

	k = i - 4;
    for (i = 0; i < size % 4; i++)
	{
		C[k + i] = A[k + i] + B[k + i];
	}
}
예제 #17
0
static inline void yuv2rgb_4x2(const uint8_t *y1, const uint8_t *y2, const uint8_t *u, const uint8_t *v, int16_t *r1, int16_t *g1, int16_t *b1, int16_t *r2, int16_t *g2, int16_t *b2){
	int32x4_t ry1;
	int32x4_t ry2;
	int32x4_t rvug;
	int32x4_t rvr;
	int32x4_t rub;
	int32x4_t rr1,rg1,rb1,rr2,rg2,rb2;
	int32x4_t max;

	LOAD_Y_PREMULTS(0)
	LOAD_Y_PREMULTS(1)
	LOAD_Y_PREMULTS(2)
	LOAD_Y_PREMULTS(3)

	LOAD_UV_PREMULTS(0)
	LOAD_UV_PREMULTS(1)

	max=vld1q_s32(yuvmax);
	/*the following does not work */
	//max=vdupq_n_s32(255);

	rr1=vaddq_s32(ry1,rvr);
	rr2=vaddq_s32(ry2,rvr);
	rg1=vaddq_s32(ry1,rvug);
	rg2=vaddq_s32(ry2,rvug);
	rb1=vaddq_s32(ry1,rub);
	rb2=vaddq_s32(ry2,rub);



	rr1=vminq_s32(vabsq_s32(rr1),max);
	rr2=vminq_s32(vabsq_s32(rr2),max);
	rg1=vminq_s32(vabsq_s32(rg1),max);
	rg2=vminq_s32(vabsq_s32(rg2),max);
	rb1=vminq_s32(vabsq_s32(rb1),max);
	rb2=vminq_s32(vabsq_s32(rb2),max);

	vst1_s16(r1,vqshrn_n_s32(rr1,13));
	vst1_s16(r2,vqshrn_n_s32(rr2,13));

	vst1_s16(g1,vqshrn_n_s32(rg1,13));
	vst1_s16(g2,vqshrn_n_s32(rg2,13));

	vst1_s16(b1,vqshrn_n_s32(rb1,13));
	vst1_s16(b2,vqshrn_n_s32(rb2,13));
}
static INLINE void iadst8_bd12(int32x4_t *const io0, int32x4_t *const io1,
                               int32x4_t *const io2, int32x4_t *const io3,
                               int32x4_t *const io4, int32x4_t *const io5,
                               int32x4_t *const io6, int32x4_t *const io7) {
  const int32x4_t c0 =
      create_s32x4_neon(cospi_2_64, cospi_30_64, cospi_10_64, cospi_22_64);
  const int32x4_t c1 =
      create_s32x4_neon(cospi_18_64, cospi_14_64, cospi_26_64, cospi_6_64);
  const int32x4_t c2 =
      create_s32x4_neon(cospi_16_64, 0, cospi_8_64, cospi_24_64);
  int32x4_t x[8], t[4];
  int64x2_t s[8][2];

  x[0] = *io7;
  x[1] = *io0;
  x[2] = *io5;
  x[3] = *io2;
  x[4] = *io3;
  x[5] = *io4;
  x[6] = *io1;
  x[7] = *io6;

  // stage 1
  iadst_butterfly_lane_0_1_bd12_neon(x[0], x[1], vget_low_s32(c0), s[0], s[1]);
  iadst_butterfly_lane_0_1_bd12_neon(x[2], x[3], vget_high_s32(c0), s[2], s[3]);
  iadst_butterfly_lane_0_1_bd12_neon(x[4], x[5], vget_low_s32(c1), s[4], s[5]);
  iadst_butterfly_lane_0_1_bd12_neon(x[6], x[7], vget_high_s32(c1), s[6], s[7]);

  x[0] = add_dct_const_round_shift_low_8_bd12(s[0], s[4]);
  x[1] = add_dct_const_round_shift_low_8_bd12(s[1], s[5]);
  x[2] = add_dct_const_round_shift_low_8_bd12(s[2], s[6]);
  x[3] = add_dct_const_round_shift_low_8_bd12(s[3], s[7]);
  x[4] = sub_dct_const_round_shift_low_8_bd12(s[0], s[4]);
  x[5] = sub_dct_const_round_shift_low_8_bd12(s[1], s[5]);
  x[6] = sub_dct_const_round_shift_low_8_bd12(s[2], s[6]);
  x[7] = sub_dct_const_round_shift_low_8_bd12(s[3], s[7]);

  // stage 2
  t[0] = x[0];
  t[1] = x[1];
  t[2] = x[2];
  t[3] = x[3];
  iadst_butterfly_lane_0_1_bd12_neon(x[4], x[5], vget_high_s32(c2), s[4], s[5]);
  iadst_butterfly_lane_1_0_bd12_neon(x[7], x[6], vget_high_s32(c2), s[7], s[6]);

  x[0] = vaddq_s32(t[0], t[2]);
  x[1] = vaddq_s32(t[1], t[3]);
  x[2] = vsubq_s32(t[0], t[2]);
  x[3] = vsubq_s32(t[1], t[3]);
  x[4] = add_dct_const_round_shift_low_8_bd12(s[4], s[6]);
  x[5] = add_dct_const_round_shift_low_8_bd12(s[5], s[7]);
  x[6] = sub_dct_const_round_shift_low_8_bd12(s[4], s[6]);
  x[7] = sub_dct_const_round_shift_low_8_bd12(s[5], s[7]);

  // stage 3
  iadst_half_butterfly_bd12_neon(x + 2, vget_low_s32(c2));
  iadst_half_butterfly_bd12_neon(x + 6, vget_low_s32(c2));

  *io0 = x[0];
  *io1 = vnegq_s32(x[4]);
  *io2 = x[6];
  *io3 = vnegq_s32(x[2]);
  *io4 = x[3];
  *io5 = vnegq_s32(x[7]);
  *io6 = x[5];
  *io7 = vnegq_s32(x[1]);
}
static INLINE int32x4_t
add_dct_const_round_shift_low_8_bd10(const int32x4_t in0, const int32x4_t in1) {
  const int32x4_t sum = vaddq_s32(in0, in1);
  return vrshrq_n_s32(sum, DCT_CONST_BITS);
}
unsigned int vp8_sub_pixel_variance16x16_neon_func(
        const unsigned char *src_ptr,
        int src_pixels_per_line,
        int xoffset,
        int yoffset,
        const unsigned char *dst_ptr,
        int dst_pixels_per_line,
        unsigned int *sse) {
    int i;
    DECLARE_ALIGNED_ARRAY(16, unsigned char, tmp, 528);
    unsigned char *tmpp;
    unsigned char *tmpp2;
    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8, d8u8, d9u8;
    uint8x8_t d10u8, d11u8, d12u8, d13u8, d14u8, d15u8, d16u8, d17u8, d18u8;
    uint8x8_t d19u8, d20u8, d21u8;
    int16x4_t d22s16, d23s16, d24s16, d25s16, d26s16, d27s16, d28s16, d29s16;
    uint32x2_t d0u32, d10u32;
    int64x1_t d0s64, d1s64, d2s64, d3s64;
    uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8;
    uint8x16_t q10u8, q11u8, q12u8, q13u8, q14u8, q15u8;
    uint16x8_t q1u16, q2u16, q3u16, q4u16, q5u16, q6u16, q7u16, q8u16;
    uint16x8_t q9u16, q10u16, q11u16, q12u16, q13u16, q14u16;
    int32x4_t q8s32, q9s32, q10s32;
    int64x2_t q0s64, q1s64, q5s64;

    tmpp2 = tmp + 272;
    tmpp = tmp;
    if (xoffset == 0) {  // secondpass_bfilter16x16_only
        d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]);
        d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]);

        q11u8 = vld1q_u8(src_ptr);
        src_ptr += src_pixels_per_line;
        for (i = 4; i > 0; i--) {
            q12u8 = vld1q_u8(src_ptr);
            src_ptr += src_pixels_per_line;
            q13u8 = vld1q_u8(src_ptr);
            src_ptr += src_pixels_per_line;
            q14u8 = vld1q_u8(src_ptr);
            src_ptr += src_pixels_per_line;
            q15u8 = vld1q_u8(src_ptr);
            src_ptr += src_pixels_per_line;

            __builtin_prefetch(src_ptr);
            __builtin_prefetch(src_ptr + src_pixels_per_line);
            __builtin_prefetch(src_ptr + src_pixels_per_line * 2);

            q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
            q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
            q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
            q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
            q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
            q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
            q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
            q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);

            q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
            q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
            q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
            q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
            q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
            q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
            q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
            q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);

            d2u8 = vqrshrn_n_u16(q1u16, 7);
            d3u8 = vqrshrn_n_u16(q2u16, 7);
            d4u8 = vqrshrn_n_u16(q3u16, 7);
            d5u8 = vqrshrn_n_u16(q4u16, 7);
            d6u8 = vqrshrn_n_u16(q5u16, 7);
            d7u8 = vqrshrn_n_u16(q6u16, 7);
            d8u8 = vqrshrn_n_u16(q7u16, 7);
            d9u8 = vqrshrn_n_u16(q8u16, 7);

            q1u8 = vcombine_u8(d2u8, d3u8);
            q2u8 = vcombine_u8(d4u8, d5u8);
            q3u8 = vcombine_u8(d6u8, d7u8);
            q4u8 = vcombine_u8(d8u8, d9u8);

            q11u8 = q15u8;

            vst1q_u8((uint8_t *)tmpp2, q1u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q2u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q3u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q4u8);
            tmpp2 += 16;
        }
    } else if (yoffset == 0) {  // firstpass_bfilter16x16_only
        d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]);
        d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]);

        for (i = 4; i > 0 ; i--) {
            d2u8 = vld1_u8(src_ptr);
            d3u8 = vld1_u8(src_ptr + 8);
            d4u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;
            d5u8 = vld1_u8(src_ptr);
            d6u8 = vld1_u8(src_ptr + 8);
            d7u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;
            d8u8 = vld1_u8(src_ptr);
            d9u8 = vld1_u8(src_ptr + 8);
            d10u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;
            d11u8 = vld1_u8(src_ptr);
            d12u8 = vld1_u8(src_ptr + 8);
            d13u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;

            __builtin_prefetch(src_ptr);
            __builtin_prefetch(src_ptr + src_pixels_per_line);
            __builtin_prefetch(src_ptr + src_pixels_per_line * 2);

            q7u16  = vmull_u8(d2u8, d0u8);
            q8u16  = vmull_u8(d3u8, d0u8);
            q9u16  = vmull_u8(d5u8, d0u8);
            q10u16 = vmull_u8(d6u8, d0u8);
            q11u16 = vmull_u8(d8u8, d0u8);
            q12u16 = vmull_u8(d9u8, d0u8);
            q13u16 = vmull_u8(d11u8, d0u8);
            q14u16 = vmull_u8(d12u8, d0u8);

            d2u8  = vext_u8(d2u8, d3u8, 1);
            d5u8  = vext_u8(d5u8, d6u8, 1);
            d8u8  = vext_u8(d8u8, d9u8, 1);
            d11u8 = vext_u8(d11u8, d12u8, 1);

            q7u16  = vmlal_u8(q7u16, d2u8, d1u8);
            q9u16  = vmlal_u8(q9u16, d5u8, d1u8);
            q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
            q13u16 = vmlal_u8(q13u16, d11u8, d1u8);

            d3u8  = vext_u8(d3u8, d4u8, 1);
            d6u8  = vext_u8(d6u8, d7u8, 1);
            d9u8  = vext_u8(d9u8, d10u8, 1);
            d12u8 = vext_u8(d12u8, d13u8, 1);

            q8u16  = vmlal_u8(q8u16,  d3u8, d1u8);
            q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
            q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
            q14u16 = vmlal_u8(q14u16, d12u8, d1u8);

            d14u8 = vqrshrn_n_u16(q7u16, 7);
            d15u8 = vqrshrn_n_u16(q8u16, 7);
            d16u8 = vqrshrn_n_u16(q9u16, 7);
            d17u8 = vqrshrn_n_u16(q10u16, 7);
            d18u8 = vqrshrn_n_u16(q11u16, 7);
            d19u8 = vqrshrn_n_u16(q12u16, 7);
            d20u8 = vqrshrn_n_u16(q13u16, 7);
            d21u8 = vqrshrn_n_u16(q14u16, 7);

            q7u8  = vcombine_u8(d14u8, d15u8);
            q8u8  = vcombine_u8(d16u8, d17u8);
            q9u8  = vcombine_u8(d18u8, d19u8);
            q10u8 = vcombine_u8(d20u8, d21u8);

            vst1q_u8((uint8_t *)tmpp2, q7u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q8u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q9u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q10u8);
            tmpp2 += 16;
        }
    } else {
        d0u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][0]);
        d1u8 = vdup_n_u8(bilinear_taps_coeff[xoffset][1]);

        d2u8 = vld1_u8(src_ptr);
        d3u8 = vld1_u8(src_ptr + 8);
        d4u8 = vld1_u8(src_ptr + 16);
        src_ptr += src_pixels_per_line;
        d5u8 = vld1_u8(src_ptr);
        d6u8 = vld1_u8(src_ptr + 8);
        d7u8 = vld1_u8(src_ptr + 16);
        src_ptr += src_pixels_per_line;
        d8u8 = vld1_u8(src_ptr);
        d9u8 = vld1_u8(src_ptr + 8);
        d10u8 = vld1_u8(src_ptr + 16);
        src_ptr += src_pixels_per_line;
        d11u8 = vld1_u8(src_ptr);
        d12u8 = vld1_u8(src_ptr + 8);
        d13u8 = vld1_u8(src_ptr + 16);
        src_ptr += src_pixels_per_line;

        // First Pass: output_height lines x output_width columns (17x16)
        for (i = 3; i > 0; i--) {
            q7u16  = vmull_u8(d2u8, d0u8);
            q8u16  = vmull_u8(d3u8, d0u8);
            q9u16  = vmull_u8(d5u8, d0u8);
            q10u16 = vmull_u8(d6u8, d0u8);
            q11u16 = vmull_u8(d8u8, d0u8);
            q12u16 = vmull_u8(d9u8, d0u8);
            q13u16 = vmull_u8(d11u8, d0u8);
            q14u16 = vmull_u8(d12u8, d0u8);

            d2u8  = vext_u8(d2u8, d3u8, 1);
            d5u8  = vext_u8(d5u8, d6u8, 1);
            d8u8  = vext_u8(d8u8, d9u8, 1);
            d11u8 = vext_u8(d11u8, d12u8, 1);

            q7u16  = vmlal_u8(q7u16, d2u8, d1u8);
            q9u16  = vmlal_u8(q9u16, d5u8, d1u8);
            q11u16 = vmlal_u8(q11u16, d8u8, d1u8);
            q13u16 = vmlal_u8(q13u16, d11u8, d1u8);

            d3u8  = vext_u8(d3u8, d4u8, 1);
            d6u8  = vext_u8(d6u8, d7u8, 1);
            d9u8  = vext_u8(d9u8, d10u8, 1);
            d12u8 = vext_u8(d12u8, d13u8, 1);

            q8u16  = vmlal_u8(q8u16,  d3u8, d1u8);
            q10u16 = vmlal_u8(q10u16, d6u8, d1u8);
            q12u16 = vmlal_u8(q12u16, d9u8, d1u8);
            q14u16 = vmlal_u8(q14u16, d12u8, d1u8);

            d14u8 = vqrshrn_n_u16(q7u16, 7);
            d15u8 = vqrshrn_n_u16(q8u16, 7);
            d16u8 = vqrshrn_n_u16(q9u16, 7);
            d17u8 = vqrshrn_n_u16(q10u16, 7);
            d18u8 = vqrshrn_n_u16(q11u16, 7);
            d19u8 = vqrshrn_n_u16(q12u16, 7);
            d20u8 = vqrshrn_n_u16(q13u16, 7);
            d21u8 = vqrshrn_n_u16(q14u16, 7);

            d2u8 = vld1_u8(src_ptr);
            d3u8 = vld1_u8(src_ptr + 8);
            d4u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;
            d5u8 = vld1_u8(src_ptr);
            d6u8 = vld1_u8(src_ptr + 8);
            d7u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;
            d8u8 = vld1_u8(src_ptr);
            d9u8 = vld1_u8(src_ptr + 8);
            d10u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;
            d11u8 = vld1_u8(src_ptr);
            d12u8 = vld1_u8(src_ptr + 8);
            d13u8 = vld1_u8(src_ptr + 16);
            src_ptr += src_pixels_per_line;

            q7u8 = vcombine_u8(d14u8, d15u8);
            q8u8 = vcombine_u8(d16u8, d17u8);
            q9u8 = vcombine_u8(d18u8, d19u8);
            q10u8 = vcombine_u8(d20u8, d21u8);

            vst1q_u8((uint8_t *)tmpp, q7u8);
            tmpp += 16;
            vst1q_u8((uint8_t *)tmpp, q8u8);
            tmpp += 16;
            vst1q_u8((uint8_t *)tmpp, q9u8);
            tmpp += 16;
            vst1q_u8((uint8_t *)tmpp, q10u8);
            tmpp += 16;
        }

        // First-pass filtering for rest 5 lines
        d14u8 = vld1_u8(src_ptr);
        d15u8 = vld1_u8(src_ptr + 8);
        d16u8 = vld1_u8(src_ptr + 16);
        src_ptr += src_pixels_per_line;

        q9u16  = vmull_u8(d2u8, d0u8);
        q10u16 = vmull_u8(d3u8, d0u8);
        q11u16 = vmull_u8(d5u8, d0u8);
        q12u16 = vmull_u8(d6u8, d0u8);
        q13u16 = vmull_u8(d8u8, d0u8);
        q14u16 = vmull_u8(d9u8, d0u8);

        d2u8  = vext_u8(d2u8, d3u8, 1);
        d5u8  = vext_u8(d5u8, d6u8, 1);
        d8u8  = vext_u8(d8u8, d9u8, 1);

        q9u16  = vmlal_u8(q9u16, d2u8, d1u8);
        q11u16 = vmlal_u8(q11u16, d5u8, d1u8);
        q13u16 = vmlal_u8(q13u16, d8u8, d1u8);

        d3u8  = vext_u8(d3u8, d4u8, 1);
        d6u8  = vext_u8(d6u8, d7u8, 1);
        d9u8  = vext_u8(d9u8, d10u8, 1);

        q10u16 = vmlal_u8(q10u16, d3u8, d1u8);
        q12u16 = vmlal_u8(q12u16, d6u8, d1u8);
        q14u16 = vmlal_u8(q14u16, d9u8, d1u8);

        q1u16 = vmull_u8(d11u8, d0u8);
        q2u16 = vmull_u8(d12u8, d0u8);
        q3u16 = vmull_u8(d14u8, d0u8);
        q4u16 = vmull_u8(d15u8, d0u8);

        d11u8 = vext_u8(d11u8, d12u8, 1);
        d14u8 = vext_u8(d14u8, d15u8, 1);

        q1u16 = vmlal_u8(q1u16, d11u8, d1u8);
        q3u16 = vmlal_u8(q3u16, d14u8, d1u8);

        d12u8 = vext_u8(d12u8, d13u8, 1);
        d15u8 = vext_u8(d15u8, d16u8, 1);

        q2u16 = vmlal_u8(q2u16, d12u8, d1u8);
        q4u16 = vmlal_u8(q4u16, d15u8, d1u8);

        d10u8 = vqrshrn_n_u16(q9u16, 7);
        d11u8 = vqrshrn_n_u16(q10u16, 7);
        d12u8 = vqrshrn_n_u16(q11u16, 7);
        d13u8 = vqrshrn_n_u16(q12u16, 7);
        d14u8 = vqrshrn_n_u16(q13u16, 7);
        d15u8 = vqrshrn_n_u16(q14u16, 7);
        d16u8 = vqrshrn_n_u16(q1u16, 7);
        d17u8 = vqrshrn_n_u16(q2u16, 7);
        d18u8 = vqrshrn_n_u16(q3u16, 7);
        d19u8 = vqrshrn_n_u16(q4u16, 7);

        q5u8 = vcombine_u8(d10u8, d11u8);
        q6u8 = vcombine_u8(d12u8, d13u8);
        q7u8 = vcombine_u8(d14u8, d15u8);
        q8u8 = vcombine_u8(d16u8, d17u8);
        q9u8 = vcombine_u8(d18u8, d19u8);

        vst1q_u8((uint8_t *)tmpp, q5u8);
        tmpp += 16;
        vst1q_u8((uint8_t *)tmpp, q6u8);
        tmpp += 16;
        vst1q_u8((uint8_t *)tmpp, q7u8);
        tmpp += 16;
        vst1q_u8((uint8_t *)tmpp, q8u8);
        tmpp += 16;
        vst1q_u8((uint8_t *)tmpp, q9u8);

        // secondpass_filter
        d0u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][0]);
        d1u8 = vdup_n_u8(bilinear_taps_coeff[yoffset][1]);

        tmpp = tmp;
        tmpp2 = tmpp + 272;
        q11u8 = vld1q_u8(tmpp);
        tmpp += 16;
        for (i = 4; i > 0; i--) {
            q12u8 = vld1q_u8(tmpp);
            tmpp += 16;
            q13u8 = vld1q_u8(tmpp);
            tmpp += 16;
            q14u8 = vld1q_u8(tmpp);
            tmpp += 16;
            q15u8 = vld1q_u8(tmpp);
            tmpp += 16;

            q1u16 = vmull_u8(vget_low_u8(q11u8), d0u8);
            q2u16 = vmull_u8(vget_high_u8(q11u8), d0u8);
            q3u16 = vmull_u8(vget_low_u8(q12u8), d0u8);
            q4u16 = vmull_u8(vget_high_u8(q12u8), d0u8);
            q5u16 = vmull_u8(vget_low_u8(q13u8), d0u8);
            q6u16 = vmull_u8(vget_high_u8(q13u8), d0u8);
            q7u16 = vmull_u8(vget_low_u8(q14u8), d0u8);
            q8u16 = vmull_u8(vget_high_u8(q14u8), d0u8);

            q1u16 = vmlal_u8(q1u16, vget_low_u8(q12u8), d1u8);
            q2u16 = vmlal_u8(q2u16, vget_high_u8(q12u8), d1u8);
            q3u16 = vmlal_u8(q3u16, vget_low_u8(q13u8), d1u8);
            q4u16 = vmlal_u8(q4u16, vget_high_u8(q13u8), d1u8);
            q5u16 = vmlal_u8(q5u16, vget_low_u8(q14u8), d1u8);
            q6u16 = vmlal_u8(q6u16, vget_high_u8(q14u8), d1u8);
            q7u16 = vmlal_u8(q7u16, vget_low_u8(q15u8), d1u8);
            q8u16 = vmlal_u8(q8u16, vget_high_u8(q15u8), d1u8);

            d2u8 = vqrshrn_n_u16(q1u16, 7);
            d3u8 = vqrshrn_n_u16(q2u16, 7);
            d4u8 = vqrshrn_n_u16(q3u16, 7);
            d5u8 = vqrshrn_n_u16(q4u16, 7);
            d6u8 = vqrshrn_n_u16(q5u16, 7);
            d7u8 = vqrshrn_n_u16(q6u16, 7);
            d8u8 = vqrshrn_n_u16(q7u16, 7);
            d9u8 = vqrshrn_n_u16(q8u16, 7);

            q1u8 = vcombine_u8(d2u8, d3u8);
            q2u8 = vcombine_u8(d4u8, d5u8);
            q3u8 = vcombine_u8(d6u8, d7u8);
            q4u8 = vcombine_u8(d8u8, d9u8);

            q11u8 = q15u8;

            vst1q_u8((uint8_t *)tmpp2, q1u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q2u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q3u8);
            tmpp2 += 16;
            vst1q_u8((uint8_t *)tmpp2, q4u8);
            tmpp2 += 16;
        }
    }

    // sub_pixel_variance16x16_neon
    q8s32 = vdupq_n_s32(0);
    q9s32 = vdupq_n_s32(0);
    q10s32 = vdupq_n_s32(0);

    tmpp = tmp + 272;
    for (i = 0; i < 8; i++) {  // sub_pixel_variance16x16_neon_loop
        q0u8 = vld1q_u8(tmpp);
        tmpp += 16;
        q1u8 = vld1q_u8(tmpp);
        tmpp += 16;
        q2u8 = vld1q_u8(dst_ptr);
        dst_ptr += dst_pixels_per_line;
        q3u8 = vld1q_u8(dst_ptr);
        dst_ptr += dst_pixels_per_line;

        d0u8 = vget_low_u8(q0u8);
        d1u8 = vget_high_u8(q0u8);
        d2u8 = vget_low_u8(q1u8);
        d3u8 = vget_high_u8(q1u8);

        q11u16 = vsubl_u8(d0u8, vget_low_u8(q2u8));
        q12u16 = vsubl_u8(d1u8, vget_high_u8(q2u8));
        q13u16 = vsubl_u8(d2u8, vget_low_u8(q3u8));
        q14u16 = vsubl_u8(d3u8, vget_high_u8(q3u8));

        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q11u16));
        q9s32 = vmlal_s16(q9s32, d22s16, d22s16);
        q10s32 = vmlal_s16(q10s32, d23s16, d23s16);

        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q12u16));
        q9s32 = vmlal_s16(q9s32, d24s16, d24s16);
        q10s32 = vmlal_s16(q10s32, d25s16, d25s16);

        d26s16 = vreinterpret_s16_u16(vget_low_u16(q13u16));
        d27s16 = vreinterpret_s16_u16(vget_high_u16(q13u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q13u16));
        q9s32 = vmlal_s16(q9s32, d26s16, d26s16);
        q10s32 = vmlal_s16(q10s32, d27s16, d27s16);

        d28s16 = vreinterpret_s16_u16(vget_low_u16(q14u16));
        d29s16 = vreinterpret_s16_u16(vget_high_u16(q14u16));
        q8s32 = vpadalq_s16(q8s32, vreinterpretq_s16_u16(q14u16));
        q9s32 = vmlal_s16(q9s32, d28s16, d28s16);
        q10s32 = vmlal_s16(q10s32, d29s16, d29s16);
    }

    q10s32 = vaddq_s32(q10s32, q9s32);
    q0s64 = vpaddlq_s32(q8s32);
    q1s64 = vpaddlq_s32(q10s32);

    d0s64 = vget_low_s64(q0s64);
    d1s64 = vget_high_s64(q0s64);
    d2s64 = vget_low_s64(q1s64);
    d3s64 = vget_high_s64(q1s64);
    d0s64 = vadd_s64(d0s64, d1s64);
    d1s64 = vadd_s64(d2s64, d3s64);

    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
                      vreinterpret_s32_s64(d0s64));
    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);

    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);

    return vget_lane_u32(d0u32, 0);
}
예제 #21
0
// Update the noise estimation information.
static void UpdateNoiseEstimateNeon(NoiseSuppressionFixedC* inst, int offset) {
  const int16_t kExp2Const = 11819; // Q13
  int16_t* ptr_noiseEstLogQuantile = NULL;
  int16_t* ptr_noiseEstQuantile = NULL;
  int16x4_t kExp2Const16x4 = vdup_n_s16(kExp2Const);
  int32x4_t twentyOne32x4 = vdupq_n_s32(21);
  int32x4_t constA32x4 = vdupq_n_s32(0x1fffff);
  int32x4_t constB32x4 = vdupq_n_s32(0x200000);

  int16_t tmp16 = WebRtcSpl_MaxValueW16(inst->noiseEstLogQuantile + offset,
                                        inst->magnLen);

  // Guarantee a Q-domain as high as possible and still fit in int16
  inst->qNoise = 14 - (int) WEBRTC_SPL_MUL_16_16_RSFT_WITH_ROUND(kExp2Const,
                                                                 tmp16,
                                                                 21);

  int32x4_t qNoise32x4 = vdupq_n_s32(inst->qNoise);

  for (ptr_noiseEstLogQuantile = &inst->noiseEstLogQuantile[offset],
       ptr_noiseEstQuantile = &inst->noiseEstQuantile[0];
       ptr_noiseEstQuantile < &inst->noiseEstQuantile[inst->magnLen - 3];
       ptr_noiseEstQuantile += 4, ptr_noiseEstLogQuantile += 4) {

    // tmp32no2 = kExp2Const * inst->noiseEstLogQuantile[offset + i];
    int16x4_t v16x4 = vld1_s16(ptr_noiseEstLogQuantile);
    int32x4_t v32x4B = vmull_s16(v16x4, kExp2Const16x4);

    // tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac
    int32x4_t v32x4A = vandq_s32(v32x4B, constA32x4);
    v32x4A = vorrq_s32(v32x4A, constB32x4);

    // tmp16 = (int16_t)(tmp32no2 >> 21);
    v32x4B = vshrq_n_s32(v32x4B, 21);

    // tmp16 -= 21;// shift 21 to get result in Q0
    v32x4B = vsubq_s32(v32x4B, twentyOne32x4);

    // tmp16 += (int16_t) inst->qNoise;
    // shift to get result in Q(qNoise)
    v32x4B = vaddq_s32(v32x4B, qNoise32x4);

    // if (tmp16 < 0) {
    //   tmp32no1 >>= -tmp16;
    // } else {
    //   tmp32no1 <<= tmp16;
    // }
    v32x4B = vshlq_s32(v32x4A, v32x4B);

    // tmp16 = WebRtcSpl_SatW32ToW16(tmp32no1);
    v16x4 = vqmovn_s32(v32x4B);

    //inst->noiseEstQuantile[i] = tmp16;
    vst1_s16(ptr_noiseEstQuantile, v16x4);
  }

  // Last iteration:

  // inst->quantile[i]=exp(inst->lquantile[offset+i]);
  // in Q21
  int32_t tmp32no2 = kExp2Const * *ptr_noiseEstLogQuantile;
  int32_t tmp32no1 = (0x00200000 | (tmp32no2 & 0x001FFFFF)); // 2^21 + frac

  tmp16 = (int16_t)(tmp32no2 >> 21);
  tmp16 -= 21;// shift 21 to get result in Q0
  tmp16 += (int16_t) inst->qNoise; //shift to get result in Q(qNoise)
  if (tmp16 < 0) {
    tmp32no1 >>= -tmp16;
  } else {
예제 #22
0
void silk_biquad_alt_stride2_neon(
    const opus_int16            *in,                /* I     input signal                                               */
    const opus_int32            *B_Q28,             /* I     MA coefficients [3]                                        */
    const opus_int32            *A_Q28,             /* I     AR coefficients [2]                                        */
    opus_int32                  *S,                 /* I/O   State vector [4]                                           */
    opus_int16                  *out,               /* O     output signal                                              */
    const opus_int32            len                 /* I     signal length (must be even)                               */
)
{
    /* DIRECT FORM II TRANSPOSED (uses 2 element state vector) */
    opus_int        k            = 0;
    const int32x2_t offset_s32x2 = vdup_n_s32( (1<<14) - 1 );
    const int32x4_t offset_s32x4 = vcombine_s32( offset_s32x2, offset_s32x2 );
    int16x4_t       in_s16x4  = vdup_n_s16( 0 );
    int16x4_t       out_s16x4;
    int32x2_t       A_Q28_s32x2, A_L_s32x2, A_U_s32x2, B_Q28_s32x2, t_s32x2;
    int32x4_t       A_L_s32x4, A_U_s32x4, B_Q28_s32x4, S_s32x4, out32_Q14_s32x4;
    int32x2x2_t     t0_s32x2x2, t1_s32x2x2, t2_s32x2x2, S_s32x2x2;

#ifdef OPUS_CHECK_ASM
    opus_int32 S_c[ 4 ];
    VARDECL( opus_int16, out_c );
    SAVE_STACK;
    ALLOC( out_c, 2 * len, opus_int16 );

    silk_memcpy( &S_c, S, sizeof( S_c ) );
    silk_biquad_alt_stride2_c( in, B_Q28, A_Q28, S_c, out_c, len );
#endif

    /* Negate A_Q28 values and split in two parts */
    A_Q28_s32x2 = vld1_s32( A_Q28 );
    A_Q28_s32x2 = vneg_s32( A_Q28_s32x2 );
    A_L_s32x2   = vshl_n_s32( A_Q28_s32x2, 18 );                                                        /* ( -A_Q28[] & 0x00003FFF ) << 18                                                     */
    A_L_s32x2   = vreinterpret_s32_u32( vshr_n_u32( vreinterpret_u32_s32( A_L_s32x2 ), 3 ) );           /* ( -A_Q28[] & 0x00003FFF ) << 15                                                     */
    A_U_s32x2   = vshr_n_s32( A_Q28_s32x2, 14 );                                                        /* silk_RSHIFT( -A_Q28[], 14 )                                                         */
    A_U_s32x2   = vshl_n_s32( A_U_s32x2, 16 );                                                          /* silk_RSHIFT( -A_Q28[], 14 ) << 16 (Clip two leading bits to conform to C function.) */
    A_U_s32x2   = vshr_n_s32( A_U_s32x2, 1 );                                                           /* silk_RSHIFT( -A_Q28[], 14 ) << 15                                                   */

    B_Q28_s32x2  = vld1_s32( B_Q28 );
    t_s32x2      = vld1_s32( B_Q28 + 1 );
    t0_s32x2x2   = vzip_s32( A_L_s32x2, A_L_s32x2 );
    t1_s32x2x2   = vzip_s32( A_U_s32x2, A_U_s32x2 );
    t2_s32x2x2   = vzip_s32( t_s32x2, t_s32x2 );
    A_L_s32x4    = vcombine_s32( t0_s32x2x2.val[ 0 ], t0_s32x2x2.val[ 1 ] );                            /* A{0,0,1,1}_L_Q28          */
    A_U_s32x4    = vcombine_s32( t1_s32x2x2.val[ 0 ], t1_s32x2x2.val[ 1 ] );                            /* A{0,0,1,1}_U_Q28          */
    B_Q28_s32x4  = vcombine_s32( t2_s32x2x2.val[ 0 ], t2_s32x2x2.val[ 1 ] );                            /* B_Q28[ {1,1,2,2} ]        */
    S_s32x4      = vld1q_s32( S );                                                                      /* S0 = S[ 0 ]; S3 = S[ 3 ]; */
    S_s32x2x2    = vtrn_s32( vget_low_s32( S_s32x4 ), vget_high_s32( S_s32x4 ) );                       /* S2 = S[ 1 ]; S1 = S[ 2 ]; */
    S_s32x4      = vcombine_s32( S_s32x2x2.val[ 0 ], S_s32x2x2.val[ 1 ] );

    for( ; k < len - 1; k += 2 ) {
        int32x4_t in_s32x4[ 2 ], t_s32x4;
        int32x2_t out32_Q14_s32x2[ 2 ];

        /* S[ 2 * i + 0 ], S[ 2 * i + 1 ], S[ 2 * i + 2 ], S[ 2 * i + 3 ]: Q12 */
        in_s16x4      = vld1_s16( &in[ 2 * k ] );                                                       /* in{0,1,2,3} = in[ 2 * k + {0,1,2,3} ]; */
        in_s32x4[ 0 ] = vshll_n_s16( in_s16x4, 15 );                                                    /* in{0,1,2,3} << 15                      */
        t_s32x4       = vqdmulhq_lane_s32( in_s32x4[ 0 ], B_Q28_s32x2, 0 );                             /* silk_SMULWB( B_Q28[ 0 ], in{0,1,2,3} ) */
        in_s32x4[ 1 ] = vcombine_s32( vget_high_s32( in_s32x4[ 0 ] ), vget_high_s32( in_s32x4[ 0 ] ) ); /* in{2,3,2,3} << 15                      */
        in_s32x4[ 0 ] = vcombine_s32( vget_low_s32 ( in_s32x4[ 0 ] ), vget_low_s32 ( in_s32x4[ 0 ] ) ); /* in{0,1,0,1} << 15                      */
        silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_low_s32 ( t_s32x4 ), in_s32x4[ 0 ], &S_s32x4, &out32_Q14_s32x2[ 0 ] );
        silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, vget_high_s32( t_s32x4 ), in_s32x4[ 1 ], &S_s32x4, &out32_Q14_s32x2[ 1 ] );

        /* Scale back to Q0 and saturate */
        out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2[ 0 ], out32_Q14_s32x2[ 1 ] );                   /* out32_Q14_{0,1,2,3}                                                                                        */
        out32_Q14_s32x4 = vaddq_s32( out32_Q14_s32x4, offset_s32x4 );                                   /* out32_Q14_{0,1,2,3} + (1<<14) - 1                                                                          */
        out_s16x4       = vqshrn_n_s32( out32_Q14_s32x4, 14 );                                          /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) )                             */
        vst1_s16( &out[ 2 * k ], out_s16x4 );                                                           /* out[ 2 * k + {0,1,2,3} ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,2,3} + (1<<14) - 1, 14 ) ); */
    }

    /* Process leftover. */
    if( k < len ) {
        int32x4_t in_s32x4;
        int32x2_t out32_Q14_s32x2;

        /* S[ 2 * i + 0 ], S[ 2 * i + 1 ]: Q12 */
        in_s16x4     = vld1_lane_s16( &in[ 2 * k + 0 ], in_s16x4, 0 );                                  /* in{0,1} = in[ 2 * k + {0,1} ];     */
        in_s16x4     = vld1_lane_s16( &in[ 2 * k + 1 ], in_s16x4, 1 );                                  /* in{0,1} = in[ 2 * k + {0,1} ];     */
        in_s32x4     = vshll_n_s16( in_s16x4, 15 );                                                     /* in{0,1} << 15                      */
        t_s32x2      = vqdmulh_lane_s32( vget_low_s32( in_s32x4 ), B_Q28_s32x2, 0 );                    /* silk_SMULWB( B_Q28[ 0 ], in{0,1} ) */
        in_s32x4     = vcombine_s32( vget_low_s32( in_s32x4 ), vget_low_s32( in_s32x4 ) );              /* in{0,1,0,1} << 15                  */
        silk_biquad_alt_stride2_kernel( A_L_s32x4, A_U_s32x4, B_Q28_s32x4, t_s32x2, in_s32x4, &S_s32x4, &out32_Q14_s32x2 );

        /* Scale back to Q0 and saturate */
        out32_Q14_s32x2 = vadd_s32( out32_Q14_s32x2, offset_s32x2 );                                    /* out32_Q14_{0,1} + (1<<14) - 1                                                              */
        out32_Q14_s32x4 = vcombine_s32( out32_Q14_s32x2, out32_Q14_s32x2 );                             /* out32_Q14_{0,1,0,1} + (1<<14) - 1                                                          */
        out_s16x4       = vqshrn_n_s32( out32_Q14_s32x4, 14 );                                          /* (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_{0,1,0,1} + (1<<14) - 1, 14 ) )             */
        vst1_lane_s16( &out[ 2 * k + 0 ], out_s16x4, 0 );                                               /* out[ 2 * k + 0 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_0 + (1<<14) - 1, 14 ) ); */
        vst1_lane_s16( &out[ 2 * k + 1 ], out_s16x4, 1 );                                               /* out[ 2 * k + 1 ] = (opus_int16)silk_SAT16( silk_RSHIFT( out32_Q14_1 + (1<<14) - 1, 14 ) ); */
    }

    vst1q_lane_s32( &S[ 0 ], S_s32x4, 0 );                                                              /* S[ 0 ] = S0; */
    vst1q_lane_s32( &S[ 1 ], S_s32x4, 2 );                                                              /* S[ 1 ] = S2; */
    vst1q_lane_s32( &S[ 2 ], S_s32x4, 1 );                                                              /* S[ 2 ] = S1; */
    vst1q_lane_s32( &S[ 3 ], S_s32x4, 3 );                                                              /* S[ 3 ] = S3; */

#ifdef OPUS_CHECK_ASM
    silk_assert( !memcmp( S_c, S, sizeof( S_c ) ) );
    silk_assert( !memcmp( out_c, out, 2 * len * sizeof( opus_int16 ) ) );
    RESTORE_STACK;
#endif
}
예제 #23
0
inline   int32x4_t vaddq(const int32x4_t   & v0, const int32x4_t   & v1) { return vaddq_s32(v0, v1); }
예제 #24
0
static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16,
                               int16x8_t *q10s16, int16x8_t *q11s16,
                               int16x8_t *q12s16, int16x8_t *q13s16,
                               int16x8_t *q14s16, int16x8_t *q15s16) {
  int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16;
  int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16;
  int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16;
  int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16;
  int16x8_t q2s16, q4s16, q5s16, q6s16;
  int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32;
  int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32;

  d16s16 = vget_low_s16(*q8s16);
  d17s16 = vget_high_s16(*q8s16);
  d18s16 = vget_low_s16(*q9s16);
  d19s16 = vget_high_s16(*q9s16);
  d20s16 = vget_low_s16(*q10s16);
  d21s16 = vget_high_s16(*q10s16);
  d22s16 = vget_low_s16(*q11s16);
  d23s16 = vget_high_s16(*q11s16);
  d24s16 = vget_low_s16(*q12s16);
  d25s16 = vget_high_s16(*q12s16);
  d26s16 = vget_low_s16(*q13s16);
  d27s16 = vget_high_s16(*q13s16);
  d28s16 = vget_low_s16(*q14s16);
  d29s16 = vget_high_s16(*q14s16);
  d30s16 = vget_low_s16(*q15s16);
  d31s16 = vget_high_s16(*q15s16);

  d14s16 = vdup_n_s16((int16_t)cospi_2_64);
  d15s16 = vdup_n_s16((int16_t)cospi_30_64);

  q1s32 = vmull_s16(d30s16, d14s16);
  q2s32 = vmull_s16(d31s16, d14s16);
  q3s32 = vmull_s16(d30s16, d15s16);
  q4s32 = vmull_s16(d31s16, d15s16);

  d30s16 = vdup_n_s16((int16_t)cospi_18_64);
  d31s16 = vdup_n_s16((int16_t)cospi_14_64);

  q1s32 = vmlal_s16(q1s32, d16s16, d15s16);
  q2s32 = vmlal_s16(q2s32, d17s16, d15s16);
  q3s32 = vmlsl_s16(q3s32, d16s16, d14s16);
  q4s32 = vmlsl_s16(q4s32, d17s16, d14s16);

  q5s32 = vmull_s16(d22s16, d30s16);
  q6s32 = vmull_s16(d23s16, d30s16);
  q7s32 = vmull_s16(d22s16, d31s16);
  q8s32 = vmull_s16(d23s16, d31s16);

  q5s32 = vmlal_s16(q5s32, d24s16, d31s16);
  q6s32 = vmlal_s16(q6s32, d25s16, d31s16);
  q7s32 = vmlsl_s16(q7s32, d24s16, d30s16);
  q8s32 = vmlsl_s16(q8s32, d25s16, d30s16);

  q11s32 = vaddq_s32(q1s32, q5s32);
  q12s32 = vaddq_s32(q2s32, q6s32);
  q1s32 = vsubq_s32(q1s32, q5s32);
  q2s32 = vsubq_s32(q2s32, q6s32);

  d22s16 = vqrshrn_n_s32(q11s32, 14);
  d23s16 = vqrshrn_n_s32(q12s32, 14);
  *q11s16 = vcombine_s16(d22s16, d23s16);

  q12s32 = vaddq_s32(q3s32, q7s32);
  q15s32 = vaddq_s32(q4s32, q8s32);
  q3s32 = vsubq_s32(q3s32, q7s32);
  q4s32 = vsubq_s32(q4s32, q8s32);

  d2s16 = vqrshrn_n_s32(q1s32, 14);
  d3s16 = vqrshrn_n_s32(q2s32, 14);
  d24s16 = vqrshrn_n_s32(q12s32, 14);
  d25s16 = vqrshrn_n_s32(q15s32, 14);
  d6s16 = vqrshrn_n_s32(q3s32, 14);
  d7s16 = vqrshrn_n_s32(q4s32, 14);
  *q12s16 = vcombine_s16(d24s16, d25s16);

  d0s16 = vdup_n_s16((int16_t)cospi_10_64);
  d1s16 = vdup_n_s16((int16_t)cospi_22_64);
  q4s32 = vmull_s16(d26s16, d0s16);
  q5s32 = vmull_s16(d27s16, d0s16);
  q2s32 = vmull_s16(d26s16, d1s16);
  q6s32 = vmull_s16(d27s16, d1s16);

  d30s16 = vdup_n_s16((int16_t)cospi_26_64);
  d31s16 = vdup_n_s16((int16_t)cospi_6_64);

  q4s32 = vmlal_s16(q4s32, d20s16, d1s16);
  q5s32 = vmlal_s16(q5s32, d21s16, d1s16);
  q2s32 = vmlsl_s16(q2s32, d20s16, d0s16);
  q6s32 = vmlsl_s16(q6s32, d21s16, d0s16);

  q0s32 = vmull_s16(d18s16, d30s16);
  q13s32 = vmull_s16(d19s16, d30s16);

  q0s32 = vmlal_s16(q0s32, d28s16, d31s16);
  q13s32 = vmlal_s16(q13s32, d29s16, d31s16);

  q10s32 = vmull_s16(d18s16, d31s16);
  q9s32 = vmull_s16(d19s16, d31s16);

  q10s32 = vmlsl_s16(q10s32, d28s16, d30s16);
  q9s32 = vmlsl_s16(q9s32, d29s16, d30s16);

  q14s32 = vaddq_s32(q2s32, q10s32);
  q15s32 = vaddq_s32(q6s32, q9s32);
  q2s32 = vsubq_s32(q2s32, q10s32);
  q6s32 = vsubq_s32(q6s32, q9s32);

  d28s16 = vqrshrn_n_s32(q14s32, 14);
  d29s16 = vqrshrn_n_s32(q15s32, 14);
  d4s16 = vqrshrn_n_s32(q2s32, 14);
  d5s16 = vqrshrn_n_s32(q6s32, 14);
  *q14s16 = vcombine_s16(d28s16, d29s16);

  q9s32 = vaddq_s32(q4s32, q0s32);
  q10s32 = vaddq_s32(q5s32, q13s32);
  q4s32 = vsubq_s32(q4s32, q0s32);
  q5s32 = vsubq_s32(q5s32, q13s32);

  d30s16 = vdup_n_s16((int16_t)cospi_8_64);
  d31s16 = vdup_n_s16((int16_t)cospi_24_64);

  d18s16 = vqrshrn_n_s32(q9s32, 14);
  d19s16 = vqrshrn_n_s32(q10s32, 14);
  d8s16 = vqrshrn_n_s32(q4s32, 14);
  d9s16 = vqrshrn_n_s32(q5s32, 14);
  *q9s16 = vcombine_s16(d18s16, d19s16);

  q5s32 = vmull_s16(d2s16, d30s16);
  q6s32 = vmull_s16(d3s16, d30s16);
  q7s32 = vmull_s16(d2s16, d31s16);
  q0s32 = vmull_s16(d3s16, d31s16);

  q5s32 = vmlal_s16(q5s32, d6s16, d31s16);
  q6s32 = vmlal_s16(q6s32, d7s16, d31s16);
  q7s32 = vmlsl_s16(q7s32, d6s16, d30s16);
  q0s32 = vmlsl_s16(q0s32, d7s16, d30s16);

  q1s32 = vmull_s16(d4s16, d30s16);
  q3s32 = vmull_s16(d5s16, d30s16);
  q10s32 = vmull_s16(d4s16, d31s16);
  q2s32 = vmull_s16(d5s16, d31s16);

  q1s32 = vmlsl_s16(q1s32, d8s16, d31s16);
  q3s32 = vmlsl_s16(q3s32, d9s16, d31s16);
  q10s32 = vmlal_s16(q10s32, d8s16, d30s16);
  q2s32 = vmlal_s16(q2s32, d9s16, d30s16);

  *q8s16 = vaddq_s16(*q11s16, *q9s16);
  *q11s16 = vsubq_s16(*q11s16, *q9s16);
  q4s16 = vaddq_s16(*q12s16, *q14s16);
  *q12s16 = vsubq_s16(*q12s16, *q14s16);

  q14s32 = vaddq_s32(q5s32, q1s32);
  q15s32 = vaddq_s32(q6s32, q3s32);
  q5s32 = vsubq_s32(q5s32, q1s32);
  q6s32 = vsubq_s32(q6s32, q3s32);

  d18s16 = vqrshrn_n_s32(q14s32, 14);
  d19s16 = vqrshrn_n_s32(q15s32, 14);
  d10s16 = vqrshrn_n_s32(q5s32, 14);
  d11s16 = vqrshrn_n_s32(q6s32, 14);
  *q9s16 = vcombine_s16(d18s16, d19s16);

  q1s32 = vaddq_s32(q7s32, q10s32);
  q3s32 = vaddq_s32(q0s32, q2s32);
  q7s32 = vsubq_s32(q7s32, q10s32);
  q0s32 = vsubq_s32(q0s32, q2s32);

  d28s16 = vqrshrn_n_s32(q1s32, 14);
  d29s16 = vqrshrn_n_s32(q3s32, 14);
  d14s16 = vqrshrn_n_s32(q7s32, 14);
  d15s16 = vqrshrn_n_s32(q0s32, 14);
  *q14s16 = vcombine_s16(d28s16, d29s16);

  d30s16 = vdup_n_s16((int16_t)cospi_16_64);

  d22s16 = vget_low_s16(*q11s16);
  d23s16 = vget_high_s16(*q11s16);
  q2s32 = vmull_s16(d22s16, d30s16);
  q3s32 = vmull_s16(d23s16, d30s16);
  q13s32 = vmull_s16(d22s16, d30s16);
  q1s32 = vmull_s16(d23s16, d30s16);

  d24s16 = vget_low_s16(*q12s16);
  d25s16 = vget_high_s16(*q12s16);
  q2s32 = vmlal_s16(q2s32, d24s16, d30s16);
  q3s32 = vmlal_s16(q3s32, d25s16, d30s16);
  q13s32 = vmlsl_s16(q13s32, d24s16, d30s16);
  q1s32 = vmlsl_s16(q1s32, d25s16, d30s16);

  d4s16 = vqrshrn_n_s32(q2s32, 14);
  d5s16 = vqrshrn_n_s32(q3s32, 14);
  d24s16 = vqrshrn_n_s32(q13s32, 14);
  d25s16 = vqrshrn_n_s32(q1s32, 14);
  q2s16 = vcombine_s16(d4s16, d5s16);
  *q12s16 = vcombine_s16(d24s16, d25s16);

  q13s32 = vmull_s16(d10s16, d30s16);
  q1s32 = vmull_s16(d11s16, d30s16);
  q11s32 = vmull_s16(d10s16, d30s16);
  q0s32 = vmull_s16(d11s16, d30s16);

  q13s32 = vmlal_s16(q13s32, d14s16, d30s16);
  q1s32 = vmlal_s16(q1s32, d15s16, d30s16);
  q11s32 = vmlsl_s16(q11s32, d14s16, d30s16);
  q0s32 = vmlsl_s16(q0s32, d15s16, d30s16);

  d20s16 = vqrshrn_n_s32(q13s32, 14);
  d21s16 = vqrshrn_n_s32(q1s32, 14);
  d12s16 = vqrshrn_n_s32(q11s32, 14);
  d13s16 = vqrshrn_n_s32(q0s32, 14);
  *q10s16 = vcombine_s16(d20s16, d21s16);
  q6s16 = vcombine_s16(d12s16, d13s16);

  q5s16 = vdupq_n_s16(0);

  *q9s16 = vsubq_s16(q5s16, *q9s16);
  *q11s16 = vsubq_s16(q5s16, q2s16);
  *q13s16 = vsubq_s16(q5s16, q6s16);
  *q15s16 = vsubq_s16(q5s16, q4s16);
  return;
}
예제 #25
0
bool decode_yuv_neon(unsigned char* out, unsigned char const* y, unsigned char const* uv, int width, int height, unsigned char fill_alpha=0xff)
{
    // pre-condition : width, height must be even
    if (0!=(width&1) || width<2 || 0!=(height&1) || height<2 || !out || !y || !uv)
        return false;

    // in & out pointers
    unsigned char* dst = out;

    // constants
    int const stride = width*trait::bytes_per_pixel;
    int const itHeight = height>>1;
    int const itWidth = width>>3;

    uint8x8_t const Yshift = vdup_n_u8(16);
    int16x8_t const half = vdupq_n_u16(128);
    int32x4_t const rounding = vdupq_n_s32(128);

    // tmp variable
    uint16x8_t t;

    // pixel block to temporary store 8 pixels
    typename trait::PixelBlock pblock = trait::init_pixelblock(fill_alpha);    

    for (int j=0; j<itHeight; ++j, y+=width, dst+=stride) {
        for (int i=0; i<itWidth; ++i, y+=8, uv+=8, dst+=(8*trait::bytes_per_pixel)) {
            t = vmovl_u8(vqsub_u8(vld1_u8(y), Yshift));
            int32x4_t const Y00 = vmulq_n_u32(vmovl_u16(vget_low_u16(t)), 298);
            int32x4_t const Y01 = vmulq_n_u32(vmovl_u16(vget_high_u16(t)), 298);

            t = vmovl_u8(vqsub_u8(vld1_u8(y+width), Yshift));
            int32x4_t const Y10 = vmulq_n_u32(vmovl_u16(vget_low_u16(t)), 298);
            int32x4_t const Y11 = vmulq_n_u32(vmovl_u16(vget_high_u16(t)), 298);

            // trait::loadvu pack 4 sets of uv into a uint8x8_t, layout : { v0,u0, v1,u1, v2,u2, v3,u3 }
            t = vsubq_s16((int16x8_t)vmovl_u8(trait::loadvu(uv)), half);

            // UV.val[0] : v0, v1, v2, v3
            // UV.val[1] : u0, u1, u2, u3
            int16x4x2_t const UV = vuzp_s16(vget_low_s16(t), vget_high_s16(t));

            // tR : 128+409V
            // tG : 128-100U-208V
            // tB : 128+516U
            int32x4_t const tR = vmlal_n_s16(rounding, UV.val[0], 409);
            int32x4_t const tG = vmlal_n_s16(vmlal_n_s16(rounding, UV.val[0], -208), UV.val[1], -100);
            int32x4_t const tB = vmlal_n_s16(rounding, UV.val[1], 516);

            int32x4x2_t const R = vzipq_s32(tR, tR); // [tR0, tR0, tR1, tR1] [ tR2, tR2, tR3, tR3]
            int32x4x2_t const G = vzipq_s32(tG, tG); // [tG0, tG0, tG1, tG1] [ tG2, tG2, tG3, tG3]
            int32x4x2_t const B = vzipq_s32(tB, tB); // [tB0, tB0, tB1, tB1] [ tB2, tB2, tB3, tB3]

            // upper 8 pixels
            trait::store_pixel_block(dst, pblock,
                    vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(R.val[0], Y00)), vqmovun_s32(vaddq_s32(R.val[1], Y01))), 8),
                    vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(G.val[0], Y00)), vqmovun_s32(vaddq_s32(G.val[1], Y01))), 8),
                    vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(B.val[0], Y00)), vqmovun_s32(vaddq_s32(B.val[1], Y01))), 8));

            // lower 8 pixels
            trait::store_pixel_block(dst+stride, pblock,
                    vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(R.val[0], Y10)), vqmovun_s32(vaddq_s32(R.val[1], Y11))), 8),
                    vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(G.val[0], Y10)), vqmovun_s32(vaddq_s32(G.val[1], Y11))), 8),
                    vshrn_n_u16(vcombine_u16(vqmovun_s32(vaddq_s32(B.val[0], Y10)), vqmovun_s32(vaddq_s32(B.val[1], Y11))), 8));
        }
    }
    return true;
}
예제 #26
0
inline v_int32x4 v_floor(const v_float32x4& a)
{
    int32x4_t a1 = vcvtq_s32_f32(a.val);
    uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
    return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
}
예제 #27
0
static float32x4_t vpowq_f32(float32x4_t a, float32x4_t b) {
  // a^b = exp2(b * log2(a))
  //   exp2(x) and log2(x) are calculated using polynomial approximations.
  float32x4_t log2_a, b_log2_a, a_exp_b;

  // Calculate log2(x), x = a.
  {
    // To calculate log2(x), we decompose x like this:
    //   x = y * 2^n
    //     n is an integer
    //     y is in the [1.0, 2.0) range
    //
    //   log2(x) = log2(y) + n
    //     n       can be evaluated by playing with float representation.
    //     log2(y) in a small range can be approximated, this code uses an order
    //             five polynomial approximation. The coefficients have been
    //             estimated with the Remez algorithm and the resulting
    //             polynomial has a maximum relative error of 0.00086%.

    // Compute n.
    //    This is done by masking the exponent, shifting it into the top bit of
    //    the mantissa, putting eight into the biased exponent (to shift/
    //    compensate the fact that the exponent has been shifted in the top/
    //    fractional part and finally getting rid of the implicit leading one
    //    from the mantissa by substracting it out.
    const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000);
    const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000);
    const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000);
    const uint32x4_t two_n = vandq_u32(vreinterpretq_u32_f32(a),
                                       vec_float_exponent_mask);
    const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa);
    const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent);
    const float32x4_t n =
        vsubq_f32(vreinterpretq_f32_u32(n_0),
                  vreinterpretq_f32_u32(vec_implicit_leading_one));
    // Compute y.
    const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF);
    const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000);
    const uint32x4_t mantissa = vandq_u32(vreinterpretq_u32_f32(a),
                                          vec_mantissa_mask);
    const float32x4_t y =
        vreinterpretq_f32_u32(vorrq_u32(mantissa,
                                        vec_zero_biased_exponent_is_one));
    // Approximate log2(y) ~= (y - 1) * pol5(y).
    //    pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0
    const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f);
    const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f);
    const float32x4_t C3 = vdupq_n_f32(-1.2315303f);
    const float32x4_t C2 = vdupq_n_f32(2.5988452f);
    const float32x4_t C1 = vdupq_n_f32(-3.3241990f);
    const float32x4_t C0 = vdupq_n_f32(3.1157899f);
    float32x4_t pol5_y = C5;
    pol5_y = vmlaq_f32(C4, y, pol5_y);
    pol5_y = vmlaq_f32(C3, y, pol5_y);
    pol5_y = vmlaq_f32(C2, y, pol5_y);
    pol5_y = vmlaq_f32(C1, y, pol5_y);
    pol5_y = vmlaq_f32(C0, y, pol5_y);
    const float32x4_t y_minus_one =
        vsubq_f32(y, vreinterpretq_f32_u32(vec_zero_biased_exponent_is_one));
    const float32x4_t log2_y = vmulq_f32(y_minus_one, pol5_y);

    // Combine parts.
    log2_a = vaddq_f32(n, log2_y);
  }

  // b * log2(a)
  b_log2_a = vmulq_f32(b, log2_a);

  // Calculate exp2(x), x = b * log2(a).
  {
    // To calculate 2^x, we decompose x like this:
    //   x = n + y
    //     n is an integer, the value of x - 0.5 rounded down, therefore
    //     y is in the [0.5, 1.5) range
    //
    //   2^x = 2^n * 2^y
    //     2^n can be evaluated by playing with float representation.
    //     2^y in a small range can be approximated, this code uses an order two
    //         polynomial approximation. The coefficients have been estimated
    //         with the Remez algorithm and the resulting polynomial has a
    //         maximum relative error of 0.17%.
    // To avoid over/underflow, we reduce the range of input to ]-127, 129].
    const float32x4_t max_input = vdupq_n_f32(129.f);
    const float32x4_t min_input = vdupq_n_f32(-126.99999f);
    const float32x4_t x_min = vminq_f32(b_log2_a, max_input);
    const float32x4_t x_max = vmaxq_f32(x_min, min_input);
    // Compute n.
    const float32x4_t half = vdupq_n_f32(0.5f);
    const float32x4_t x_minus_half = vsubq_f32(x_max, half);
    const int32x4_t x_minus_half_floor = vcvtq_s32_f32(x_minus_half);

    // Compute 2^n.
    const int32x4_t float_exponent_bias = vdupq_n_s32(127);
    const int32x4_t two_n_exponent =
        vaddq_s32(x_minus_half_floor, float_exponent_bias);
    const float32x4_t two_n =
        vreinterpretq_f32_s32(vshlq_n_s32(two_n_exponent, kFloatExponentShift));
    // Compute y.
    const float32x4_t y = vsubq_f32(x_max, vcvtq_f32_s32(x_minus_half_floor));

    // Approximate 2^y ~= C2 * y^2 + C1 * y + C0.
    const float32x4_t C2 = vdupq_n_f32(3.3718944e-1f);
    const float32x4_t C1 = vdupq_n_f32(6.5763628e-1f);
    const float32x4_t C0 = vdupq_n_f32(1.0017247f);
    float32x4_t exp2_y = C2;
    exp2_y = vmlaq_f32(C1, y, exp2_y);
    exp2_y = vmlaq_f32(C0, y, exp2_y);

    // Combine parts.
    a_exp_b = vmulq_f32(exp2_y, two_n);
  }

  return a_exp_b;
}
unsigned int vp8_variance_halfpixvar16x16_hv_neon(
        const unsigned char *src_ptr,
        int  source_stride,
        const unsigned char *ref_ptr,
        int  recon_stride,
        unsigned int *sse) {
    int i;
    uint8x8_t d0u8, d1u8, d2u8, d3u8, d4u8, d5u8, d6u8, d7u8;
    int16x4_t d0s16, d1s16, d2s16, d3s16, d10s16, d11s16, d12s16, d13s16;
    int16x4_t d18s16, d19s16, d20s16, d21s16, d22s16, d23s16, d24s16, d25s16;
    uint32x2_t d0u32, d10u32;
    int64x1_t d0s64, d1s64, d2s64, d3s64;
    uint8x16_t q0u8, q1u8, q2u8, q3u8, q4u8, q5u8, q6u8, q7u8, q8u8, q9u8;
    uint16x8_t q0u16, q1u16, q5u16, q6u16, q9u16, q10u16, q11u16, q12u16;
    int32x4_t q13s32, q14s32, q15s32;
    int64x2_t q0s64, q1s64, q5s64;

    q13s32 = vdupq_n_s32(0);
    q14s32 = vdupq_n_s32(0);
    q15s32 = vdupq_n_s32(0);

    q0u8 = vld1q_u8(src_ptr);
    q1u8 = vld1q_u8(src_ptr + 16);
    src_ptr += source_stride;
    q1u8 = vextq_u8(q0u8, q1u8, 1);
    q0u8 = vrhaddq_u8(q0u8, q1u8);
    for (i = 0; i < 4; i++) {  // vp8_filt_fpo16x16s_4_0_loop_neon
        q2u8 = vld1q_u8(src_ptr);
        q3u8 = vld1q_u8(src_ptr + 16);
        src_ptr += source_stride;
        q4u8 = vld1q_u8(src_ptr);
        q5u8 = vld1q_u8(src_ptr + 16);
        src_ptr += source_stride;
        q6u8 = vld1q_u8(src_ptr);
        q7u8 = vld1q_u8(src_ptr + 16);
        src_ptr += source_stride;
        q8u8 = vld1q_u8(src_ptr);
        q9u8 = vld1q_u8(src_ptr + 16);
        src_ptr += source_stride;

        q3u8 = vextq_u8(q2u8, q3u8, 1);
        q5u8 = vextq_u8(q4u8, q5u8, 1);
        q7u8 = vextq_u8(q6u8, q7u8, 1);
        q9u8 = vextq_u8(q8u8, q9u8, 1);

        q1u8 = vrhaddq_u8(q2u8, q3u8);
        q2u8 = vrhaddq_u8(q4u8, q5u8);
        q3u8 = vrhaddq_u8(q6u8, q7u8);
        q4u8 = vrhaddq_u8(q8u8, q9u8);
        q0u8 = vrhaddq_u8(q0u8, q1u8);
        q1u8 = vrhaddq_u8(q1u8, q2u8);
        q2u8 = vrhaddq_u8(q2u8, q3u8);
        q3u8 = vrhaddq_u8(q3u8, q4u8);

        q5u8 = vld1q_u8(ref_ptr);
        ref_ptr += recon_stride;
        q6u8 = vld1q_u8(ref_ptr);
        ref_ptr += recon_stride;
        q7u8 = vld1q_u8(ref_ptr);
        ref_ptr += recon_stride;
        q8u8 = vld1q_u8(ref_ptr);
        ref_ptr += recon_stride;

        d0u8 = vget_low_u8(q0u8);
        d1u8 = vget_high_u8(q0u8);
        d2u8 = vget_low_u8(q1u8);
        d3u8 = vget_high_u8(q1u8);
        d4u8 = vget_low_u8(q2u8);
        d5u8 = vget_high_u8(q2u8);
        d6u8 = vget_low_u8(q3u8);
        d7u8 = vget_high_u8(q3u8);

        q9u16  = vsubl_u8(d0u8, vget_low_u8(q5u8));
        q10u16 = vsubl_u8(d1u8, vget_high_u8(q5u8));
        q11u16 = vsubl_u8(d2u8, vget_low_u8(q6u8));
        q12u16 = vsubl_u8(d3u8, vget_high_u8(q6u8));
        q0u16  = vsubl_u8(d4u8, vget_low_u8(q7u8));
        q1u16  = vsubl_u8(d5u8, vget_high_u8(q7u8));
        q5u16  = vsubl_u8(d6u8, vget_low_u8(q8u8));
        q6u16  = vsubl_u8(d7u8, vget_high_u8(q8u8));

        d18s16 = vreinterpret_s16_u16(vget_low_u16(q9u16));
        d19s16 = vreinterpret_s16_u16(vget_high_u16(q9u16));
        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q9u16));
        q14s32 = vmlal_s16(q14s32, d18s16, d18s16);
        q15s32 = vmlal_s16(q15s32, d19s16, d19s16);

        d20s16 = vreinterpret_s16_u16(vget_low_u16(q10u16));
        d21s16 = vreinterpret_s16_u16(vget_high_u16(q10u16));
        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q10u16));
        q14s32 = vmlal_s16(q14s32, d20s16, d20s16);
        q15s32 = vmlal_s16(q15s32, d21s16, d21s16);

        d22s16 = vreinterpret_s16_u16(vget_low_u16(q11u16));
        d23s16 = vreinterpret_s16_u16(vget_high_u16(q11u16));
        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q11u16));
        q14s32 = vmlal_s16(q14s32, d22s16, d22s16);
        q15s32 = vmlal_s16(q15s32, d23s16, d23s16);

        d24s16 = vreinterpret_s16_u16(vget_low_u16(q12u16));
        d25s16 = vreinterpret_s16_u16(vget_high_u16(q12u16));
        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q12u16));
        q14s32 = vmlal_s16(q14s32, d24s16, d24s16);
        q15s32 = vmlal_s16(q15s32, d25s16, d25s16);

        d0s16 = vreinterpret_s16_u16(vget_low_u16(q0u16));
        d1s16 = vreinterpret_s16_u16(vget_high_u16(q0u16));
        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q0u16));
        q14s32 = vmlal_s16(q14s32, d0s16, d0s16);
        q15s32 = vmlal_s16(q15s32, d1s16, d1s16);

        d2s16 = vreinterpret_s16_u16(vget_low_u16(q1u16));
        d3s16 = vreinterpret_s16_u16(vget_high_u16(q1u16));
        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q1u16));
        q14s32 = vmlal_s16(q14s32, d2s16, d2s16);
        q15s32 = vmlal_s16(q15s32, d3s16, d3s16);

        d10s16 = vreinterpret_s16_u16(vget_low_u16(q5u16));
        d11s16 = vreinterpret_s16_u16(vget_high_u16(q5u16));
        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q5u16));
        q14s32 = vmlal_s16(q14s32, d10s16, d10s16);
        q15s32 = vmlal_s16(q15s32, d11s16, d11s16);

        d12s16 = vreinterpret_s16_u16(vget_low_u16(q6u16));
        d13s16 = vreinterpret_s16_u16(vget_high_u16(q6u16));
        q13s32 = vpadalq_s16(q13s32, vreinterpretq_s16_u16(q6u16));
        q14s32 = vmlal_s16(q14s32, d12s16, d12s16);
        q15s32 = vmlal_s16(q15s32, d13s16, d13s16);

        q0u8 = q4u8;
    }

    q15s32 = vaddq_s32(q14s32, q15s32);
    q0s64 = vpaddlq_s32(q13s32);
    q1s64 = vpaddlq_s32(q15s32);

    d0s64 = vget_low_s64(q0s64);
    d1s64 = vget_high_s64(q0s64);
    d2s64 = vget_low_s64(q1s64);
    d3s64 = vget_high_s64(q1s64);
    d0s64 = vadd_s64(d0s64, d1s64);
    d1s64 = vadd_s64(d2s64, d3s64);

    q5s64 = vmull_s32(vreinterpret_s32_s64(d0s64),
                      vreinterpret_s32_s64(d0s64));
    vst1_lane_u32((uint32_t *)sse, vreinterpret_u32_s64(d1s64), 0);

    d10u32 = vshr_n_u32(vreinterpret_u32_s64(vget_low_s64(q5s64)), 8);
    d0u32 = vsub_u32(vreinterpret_u32_s64(d1s64), d10u32);

    return vget_lane_u32(d0u32, 0);
}
예제 #29
0
void ne10_img_vresize_linear_neon (const int** src, unsigned char* dst, const short* beta, int width)
{
    const int *S0 = src[0], *S1 = src[1];

    int32x4_t qS0_0123, qS0_4567, qS1_0123, qS1_4567;
    int32x4_t qT_0123, qT_4567;
    int16x4_t dT_0123, dT_4567;
    uint16x8_t qT_01234567;
    uint8x8_t dT_01234567, dDst_01234567;

    int32x2_t dBeta;
    dBeta = vset_lane_s32 ( (int) (beta[0]), dBeta, 0);
    dBeta = vset_lane_s32 ( (int) (beta[1]), dBeta, 1);

    int32x4_t qDelta, qMin, qMax;
    qDelta = vdupq_n_s32 (DELTA);
    qMin = vdupq_n_s32 (0);
    qMax = vdupq_n_s32 (255);

    int x = 0;
    for (; x <= width - 8; x += 8)
    {
        qS0_0123 = vld1q_s32 (&S0[x]);
        qS0_4567 = vld1q_s32 (&S0[x + 4]);
        qS1_0123 = vld1q_s32 (&S1[x]);
        qS1_4567 = vld1q_s32 (&S1[x + 4]);

        qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0);
        qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0);
        qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1);
        qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1);

        qT_0123 = vaddq_s32 (qT_0123, qDelta);
        qT_4567 = vaddq_s32 (qT_4567, qDelta);

        qT_0123 = vshrq_n_s32 (qT_0123, BITS);
        qT_4567 = vshrq_n_s32 (qT_4567, BITS);

        qT_0123 = vmaxq_s32 (qT_0123, qMin);
        qT_4567 = vmaxq_s32 (qT_4567, qMin);
        qT_0123 = vminq_s32 (qT_0123, qMax);
        qT_4567 = vminq_s32 (qT_4567, qMax);

        dT_0123 = vmovn_s32 (qT_0123);
        dT_4567 = vmovn_s32 (qT_4567);
        qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567));
        dT_01234567 = vmovn_u16 (qT_01234567);

        vst1_u8 (&dst[x], dT_01234567);
    }

    if (x < width)
    {
        uint8x8_t dMask;
        dMask = vld1_u8 ( (uint8_t *) (&ne10_img_vresize_linear_mask_residual_table[ (width - x - 1)]));
        dDst_01234567 = vld1_u8 (&dst[x]);

        qS0_0123 = vld1q_s32 (&S0[x]);
        qS0_4567 = vld1q_s32 (&S0[x + 4]);
        qS1_0123 = vld1q_s32 (&S1[x]);
        qS1_4567 = vld1q_s32 (&S1[x + 4]);

        qT_0123 = vmulq_lane_s32 (qS0_0123, dBeta, 0);
        qT_4567 = vmulq_lane_s32 (qS0_4567, dBeta, 0);
        qT_0123 = vmlaq_lane_s32 (qT_0123, qS1_0123, dBeta, 1);
        qT_4567 = vmlaq_lane_s32 (qT_4567, qS1_4567, dBeta, 1);

        qT_0123 = vaddq_s32 (qT_0123, qDelta);
        qT_4567 = vaddq_s32 (qT_4567, qDelta);

        qT_0123 = vshrq_n_s32 (qT_0123, BITS);
        qT_4567 = vshrq_n_s32 (qT_4567, BITS);

        qT_0123 = vmaxq_s32 (qT_0123, qMin);
        qT_4567 = vmaxq_s32 (qT_4567, qMin);
        qT_0123 = vminq_s32 (qT_0123, qMax);
        qT_4567 = vminq_s32 (qT_4567, qMax);

        dT_0123 = vmovn_s32 (qT_0123);
        dT_4567 = vmovn_s32 (qT_4567);
        qT_01234567 = vreinterpretq_u16_s16 (vcombine_s16 (dT_0123, dT_4567));
        dT_01234567 = vmovn_u16 (qT_01234567);

        dMask = vbsl_u8 (dMask, dT_01234567, dDst_01234567);
        vst1_u8 (&dst[x], dMask);
    }
}
예제 #30
0
static void PCorr2Q32(const int16_t *in, int32_t *logcorQ8)
{
  int16_t scaling,n,k;
  int32_t ysum32,csum32, lys, lcs;
  int32_t oneQ8;


  const int16_t *x, *inptr;

  oneQ8 = WEBRTC_SPL_LSHIFT_W32((int32_t)1, 8);  // 1.00 in Q8

  x = in + PITCH_MAX_LAG/2 + 2;
  scaling = WebRtcSpl_GetScalingSquare ((int16_t *) in, PITCH_CORR_LEN2, PITCH_CORR_LEN2);
  ysum32 = 1;
  csum32 = 0;
  x = in + PITCH_MAX_LAG/2 + 2;
  for (n = 0; n < PITCH_CORR_LEN2; n++) {
    ysum32 += WEBRTC_SPL_MUL_16_16_RSFT( (int16_t) in[n],(int16_t) in[n], scaling);  // Q0
    csum32 += WEBRTC_SPL_MUL_16_16_RSFT((int16_t) x[n],(int16_t) in[n], scaling); // Q0
  }

  logcorQ8 += PITCH_LAG_SPAN2 - 1;

  lys=Log2Q8((uint32_t) ysum32); // Q8
  lys=WEBRTC_SPL_RSHIFT_W32(lys, 1); //sqrt(ysum);

  if (csum32>0) {

    lcs=Log2Q8((uint32_t) csum32);   // 2log(csum) in Q8

    if (lcs>(lys + oneQ8) ){ // csum/sqrt(ysum) > 2 in Q8
      *logcorQ8 = lcs - lys;  // log2(csum/sqrt(ysum))
    } else {
      *logcorQ8 = oneQ8;  // 1.00
    }

  } else {
    *logcorQ8 = 0;
  }


  for (k = 1; k < PITCH_LAG_SPAN2; k++) {
    inptr = &in[k];
    ysum32 -= WEBRTC_SPL_MUL_16_16_RSFT( (int16_t) in[k-1],(int16_t) in[k-1], scaling);
    ysum32 += WEBRTC_SPL_MUL_16_16_RSFT( (int16_t) in[PITCH_CORR_LEN2 + k - 1],(int16_t) in[PITCH_CORR_LEN2 + k - 1], scaling);

#ifdef WEBRTC_ARCH_ARM_NEON
    {
      int32_t vbuff[4];
      int32x4_t int_32x4_sum = vmovq_n_s32(0);
      // Can't shift a Neon register to right with a non-constant shift value.
      int32x4_t int_32x4_scale = vdupq_n_s32(-scaling);
      // Assert a codition used in loop unrolling at compile-time.
      COMPILE_ASSERT(PITCH_CORR_LEN2 %4 == 0);

      for (n = 0; n < PITCH_CORR_LEN2; n += 4) {
        int16x4_t int_16x4_x = vld1_s16(&x[n]);
        int16x4_t int_16x4_in = vld1_s16(&inptr[n]);
        int32x4_t int_32x4 = vmull_s16(int_16x4_x, int_16x4_in);
        int_32x4 = vshlq_s32(int_32x4, int_32x4_scale);
        int_32x4_sum = vaddq_s32(int_32x4_sum, int_32x4);
      }

      // Use vector store to avoid long stall from data trasferring
      // from vector to general register.
      vst1q_s32(vbuff, int_32x4_sum);
      csum32 = vbuff[0] + vbuff[1];
      csum32 += vbuff[2];
      csum32 += vbuff[3];
    }
#else
    csum32 = 0;
    if(scaling == 0) {
      for (n = 0; n < PITCH_CORR_LEN2; n++) {
        csum32 += x[n] * inptr[n];
      }
    } else {
      for (n = 0; n < PITCH_CORR_LEN2; n++) {
        csum32 += (x[n] * inptr[n]) >> scaling;
      }
    }
#endif

    logcorQ8--;

    lys=Log2Q8((uint32_t)ysum32); // Q8
    lys=WEBRTC_SPL_RSHIFT_W32(lys, 1); //sqrt(ysum);

    if (csum32>0) {

      lcs=Log2Q8((uint32_t) csum32);   // 2log(csum) in Q8

      if (lcs>(lys + oneQ8) ){ // csum/sqrt(ysum) > 2
        *logcorQ8 = lcs - lys;  // log2(csum/sqrt(ysum))
      } else {
        *logcorQ8 = oneQ8;  // 1.00
      }

    } else {
      *logcorQ8 = 0;
    }
  }
}