void vpx_highbd_idct32x32_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { const tran_low_t out0 = HIGHBD_WRAPLOW( dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); const tran_low_t out1 = HIGHBD_WRAPLOW( dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 6); const int16x8_t dc = vdupq_n_s16(a1); int i; if (a1 >= 0) { const int16x8_t max = vdupq_n_s16((1 << bd) - 1); for (i = 0; i < 8; ++i) { highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); highbd_idct32x32_1_add_pos_kernel(&dest, stride, dc, max); } } else { for (i = 0; i < 8; ++i) { highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); highbd_idct32x32_1_add_neg_kernel(&dest, stride, dc); } } }
static uint64_t SharpYUVUpdateY_NEON(const uint16_t* ref, const uint16_t* src, uint16_t* dst, int len) { int i; const int16x8_t zero = vdupq_n_s16(0); const int16x8_t max = vdupq_n_s16(MAX_Y); uint64x2_t sum = vdupq_n_u64(0); uint64_t diff; for (i = 0; i + 8 <= len; i += 8) { const int16x8_t A = vreinterpretq_s16_u16(vld1q_u16(ref + i)); const int16x8_t B = vreinterpretq_s16_u16(vld1q_u16(src + i)); const int16x8_t C = vreinterpretq_s16_u16(vld1q_u16(dst + i)); const int16x8_t D = vsubq_s16(A, B); // diff_y const int16x8_t F = vaddq_s16(C, D); // new_y const uint16x8_t H = vreinterpretq_u16_s16(vmaxq_s16(vminq_s16(F, max), zero)); const int16x8_t I = vabsq_s16(D); // abs(diff_y) vst1q_u16(dst + i, H); sum = vpadalq_u32(sum, vpaddlq_u16(vreinterpretq_u16_s16(I))); } diff = vgetq_lane_u64(sum, 0) + vgetq_lane_u64(sum, 1); for (; i < len; ++i) { const int diff_y = ref[i] - src[i]; const int new_y = (int)(dst[i]) + diff_y; dst[i] = clip_y(new_y); diff += (uint64_t)(abs(diff_y)); } return diff; }
void freq_equalization(LTE_DL_FRAME_PARMS *frame_parms, int32_t **rxdataF_comp, int32_t **ul_ch_mag, int32_t **ul_ch_magb, uint8_t symbol, uint16_t Msc_RS, uint8_t Qm) { uint16_t re; int16_t amp; #if defined(__x86_64__) || defined(__i386__) __m128i *ul_ch_mag128,*ul_ch_magb128,*rxdataF_comp128; rxdataF_comp128 = (__m128i *)&rxdataF_comp[0][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128 = (__m128i *)&ul_ch_mag[0][symbol*frame_parms->N_RB_DL*12]; ul_ch_magb128 = (__m128i *)&ul_ch_magb[0][symbol*frame_parms->N_RB_DL*12]; #elif defined(__arm__) int16x8_t *ul_ch_mag128,*ul_ch_magb128,*rxdataF_comp128; rxdataF_comp128 = (int16x8_t*)&rxdataF_comp[0][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128 = (int16x8_t*)&ul_ch_mag[0][symbol*frame_parms->N_RB_DL*12]; ul_ch_magb128 = (int16x8_t*)&ul_ch_magb[0][symbol*frame_parms->N_RB_DL*12]; #endif for (re=0; re<(Msc_RS>>2); re++) { amp=(*((int16_t*)&ul_ch_mag128[re])); if (amp>255) amp=255; // printf("freq_eq: symbol %d re %d => %d,%d,%d, (%d) (%d,%d) => ",symbol,re,*((int16_t*)(&ul_ch_mag128[re])),amp,inv_ch[8*amp],*((int16_t*)(&ul_ch_mag128[re]))*inv_ch[8*amp],*(int16_t*)&(rxdataF_comp128[re]),*(1+(int16_t*)&(rxdataF_comp128[re]))); #if defined(__x86_64__) || defined(__i386__) rxdataF_comp128[re] = _mm_mullo_epi16(rxdataF_comp128[re],*((__m128i *)&inv_ch[8*amp])); if (Qm==4) ul_ch_mag128[re] = _mm_set1_epi16(324); // this is 512*2/sqrt(10) else { ul_ch_mag128[re] = _mm_set1_epi16(316); // this is 512*4/sqrt(42) ul_ch_magb128[re] = _mm_set1_epi16(158); // this is 512*2/sqrt(42) } #elif defined(__arm__) rxdataF_comp128[re] = vmulq_s16(rxdataF_comp128[re],*((int16x8_t *)&inv_ch[8*amp])); if (Qm==4) ul_ch_mag128[re] = vdupq_n_s16(324); // this is 512*2/sqrt(10) else { ul_ch_mag128[re] = vdupq_n_s16(316); // this is 512*4/sqrt(42) ul_ch_magb128[re] = vdupq_n_s16(158); // this is 512*2/sqrt(42) } #endif // printf("(%d,%d)\n",*(int16_t*)&(rxdataF_comp128[re]),*(1+(int16_t*)&(rxdataF_comp128[re]))); } }
void vpx_highbd_idct4x4_1_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { const int16x8_t max = vdupq_n_s16((1 << bd) - 1); const tran_low_t out0 = HIGHBD_WRAPLOW( dct_const_round_shift(input[0] * (tran_high_t)cospi_16_64), bd); const tran_low_t out1 = HIGHBD_WRAPLOW( dct_const_round_shift(out0 * (tran_high_t)cospi_16_64), bd); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4); const int16x8_t dc = vdupq_n_s16(a1); highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max); highbd_idct4x4_1_add_kernel1(&dest, stride, dc, max); }
void rfx_decode_YCbCr_to_RGB_NEON(sint16 * y_r_buffer, sint16 * cb_g_buffer, sint16 * cr_b_buffer) { int16x8_t zero = vdupq_n_s16(0); int16x8_t max = vdupq_n_s16(255); int16x8_t y_add = vdupq_n_s16(128); int16x8_t* y_r_buf = (int16x8_t*)y_r_buffer; int16x8_t* cb_g_buf = (int16x8_t*)cb_g_buffer; int16x8_t* cr_b_buf = (int16x8_t*)cr_b_buffer; int i; for (i = 0; i < 4096 / 8; i++) { int16x8_t y = vld1q_s16((sint16*)&y_r_buf[i]); y = vaddq_s16(y, y_add); int16x8_t cr = vld1q_s16((sint16*)&cr_b_buf[i]); // r = between((y + cr + (cr >> 2) + (cr >> 3) + (cr >> 5)), 0, 255); int16x8_t r = vaddq_s16(y, cr); r = vaddq_s16(r, vshrq_n_s16(cr, 2)); r = vaddq_s16(r, vshrq_n_s16(cr, 3)); r = vaddq_s16(r, vshrq_n_s16(cr, 5)); r = vminq_s16(vmaxq_s16(r, zero), max); vst1q_s16((sint16*)&y_r_buf[i], r); // cb = cb_g_buf[i]; int16x8_t cb = vld1q_s16((sint16*)&cb_g_buf[i]); // g = between(y - (cb >> 2) - (cb >> 4) - (cb >> 5) - (cr >> 1) - (cr >> 3) - (cr >> 4) - (cr >> 5), 0, 255); int16x8_t g = vsubq_s16(y, vshrq_n_s16(cb, 2)); g = vsubq_s16(g, vshrq_n_s16(cb, 4)); g = vsubq_s16(g, vshrq_n_s16(cb, 5)); g = vsubq_s16(g, vshrq_n_s16(cr, 1)); g = vsubq_s16(g, vshrq_n_s16(cr, 3)); g = vsubq_s16(g, vshrq_n_s16(cr, 4)); g = vsubq_s16(g, vshrq_n_s16(cr, 5)); g = vminq_s16(vmaxq_s16(g, zero), max); vst1q_s16((sint16*)&cb_g_buf[i], g); // b = between((y + cb + (cb >> 1) + (cb >> 2) + (cb >> 6)), 0, 255); int16x8_t b = vaddq_s16(y, cb); b = vaddq_s16(b, vshrq_n_s16(cb, 1)); b = vaddq_s16(b, vshrq_n_s16(cb, 2)); b = vaddq_s16(b, vshrq_n_s16(cb, 6)); b = vminq_s16(vmaxq_s16(b, zero), max); vst1q_s16((sint16*)&cr_b_buf[i], b); } }
void idct_dequant_0_2x_neon( int16_t *q, int16_t dq, unsigned char *dst, int stride) { unsigned char *dst0; int i, a0, a1; int16x8x2_t q2Add; int32x2_t d2s32, d4s32; uint8x8_t d2u8, d4u8; uint16x8_t q1u16, q2u16; a0 = ((q[0] * dq) + 4) >> 3; a1 = ((q[16] * dq) + 4) >> 3; q[0] = q[16] = 0; q2Add.val[0] = vdupq_n_s16((int16_t)a0); q2Add.val[1] = vdupq_n_s16((int16_t)a1); for (i = 0; i < 2; i++, dst += 4) { dst0 = dst; d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 0); dst0 += stride; d2s32 = vld1_lane_s32((const int32_t *)dst0, d2s32, 1); dst0 += stride; d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 0); dst0 += stride; d4s32 = vld1_lane_s32((const int32_t *)dst0, d4s32, 1); q1u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), vreinterpret_u8_s32(d2s32)); q2u16 = vaddw_u8(vreinterpretq_u16_s16(q2Add.val[i]), vreinterpret_u8_s32(d4s32)); d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q1u16)); d4u8 = vqmovun_s16(vreinterpretq_s16_u16(q2u16)); d2s32 = vreinterpret_s32_u8(d2u8); d4s32 = vreinterpret_s32_u8(d4u8); dst0 = dst; vst1_lane_s32((int32_t *)dst0, d2s32, 0); dst0 += stride; vst1_lane_s32((int32_t *)dst0, d2s32, 1); dst0 += stride; vst1_lane_s32((int32_t *)dst0, d4s32, 0); dst0 += stride; vst1_lane_s32((int32_t *)dst0, d4s32, 1); } return; }
static void inline ff_dct_unquantize_h263_neon(int qscale, int qadd, int nCoeffs, int16_t *block) { int16x8_t q0s16, q2s16, q3s16, q8s16, q10s16, q11s16, q13s16; int16x8_t q14s16, q15s16, qzs16; int16x4_t d0s16, d2s16, d3s16, dzs16; uint16x8_t q1u16, q9u16; uint16x4_t d1u16; dzs16 = vdup_n_s16(0); qzs16 = vdupq_n_s16(0); q15s16 = vdupq_n_s16(qscale << 1); q14s16 = vdupq_n_s16(qadd); q13s16 = vnegq_s16(q14s16); if (nCoeffs > 4) { for (; nCoeffs > 8; nCoeffs -= 16, block += 16) { q0s16 = vld1q_s16(block); q3s16 = vreinterpretq_s16_u16(vcltq_s16(q0s16, qzs16)); q8s16 = vld1q_s16(block + 8); q1u16 = vceqq_s16(q0s16, qzs16); q2s16 = vmulq_s16(q0s16, q15s16); q11s16 = vreinterpretq_s16_u16(vcltq_s16(q8s16, qzs16)); q10s16 = vmulq_s16(q8s16, q15s16); q3s16 = vbslq_s16(vreinterpretq_u16_s16(q3s16), q13s16, q14s16); q11s16 = vbslq_s16(vreinterpretq_u16_s16(q11s16), q13s16, q14s16); q2s16 = vaddq_s16(q2s16, q3s16); q9u16 = vceqq_s16(q8s16, qzs16); q10s16 = vaddq_s16(q10s16, q11s16); q0s16 = vbslq_s16(q1u16, q0s16, q2s16); q8s16 = vbslq_s16(q9u16, q8s16, q10s16); vst1q_s16(block, q0s16); vst1q_s16(block + 8, q8s16); } } if (nCoeffs <= 0) return; d0s16 = vld1_s16(block); d3s16 = vreinterpret_s16_u16(vclt_s16(d0s16, dzs16)); d1u16 = vceq_s16(d0s16, dzs16); d2s16 = vmul_s16(d0s16, vget_high_s16(q15s16)); d3s16 = vbsl_s16(vreinterpret_u16_s16(d3s16), vget_high_s16(q13s16), vget_high_s16(q14s16)); d2s16 = vadd_s16(d2s16, d3s16); d0s16 = vbsl_s16(d1u16, d0s16, d2s16); vst1_s16(block, d0s16); }
static void variance_neon_w8(const uint8_t *a, int a_stride, const uint8_t *b, int b_stride, int w, int h, unsigned int *sse, int *sum) { int i, j; int16x8_t v_sum = vdupq_n_s16(0); int32x4_t v_sse_lo = vdupq_n_s32(0); int32x4_t v_sse_hi = vdupq_n_s32(0); for (i = 0; i < h; ++i) { for (j = 0; j < w; j += 8) { const uint8x8_t v_a = vld1_u8(&a[j]); const uint8x8_t v_b = vld1_u8(&b[j]); const uint16x8_t v_diff = vsubl_u8(v_a, v_b); const int16x8_t sv_diff = vreinterpretq_s16_u16(v_diff); v_sum = vaddq_s16(v_sum, sv_diff); v_sse_lo = vmlal_s16(v_sse_lo, vget_low_s16(sv_diff), vget_low_s16(sv_diff)); v_sse_hi = vmlal_s16(v_sse_hi, vget_high_s16(sv_diff), vget_high_s16(sv_diff)); } a += a_stride; b += b_stride; } *sum = horizontal_add_s16x8(v_sum); *sse = (unsigned int)horizontal_add_s32x4(vaddq_s32(v_sse_lo, v_sse_hi)); }
void vpx_idct4x4_1_add_neon( int16_t *input, uint8_t *dest, int dest_stride) { uint8x8_t d6u8; uint32x2_t d2u32 = vdup_n_u32(0); uint16x8_t q8u16; int16x8_t q0s16; uint8_t *d1, *d2; int16_t i, a1, cospi_16_64 = 11585; int16_t out = dct_const_round_shift(input[0] * cospi_16_64); out = dct_const_round_shift(out * cospi_16_64); a1 = ROUND_POWER_OF_TWO(out, 4); q0s16 = vdupq_n_s16(a1); // dc_only_idct_add d1 = d2 = dest; for (i = 0; i < 2; i++) { d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 0); d1 += dest_stride; d2u32 = vld1_lane_u32((const uint32_t *)d1, d2u32, 1); d1 += dest_stride; q8u16 = vaddw_u8(vreinterpretq_u16_s16(q0s16), vreinterpret_u8_u32(d2u32)); d6u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 0); d2 += dest_stride; vst1_lane_u32((uint32_t *)d2, vreinterpret_u32_u8(d6u8), 1); d2 += dest_stride; } return; }
void test_vdupQ_ns16 (void) { int16x8_t out_int16x8_t; int16_t arg0_int16_t; out_int16x8_t = vdupq_n_s16 (arg0_int16_t); }
/* s16x8 mv mul */ void mw_neon_mv_mul_s16x8(short * A, int Row, int T, short * B, short * C) { int i = 0; int k = 0; int16x8_t neon_b, neon_c; int16x8_t neon_a0, neon_a1, neon_a2, neon_a3, neon_a4, neon_a5, neon_a6, neon_a7; int16x8_t neon_b0, neon_b1, neon_b2, neon_b3, neon_b4, neon_b5, neon_b6, neon_b7; for (i = 0; i < Row; i+=8) { neon_c = vmovq_n_s16(0); for (k = 0; k < T; k+=8) { int j = k * T + i; neon_a0 = vld1q_s16(A + j); j+=Row; neon_a1 = vld1q_s16(A + j); j+=Row; neon_a2 = vld1q_s16(A + j); j+=Row; neon_a3 = vld1q_s16(A + j); j+=Row; neon_a4 = vld1q_s16(A + j); j+=Row; neon_a5 = vld1q_s16(A + j); j+=Row; neon_a6 = vld1q_s16(A + j); j+=Row; neon_a7 = vld1q_s16(A + j); neon_b = vld1q_s16(B + k); neon_b0 = vdupq_n_s16(vgetq_lane_s16(neon_b, 0)); neon_b1 = vdupq_n_s16(vgetq_lane_s16(neon_b, 1)); neon_b2 = vdupq_n_s16(vgetq_lane_s16(neon_b, 2)); neon_b3 = vdupq_n_s16(vgetq_lane_s16(neon_b, 3)); neon_b4 = vdupq_n_s16(vgetq_lane_s16(neon_b, 4)); neon_b5 = vdupq_n_s16(vgetq_lane_s16(neon_b, 5)); neon_b6 = vdupq_n_s16(vgetq_lane_s16(neon_b, 6)); neon_b7 = vdupq_n_s16(vgetq_lane_s16(neon_b, 7)); neon_c = vaddq_s16(vmulq_s16(neon_a0, neon_b0), neon_c); neon_c = vaddq_s16(vmulq_s16(neon_a1, neon_b1), neon_c); neon_c = vaddq_s16(vmulq_s16(neon_a2, neon_b2), neon_c); neon_c = vaddq_s16(vmulq_s16(neon_a3, neon_b3), neon_c); neon_c = vaddq_s16(vmulq_s16(neon_a4, neon_b4), neon_c); neon_c = vaddq_s16(vmulq_s16(neon_a5, neon_b5), neon_c); neon_c = vaddq_s16(vmulq_s16(neon_a6, neon_b6), neon_c); neon_c = vaddq_s16(vmulq_s16(neon_a7, neon_b7), neon_c); } vst1q_s16(C + i, neon_c); } }
static INLINE void GENERATE_SINE_CONSTANTS(int16x4_t *d3s16, int16x4_t *d4s16, int16x4_t *d5s16, int16x8_t *q3s16) { *d3s16 = vdup_n_s16(sinpi_1_9); *d4s16 = vdup_n_s16(sinpi_2_9); *q3s16 = vdupq_n_s16(sinpi_3_9); *d5s16 = vdup_n_s16(sinpi_4_9); return; }
static void SharpYUVFilterRow_NEON(const int16_t* A, const int16_t* B, int len, const uint16_t* best_y, uint16_t* out) { int i; const int16x8_t max = vdupq_n_s16(MAX_Y); const int16x8_t zero = vdupq_n_s16(0); for (i = 0; i + 8 <= len; i += 8) { const int16x8_t a0 = vld1q_s16(A + i + 0); const int16x8_t a1 = vld1q_s16(A + i + 1); const int16x8_t b0 = vld1q_s16(B + i + 0); const int16x8_t b1 = vld1q_s16(B + i + 1); const int16x8_t a0b1 = vaddq_s16(a0, b1); const int16x8_t a1b0 = vaddq_s16(a1, b0); const int16x8_t a0a1b0b1 = vaddq_s16(a0b1, a1b0); // A0+A1+B0+B1 const int16x8_t a0b1_2 = vaddq_s16(a0b1, a0b1); // 2*(A0+B1) const int16x8_t a1b0_2 = vaddq_s16(a1b0, a1b0); // 2*(A1+B0) const int16x8_t c0 = vshrq_n_s16(vaddq_s16(a0b1_2, a0a1b0b1), 3); const int16x8_t c1 = vshrq_n_s16(vaddq_s16(a1b0_2, a0a1b0b1), 3); const int16x8_t d0 = vaddq_s16(c1, a0); const int16x8_t d1 = vaddq_s16(c0, a1); const int16x8_t e0 = vrshrq_n_s16(d0, 1); const int16x8_t e1 = vrshrq_n_s16(d1, 1); const int16x8x2_t f = vzipq_s16(e0, e1); const int16x8_t g0 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 0)); const int16x8_t g1 = vreinterpretq_s16_u16(vld1q_u16(best_y + 2 * i + 8)); const int16x8_t h0 = vaddq_s16(g0, f.val[0]); const int16x8_t h1 = vaddq_s16(g1, f.val[1]); const int16x8_t i0 = vmaxq_s16(vminq_s16(h0, max), zero); const int16x8_t i1 = vmaxq_s16(vminq_s16(h1, max), zero); vst1q_u16(out + 2 * i + 0, vreinterpretq_u16_s16(i0)); vst1q_u16(out + 2 * i + 8, vreinterpretq_u16_s16(i1)); } for (; i < len; ++i) { const int a0b1 = A[i + 0] + B[i + 1]; const int a1b0 = A[i + 1] + B[i + 0]; const int a0a1b0b1 = a0b1 + a1b0 + 8; const int v0 = (8 * A[i + 0] + 2 * a1b0 + a0a1b0b1) >> 4; const int v1 = (8 * A[i + 1] + 2 * a0b1 + a0a1b0b1) >> 4; out[2 * i + 0] = clip_y(best_y[2 * i + 0] + v0); out[2 * i + 1] = clip_y(best_y[2 * i + 1] + v1); } }
void vp9_int_pro_row_neon(int16_t hbuf[16], uint8_t const *ref, const int ref_stride, const int height) { int i; uint16x8_t vec_sum_lo = vdupq_n_u16(0); uint16x8_t vec_sum_hi = vdupq_n_u16(0); const int shift_factor = ((height >> 5) + 3) * -1; const int16x8_t vec_shift = vdupq_n_s16(shift_factor); for (i = 0; i < height; i += 8) { const uint8x16_t vec_row1 = vld1q_u8(ref); const uint8x16_t vec_row2 = vld1q_u8(ref + ref_stride); const uint8x16_t vec_row3 = vld1q_u8(ref + ref_stride * 2); const uint8x16_t vec_row4 = vld1q_u8(ref + ref_stride * 3); const uint8x16_t vec_row5 = vld1q_u8(ref + ref_stride * 4); const uint8x16_t vec_row6 = vld1q_u8(ref + ref_stride * 5); const uint8x16_t vec_row7 = vld1q_u8(ref + ref_stride * 6); const uint8x16_t vec_row8 = vld1q_u8(ref + ref_stride * 7); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row1)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row1)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row2)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row2)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row3)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row3)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row4)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row4)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row5)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row5)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row6)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row6)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row7)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row7)); vec_sum_lo = vaddw_u8(vec_sum_lo, vget_low_u8(vec_row8)); vec_sum_hi = vaddw_u8(vec_sum_hi, vget_high_u8(vec_row8)); ref += ref_stride * 8; } vec_sum_lo = vshlq_u16(vec_sum_lo, vec_shift); vec_sum_hi = vshlq_u16(vec_sum_hi, vec_shift); vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_lo)); hbuf += 8; vst1q_s16(hbuf, vreinterpretq_s16_u16(vec_sum_hi)); }
void vpx_idct4x4_1_add_neon(const tran_low_t *input, uint8_t *dest, int stride) { const int16_t out0 = WRAPLOW(dct_const_round_shift(input[0] * cospi_16_64)); const int16_t out1 = WRAPLOW(dct_const_round_shift(out0 * cospi_16_64)); const int16_t a1 = ROUND_POWER_OF_TWO(out1, 4); const int16x8_t dc = vdupq_n_s16(a1); uint32x2_t d = vdup_n_u32(0); assert(!((intptr_t)dest % sizeof(uint32_t))); assert(!(stride % sizeof(uint32_t))); idct4x4_1_add_kernel(&dest, stride, dc, &d); idct4x4_1_add_kernel(&dest, stride, dc, &d); }
void vp9_idct8x8_1_add_neon( int16_t *input, uint8_t *dest, int dest_stride) { uint8x8_t d2u8, d3u8, d30u8, d31u8; uint64x1_t d2u64, d3u64, d4u64, d5u64; uint16x8_t q0u16, q9u16, q10u16, q11u16, q12u16; int16x8_t q0s16; uint8_t *d1, *d2; int16_t i, a1, cospi_16_64 = 11585; int16_t out = dct_const_round_shift(input[0] * cospi_16_64); out = dct_const_round_shift(out * cospi_16_64); a1 = ROUND_POWER_OF_TWO(out, 5); q0s16 = vdupq_n_s16(a1); q0u16 = vreinterpretq_u16_s16(q0s16); d1 = d2 = dest; for (i = 0; i < 2; i++) { d2u64 = vld1_u64((const uint64_t *)d1); d1 += dest_stride; d3u64 = vld1_u64((const uint64_t *)d1); d1 += dest_stride; d4u64 = vld1_u64((const uint64_t *)d1); d1 += dest_stride; d5u64 = vld1_u64((const uint64_t *)d1); d1 += dest_stride; q9u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d2u64)); q10u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d3u64)); q11u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d4u64)); q12u16 = vaddw_u8(q0u16, vreinterpret_u8_u64(d5u64)); d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); d30u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); d31u8 = vqmovun_s16(vreinterpretq_s16_u16(q12u16)); vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d30u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d31u8)); d2 += dest_stride; } return; }
rfx_quantization_decode_block_NEON(INT16 * buffer, const int buffer_size, const UINT32 factor) { int16x8_t quantFactors = vdupq_n_s16(factor); int16x8_t* buf = (int16x8_t*)buffer; int16x8_t* buf_end = (int16x8_t*)(buffer + buffer_size); do { int16x8_t val = vld1q_s16((INT16*)buf); val = vshlq_s16(val, quantFactors); vst1q_s16((INT16*)buf, val); buf++; } while(buf < buf_end); }
// Noise Estimation static void NoiseEstimationNeon(NsxInst_t* inst, uint16_t* magn, uint32_t* noise, int16_t* q_noise) { int16_t lmagn[HALF_ANAL_BLOCKL], counter, countDiv; int16_t countProd, delta, zeros, frac; int16_t log2, tabind, logval, tmp16, tmp16no1, tmp16no2; const int16_t log2_const = 22713; const int16_t width_factor = 21845; int i, s, offset; tabind = inst->stages - inst->normData; assert(tabind < 9); assert(tabind > -9); if (tabind < 0) { logval = -WebRtcNsx_kLogTable[-tabind]; } else { logval = WebRtcNsx_kLogTable[tabind]; } int16x8_t logval_16x8 = vdupq_n_s16(logval); // lmagn(i)=log(magn(i))=log(2)*log2(magn(i)) // magn is in Q(-stages), and the real lmagn values are: // real_lmagn(i)=log(magn(i)*2^stages)=log(magn(i))+log(2^stages) // lmagn in Q8 for (i = 0; i < inst->magnLen; i++) { if (magn[i]) { zeros = WebRtcSpl_NormU32((uint32_t)magn[i]); frac = (int16_t)((((uint32_t)magn[i] << zeros) & 0x7FFFFFFF) >> 23); assert(frac < 256); // log2(magn(i)) log2 = (int16_t)(((31 - zeros) << 8) + WebRtcNsx_kLogTableFrac[frac]); // log2(magn(i))*log(2) lmagn[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2_const, 15); // + log(2^stages) lmagn[i] += logval; } else { lmagn[i] = logval; } }
// ref, src = [0, 510] - max diff = 16-bits // bwl = {2, 3, 4}, width = {16, 32, 64} int vp9_vector_var_neon(int16_t const *ref, int16_t const *src, const int bwl) { int width = 4 << bwl; int32x4_t sse = vdupq_n_s32(0); int16x8_t total = vdupq_n_s16(0); assert(width >= 8); assert((width % 8) == 0); do { const int16x8_t r = vld1q_s16(ref); const int16x8_t s = vld1q_s16(src); const int16x8_t diff = vsubq_s16(r, s); // [-510, 510], 10 bits. const int16x4_t diff_lo = vget_low_s16(diff); const int16x4_t diff_hi = vget_high_s16(diff); sse = vmlal_s16(sse, diff_lo, diff_lo); // dynamic range 26 bits. sse = vmlal_s16(sse, diff_hi, diff_hi); total = vaddq_s16(total, diff); // dynamic range 16 bits. ref += 8; src += 8; width -= 8; } while (width != 0); { // Note: 'total''s pairwise addition could be implemented similarly to // horizontal_add_u16x8(), but one less vpaddl with 'total' when paired // with the summation of 'sse' performed better on a Cortex-A15. const int32x4_t t0 = vpaddlq_s16(total); // cascading summation of 'total' const int32x2_t t1 = vadd_s32(vget_low_s32(t0), vget_high_s32(t0)); const int32x2_t t2 = vpadd_s32(t1, t1); const int t = vget_lane_s32(t2, 0); const int64x2_t s0 = vpaddlq_s32(sse); // cascading summation of 'sse'. const int32x2_t s1 = vadd_s32(vreinterpret_s32_s64(vget_low_s64(s0)), vreinterpret_s32_s64(vget_high_s64(s0))); const int s = vget_lane_s32(s1, 0); const int shift_factor = bwl + 2; return s - ((t * t) >> shift_factor); } }
void vpx_highbd_idct4x4_16_add_neon(const tran_low_t *input, uint16_t *dest, int stride, int bd) { const int16x8_t max = vdupq_n_s16((1 << bd) - 1); int32x4_t c0 = vld1q_s32(input); int32x4_t c1 = vld1q_s32(input + 4); int32x4_t c2 = vld1q_s32(input + 8); int32x4_t c3 = vld1q_s32(input + 12); int16x8_t a0, a1; if (bd == 8) { const int16x4_t cospis = vld1_s16(kCospi); // Rows a0 = vcombine_s16(vmovn_s32(c0), vmovn_s32(c1)); a1 = vcombine_s16(vmovn_s32(c2), vmovn_s32(c3)); idct4x4_16_kernel_bd8(cospis, &a0, &a1); // Columns a1 = vcombine_s16(vget_high_s16(a1), vget_low_s16(a1)); idct4x4_16_kernel_bd8(cospis, &a0, &a1); a0 = vrshrq_n_s16(a0, 4); a1 = vrshrq_n_s16(a1, 4); } else { const int32x4_t cospis = vld1q_s32(kCospi32); if (bd == 10) { idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3); idct4x4_16_kernel_bd10(cospis, &c0, &c1, &c2, &c3); } else { idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3); idct4x4_16_kernel_bd12(cospis, &c0, &c1, &c2, &c3); } a0 = vcombine_s16(vqrshrn_n_s32(c0, 4), vqrshrn_n_s32(c1, 4)); a1 = vcombine_s16(vqrshrn_n_s32(c3, 4), vqrshrn_n_s32(c2, 4)); } highbd_idct4x4_1_add_kernel1(&dest, stride, a0, max); highbd_idct4x4_1_add_kernel2(&dest, stride, a1, max); }
static void thresh_16s( const Mat& _src, Mat& _dst, short thresh, short maxval, int type ) { int i, j; Size roi = _src.size(); roi.width *= _src.channels(); const short* src = _src.ptr<short>(); short* dst = _dst.ptr<short>(); size_t src_step = _src.step/sizeof(src[0]); size_t dst_step = _dst.step/sizeof(dst[0]); #if CV_SSE2 volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE); #endif if( _src.isContinuous() && _dst.isContinuous() ) { roi.width *= roi.height; roi.height = 1; src_step = dst_step = roi.width; } #ifdef HAVE_TEGRA_OPTIMIZATION if (tegra::thresh_16s(_src, _dst, roi.width, roi.height, thresh, maxval, type)) return; #endif #if defined(HAVE_IPP) CV_IPP_CHECK() { IppiSize sz = { roi.width, roi.height }; CV_SUPPRESS_DEPRECATED_START switch( type ) { case THRESH_TRUNC: #ifndef HAVE_IPP_ICV_ONLY if (_src.data == _dst.data && ippiThreshold_GT_16s_C1IR(dst, (int)dst_step*sizeof(dst[0]), sz, thresh) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } #endif if (ippiThreshold_GT_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); break; case THRESH_TOZERO: #ifndef HAVE_IPP_ICV_ONLY if (_src.data == _dst.data && ippiThreshold_LTVal_16s_C1IR(dst, (int)dst_step*sizeof(dst[0]), sz, thresh + 1, 0) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } #endif if (ippiThreshold_LTVal_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh+1, 0) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); break; case THRESH_TOZERO_INV: #ifndef HAVE_IPP_ICV_ONLY if (_src.data == _dst.data && ippiThreshold_GTVal_16s_C1IR(dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } #endif if (ippiThreshold_GTVal_16s_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0) >= 0) { CV_IMPL_ADD(CV_IMPL_IPP); return; } setIppErrorStatus(); break; } CV_SUPPRESS_DEPRECATED_END } #endif switch( type ) { case THRESH_BINARY: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh), maxval8 = _mm_set1_epi16(maxval); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_cmpgt_epi16( v0, thresh8 ); v1 = _mm_cmpgt_epi16( v1, thresh8 ); v0 = _mm_and_si128( v0, maxval8 ); v1 = _mm_and_si128( v1, maxval8 ); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #elif CV_NEON int16x8_t v_thresh = vdupq_n_s16(thresh), v_maxval = vdupq_n_s16(maxval); for( ; j <= roi.width - 8; j += 8 ) { uint16x8_t v_mask = vcgtq_s16(vld1q_s16(src + j), v_thresh); vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_maxval)); } #endif for( ; j < roi.width; j++ ) dst[j] = src[j] > thresh ? maxval : 0; } break; case THRESH_BINARY_INV: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh), maxval8 = _mm_set1_epi16(maxval); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_cmpgt_epi16( v0, thresh8 ); v1 = _mm_cmpgt_epi16( v1, thresh8 ); v0 = _mm_andnot_si128( v0, maxval8 ); v1 = _mm_andnot_si128( v1, maxval8 ); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #elif CV_NEON int16x8_t v_thresh = vdupq_n_s16(thresh), v_maxval = vdupq_n_s16(maxval); for( ; j <= roi.width - 8; j += 8 ) { uint16x8_t v_mask = vcleq_s16(vld1q_s16(src + j), v_thresh); vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_maxval)); } #endif for( ; j < roi.width; j++ ) dst[j] = src[j] <= thresh ? maxval : 0; } break; case THRESH_TRUNC: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_min_epi16( v0, thresh8 ); v1 = _mm_min_epi16( v1, thresh8 ); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #elif CV_NEON int16x8_t v_thresh = vdupq_n_s16(thresh); for( ; j <= roi.width - 8; j += 8 ) vst1q_s16(dst + j, vminq_s16(vld1q_s16(src + j), v_thresh)); #endif for( ; j < roi.width; j++ ) dst[j] = std::min(src[j], thresh); } break; case THRESH_TOZERO: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_and_si128(v0, _mm_cmpgt_epi16(v0, thresh8)); v1 = _mm_and_si128(v1, _mm_cmpgt_epi16(v1, thresh8)); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #elif CV_NEON int16x8_t v_thresh = vdupq_n_s16(thresh); for( ; j <= roi.width - 8; j += 8 ) { int16x8_t v_src = vld1q_s16(src + j); uint16x8_t v_mask = vcgtq_s16(v_src, v_thresh); vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_src)); } #endif for( ; j < roi.width; j++ ) { short v = src[j]; dst[j] = v > thresh ? v : 0; } } break; case THRESH_TOZERO_INV: for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step ) { j = 0; #if CV_SSE2 if( useSIMD ) { __m128i thresh8 = _mm_set1_epi16(thresh); for( ; j <= roi.width - 16; j += 16 ) { __m128i v0, v1; v0 = _mm_loadu_si128( (const __m128i*)(src + j) ); v1 = _mm_loadu_si128( (const __m128i*)(src + j + 8) ); v0 = _mm_andnot_si128(_mm_cmpgt_epi16(v0, thresh8), v0); v1 = _mm_andnot_si128(_mm_cmpgt_epi16(v1, thresh8), v1); _mm_storeu_si128((__m128i*)(dst + j), v0 ); _mm_storeu_si128((__m128i*)(dst + j + 8), v1 ); } } #elif CV_NEON int16x8_t v_thresh = vdupq_n_s16(thresh); for( ; j <= roi.width - 8; j += 8 ) { int16x8_t v_src = vld1q_s16(src + j); uint16x8_t v_mask = vcleq_s16(v_src, v_thresh); vst1q_s16(dst + j, vandq_s16(vreinterpretq_s16_u16(v_mask), v_src)); } #endif for( ; j < roi.width; j++ ) { short v = src[j]; dst[j] = v <= thresh ? v : 0; } } break; default: return CV_Error( CV_StsBadArg, "" ); } }
rfx_dwt_2d_decode_block_horiz_NEON(INT16 * l, INT16 * h, INT16 * dst, int subband_width) { int y, n; INT16 * l_ptr = l; INT16 * h_ptr = h; INT16 * dst_ptr = dst; for (y = 0; y < subband_width; y++) { /* Even coefficients */ for (n = 0; n < subband_width; n+=8) { // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); int16x8_t l_n = vld1q_s16(l_ptr); int16x8_t h_n = vld1q_s16(h_ptr); int16x8_t h_n_m = vld1q_s16(h_ptr - 1); if (n == 0) { int16_t first = vgetq_lane_s16(h_n_m, 1); h_n_m = vsetq_lane_s16(first, h_n_m, 0); } int16x8_t tmp_n = vaddq_s16(h_n, h_n_m); tmp_n = vaddq_s16(tmp_n, vdupq_n_s16(1)); tmp_n = vshrq_n_s16(tmp_n, 1); int16x8_t dst_n = vsubq_s16(l_n, tmp_n); vst1q_s16(l_ptr, dst_n); l_ptr+=8; h_ptr+=8; } l_ptr -= subband_width; h_ptr -= subband_width; /* Odd coefficients */ for (n = 0; n < subband_width; n+=8) { // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); int16x8_t h_n = vld1q_s16(h_ptr); h_n = vshlq_n_s16(h_n, 1); int16x8x2_t dst_n; dst_n.val[0] = vld1q_s16(l_ptr); int16x8_t dst_n_p = vld1q_s16(l_ptr + 1); if (n == subband_width - 8) { int16_t last = vgetq_lane_s16(dst_n_p, 6); dst_n_p = vsetq_lane_s16(last, dst_n_p, 7); } dst_n.val[1] = vaddq_s16(dst_n_p, dst_n.val[0]); dst_n.val[1] = vshrq_n_s16(dst_n.val[1], 1); dst_n.val[1] = vaddq_s16(dst_n.val[1], h_n); vst2q_s16(dst_ptr, dst_n); l_ptr+=8; h_ptr+=8; dst_ptr+=16; } } }
rfx_dwt_2d_decode_block_vert_NEON(INT16 * l, INT16 * h, INT16 * dst, int subband_width) { int x, n; INT16 * l_ptr = l; INT16 * h_ptr = h; INT16 * dst_ptr = dst; int total_width = subband_width + subband_width; /* Even coefficients */ for (n = 0; n < subband_width; n++) { for (x = 0; x < total_width; x+=8) { // dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); int16x8_t l_n = vld1q_s16(l_ptr); int16x8_t h_n = vld1q_s16(h_ptr); int16x8_t tmp_n = vaddq_s16(h_n, vdupq_n_s16(1));; if (n == 0) tmp_n = vaddq_s16(tmp_n, h_n); else { int16x8_t h_n_m = vld1q_s16((h_ptr - total_width)); tmp_n = vaddq_s16(tmp_n, h_n_m); } tmp_n = vshrq_n_s16(tmp_n, 1); int16x8_t dst_n = vsubq_s16(l_n, tmp_n); vst1q_s16(dst_ptr, dst_n); l_ptr+=8; h_ptr+=8; dst_ptr+=8; } dst_ptr+=total_width; } h_ptr = h; dst_ptr = dst + total_width; /* Odd coefficients */ for (n = 0; n < subband_width; n++) { for (x = 0; x < total_width; x+=8) { // dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); int16x8_t h_n = vld1q_s16(h_ptr); int16x8_t dst_n_m = vld1q_s16(dst_ptr - total_width); h_n = vshlq_n_s16(h_n, 1); int16x8_t tmp_n = dst_n_m; if (n == subband_width - 1) tmp_n = vaddq_s16(tmp_n, dst_n_m); else { int16x8_t dst_n_p = vld1q_s16((dst_ptr + total_width)); tmp_n = vaddq_s16(tmp_n, dst_n_p); } tmp_n = vshrq_n_s16(tmp_n, 1); int16x8_t dst_n = vaddq_s16(tmp_n, h_n); vst1q_s16(dst_ptr, dst_n); h_ptr+=8; dst_ptr+=8; } dst_ptr+=total_width; } }
static INLINE void vp8_loop_filter_simple_horizontal_edge_neon( unsigned char *s, int p, const unsigned char *blimit) { uint8_t *sp; uint8x16_t qblimit, q0u8; uint8x16_t q5u8, q6u8, q7u8, q8u8, q9u8, q10u8, q14u8, q15u8; int16x8_t q2s16, q3s16, q13s16; int8x8_t d8s8, d9s8; int8x16_t q2s8, q3s8, q4s8, q10s8, q11s8, q14s8; qblimit = vdupq_n_u8(*blimit); sp = s - (p << 1); q5u8 = vld1q_u8(sp); sp += p; q6u8 = vld1q_u8(sp); sp += p; q7u8 = vld1q_u8(sp); sp += p; q8u8 = vld1q_u8(sp); q15u8 = vabdq_u8(q6u8, q7u8); q14u8 = vabdq_u8(q5u8, q8u8); q15u8 = vqaddq_u8(q15u8, q15u8); q14u8 = vshrq_n_u8(q14u8, 1); q0u8 = vdupq_n_u8(0x80); q13s16 = vdupq_n_s16(3); q15u8 = vqaddq_u8(q15u8, q14u8); q5u8 = veorq_u8(q5u8, q0u8); q6u8 = veorq_u8(q6u8, q0u8); q7u8 = veorq_u8(q7u8, q0u8); q8u8 = veorq_u8(q8u8, q0u8); q15u8 = vcgeq_u8(qblimit, q15u8); q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q7u8)), vget_low_s8(vreinterpretq_s8_u8(q6u8))); q3s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q7u8)), vget_high_s8(vreinterpretq_s8_u8(q6u8))); q4s8 = vqsubq_s8(vreinterpretq_s8_u8(q5u8), vreinterpretq_s8_u8(q8u8)); q2s16 = vmulq_s16(q2s16, q13s16); q3s16 = vmulq_s16(q3s16, q13s16); q10u8 = vdupq_n_u8(3); q9u8 = vdupq_n_u8(4); q2s16 = vaddw_s8(q2s16, vget_low_s8(q4s8)); q3s16 = vaddw_s8(q3s16, vget_high_s8(q4s8)); d8s8 = vqmovn_s16(q2s16); d9s8 = vqmovn_s16(q3s16); q4s8 = vcombine_s8(d8s8, d9s8); q14s8 = vandq_s8(q4s8, vreinterpretq_s8_u8(q15u8)); q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q10u8)); q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q9u8)); q2s8 = vshrq_n_s8(q2s8, 3); q3s8 = vshrq_n_s8(q3s8, 3); q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q6u8), q2s8); q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q7u8), q3s8); q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); vst1q_u8(s, q7u8); s -= p; vst1q_u8(s, q6u8); return; }
int16x8_t test_vdupq_n_s16(int16_t v1) { // CHECK: test_vdupq_n_s16 return vdupq_n_s16(v1); // CHECK: dup {{v[0-9]+}}.8h, {{w[0-9]+}} }
& 0x7FFFFFFF) >> 23); assert(frac < 256); // log2(magn(i)) log2 = (int16_t)(((31 - zeros) << 8) + WebRtcNsx_kLogTableFrac[frac]); // log2(magn(i))*log(2) lmagn[i] = (int16_t)WEBRTC_SPL_MUL_16_16_RSFT(log2, log2_const, 15); // + log(2^stages) lmagn[i] += logval; } else { lmagn[i] = logval; } } int16x4_t Q3_16x4 = vdup_n_s16(3); int16x8_t WIDTHQ8_16x8 = vdupq_n_s16(WIDTH_Q8); int16x8_t WIDTHFACTOR_16x8 = vdupq_n_s16(width_factor); int16_t factor = FACTOR_Q7; if (inst->blockIndex < END_STARTUP_LONG) factor = FACTOR_Q7_STARTUP; // Loop over simultaneous estimates for (s = 0; s < SIMULT; s++) { offset = s * inst->magnLen; // Get counter values from state counter = inst->noiseEstCounter[s]; assert(counter < 201); countDiv = WebRtcNsx_kCounterDiv[counter]; countProd = (int16_t)WEBRTC_SPL_MUL_16_16(counter, countDiv);
void vp9_quantize_fp_neon(const tran_low_t *coeff_ptr, intptr_t count, int skip_block, const int16_t *zbin_ptr, const int16_t *round_ptr, const int16_t *quant_ptr, const int16_t *quant_shift_ptr, tran_low_t *qcoeff_ptr, tran_low_t *dqcoeff_ptr, const int16_t *dequant_ptr, uint16_t *eob_ptr, const int16_t *scan, const int16_t *iscan) { // TODO(jingning) Decide the need of these arguments after the // quantization process is completed. (void)zbin_ptr; (void)quant_shift_ptr; (void)scan; if (!skip_block) { // Quantization pass: All coefficients with index >= zero_flag are // skippable. Note: zero_flag can be zero. int i; const int16x8_t v_zero = vdupq_n_s16(0); const int16x8_t v_one = vdupq_n_s16(1); int16x8_t v_eobmax_76543210 = vdupq_n_s16(-1); int16x8_t v_round = vmovq_n_s16(round_ptr[1]); int16x8_t v_quant = vmovq_n_s16(quant_ptr[1]); int16x8_t v_dequant = vmovq_n_s16(dequant_ptr[1]); // adjust for dc v_round = vsetq_lane_s16(round_ptr[0], v_round, 0); v_quant = vsetq_lane_s16(quant_ptr[0], v_quant, 0); v_dequant = vsetq_lane_s16(dequant_ptr[0], v_dequant, 0); // process dc and the first seven ac coeffs { const int16x8_t v_iscan = vld1q_s16(&iscan[0]); const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr); const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero); const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); const int16x8_t v_tmp2 = vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); store_s16q_to_tran_low(qcoeff_ptr, v_qcoeff); store_s16q_to_tran_low(dqcoeff_ptr, v_dqcoeff); v_round = vmovq_n_s16(round_ptr[1]); v_quant = vmovq_n_s16(quant_ptr[1]); v_dequant = vmovq_n_s16(dequant_ptr[1]); } // now process the rest of the ac coeffs for (i = 8; i < count; i += 8) { const int16x8_t v_iscan = vld1q_s16(&iscan[i]); const int16x8_t v_coeff = load_tran_low_to_s16q(coeff_ptr + i); const int16x8_t v_coeff_sign = vshrq_n_s16(v_coeff, 15); const int16x8_t v_tmp = vabaq_s16(v_round, v_coeff, v_zero); const int32x4_t v_tmp_lo = vmull_s16(vget_low_s16(v_tmp), vget_low_s16(v_quant)); const int32x4_t v_tmp_hi = vmull_s16(vget_high_s16(v_tmp), vget_high_s16(v_quant)); const int16x8_t v_tmp2 = vcombine_s16(vshrn_n_s32(v_tmp_lo, 16), vshrn_n_s32(v_tmp_hi, 16)); const uint16x8_t v_nz_mask = vceqq_s16(v_tmp2, v_zero); const int16x8_t v_iscan_plus1 = vaddq_s16(v_iscan, v_one); const int16x8_t v_nz_iscan = vbslq_s16(v_nz_mask, v_zero, v_iscan_plus1); const int16x8_t v_qcoeff_a = veorq_s16(v_tmp2, v_coeff_sign); const int16x8_t v_qcoeff = vsubq_s16(v_qcoeff_a, v_coeff_sign); const int16x8_t v_dqcoeff = vmulq_s16(v_qcoeff, v_dequant); v_eobmax_76543210 = vmaxq_s16(v_eobmax_76543210, v_nz_iscan); store_s16q_to_tran_low(qcoeff_ptr + i, v_qcoeff); store_s16q_to_tran_low(dqcoeff_ptr + i, v_dqcoeff); } { const int16x4_t v_eobmax_3210 = vmax_s16( vget_low_s16(v_eobmax_76543210), vget_high_s16(v_eobmax_76543210)); const int64x1_t v_eobmax_xx32 = vshr_n_s64(vreinterpret_s64_s16(v_eobmax_3210), 32); const int16x4_t v_eobmax_tmp = vmax_s16(v_eobmax_3210, vreinterpret_s16_s64(v_eobmax_xx32)); const int64x1_t v_eobmax_xxx3 = vshr_n_s64(vreinterpret_s64_s16(v_eobmax_tmp), 16); const int16x4_t v_eobmax_final = vmax_s16(v_eobmax_tmp, vreinterpret_s16_s64(v_eobmax_xxx3)); *eob_ptr = (uint16_t)vget_lane_s16(v_eobmax_final, 0); } } else { memset(qcoeff_ptr, 0, count * sizeof(*qcoeff_ptr)); memset(dqcoeff_ptr, 0, count * sizeof(*dqcoeff_ptr)); *eob_ptr = 0; } }
static INLINE void IADST8X8_1D(int16x8_t *q8s16, int16x8_t *q9s16, int16x8_t *q10s16, int16x8_t *q11s16, int16x8_t *q12s16, int16x8_t *q13s16, int16x8_t *q14s16, int16x8_t *q15s16) { int16x4_t d0s16, d1s16, d2s16, d3s16, d4s16, d5s16, d6s16, d7s16; int16x4_t d8s16, d9s16, d10s16, d11s16, d12s16, d13s16, d14s16, d15s16; int16x4_t d16s16, d17s16, d18s16, d19s16, d20s16, d21s16, d22s16, d23s16; int16x4_t d24s16, d25s16, d26s16, d27s16, d28s16, d29s16, d30s16, d31s16; int16x8_t q2s16, q4s16, q5s16, q6s16; int32x4_t q0s32, q1s32, q2s32, q3s32, q4s32, q5s32, q6s32, q7s32, q8s32; int32x4_t q9s32, q10s32, q11s32, q12s32, q13s32, q14s32, q15s32; d16s16 = vget_low_s16(*q8s16); d17s16 = vget_high_s16(*q8s16); d18s16 = vget_low_s16(*q9s16); d19s16 = vget_high_s16(*q9s16); d20s16 = vget_low_s16(*q10s16); d21s16 = vget_high_s16(*q10s16); d22s16 = vget_low_s16(*q11s16); d23s16 = vget_high_s16(*q11s16); d24s16 = vget_low_s16(*q12s16); d25s16 = vget_high_s16(*q12s16); d26s16 = vget_low_s16(*q13s16); d27s16 = vget_high_s16(*q13s16); d28s16 = vget_low_s16(*q14s16); d29s16 = vget_high_s16(*q14s16); d30s16 = vget_low_s16(*q15s16); d31s16 = vget_high_s16(*q15s16); d14s16 = vdup_n_s16((int16_t)cospi_2_64); d15s16 = vdup_n_s16((int16_t)cospi_30_64); q1s32 = vmull_s16(d30s16, d14s16); q2s32 = vmull_s16(d31s16, d14s16); q3s32 = vmull_s16(d30s16, d15s16); q4s32 = vmull_s16(d31s16, d15s16); d30s16 = vdup_n_s16((int16_t)cospi_18_64); d31s16 = vdup_n_s16((int16_t)cospi_14_64); q1s32 = vmlal_s16(q1s32, d16s16, d15s16); q2s32 = vmlal_s16(q2s32, d17s16, d15s16); q3s32 = vmlsl_s16(q3s32, d16s16, d14s16); q4s32 = vmlsl_s16(q4s32, d17s16, d14s16); q5s32 = vmull_s16(d22s16, d30s16); q6s32 = vmull_s16(d23s16, d30s16); q7s32 = vmull_s16(d22s16, d31s16); q8s32 = vmull_s16(d23s16, d31s16); q5s32 = vmlal_s16(q5s32, d24s16, d31s16); q6s32 = vmlal_s16(q6s32, d25s16, d31s16); q7s32 = vmlsl_s16(q7s32, d24s16, d30s16); q8s32 = vmlsl_s16(q8s32, d25s16, d30s16); q11s32 = vaddq_s32(q1s32, q5s32); q12s32 = vaddq_s32(q2s32, q6s32); q1s32 = vsubq_s32(q1s32, q5s32); q2s32 = vsubq_s32(q2s32, q6s32); d22s16 = vqrshrn_n_s32(q11s32, 14); d23s16 = vqrshrn_n_s32(q12s32, 14); *q11s16 = vcombine_s16(d22s16, d23s16); q12s32 = vaddq_s32(q3s32, q7s32); q15s32 = vaddq_s32(q4s32, q8s32); q3s32 = vsubq_s32(q3s32, q7s32); q4s32 = vsubq_s32(q4s32, q8s32); d2s16 = vqrshrn_n_s32(q1s32, 14); d3s16 = vqrshrn_n_s32(q2s32, 14); d24s16 = vqrshrn_n_s32(q12s32, 14); d25s16 = vqrshrn_n_s32(q15s32, 14); d6s16 = vqrshrn_n_s32(q3s32, 14); d7s16 = vqrshrn_n_s32(q4s32, 14); *q12s16 = vcombine_s16(d24s16, d25s16); d0s16 = vdup_n_s16((int16_t)cospi_10_64); d1s16 = vdup_n_s16((int16_t)cospi_22_64); q4s32 = vmull_s16(d26s16, d0s16); q5s32 = vmull_s16(d27s16, d0s16); q2s32 = vmull_s16(d26s16, d1s16); q6s32 = vmull_s16(d27s16, d1s16); d30s16 = vdup_n_s16((int16_t)cospi_26_64); d31s16 = vdup_n_s16((int16_t)cospi_6_64); q4s32 = vmlal_s16(q4s32, d20s16, d1s16); q5s32 = vmlal_s16(q5s32, d21s16, d1s16); q2s32 = vmlsl_s16(q2s32, d20s16, d0s16); q6s32 = vmlsl_s16(q6s32, d21s16, d0s16); q0s32 = vmull_s16(d18s16, d30s16); q13s32 = vmull_s16(d19s16, d30s16); q0s32 = vmlal_s16(q0s32, d28s16, d31s16); q13s32 = vmlal_s16(q13s32, d29s16, d31s16); q10s32 = vmull_s16(d18s16, d31s16); q9s32 = vmull_s16(d19s16, d31s16); q10s32 = vmlsl_s16(q10s32, d28s16, d30s16); q9s32 = vmlsl_s16(q9s32, d29s16, d30s16); q14s32 = vaddq_s32(q2s32, q10s32); q15s32 = vaddq_s32(q6s32, q9s32); q2s32 = vsubq_s32(q2s32, q10s32); q6s32 = vsubq_s32(q6s32, q9s32); d28s16 = vqrshrn_n_s32(q14s32, 14); d29s16 = vqrshrn_n_s32(q15s32, 14); d4s16 = vqrshrn_n_s32(q2s32, 14); d5s16 = vqrshrn_n_s32(q6s32, 14); *q14s16 = vcombine_s16(d28s16, d29s16); q9s32 = vaddq_s32(q4s32, q0s32); q10s32 = vaddq_s32(q5s32, q13s32); q4s32 = vsubq_s32(q4s32, q0s32); q5s32 = vsubq_s32(q5s32, q13s32); d30s16 = vdup_n_s16((int16_t)cospi_8_64); d31s16 = vdup_n_s16((int16_t)cospi_24_64); d18s16 = vqrshrn_n_s32(q9s32, 14); d19s16 = vqrshrn_n_s32(q10s32, 14); d8s16 = vqrshrn_n_s32(q4s32, 14); d9s16 = vqrshrn_n_s32(q5s32, 14); *q9s16 = vcombine_s16(d18s16, d19s16); q5s32 = vmull_s16(d2s16, d30s16); q6s32 = vmull_s16(d3s16, d30s16); q7s32 = vmull_s16(d2s16, d31s16); q0s32 = vmull_s16(d3s16, d31s16); q5s32 = vmlal_s16(q5s32, d6s16, d31s16); q6s32 = vmlal_s16(q6s32, d7s16, d31s16); q7s32 = vmlsl_s16(q7s32, d6s16, d30s16); q0s32 = vmlsl_s16(q0s32, d7s16, d30s16); q1s32 = vmull_s16(d4s16, d30s16); q3s32 = vmull_s16(d5s16, d30s16); q10s32 = vmull_s16(d4s16, d31s16); q2s32 = vmull_s16(d5s16, d31s16); q1s32 = vmlsl_s16(q1s32, d8s16, d31s16); q3s32 = vmlsl_s16(q3s32, d9s16, d31s16); q10s32 = vmlal_s16(q10s32, d8s16, d30s16); q2s32 = vmlal_s16(q2s32, d9s16, d30s16); *q8s16 = vaddq_s16(*q11s16, *q9s16); *q11s16 = vsubq_s16(*q11s16, *q9s16); q4s16 = vaddq_s16(*q12s16, *q14s16); *q12s16 = vsubq_s16(*q12s16, *q14s16); q14s32 = vaddq_s32(q5s32, q1s32); q15s32 = vaddq_s32(q6s32, q3s32); q5s32 = vsubq_s32(q5s32, q1s32); q6s32 = vsubq_s32(q6s32, q3s32); d18s16 = vqrshrn_n_s32(q14s32, 14); d19s16 = vqrshrn_n_s32(q15s32, 14); d10s16 = vqrshrn_n_s32(q5s32, 14); d11s16 = vqrshrn_n_s32(q6s32, 14); *q9s16 = vcombine_s16(d18s16, d19s16); q1s32 = vaddq_s32(q7s32, q10s32); q3s32 = vaddq_s32(q0s32, q2s32); q7s32 = vsubq_s32(q7s32, q10s32); q0s32 = vsubq_s32(q0s32, q2s32); d28s16 = vqrshrn_n_s32(q1s32, 14); d29s16 = vqrshrn_n_s32(q3s32, 14); d14s16 = vqrshrn_n_s32(q7s32, 14); d15s16 = vqrshrn_n_s32(q0s32, 14); *q14s16 = vcombine_s16(d28s16, d29s16); d30s16 = vdup_n_s16((int16_t)cospi_16_64); d22s16 = vget_low_s16(*q11s16); d23s16 = vget_high_s16(*q11s16); q2s32 = vmull_s16(d22s16, d30s16); q3s32 = vmull_s16(d23s16, d30s16); q13s32 = vmull_s16(d22s16, d30s16); q1s32 = vmull_s16(d23s16, d30s16); d24s16 = vget_low_s16(*q12s16); d25s16 = vget_high_s16(*q12s16); q2s32 = vmlal_s16(q2s32, d24s16, d30s16); q3s32 = vmlal_s16(q3s32, d25s16, d30s16); q13s32 = vmlsl_s16(q13s32, d24s16, d30s16); q1s32 = vmlsl_s16(q1s32, d25s16, d30s16); d4s16 = vqrshrn_n_s32(q2s32, 14); d5s16 = vqrshrn_n_s32(q3s32, 14); d24s16 = vqrshrn_n_s32(q13s32, 14); d25s16 = vqrshrn_n_s32(q1s32, 14); q2s16 = vcombine_s16(d4s16, d5s16); *q12s16 = vcombine_s16(d24s16, d25s16); q13s32 = vmull_s16(d10s16, d30s16); q1s32 = vmull_s16(d11s16, d30s16); q11s32 = vmull_s16(d10s16, d30s16); q0s32 = vmull_s16(d11s16, d30s16); q13s32 = vmlal_s16(q13s32, d14s16, d30s16); q1s32 = vmlal_s16(q1s32, d15s16, d30s16); q11s32 = vmlsl_s16(q11s32, d14s16, d30s16); q0s32 = vmlsl_s16(q0s32, d15s16, d30s16); d20s16 = vqrshrn_n_s32(q13s32, 14); d21s16 = vqrshrn_n_s32(q1s32, 14); d12s16 = vqrshrn_n_s32(q11s32, 14); d13s16 = vqrshrn_n_s32(q0s32, 14); *q10s16 = vcombine_s16(d20s16, d21s16); q6s16 = vcombine_s16(d12s16, d13s16); q5s16 = vdupq_n_s16(0); *q9s16 = vsubq_s16(q5s16, *q9s16); *q11s16 = vsubq_s16(q5s16, q2s16); *q13s16 = vsubq_s16(q5s16, q6s16); *q15s16 = vsubq_s16(q5s16, q4s16); return; }
void vpx_idct8x8_12_add_neon( int16_t *input, uint8_t *dest, int dest_stride) { uint8_t *d1, *d2; uint8x8_t d0u8, d1u8, d2u8, d3u8; int16x4_t d10s16, d11s16, d12s16, d13s16, d16s16; int16x4_t d26s16, d27s16, d28s16, d29s16; uint64x1_t d0u64, d1u64, d2u64, d3u64; int16x8_t q0s16, q1s16, q2s16, q3s16, q4s16, q5s16, q6s16, q7s16; int16x8_t q8s16, q9s16, q10s16, q11s16, q12s16, q13s16, q14s16, q15s16; uint16x8_t q8u16, q9u16, q10u16, q11u16; int32x4_t q9s32, q10s32, q11s32, q12s32; q8s16 = vld1q_s16(input); q9s16 = vld1q_s16(input + 8); q10s16 = vld1q_s16(input + 16); q11s16 = vld1q_s16(input + 24); q12s16 = vld1q_s16(input + 32); q13s16 = vld1q_s16(input + 40); q14s16 = vld1q_s16(input + 48); q15s16 = vld1q_s16(input + 56); TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); // First transform rows // stage 1 q0s16 = vdupq_n_s16(cospi_28_64 * 2); q1s16 = vdupq_n_s16(cospi_4_64 * 2); q4s16 = vqrdmulhq_s16(q9s16, q0s16); q0s16 = vdupq_n_s16(-cospi_20_64 * 2); q7s16 = vqrdmulhq_s16(q9s16, q1s16); q1s16 = vdupq_n_s16(cospi_12_64 * 2); q5s16 = vqrdmulhq_s16(q11s16, q0s16); q0s16 = vdupq_n_s16(cospi_16_64 * 2); q6s16 = vqrdmulhq_s16(q11s16, q1s16); // stage 2 & stage 3 - even half q1s16 = vdupq_n_s16(cospi_24_64 * 2); q9s16 = vqrdmulhq_s16(q8s16, q0s16); q0s16 = vdupq_n_s16(cospi_8_64 * 2); q13s16 = vqrdmulhq_s16(q10s16, q1s16); q15s16 = vqrdmulhq_s16(q10s16, q0s16); // stage 3 -odd half q0s16 = vaddq_s16(q9s16, q15s16); q1s16 = vaddq_s16(q9s16, q13s16); q2s16 = vsubq_s16(q9s16, q13s16); q3s16 = vsubq_s16(q9s16, q15s16); // stage 2 - odd half q13s16 = vsubq_s16(q4s16, q5s16); q4s16 = vaddq_s16(q4s16, q5s16); q14s16 = vsubq_s16(q7s16, q6s16); q7s16 = vaddq_s16(q7s16, q6s16); d26s16 = vget_low_s16(q13s16); d27s16 = vget_high_s16(q13s16); d28s16 = vget_low_s16(q14s16); d29s16 = vget_high_s16(q14s16); d16s16 = vdup_n_s16(cospi_16_64); q9s32 = vmull_s16(d28s16, d16s16); q10s32 = vmull_s16(d29s16, d16s16); q11s32 = vmull_s16(d28s16, d16s16); q12s32 = vmull_s16(d29s16, d16s16); q9s32 = vmlsl_s16(q9s32, d26s16, d16s16); q10s32 = vmlsl_s16(q10s32, d27s16, d16s16); q11s32 = vmlal_s16(q11s32, d26s16, d16s16); q12s32 = vmlal_s16(q12s32, d27s16, d16s16); d10s16 = vqrshrn_n_s32(q9s32, 14); d11s16 = vqrshrn_n_s32(q10s32, 14); d12s16 = vqrshrn_n_s32(q11s32, 14); d13s16 = vqrshrn_n_s32(q12s32, 14); q5s16 = vcombine_s16(d10s16, d11s16); q6s16 = vcombine_s16(d12s16, d13s16); // stage 4 q8s16 = vaddq_s16(q0s16, q7s16); q9s16 = vaddq_s16(q1s16, q6s16); q10s16 = vaddq_s16(q2s16, q5s16); q11s16 = vaddq_s16(q3s16, q4s16); q12s16 = vsubq_s16(q3s16, q4s16); q13s16 = vsubq_s16(q2s16, q5s16); q14s16 = vsubq_s16(q1s16, q6s16); q15s16 = vsubq_s16(q0s16, q7s16); TRANSPOSE8X8(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); IDCT8x8_1D(&q8s16, &q9s16, &q10s16, &q11s16, &q12s16, &q13s16, &q14s16, &q15s16); q8s16 = vrshrq_n_s16(q8s16, 5); q9s16 = vrshrq_n_s16(q9s16, 5); q10s16 = vrshrq_n_s16(q10s16, 5); q11s16 = vrshrq_n_s16(q11s16, 5); q12s16 = vrshrq_n_s16(q12s16, 5); q13s16 = vrshrq_n_s16(q13s16, 5); q14s16 = vrshrq_n_s16(q14s16, 5); q15s16 = vrshrq_n_s16(q15s16, 5); d1 = d2 = dest; d0u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d1u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d2u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d3u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); d2 += dest_stride; q8s16 = q12s16; q9s16 = q13s16; q10s16 = q14s16; q11s16 = q15s16; d0u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d1u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d2u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; d3u64 = vld1_u64((uint64_t *)d1); d1 += dest_stride; q8u16 = vaddw_u8(vreinterpretq_u16_s16(q8s16), vreinterpret_u8_u64(d0u64)); q9u16 = vaddw_u8(vreinterpretq_u16_s16(q9s16), vreinterpret_u8_u64(d1u64)); q10u16 = vaddw_u8(vreinterpretq_u16_s16(q10s16), vreinterpret_u8_u64(d2u64)); q11u16 = vaddw_u8(vreinterpretq_u16_s16(q11s16), vreinterpret_u8_u64(d3u64)); d0u8 = vqmovun_s16(vreinterpretq_s16_u16(q8u16)); d1u8 = vqmovun_s16(vreinterpretq_s16_u16(q9u16)); d2u8 = vqmovun_s16(vreinterpretq_s16_u16(q10u16)); d3u8 = vqmovun_s16(vreinterpretq_s16_u16(q11u16)); vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d0u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d1u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d2u8)); d2 += dest_stride; vst1_u64((uint64_t *)d2, vreinterpret_u64_u8(d3u8)); d2 += dest_stride; return; }
static INLINE void vp8_loop_filter_simple_vertical_edge_neon( unsigned char *s, int p, const unsigned char *blimit) { unsigned char *src1; uint8x16_t qblimit, q0u8; uint8x16_t q3u8, q4u8, q5u8, q6u8, q7u8, q11u8, q12u8, q14u8, q15u8; int16x8_t q2s16, q13s16, q11s16; int8x8_t d28s8, d29s8; int8x16_t q2s8, q3s8, q10s8, q11s8, q14s8; uint8x8x4_t d0u8x4; // d6, d7, d8, d9 uint8x8x4_t d1u8x4; // d10, d11, d12, d13 uint8x8x2_t d2u8x2; // d12, d13 uint8x8x2_t d3u8x2; // d14, d15 qblimit = vdupq_n_u8(*blimit); src1 = s - 2; d0u8x4 = read_4x8(src1, p); src1 += p * 8; d1u8x4 = read_4x8(src1, p); q3u8 = vcombine_u8(d0u8x4.val[0], d1u8x4.val[0]); // d6 d10 q4u8 = vcombine_u8(d0u8x4.val[2], d1u8x4.val[2]); // d8 d12 q5u8 = vcombine_u8(d0u8x4.val[1], d1u8x4.val[1]); // d7 d11 q6u8 = vcombine_u8(d0u8x4.val[3], d1u8x4.val[3]); // d9 d13 q15u8 = vabdq_u8(q5u8, q4u8); q14u8 = vabdq_u8(q3u8, q6u8); q15u8 = vqaddq_u8(q15u8, q15u8); q14u8 = vshrq_n_u8(q14u8, 1); q0u8 = vdupq_n_u8(0x80); q11s16 = vdupq_n_s16(3); q15u8 = vqaddq_u8(q15u8, q14u8); q3u8 = veorq_u8(q3u8, q0u8); q4u8 = veorq_u8(q4u8, q0u8); q5u8 = veorq_u8(q5u8, q0u8); q6u8 = veorq_u8(q6u8, q0u8); q15u8 = vcgeq_u8(qblimit, q15u8); q2s16 = vsubl_s8(vget_low_s8(vreinterpretq_s8_u8(q4u8)), vget_low_s8(vreinterpretq_s8_u8(q5u8))); q13s16 = vsubl_s8(vget_high_s8(vreinterpretq_s8_u8(q4u8)), vget_high_s8(vreinterpretq_s8_u8(q5u8))); q14s8 = vqsubq_s8(vreinterpretq_s8_u8(q3u8), vreinterpretq_s8_u8(q6u8)); q2s16 = vmulq_s16(q2s16, q11s16); q13s16 = vmulq_s16(q13s16, q11s16); q11u8 = vdupq_n_u8(3); q12u8 = vdupq_n_u8(4); q2s16 = vaddw_s8(q2s16, vget_low_s8(q14s8)); q13s16 = vaddw_s8(q13s16, vget_high_s8(q14s8)); d28s8 = vqmovn_s16(q2s16); d29s8 = vqmovn_s16(q13s16); q14s8 = vcombine_s8(d28s8, d29s8); q14s8 = vandq_s8(q14s8, vreinterpretq_s8_u8(q15u8)); q2s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q11u8)); q3s8 = vqaddq_s8(q14s8, vreinterpretq_s8_u8(q12u8)); q2s8 = vshrq_n_s8(q2s8, 3); q14s8 = vshrq_n_s8(q3s8, 3); q11s8 = vqaddq_s8(vreinterpretq_s8_u8(q5u8), q2s8); q10s8 = vqsubq_s8(vreinterpretq_s8_u8(q4u8), q14s8); q6u8 = veorq_u8(vreinterpretq_u8_s8(q11s8), q0u8); q7u8 = veorq_u8(vreinterpretq_u8_s8(q10s8), q0u8); d2u8x2.val[0] = vget_low_u8(q6u8); // d12 d2u8x2.val[1] = vget_low_u8(q7u8); // d14 d3u8x2.val[0] = vget_high_u8(q6u8); // d13 d3u8x2.val[1] = vget_high_u8(q7u8); // d15 src1 = s - 1; write_2x8(src1, p, d2u8x2, d3u8x2); }