void checkasm_check_hevc_mc(void) { DECLARE_ALIGNED(16, uint8_t, buf8_0)[BUF_SIZE]; DECLARE_ALIGNED(16, uint8_t, buf8_1)[BUF_SIZE]; DECLARE_ALIGNED(16, int16_t, buf16_0)[BUF_SIZE]; DECLARE_ALIGNED(16, int16_t, buf16_1)[BUF_SIZE]; DECLARE_ALIGNED(16, int16_t, mcbuffer)[BUF_SIZE]; HEVCDSPContext h; int bit_depth; for (bit_depth = 8; bit_depth <= 10; bit_depth++) { ff_hevc_dsp_init(&h, bit_depth); check_qpel(&h, buf16_0, buf16_1, buf8_0, mcbuffer, bit_depth); report("qpel"); check_epel(&h, buf16_0, buf16_1, buf8_0, mcbuffer, bit_depth); report("epel"); check_unweighted_pred(&h, buf8_0, buf8_1, buf16_0, buf16_1, bit_depth); report("unweighted_pred"); check_weighted_pred(&h, buf8_0, buf8_1, buf16_0, buf16_1, bit_depth); report("weighted_pred"); } }
unsigned int vp9_satd16x16_c(const uint8_t *src_ptr, int src_stride, const uint8_t *ref_ptr, int ref_stride, unsigned int *psatd) { int r, c, i; unsigned int satd = 0; DECLARE_ALIGNED(16, int16_t, diff_in[256]); DECLARE_ALIGNED(16, int16_t, diff_out[16]); int16_t *in; for (r = 0; r < 16; r++) { for (c = 0; c < 16; c++) { diff_in[r * 16 + c] = src_ptr[c] - ref_ptr[c]; } src_ptr += src_stride; ref_ptr += ref_stride; } in = diff_in; for (r = 0; r < 16; r += 4) { for (c = 0; c < 16; c += 4) { vp9_short_walsh4x4_c(in + c, diff_out, 32); for (i = 0; i < 16; i++) satd += abs(diff_out[i]); } in += 64; } if (psatd) *psatd = satd; return satd; }
unsigned int aom_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *dst, int dst_stride, unsigned int *sse) { DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]); DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]); var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8, bilinear_filters[xoffset]); var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8, bilinear_filters[yoffset]); return aom_variance8x8_neon(temp2, 8, dst, dst_stride, sse); }
unsigned int aom_sub_pixel_variance64x64_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *dst, int dst_stride, unsigned int *sse) { DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]); DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]); var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64, bilinear_filters[xoffset]); var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64, bilinear_filters[yoffset]); return aom_variance64x64_neon(temp2, 64, dst, dst_stride, sse); }
unsigned int aom_sub_pixel_variance16x16_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *dst, int dst_stride, unsigned int *sse) { DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]); DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]); var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16, bilinear_filters[xoffset]); var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16, bilinear_filters[yoffset]); return aom_variance16x16_neon(temp2, 16, dst, dst_stride, sse); }
unsigned int aom_sub_pixel_variance32x32_neon(const uint8_t *src, int src_stride, int xoffset, int yoffset, const uint8_t *dst, int dst_stride, unsigned int *sse) { DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]); DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]); var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32, bilinear_filters[xoffset]); var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32, bilinear_filters[yoffset]); return aom_variance32x32_neon(temp2, 32, dst, dst_stride, sse); }
void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4). */ DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]); // Account for the vertical phase needing 3 lines prior and 4 lines post const int intermediate_height = h + 7; assert(y_step_q4 == 16); assert(x_step_q4 == 16); /* Filter starting 3 lines back. The neon implementation will ignore the given * height and filter a multiple of 4 lines. Since this goes in to the temp * buffer which has lots of extra room and is subsequently discarded this is * safe if somewhat less than ideal. */ vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter_x, x_step_q4, filter_y, y_step_q4, w, intermediate_height); /* Step into the temp buffer 3 lines to get the actual frame data */ vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); }
void vpx_highbd_convolve8_neon(const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { const uint16_t *src = CONVERT_TO_SHORTPTR(src8); const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y)); // + 1 to make it divisible by 4 DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]); const int intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; /* Filter starting 3 lines back. The neon implementation will ignore the given * height and filter a multiple of 4 lines. Since this goes in to the temp * buffer which has lots of extra room and is subsequently discarded this is * safe if somewhat less than ideal. */ vpx_highbd_convolve8_horiz_neon(CONVERT_TO_BYTEPTR(src - src_stride * 3), src_stride, CONVERT_TO_BYTEPTR(temp), w, filter_x, x_step_q4, filter_y, y_step_q4, w, intermediate_height, bd); /* Step into the temp buffer 3 lines to get the actual frame data */ vpx_highbd_convolve8_vert_neon(CONVERT_TO_BYTEPTR(temp + w * 3), w, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); }
void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]); int intermediate_height = h + 7; if (x_step_q4 != 16 || y_step_q4 != 16) { vp9_convolve8_avg_c(src, src_stride, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); return; } /* This implementation has the same issues as above. In addition, we only want * to average the values after both passes. */ vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, 64, filter_x, x_step_q4, filter_y, y_step_q4, w, intermediate_height); vp9_convolve8_avg_vert_neon(temp + 64 * 3, 64, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h); }
static INLINE void scaledconvolve_horiz_w8( const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, const ptrdiff_t dst_stride, const InterpKernel *const x_filters, const int x0_q4, const int x_step_q4, const int w, const int h) { DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]); int x, y, z; src -= SUBPEL_TAPS / 2 - 1; // This function processes 8x8 areas. The intermediate height is not always // a multiple of 8, so force it to be a multiple of 8 here. y = (h + 7) & ~7; do { int x_q4 = x0_q4; x = 0; do { uint8x8_t d[8]; // process 8 src_x steps for (z = 0; z < 8; ++z) { const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; if (x_q4 & SUBPEL_MASK) { const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]); uint8x8_t s[8]; load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]); transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6], &s[7]); d[0] = scale_filter_8(s, filters); vst1_u8(&temp[8 * z], d[0]); } else { int i; for (i = 0; i < 8; ++i) { temp[z * 8 + i] = src_x[i * src_stride + 3]; } } x_q4 += x_step_q4; } // transpose the 8x8 filters values back to dst load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]); vst1_u8(&dst[x + 0 * dst_stride], d[0]); vst1_u8(&dst[x + 1 * dst_stride], d[1]); vst1_u8(&dst[x + 2 * dst_stride], d[2]); vst1_u8(&dst[x + 3 * dst_stride], d[3]); vst1_u8(&dst[x + 4 * dst_stride], d[4]); vst1_u8(&dst[x + 5 * dst_stride], d[5]); vst1_u8(&dst[x + 6 * dst_stride], d[6]); vst1_u8(&dst[x + 7 * dst_stride], d[7]); x += 8; } while (x < w); src += src_stride * 8; dst += dst_stride * 8; } while (y -= 8); }
void vpx_fdct32x32_rd_msa(const int16_t *input, int16_t *out, int32_t src_stride) { int32_t i; DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); /* column transform */ for (i = 0; i < 4; ++i) { fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0], &tmp_buf_big[0] + (8 * i)); } /* row transform */ for (i = 0; i < 4; ++i) { fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0], out + (8 * i * 32)); } }
static void float_to_int16_interleave_altivec(int16_t *dst, const float **src, long len, int channels) { int i; vector signed short d0, d1, d2, c0, c1, t0, t1; vector unsigned char align; if(channels == 1) float_to_int16_altivec(dst, src[0], len); else if (channels == 2) { if(((long)dst) & 15) for(i = 0; i < len - 7; i += 8) { d0 = vec_ld(0, dst + i); t0 = float_to_int16_one_altivec(src[0] + i); d1 = vec_ld(31, dst + i); t1 = float_to_int16_one_altivec(src[1] + i); c0 = vec_mergeh(t0, t1); c1 = vec_mergel(t0, t1); d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i)); align = vec_lvsr(0, dst + i); d0 = vec_perm(d2, c0, align); d1 = vec_perm(c0, c1, align); vec_st(d0, 0, dst + i); d0 = vec_perm(c1, d2, align); vec_st(d1, 15, dst + i); vec_st(d0, 31, dst + i); dst += 8; } else for(i = 0; i < len - 7; i += 8) { t0 = float_to_int16_one_altivec(src[0] + i); t1 = float_to_int16_one_altivec(src[1] + i); d0 = vec_mergeh(t0, t1); d1 = vec_mergel(t0, t1); vec_st(d0, 0, dst + i); vec_st(d1, 16, dst + i); dst += 8; } } else { DECLARE_ALIGNED(16, int16_t, tmp)[len]; int c, j; for (c = 0; c < channels; c++) { float_to_int16_altivec(tmp, src[c], len); for (i = 0, j = c; i < len; i++, j += channels) { dst[j] = tmp[i]; } } } }
static INLINE void hadamard_16x16_sse2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff, int is_final) { // For high bitdepths, it is unnecessary to store_tran_low // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the // next stage. Output to an intermediate buffer first, then store_tran_low() // in the final stage. DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]); int16_t *t_coeff = temp_coeff; int16_t *coeff16 = (int16_t *)coeff; int idx; for (idx = 0; idx < 4; ++idx) { const int16_t *src_ptr = src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8; hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64), 0); } for (idx = 0; idx < 64; idx += 8) { __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff); __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64)); __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128)); __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192)); __m128i b0 = _mm_add_epi16(coeff0, coeff1); __m128i b1 = _mm_sub_epi16(coeff0, coeff1); __m128i b2 = _mm_add_epi16(coeff2, coeff3); __m128i b3 = _mm_sub_epi16(coeff2, coeff3); b0 = _mm_srai_epi16(b0, 1); b1 = _mm_srai_epi16(b1, 1); b2 = _mm_srai_epi16(b2, 1); b3 = _mm_srai_epi16(b3, 1); coeff0 = _mm_add_epi16(b0, b2); coeff1 = _mm_add_epi16(b1, b3); coeff2 = _mm_sub_epi16(b0, b2); coeff3 = _mm_sub_epi16(b1, b3); if (is_final) { store_tran_low(coeff0, coeff); store_tran_low(coeff1, coeff + 64); store_tran_low(coeff2, coeff + 128); store_tran_low(coeff3, coeff + 192); coeff += 8; } else { _mm_store_si128((__m128i *)coeff16, coeff0); _mm_store_si128((__m128i *)(coeff16 + 64), coeff1); _mm_store_si128((__m128i *)(coeff16 + 128), coeff2); _mm_store_si128((__m128i *)(coeff16 + 192), coeff3); coeff16 += 8; } t_coeff += 8; } }
void vpx_fdct32x32_msa(const int16_t *input, int16_t *output, int32_t src_stride) { int32_t i; DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]); DECLARE_ALIGNED(32, int16_t, tmp_buf[256]); /* column transform */ for (i = 0; i < 4; ++i) { fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf, tmp_buf_big + (8 * i)); } /* row transform */ fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output); /* row transform */ for (i = 1; i < 4; ++i) { fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256)); } }
void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int *p_halfpel_thresh ) { const int bw = x264_pixel_size[m->i_pixel].w; const int bh = x264_pixel_size[m->i_pixel].h; const int i_pixel = m->i_pixel; int i_me_range = h->param.analyse.i_me_range; int bmx, bmy, bcost; int bpred_mx = 0, bpred_my = 0, bpred_cost = COST_MAX; int omx, omy, pmx, pmy; uint8_t *p_fref = m->p_fref[0]; DECLARE_ALIGNED( uint8_t, pix[16*16], 16 ); int i, j; int dir; int costs[6]; int mv_x_min = h->mb.mv_min_fpel[0]; int mv_y_min = h->mb.mv_min_fpel[1]; int mv_x_max = h->mb.mv_max_fpel[0]; int mv_y_max = h->mb.mv_max_fpel[1]; const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0]; const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1]; if( h->mb.i_me_method == X264_ME_UMH ) { /* clamp mvp to inside frame+padding, so that we don't have to check it each iteration */ p_cost_mvx = m->p_cost_mv - x264_clip3( m->mvp[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] ); p_cost_mvy = m->p_cost_mv - x264_clip3( m->mvp[1], h->mb.mv_min_spel[1], h->mb.mv_max_spel[1] ); } bmx = x264_clip3( m->mvp[0], mv_x_min*4, mv_x_max*4 ); bmy = x264_clip3( m->mvp[1], mv_y_min*4, mv_y_max*4 ); pmx = ( bmx + 2 ) >> 2; pmy = ( bmy + 2 ) >> 2; bcost = COST_MAX; /* try extra predictors if provided */ if( h->mb.i_subpel_refine >= 3 ) { COST_MV_PRED( bmx, bmy ); for( i = 0; i < i_mvc; i++ ) { const int mx = x264_clip3( mvc[i][0], mv_x_min*4, mv_x_max*4 ); const int my = x264_clip3( mvc[i][1], mv_y_min*4, mv_y_max*4 ); if( mx != bpred_mx || my != bpred_my ) COST_MV_PRED( mx, my ); } bmx = ( bpred_mx + 2 ) >> 2; bmy = ( bpred_my + 2 ) >> 2; COST_MV( bmx, bmy ); }
void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int *p_halfpel_thresh ) { const int bw = x264_pixel_size[m->i_pixel].w; const int bh = x264_pixel_size[m->i_pixel].h; const int i_pixel = m->i_pixel; int i_me_range = h->param.analyse.i_me_range; int bmx, bmy, bcost; int bpred_mx = 0, bpred_my = 0, bpred_cost = COST_MAX; int omx, omy, pmx, pmy; uint8_t *p_fref = m->p_fref[0]; DECLARE_ALIGNED( uint8_t, pix[16*16], 16 ); int i, j; int dir; int costs[6]; int mv_x_min = h->mb.mv_min_fpel[0]; int mv_y_min = h->mb.mv_min_fpel[1]; int mv_x_max = h->mb.mv_max_fpel[0]; int mv_y_max = h->mb.mv_max_fpel[1]; #define CHECK_MVRANGE(mx,my) ( mx >= mv_x_min && mx <= mv_x_max && my >= mv_y_min && my <= mv_y_max ) const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0]; const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1]; bmx = x264_clip3( m->mvp[0], mv_x_min*4, mv_x_max*4 ); bmy = x264_clip3( m->mvp[1], mv_y_min*4, mv_y_max*4 ); pmx = ( bmx + 2 ) >> 2; pmy = ( bmy + 2 ) >> 2; bcost = COST_MAX; /* try extra predictors if provided */ if( h->mb.i_subpel_refine >= 3 ) { COST_MV_HPEL( bmx, bmy ); for( i = 0; i < i_mvc; i++ ) { int mx = mvc[i][0]; int my = mvc[i][1]; if( (mx | my) && ((mx-bmx) | (my-bmy)) ) { mx = x264_clip3( mx, mv_x_min*4, mv_x_max*4 ); my = x264_clip3( my, mv_y_min*4, mv_y_max*4 ); COST_MV_HPEL( mx, my ); } } bmx = ( bpred_mx + 2 ) >> 2; bmy = ( bpred_my + 2 ) >> 2; COST_MV( bmx, bmy ); }
void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { /* Fixed size intermediate buffer places limits on parameters. */ DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]); assert(w <= 64); assert(h <= 64); vpx_convolve8_c(src, src_stride, temp, 64, filter_x, x_step_q4, filter_y, y_step_q4, w, h); vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h); }
void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const InterpKernel *filter, int x0_q4, int x_step_q4, int y0_q4, int y_step_q4, int w, int h) { // Note: Fixed size intermediate buffer, temp, places limits on parameters. // 2d filtering proceeds in 2 steps: // (1) Interpolate horizontally into an intermediate buffer, temp. // (2) Interpolate temp vertically to derive the sub-pixel result. // Deriving the maximum number of rows in the temp buffer (135): // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative). // --Largest block size is 64x64 pixels. // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the // original frame (in 1/16th pixel units). // --Must round-up because block may be located at sub-pixel position. // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails. // --((64 - 1) * 32 + 15) >> 4 + 8 = 135. // --Require an additional 8 rows for the horiz_w8 transpose tail. // When calling in frame scaling function, the smallest scaling factor is x1/4 // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still // big enough. DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]); const int intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; assert(w <= 64); assert(h <= 64); assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32)); assert(x_step_q4 <= 64); if (w >= 8) { scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64, filter, x0_q4, x_step_q4, w, intermediate_height); } else { scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1), src_stride, temp, 64, filter, x0_q4, x_step_q4, w, intermediate_height); } if (w >= 16) { scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter, y0_q4, y_step_q4, w, h); } else if (w == 8) { scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter, y0_q4, y_step_q4, w, h); } else { scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst, dst_stride, filter, y0_q4, y_step_q4, w, h); } }
void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { // Fixed size intermediate buffer places limits on parameters. DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]); assert(w <= 64); assert(h <= 64); vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride, NULL, 0, NULL, 0, w, h, bd); }
void vpx_fdct16x16_msa(const int16_t *input, int16_t *output, int32_t src_stride) { int32_t i; DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]); /* column transform */ for (i = 0; i < 2; ++i) { fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride); } /* row transform */ for (i = 0; i < 2; ++i) { fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i))); } }
static INLINE void hadamard_16x16_avx2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff, int is_final) { #if CONFIG_VP9_HIGHBITDEPTH DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]); int16_t *t_coeff = temp_coeff; #else int16_t *t_coeff = coeff; #endif int16_t *coeff16 = (int16_t *)coeff; int idx; for (idx = 0; idx < 2; ++idx) { const int16_t *src_ptr = src_diff + idx * 8 * src_stride; hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2)); } for (idx = 0; idx < 64; idx += 16) { const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64)); const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128)); const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192)); __m256i b0 = _mm256_add_epi16(coeff0, coeff1); __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); __m256i b2 = _mm256_add_epi16(coeff2, coeff3); __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); b0 = _mm256_srai_epi16(b0, 1); b1 = _mm256_srai_epi16(b1, 1); b2 = _mm256_srai_epi16(b2, 1); b3 = _mm256_srai_epi16(b3, 1); if (is_final) { store_tran_low(_mm256_add_epi16(b0, b2), coeff); store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64); store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128); store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192); coeff += 16; } else { _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2)); _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3)); _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2)); _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3)); coeff16 += 16; } t_coeff += 16; } }
void aom_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { // For high bitdepths, it is unnecessary to store_tran_low // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the // next stage. Output to an intermediate buffer first, then store_tran_low() // in the final stage. DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]); int16_t *t_coeff = temp_coeff; int idx; for (idx = 0; idx < 4; ++idx) { const int16_t *src_ptr = src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; hadamard_16x16_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 256), 0); } for (idx = 0; idx < 256; idx += 8) { __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff); __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256)); __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512)); __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768)); __m128i b0 = _mm_add_epi16(coeff0, coeff1); __m128i b1 = _mm_sub_epi16(coeff0, coeff1); __m128i b2 = _mm_add_epi16(coeff2, coeff3); __m128i b3 = _mm_sub_epi16(coeff2, coeff3); b0 = _mm_srai_epi16(b0, 2); b1 = _mm_srai_epi16(b1, 2); b2 = _mm_srai_epi16(b2, 2); b3 = _mm_srai_epi16(b3, 2); coeff0 = _mm_add_epi16(b0, b2); coeff1 = _mm_add_epi16(b1, b3); store_tran_low(coeff0, coeff); store_tran_low(coeff1, coeff + 256); coeff2 = _mm_sub_epi16(b0, b2); coeff3 = _mm_sub_epi16(b1, b3); store_tran_low(coeff2, coeff + 512); store_tran_low(coeff3, coeff + 768); coeff += 8; t_coeff += 8; } }
void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride, tran_low_t *coeff) { #if CONFIG_VP9_HIGHBITDEPTH // For high bitdepths, it is unnecessary to store_tran_low // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the // next stage. Output to an intermediate buffer first, then store_tran_low() // in the final stage. DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]); int16_t *t_coeff = temp_coeff; #else int16_t *t_coeff = coeff; #endif int idx; for (idx = 0; idx < 4; ++idx) { // src_diff: 9 bit, dynamic range [-255, 255] const int16_t *src_ptr = src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16; hadamard_16x16_avx2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 256), 0); } for (idx = 0; idx < 256; idx += 16) { const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff); const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256)); const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512)); const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768)); __m256i b0 = _mm256_add_epi16(coeff0, coeff1); __m256i b1 = _mm256_sub_epi16(coeff0, coeff1); __m256i b2 = _mm256_add_epi16(coeff2, coeff3); __m256i b3 = _mm256_sub_epi16(coeff2, coeff3); b0 = _mm256_srai_epi16(b0, 2); b1 = _mm256_srai_epi16(b1, 2); b2 = _mm256_srai_epi16(b2, 2); b3 = _mm256_srai_epi16(b3, 2); store_tran_low(_mm256_add_epi16(b0, b2), coeff); store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256); store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 512); store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768); coeff += 16; t_coeff += 16; } }
static void dequantize(DCTELEM *block, MscCodecContext *mscContext, int intraMatrix) { DECLARE_ALIGNED(16, DCTELEM, tmp)[64]; DCTELEM firstElemValue; uint16_t *qmatrix = intraMatrix ? mscContext->intra_matrix : mscContext->non_intra_matrix; firstElemValue = 8 * block[0]; block[0] = 0; for (int i = 0; i < 64; ++i) { const int index = scantab[i]; tmp[mscContext->scantable.permutated[i]]= (block[index] * qmatrix[i]) >> 4; } tmp[0] = firstElemValue; for (int i = 0; i < 64; ++i) { block[i] = tmp[i]; } }
void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride, uint16_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, int bd) { const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y)); // + 1 to make it divisible by 4 DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]); const int intermediate_height = (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS; /* This implementation has the same issues as above. In addition, we only want * to average the values after both passes. */ vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter_x, x_step_q4, filter_y, y_step_q4, w, intermediate_height, bd); vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd); }
static int do_layer2(struct frame *fr,int outmode) { int clip=0; int i,j; int stereo = fr->stereo; DECLARE_ALIGNED(16, real, fraction[2][4][SBLIMIT]); /* pick_table clears unused subbands */ unsigned int bit_alloc[64]; int scale[192]; int single = fr->single; II_select_table(fr); fr->jsbound = (fr->mode == MPG_MD_JOINT_STEREO) ? (fr->mode_ext<<2)+4 : fr->II_sblimit; if(stereo == 1 || single == 3) single = 0; II_step_one(bit_alloc, scale, fr); for (i=0; i<SCALE_BLOCK; i++) { II_step_two(bit_alloc,fraction,scale,fr,i>>2); for (j=0; j<3; j++) { if(single >= 0) { clip += (fr->synth_mono) (fraction[single][j],pcm_sample,&pcm_point); } else { int p1 = pcm_point; clip += (fr->synth) (fraction[0][j],0,pcm_sample,&p1); clip += (fr->synth) (fraction[1][j],1,pcm_sample,&pcm_point); } // if(pcm_point >= audiobufsize) audio_flush(outmode,ai); } } return clip; }
static int do_layer1(struct frame *fr,int single) { int clip=0; int i,stereo = fr->stereo; unsigned int balloc[2*SBLIMIT]; unsigned int scale_index[2][SBLIMIT]; DECLARE_ALIGNED(16, real, fraction[2][SBLIMIT]); // int single = fr->single; // printf("do_layer1(0x%02X 0x%02X 0x%02X 0x%02X 0x%02X 0x%02X 0x%02X 0x%02X )\n", // wordpointer[0],wordpointer[1],wordpointer[2],wordpointer[3],wordpointer[4],wordpointer[5],wordpointer[6],wordpointer[7]); fr->jsbound = (fr->mode == MPG_MD_JOINT_STEREO) ? (fr->mode_ext<<2)+4 : 32; if(stereo == 1 || single == 3) single = 0; I_step_one(balloc,scale_index,fr); for (i=0;i<SCALE_BLOCK;i++) { I_step_two(fraction,balloc,scale_index,fr); if(single >= 0) { clip += (fr->synth_mono)( (real *) fraction[single],pcm_sample,&pcm_point); } else { int p1 = pcm_point; clip += (fr->synth)( (real *) fraction[0],0,pcm_sample,&p1); clip += (fr->synth)( (real *) fraction[1],1,pcm_sample,&pcm_point); } } return clip; }
void av1_fht16x16_msa(const int16_t *input, int16_t *output, int32_t stride, int32_t tx_type) { DECLARE_ALIGNED(32, int16_t, tmp[256]); DECLARE_ALIGNED(32, int16_t, trans_buf[256]); DECLARE_ALIGNED(32, int16_t, tmp_buf[128]); int32_t i; int16_t *ptmpbuf = &tmp_buf[0]; int16_t *trans = &trans_buf[0]; const int32_t const_arr[29 * 4] = { 52707308, 52707308, 52707308, 52707308, -1072430300, -1072430300, -1072430300, -1072430300, 795618043, 795618043, 795618043, 795618043, -721080468, -721080468, -721080468, -721080468, 459094491, 459094491, 459094491, 459094491, -970646691, -970646691, -970646691, -970646691, 1010963856, 1010963856, 1010963856, 1010963856, -361743294, -361743294, -361743294, -361743294, 209469125, 209469125, 209469125, 209469125, -1053094788, -1053094788, -1053094788, -1053094788, 1053160324, 1053160324, 1053160324, 1053160324, 639644520, 639644520, 639644520, 639644520, -862444000, -862444000, -862444000, -862444000, 1062144356, 1062144356, 1062144356, 1062144356, -157532337, -157532337, -157532337, -157532337, 260914709, 260914709, 260914709, 260914709, -1041559667, -1041559667, -1041559667, -1041559667, 920985831, 920985831, 920985831, 920985831, -551995675, -551995675, -551995675, -551995675, 596522295, 596522295, 596522295, 596522295, 892853362, 892853362, 892853362, 892853362, -892787826, -892787826, -892787826, -892787826, 410925857, 410925857, 410925857, 410925857, -992012162, -992012162, -992012162, -992012162, 992077698, 992077698, 992077698, 992077698, 759246145, 759246145, 759246145, 759246145, -759180609, -759180609, -759180609, -759180609, -759222975, -759222975, -759222975, -759222975, 759288511, 759288511, 759288511, 759288511 }; switch (tx_type) { case DCT_DCT: /* column transform */ for (i = 0; i < 2; ++i) { fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride); } /* row transform */ for (i = 0; i < 2; ++i) { fdct16x8_1d_row(tmp + (128 * i), output + (128 * i)); } break; case ADST_DCT: /* column transform */ for (i = 0; i < 2; ++i) { fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf); fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3)); } /* row transform */ for (i = 0; i < 2; ++i) { postproc_fdct16x8_1d_row(tmp + (128 * i), output + (128 * i)); } break; case DCT_ADST: /* column transform */ for (i = 0; i < 2; ++i) { fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride); } fadst16_transpose_postproc_msa(tmp, trans); /* row transform */ for (i = 0; i < 2; ++i) { fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf); fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7)); } fadst16_transpose_msa(tmp, output); break; case ADST_ADST: /* column transform */ for (i = 0; i < 2; ++i) { fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf); fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3)); } fadst16_transpose_postproc_msa(tmp, trans); /* row transform */ for (i = 0; i < 2; ++i) { fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf); fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7)); } fadst16_transpose_msa(tmp, output); break; default: assert(0); break; } }
void av1_highbd_wiener_convolve_add_src_ssse3( const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h, const ConvolveParams *conv_params, int bd) { assert(x_step_q4 == 16 && y_step_q4 == 16); assert(!(w & 7)); assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16); (void)x_step_q4; (void)y_step_q4; const uint16_t *const src = CONVERT_TO_SHORTPTR(src8); uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8); DECLARE_ALIGNED(16, uint16_t, temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]); int intermediate_height = h + SUBPEL_TAPS - 1; int i, j; const int center_tap = ((SUBPEL_TAPS - 1) / 2); const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap; const __m128i zero = _mm_setzero_si128(); // Add an offset to account for the "add_src" part of the convolve function. const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3); /* Horizontal filter */ { const __m128i coeffs_x = _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); // coeffs 4 5 4 5 6 7 6 7 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); // coeffs 0 1 0 1 0 1 0 1 const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 4 5 4 5 4 5 4 5 const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); const __m128i round_const = _mm_set1_epi32( (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1))); for (i = 0; i < intermediate_height; ++i) { for (j = 0; j < w; j += 8) { const __m128i data = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); const __m128i data2 = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]); // Filter even-index pixels const __m128i res_0 = _mm_madd_epi16(data, coeff_01); const __m128i res_2 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23); const __m128i res_4 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45); const __m128i res_6 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67); __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6)); res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const), conv_params->round_0); // Filter odd-index pixels const __m128i res_1 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01); const __m128i res_3 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23); const __m128i res_5 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45); const __m128i res_7 = _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67); __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7)); res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const), conv_params->round_0); // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 const __m128i maxval = _mm_set1_epi16((WIENER_CLAMP_LIMIT(conv_params->round_0, bd)) - 1); __m128i res = _mm_packs_epi32(res_even, res_odd); res = _mm_min_epi16(_mm_max_epi16(res, zero), maxval); _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res); } } } /* Vertical filter */ { const __m128i coeffs_y = _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); // coeffs 4 5 4 5 6 7 6 7 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); // coeffs 0 1 0 1 0 1 0 1 const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 4 5 4 5 4 5 4 5 const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); const __m128i round_const = _mm_set1_epi32((1 << (conv_params->round_1 - 1)) - (1 << (bd + conv_params->round_1 - 1))); for (i = 0; i < h; ++i) { for (j = 0; j < w; j += 8) { // Filter even-index pixels const uint16_t *data = &temp[i * MAX_SB_SIZE + j]; const __m128i src_0 = _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), *(__m128i *)(data + 1 * MAX_SB_SIZE)); const __m128i src_2 = _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), *(__m128i *)(data + 3 * MAX_SB_SIZE)); const __m128i src_4 = _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), *(__m128i *)(data + 5 * MAX_SB_SIZE)); const __m128i src_6 = _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), *(__m128i *)(data + 7 * MAX_SB_SIZE)); const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6)); // Filter odd-index pixels const __m128i src_1 = _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), *(__m128i *)(data + 1 * MAX_SB_SIZE)); const __m128i src_3 = _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), *(__m128i *)(data + 3 * MAX_SB_SIZE)); const __m128i src_5 = _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), *(__m128i *)(data + 5 * MAX_SB_SIZE)); const __m128i src_7 = _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), *(__m128i *)(data + 7 * MAX_SB_SIZE)); const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7)); // Rearrange pixels back into the order 0 ... 7 const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); const __m128i res_lo_round = _mm_srai_epi32( _mm_add_epi32(res_lo, round_const), conv_params->round_1); const __m128i res_hi_round = _mm_srai_epi32( _mm_add_epi32(res_hi, round_const), conv_params->round_1); const __m128i maxval = _mm_set1_epi16((1 << bd) - 1); __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); res_16bit = _mm_min_epi16(_mm_max_epi16(res_16bit, zero), maxval); __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; _mm_storeu_si128(p, res_16bit); } } } }
static INLINE void scaledconvolve_horiz_w4( const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst, const ptrdiff_t dst_stride, const InterpKernel *const x_filters, const int x0_q4, const int x_step_q4, const int w, const int h) { DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]); int x, y, z; src -= SUBPEL_TAPS / 2 - 1; y = h; do { int x_q4 = x0_q4; x = 0; do { // process 4 src_x steps for (z = 0; z < 4; ++z) { const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS]; if (x_q4 & SUBPEL_MASK) { const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]); const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3); const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0); uint8x8_t s[8], d; int16x8_t ss[4]; int16x4_t t[8], tt; load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]); transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]); ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0])); ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1])); ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2])); ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3])); t[0] = vget_low_s16(ss[0]); t[1] = vget_low_s16(ss[1]); t[2] = vget_low_s16(ss[2]); t[3] = vget_low_s16(ss[3]); t[4] = vget_high_s16(ss[0]); t[5] = vget_high_s16(ss[1]); t[6] = vget_high_s16(ss[2]); t[7] = vget_high_s16(ss[3]); tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7], filters, filter3, filter4); d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7); vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0); } else { int i; for (i = 0; i < 4; ++i) { temp[z * 4 + i] = src_x[i * src_stride + 3]; } } x_q4 += x_step_q4; } // transpose the 4x4 filters values back to dst { const uint8x8x4_t d4 = vld4_u8(temp); vst1_lane_u32((uint32_t *)&dst[x + 0 * dst_stride], vreinterpret_u32_u8(d4.val[0]), 0); vst1_lane_u32((uint32_t *)&dst[x + 1 * dst_stride], vreinterpret_u32_u8(d4.val[1]), 0); vst1_lane_u32((uint32_t *)&dst[x + 2 * dst_stride], vreinterpret_u32_u8(d4.val[2]), 0); vst1_lane_u32((uint32_t *)&dst[x + 3 * dst_stride], vreinterpret_u32_u8(d4.val[3]), 0); } x += 4; } while (x < w); src += src_stride * 4; dst += dst_stride * 4; y -= 4; } while (y > 0); }