示例#1
0
文件: hevc_mc.c 项目: AVLeo/libav
void checkasm_check_hevc_mc(void)
{
    DECLARE_ALIGNED(16, uint8_t,  buf8_0)[BUF_SIZE];
    DECLARE_ALIGNED(16, uint8_t,  buf8_1)[BUF_SIZE];

    DECLARE_ALIGNED(16, int16_t, buf16_0)[BUF_SIZE];
    DECLARE_ALIGNED(16, int16_t, buf16_1)[BUF_SIZE];

    DECLARE_ALIGNED(16, int16_t, mcbuffer)[BUF_SIZE];

    HEVCDSPContext h;
    int bit_depth;

    for (bit_depth = 8; bit_depth <= 10; bit_depth++) {
        ff_hevc_dsp_init(&h, bit_depth);

        check_qpel(&h, buf16_0, buf16_1, buf8_0, mcbuffer, bit_depth);
        report("qpel");

        check_epel(&h, buf16_0, buf16_1, buf8_0, mcbuffer, bit_depth);
        report("epel");

        check_unweighted_pred(&h, buf8_0, buf8_1, buf16_0, buf16_1, bit_depth);
        report("unweighted_pred");

        check_weighted_pred(&h, buf8_0, buf8_1, buf16_0, buf16_1, bit_depth);
        report("weighted_pred");
    }
}
示例#2
0
unsigned int vp9_satd16x16_c(const uint8_t *src_ptr,
                             int  src_stride,
                             const uint8_t *ref_ptr,
                             int  ref_stride,
                             unsigned int *psatd) {
  int r, c, i;
  unsigned int satd = 0;
  DECLARE_ALIGNED(16, int16_t, diff_in[256]);
  DECLARE_ALIGNED(16, int16_t, diff_out[16]);
  int16_t *in;

  for (r = 0; r < 16; r++) {
    for (c = 0; c < 16; c++) {
      diff_in[r * 16 + c] = src_ptr[c] - ref_ptr[c];
    }
    src_ptr += src_stride;
    ref_ptr += ref_stride;
  }

  in = diff_in;
  for (r = 0; r < 16; r += 4) {
    for (c = 0; c < 16; c += 4) {
      vp9_short_walsh4x4_c(in + c, diff_out, 32);
      for (i = 0; i < 16; i++)
        satd += abs(diff_out[i]);
    }
    in += 64;
  }

  if (psatd)
    *psatd = satd;

  return satd;
}
unsigned int aom_sub_pixel_variance8x8_neon(const uint8_t *src, int src_stride,
                                            int xoffset, int yoffset,
                                            const uint8_t *dst, int dst_stride,
                                            unsigned int *sse) {
  DECLARE_ALIGNED(16, uint8_t, temp2[8 * 8]);
  DECLARE_ALIGNED(16, uint8_t, fdata3[9 * 8]);

  var_filter_block2d_bil_w8(src, fdata3, src_stride, 1, 9, 8,
                            bilinear_filters[xoffset]);
  var_filter_block2d_bil_w8(fdata3, temp2, 8, 8, 8, 8,
                            bilinear_filters[yoffset]);
  return aom_variance8x8_neon(temp2, 8, dst, dst_stride, sse);
}
unsigned int aom_sub_pixel_variance64x64_neon(const uint8_t *src,
                                              int src_stride, int xoffset,
                                              int yoffset, const uint8_t *dst,
                                              int dst_stride,
                                              unsigned int *sse) {
  DECLARE_ALIGNED(16, uint8_t, temp2[64 * 64]);
  DECLARE_ALIGNED(16, uint8_t, fdata3[65 * 64]);

  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 65, 64,
                             bilinear_filters[xoffset]);
  var_filter_block2d_bil_w16(fdata3, temp2, 64, 64, 64, 64,
                             bilinear_filters[yoffset]);
  return aom_variance64x64_neon(temp2, 64, dst, dst_stride, sse);
}
unsigned int aom_sub_pixel_variance16x16_neon(const uint8_t *src,
                                              int src_stride, int xoffset,
                                              int yoffset, const uint8_t *dst,
                                              int dst_stride,
                                              unsigned int *sse) {
  DECLARE_ALIGNED(16, uint8_t, temp2[16 * 16]);
  DECLARE_ALIGNED(16, uint8_t, fdata3[17 * 16]);

  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 17, 16,
                             bilinear_filters[xoffset]);
  var_filter_block2d_bil_w16(fdata3, temp2, 16, 16, 16, 16,
                             bilinear_filters[yoffset]);
  return aom_variance16x16_neon(temp2, 16, dst, dst_stride, sse);
}
unsigned int aom_sub_pixel_variance32x32_neon(const uint8_t *src,
                                              int src_stride, int xoffset,
                                              int yoffset, const uint8_t *dst,
                                              int dst_stride,
                                              unsigned int *sse) {
  DECLARE_ALIGNED(16, uint8_t, temp2[32 * 32]);
  DECLARE_ALIGNED(16, uint8_t, fdata3[33 * 32]);

  var_filter_block2d_bil_w16(src, fdata3, src_stride, 1, 33, 32,
                             bilinear_filters[xoffset]);
  var_filter_block2d_bil_w16(fdata3, temp2, 32, 32, 32, 32,
                             bilinear_filters[yoffset]);
  return aom_variance32x32_neon(temp2, 32, dst, dst_stride, sse);
}
示例#7
0
void vpx_convolve8_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                        ptrdiff_t dst_stride, const int16_t *filter_x,
                        int x_step_q4, const int16_t *filter_y, int y_step_q4,
                        int w, int h) {
  /* Given our constraints: w <= 64, h <= 64, taps == 8 we can reduce the
   * maximum buffer size to 64 * 64 + 7 (+ 1 to make it divisible by 4).
   */
  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);

  // Account for the vertical phase needing 3 lines prior and 4 lines post
  const int intermediate_height = h + 7;

  assert(y_step_q4 == 16);
  assert(x_step_q4 == 16);

  /* Filter starting 3 lines back. The neon implementation will ignore the given
   * height and filter a multiple of 4 lines. Since this goes in to the temp
   * buffer which has lots of extra room and is subsequently discarded this is
   * safe if somewhat less than ideal.   */
  vpx_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w, filter_x,
                           x_step_q4, filter_y, y_step_q4, w,
                           intermediate_height);

  /* Step into the temp buffer 3 lines to get the actual frame data */
  vpx_convolve8_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x, x_step_q4,
                          filter_y, y_step_q4, w, h);
}
void vpx_highbd_convolve8_neon(const uint8_t *src8, ptrdiff_t src_stride,
                               uint8_t *dst, ptrdiff_t dst_stride,
                               const int16_t *filter_x, int x_step_q4,
                               const int16_t *filter_y, int y_step_q4, int w,
                               int h, int bd) {
  const uint16_t *src = CONVERT_TO_SHORTPTR(src8);
  const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y));
  // + 1 to make it divisible by 4
  DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]);
  const int intermediate_height =
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;

  /* Filter starting 3 lines back. The neon implementation will ignore the given
   * height and filter a multiple of 4 lines. Since this goes in to the temp
   * buffer which has lots of extra room and is subsequently discarded this is
   * safe if somewhat less than ideal.   */
  vpx_highbd_convolve8_horiz_neon(CONVERT_TO_BYTEPTR(src - src_stride * 3),
                                  src_stride, CONVERT_TO_BYTEPTR(temp), w,
                                  filter_x, x_step_q4, filter_y, y_step_q4, w,
                                  intermediate_height, bd);

  /* Step into the temp buffer 3 lines to get the actual frame data */
  vpx_highbd_convolve8_vert_neon(CONVERT_TO_BYTEPTR(temp + w * 3), w, dst,
                                 dst_stride, filter_x, x_step_q4, filter_y,
                                 y_step_q4, w, h, bd);
}
示例#9
0
void vp9_convolve8_avg_neon(const uint8_t *src, ptrdiff_t src_stride,
                            uint8_t *dst, ptrdiff_t dst_stride,
                            const int16_t *filter_x, int x_step_q4,
                            const int16_t *filter_y, int y_step_q4,
                            int w, int h) {
  DECLARE_ALIGNED(8, uint8_t, temp[64 * 72]);
  int intermediate_height = h + 7;

  if (x_step_q4 != 16 || y_step_q4 != 16) {
    vp9_convolve8_avg_c(src, src_stride,
                        dst, dst_stride,
                        filter_x, x_step_q4,
                        filter_y, y_step_q4,
                        w, h);
    return;
  }

  /* This implementation has the same issues as above. In addition, we only want
   * to average the values after both passes.
   */
  vp9_convolve8_horiz_neon(src - src_stride * 3, src_stride,
                           temp, 64,
                           filter_x, x_step_q4, filter_y, y_step_q4,
                           w, intermediate_height);
  vp9_convolve8_avg_vert_neon(temp + 64 * 3,
                              64, dst, dst_stride,
                              filter_x, x_step_q4, filter_y, y_step_q4,
                              w, h);
}
static INLINE void scaledconvolve_horiz_w8(
    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
    const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
    const int x0_q4, const int x_step_q4, const int w, const int h) {
  DECLARE_ALIGNED(16, uint8_t, temp[8 * 8]);
  int x, y, z;
  src -= SUBPEL_TAPS / 2 - 1;

  // This function processes 8x8 areas. The intermediate height is not always
  // a multiple of 8, so force it to be a multiple of 8 here.
  y = (h + 7) & ~7;

  do {
    int x_q4 = x0_q4;
    x = 0;
    do {
      uint8x8_t d[8];
      // process 8 src_x steps
      for (z = 0; z < 8; ++z) {
        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];

        if (x_q4 & SUBPEL_MASK) {
          const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
          uint8x8_t s[8];
          load_u8_8x8(src_x, src_stride, &s[0], &s[1], &s[2], &s[3], &s[4],
                      &s[5], &s[6], &s[7]);
          transpose_u8_8x8(&s[0], &s[1], &s[2], &s[3], &s[4], &s[5], &s[6],
                           &s[7]);
          d[0] = scale_filter_8(s, filters);
          vst1_u8(&temp[8 * z], d[0]);
        } else {
          int i;
          for (i = 0; i < 8; ++i) {
            temp[z * 8 + i] = src_x[i * src_stride + 3];
          }
        }
        x_q4 += x_step_q4;
      }

      // transpose the 8x8 filters values back to dst
      load_u8_8x8(temp, 8, &d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6],
                  &d[7]);
      transpose_u8_8x8(&d[0], &d[1], &d[2], &d[3], &d[4], &d[5], &d[6], &d[7]);
      vst1_u8(&dst[x + 0 * dst_stride], d[0]);
      vst1_u8(&dst[x + 1 * dst_stride], d[1]);
      vst1_u8(&dst[x + 2 * dst_stride], d[2]);
      vst1_u8(&dst[x + 3 * dst_stride], d[3]);
      vst1_u8(&dst[x + 4 * dst_stride], d[4]);
      vst1_u8(&dst[x + 5 * dst_stride], d[5]);
      vst1_u8(&dst[x + 6 * dst_stride], d[6]);
      vst1_u8(&dst[x + 7 * dst_stride], d[7]);
      x += 8;
    } while (x < w);

    src += src_stride * 8;
    dst += dst_stride * 8;
  } while (y -= 8);
}
示例#11
0
void vpx_fdct32x32_rd_msa(const int16_t *input, int16_t *out,
                          int32_t src_stride) {
  int32_t i;
  DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
  DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);

  /* column transform */
  for (i = 0; i < 4; ++i) {
    fdct8x32_1d_column(input + (8 * i), src_stride, &tmp_buf[0],
                       &tmp_buf_big[0] + (8 * i));
  }

  /* row transform */
  for (i = 0; i < 4; ++i) {
    fdct32x8_1d_row_rd(&tmp_buf_big[0] + (8 * i * 32), &tmp_buf[0],
                       out + (8 * i * 32));
  }
}
static void
float_to_int16_interleave_altivec(int16_t *dst, const float **src,
                                  long len, int channels)
{
    int i;
    vector signed short d0, d1, d2, c0, c1, t0, t1;
    vector unsigned char align;
    if(channels == 1)
        float_to_int16_altivec(dst, src[0], len);
    else if (channels == 2)
    {
        if(((long)dst) & 15)
            for(i = 0; i < len - 7; i += 8)
            {
                d0 = vec_ld(0, dst + i);
                t0 = float_to_int16_one_altivec(src[0] + i);
                d1 = vec_ld(31, dst + i);
                t1 = float_to_int16_one_altivec(src[1] + i);
                c0 = vec_mergeh(t0, t1);
                c1 = vec_mergel(t0, t1);
                d2 = vec_perm(d1, d0, vec_lvsl(0, dst + i));
                align = vec_lvsr(0, dst + i);
                d0 = vec_perm(d2, c0, align);
                d1 = vec_perm(c0, c1, align);
                vec_st(d0,  0, dst + i);
                d0 = vec_perm(c1, d2, align);
                vec_st(d1, 15, dst + i);
                vec_st(d0, 31, dst + i);
                dst += 8;
            }
        else
            for(i = 0; i < len - 7; i += 8)
            {
                t0 = float_to_int16_one_altivec(src[0] + i);
                t1 = float_to_int16_one_altivec(src[1] + i);
                d0 = vec_mergeh(t0, t1);
                d1 = vec_mergel(t0, t1);
                vec_st(d0,  0, dst + i);
                vec_st(d1, 16, dst + i);
                dst += 8;
            }
    }
    else
    {
        DECLARE_ALIGNED(16, int16_t, tmp)[len];
        int c, j;
        for (c = 0; c < channels; c++)
        {
            float_to_int16_altivec(tmp, src[c], len);
            for (i = 0, j = c; i < len; i++, j += channels)
            {
                dst[j] = tmp[i];
            }
        }
    }
}
示例#13
0
static INLINE void hadamard_16x16_sse2(const int16_t *src_diff,
                                       ptrdiff_t src_stride, tran_low_t *coeff,
                                       int is_final) {
  // For high bitdepths, it is unnecessary to store_tran_low
  // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
  // next stage.  Output to an intermediate buffer first, then store_tran_low()
  // in the final stage.
  DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
  int16_t *t_coeff = temp_coeff;
  int16_t *coeff16 = (int16_t *)coeff;
  int idx;
  for (idx = 0; idx < 4; ++idx) {
    const int16_t *src_ptr =
        src_diff + (idx >> 1) * 8 * src_stride + (idx & 0x01) * 8;
    hadamard_8x8_sse2(src_ptr, src_stride, (tran_low_t *)(t_coeff + idx * 64),
                      0);
  }

  for (idx = 0; idx < 64; idx += 8) {
    __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
    __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 64));
    __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 128));
    __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 192));

    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);

    b0 = _mm_srai_epi16(b0, 1);
    b1 = _mm_srai_epi16(b1, 1);
    b2 = _mm_srai_epi16(b2, 1);
    b3 = _mm_srai_epi16(b3, 1);

    coeff0 = _mm_add_epi16(b0, b2);
    coeff1 = _mm_add_epi16(b1, b3);
    coeff2 = _mm_sub_epi16(b0, b2);
    coeff3 = _mm_sub_epi16(b1, b3);

    if (is_final) {
      store_tran_low(coeff0, coeff);
      store_tran_low(coeff1, coeff + 64);
      store_tran_low(coeff2, coeff + 128);
      store_tran_low(coeff3, coeff + 192);
      coeff += 8;
    } else {
      _mm_store_si128((__m128i *)coeff16, coeff0);
      _mm_store_si128((__m128i *)(coeff16 + 64), coeff1);
      _mm_store_si128((__m128i *)(coeff16 + 128), coeff2);
      _mm_store_si128((__m128i *)(coeff16 + 192), coeff3);
      coeff16 += 8;
    }

    t_coeff += 8;
  }
}
示例#14
0
void vpx_fdct32x32_msa(const int16_t *input, int16_t *output,
                       int32_t src_stride) {
  int32_t i;
  DECLARE_ALIGNED(32, int16_t, tmp_buf_big[1024]);
  DECLARE_ALIGNED(32, int16_t, tmp_buf[256]);

  /* column transform */
  for (i = 0; i < 4; ++i) {
    fdct8x32_1d_column(input + (8 * i), src_stride, tmp_buf,
                       tmp_buf_big + (8 * i));
  }

  /* row transform */
  fdct32x8_1d_row_4x(tmp_buf_big, tmp_buf, output);

  /* row transform */
  for (i = 1; i < 4; ++i) {
    fdct32x8_1d_row(tmp_buf_big + (i * 256), tmp_buf, output + (i * 256));
  }
}
示例#15
0
void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int *p_halfpel_thresh )
{
    const int bw = x264_pixel_size[m->i_pixel].w;
    const int bh = x264_pixel_size[m->i_pixel].h;
    const int i_pixel = m->i_pixel;
    int i_me_range = h->param.analyse.i_me_range;
    int bmx, bmy, bcost;
    int bpred_mx = 0, bpred_my = 0, bpred_cost = COST_MAX;
    int omx, omy, pmx, pmy;
    uint8_t *p_fref = m->p_fref[0];
    DECLARE_ALIGNED( uint8_t, pix[16*16], 16 );
    
    int i, j;
    int dir;
    int costs[6];

    int mv_x_min = h->mb.mv_min_fpel[0];
    int mv_y_min = h->mb.mv_min_fpel[1];
    int mv_x_max = h->mb.mv_max_fpel[0];
    int mv_y_max = h->mb.mv_max_fpel[1];

    const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
    const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];

    if( h->mb.i_me_method == X264_ME_UMH )
    {
        /* clamp mvp to inside frame+padding, so that we don't have to check it each iteration */
        p_cost_mvx = m->p_cost_mv - x264_clip3( m->mvp[0], h->mb.mv_min_spel[0], h->mb.mv_max_spel[0] );
        p_cost_mvy = m->p_cost_mv - x264_clip3( m->mvp[1], h->mb.mv_min_spel[1], h->mb.mv_max_spel[1] );
    }

    bmx = x264_clip3( m->mvp[0], mv_x_min*4, mv_x_max*4 );
    bmy = x264_clip3( m->mvp[1], mv_y_min*4, mv_y_max*4 );
    pmx = ( bmx + 2 ) >> 2;
    pmy = ( bmy + 2 ) >> 2;
    bcost = COST_MAX;

    /* try extra predictors if provided */
    if( h->mb.i_subpel_refine >= 3 )
    {
        COST_MV_PRED( bmx, bmy );
        for( i = 0; i < i_mvc; i++ )
        {
             const int mx = x264_clip3( mvc[i][0], mv_x_min*4, mv_x_max*4 );
             const int my = x264_clip3( mvc[i][1], mv_y_min*4, mv_y_max*4 );
             if( mx != bpred_mx || my != bpred_my )
                 COST_MV_PRED( mx, my );
        }
        bmx = ( bpred_mx + 2 ) >> 2;
        bmy = ( bpred_my + 2 ) >> 2;
        COST_MV( bmx, bmy );
    }
示例#16
0
void x264_me_search_ref( x264_t *h, x264_me_t *m, int (*mvc)[2], int i_mvc, int *p_halfpel_thresh )
{
    const int bw = x264_pixel_size[m->i_pixel].w;
    const int bh = x264_pixel_size[m->i_pixel].h;
    const int i_pixel = m->i_pixel;
    int i_me_range = h->param.analyse.i_me_range;
    int bmx, bmy, bcost;
    int bpred_mx = 0, bpred_my = 0, bpred_cost = COST_MAX;
    int omx, omy, pmx, pmy;
    uint8_t *p_fref = m->p_fref[0];
    DECLARE_ALIGNED( uint8_t, pix[16*16], 16 );
    
    int i, j;
    int dir;
    int costs[6];

    int mv_x_min = h->mb.mv_min_fpel[0];
    int mv_y_min = h->mb.mv_min_fpel[1];
    int mv_x_max = h->mb.mv_max_fpel[0];
    int mv_y_max = h->mb.mv_max_fpel[1];

#define CHECK_MVRANGE(mx,my) ( mx >= mv_x_min && mx <= mv_x_max && my >= mv_y_min && my <= mv_y_max )

    const int16_t *p_cost_mvx = m->p_cost_mv - m->mvp[0];
    const int16_t *p_cost_mvy = m->p_cost_mv - m->mvp[1];

    bmx = x264_clip3( m->mvp[0], mv_x_min*4, mv_x_max*4 );
    bmy = x264_clip3( m->mvp[1], mv_y_min*4, mv_y_max*4 );
    pmx = ( bmx + 2 ) >> 2;
    pmy = ( bmy + 2 ) >> 2;
    bcost = COST_MAX;

    /* try extra predictors if provided */
    if( h->mb.i_subpel_refine >= 3 )
    {
        COST_MV_HPEL( bmx, bmy );
        for( i = 0; i < i_mvc; i++ )
        {
            int mx = mvc[i][0];
            int my = mvc[i][1];
            if( (mx | my) && ((mx-bmx) | (my-bmy)) )
            {
                mx = x264_clip3( mx, mv_x_min*4, mv_x_max*4 );
                my = x264_clip3( my, mv_y_min*4, mv_y_max*4 );
                COST_MV_HPEL( mx, my );
            }
        }
        bmx = ( bpred_mx + 2 ) >> 2;
        bmy = ( bpred_my + 2 ) >> 2;
        COST_MV( bmx, bmy );
    }
示例#17
0
void vpx_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
                         uint8_t *dst, ptrdiff_t dst_stride,
                         const int16_t *filter_x, int x_step_q4,
                         const int16_t *filter_y, int y_step_q4,
                         int w, int h) {
  /* Fixed size intermediate buffer places limits on parameters. */
  DECLARE_ALIGNED(16, uint8_t, temp[64 * 64]);
  assert(w <= 64);
  assert(h <= 64);

  vpx_convolve8_c(src, src_stride, temp, 64,
                  filter_x, x_step_q4, filter_y, y_step_q4, w, h);
  vpx_convolve_avg_c(temp, 64, dst, dst_stride, NULL, 0, NULL, 0, w, h);
}
void vpx_scaled_2d_neon(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst,
                        ptrdiff_t dst_stride, const InterpKernel *filter,
                        int x0_q4, int x_step_q4, int y0_q4, int y_step_q4,
                        int w, int h) {
  // Note: Fixed size intermediate buffer, temp, places limits on parameters.
  // 2d filtering proceeds in 2 steps:
  //   (1) Interpolate horizontally into an intermediate buffer, temp.
  //   (2) Interpolate temp vertically to derive the sub-pixel result.
  // Deriving the maximum number of rows in the temp buffer (135):
  // --Smallest scaling factor is x1/2 ==> y_step_q4 = 32 (Normative).
  // --Largest block size is 64x64 pixels.
  // --64 rows in the downscaled frame span a distance of (64 - 1) * 32 in the
  //   original frame (in 1/16th pixel units).
  // --Must round-up because block may be located at sub-pixel position.
  // --Require an additional SUBPEL_TAPS rows for the 8-tap filter tails.
  // --((64 - 1) * 32 + 15) >> 4 + 8 = 135.
  // --Require an additional 8 rows for the horiz_w8 transpose tail.
  // When calling in frame scaling function, the smallest scaling factor is x1/4
  // ==> y_step_q4 = 64. Since w and h are at most 16, the temp buffer is still
  // big enough.
  DECLARE_ALIGNED(16, uint8_t, temp[(135 + 8) * 64]);
  const int intermediate_height =
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;

  assert(w <= 64);
  assert(h <= 64);
  assert(y_step_q4 <= 32 || (y_step_q4 <= 64 && h <= 32));
  assert(x_step_q4 <= 64);

  if (w >= 8) {
    scaledconvolve_horiz_w8(src - src_stride * (SUBPEL_TAPS / 2 - 1),
                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
                            intermediate_height);
  } else {
    scaledconvolve_horiz_w4(src - src_stride * (SUBPEL_TAPS / 2 - 1),
                            src_stride, temp, 64, filter, x0_q4, x_step_q4, w,
                            intermediate_height);
  }

  if (w >= 16) {
    scaledconvolve_vert_w16(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
                            dst_stride, filter, y0_q4, y_step_q4, w, h);
  } else if (w == 8) {
    scaledconvolve_vert_w8(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
                           dst_stride, filter, y0_q4, y_step_q4, w, h);
  } else {
    scaledconvolve_vert_w4(temp + 64 * (SUBPEL_TAPS / 2 - 1), 64, dst,
                           dst_stride, filter, y0_q4, y_step_q4, w, h);
  }
}
示例#19
0
void vpx_highbd_convolve8_avg_c(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride,
                                const int16_t *filter_x, int x_step_q4,
                                const int16_t *filter_y, int y_step_q4,
                                int w, int h, int bd) {
  // Fixed size intermediate buffer places limits on parameters.
  DECLARE_ALIGNED(16, uint16_t, temp[64 * 64]);
  assert(w <= 64);
  assert(h <= 64);

  vpx_highbd_convolve8_c(src, src_stride, CONVERT_TO_BYTEPTR(temp), 64,
                         filter_x, x_step_q4, filter_y, y_step_q4, w, h, bd);
  vpx_highbd_convolve_avg_c(CONVERT_TO_BYTEPTR(temp), 64, dst, dst_stride,
                            NULL, 0, NULL, 0, w, h, bd);
}
示例#20
0
void vpx_fdct16x16_msa(const int16_t *input, int16_t *output,
                       int32_t src_stride) {
  int32_t i;
  DECLARE_ALIGNED(32, int16_t, tmp_buf[16 * 16]);

  /* column transform */
  for (i = 0; i < 2; ++i) {
    fdct8x16_1d_column((input + 8 * i), (&tmp_buf[0] + 8 * i), src_stride);
  }

  /* row transform */
  for (i = 0; i < 2; ++i) {
    fdct16x8_1d_row((&tmp_buf[0] + (128 * i)), (output + (128 * i)));
  }
}
示例#21
0
static INLINE void hadamard_16x16_avx2(const int16_t *src_diff,
                                       ptrdiff_t src_stride, tran_low_t *coeff,
                                       int is_final) {
#if CONFIG_VP9_HIGHBITDEPTH
  DECLARE_ALIGNED(32, int16_t, temp_coeff[16 * 16]);
  int16_t *t_coeff = temp_coeff;
#else
  int16_t *t_coeff = coeff;
#endif
  int16_t *coeff16 = (int16_t *)coeff;
  int idx;
  for (idx = 0; idx < 2; ++idx) {
    const int16_t *src_ptr = src_diff + idx * 8 * src_stride;
    hadamard_8x8x2_avx2(src_ptr, src_stride, t_coeff + (idx * 64 * 2));
  }

  for (idx = 0; idx < 64; idx += 16) {
    const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
    const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 64));
    const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 128));
    const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 192));

    __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
    __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
    __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
    __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);

    b0 = _mm256_srai_epi16(b0, 1);
    b1 = _mm256_srai_epi16(b1, 1);
    b2 = _mm256_srai_epi16(b2, 1);
    b3 = _mm256_srai_epi16(b3, 1);
    if (is_final) {
      store_tran_low(_mm256_add_epi16(b0, b2), coeff);
      store_tran_low(_mm256_add_epi16(b1, b3), coeff + 64);
      store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 128);
      store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 192);
      coeff += 16;
    } else {
      _mm256_storeu_si256((__m256i *)coeff16, _mm256_add_epi16(b0, b2));
      _mm256_storeu_si256((__m256i *)(coeff16 + 64), _mm256_add_epi16(b1, b3));
      _mm256_storeu_si256((__m256i *)(coeff16 + 128), _mm256_sub_epi16(b0, b2));
      _mm256_storeu_si256((__m256i *)(coeff16 + 192), _mm256_sub_epi16(b1, b3));
      coeff16 += 16;
    }
    t_coeff += 16;
  }
}
示例#22
0
void aom_hadamard_32x32_sse2(const int16_t *src_diff, ptrdiff_t src_stride,
                             tran_low_t *coeff) {
  // For high bitdepths, it is unnecessary to store_tran_low
  // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
  // next stage.  Output to an intermediate buffer first, then store_tran_low()
  // in the final stage.
  DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
  int16_t *t_coeff = temp_coeff;
  int idx;
  for (idx = 0; idx < 4; ++idx) {
    const int16_t *src_ptr =
        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
    hadamard_16x16_sse2(src_ptr, src_stride,
                        (tran_low_t *)(t_coeff + idx * 256), 0);
  }

  for (idx = 0; idx < 256; idx += 8) {
    __m128i coeff0 = _mm_load_si128((const __m128i *)t_coeff);
    __m128i coeff1 = _mm_load_si128((const __m128i *)(t_coeff + 256));
    __m128i coeff2 = _mm_load_si128((const __m128i *)(t_coeff + 512));
    __m128i coeff3 = _mm_load_si128((const __m128i *)(t_coeff + 768));

    __m128i b0 = _mm_add_epi16(coeff0, coeff1);
    __m128i b1 = _mm_sub_epi16(coeff0, coeff1);
    __m128i b2 = _mm_add_epi16(coeff2, coeff3);
    __m128i b3 = _mm_sub_epi16(coeff2, coeff3);

    b0 = _mm_srai_epi16(b0, 2);
    b1 = _mm_srai_epi16(b1, 2);
    b2 = _mm_srai_epi16(b2, 2);
    b3 = _mm_srai_epi16(b3, 2);

    coeff0 = _mm_add_epi16(b0, b2);
    coeff1 = _mm_add_epi16(b1, b3);
    store_tran_low(coeff0, coeff);
    store_tran_low(coeff1, coeff + 256);

    coeff2 = _mm_sub_epi16(b0, b2);
    coeff3 = _mm_sub_epi16(b1, b3);
    store_tran_low(coeff2, coeff + 512);
    store_tran_low(coeff3, coeff + 768);

    coeff += 8;
    t_coeff += 8;
  }
}
示例#23
0
void vpx_hadamard_32x32_avx2(const int16_t *src_diff, ptrdiff_t src_stride,
                             tran_low_t *coeff) {
#if CONFIG_VP9_HIGHBITDEPTH
  // For high bitdepths, it is unnecessary to store_tran_low
  // (mult/unpack/store), then load_tran_low (load/pack) the same memory in the
  // next stage.  Output to an intermediate buffer first, then store_tran_low()
  // in the final stage.
  DECLARE_ALIGNED(32, int16_t, temp_coeff[32 * 32]);
  int16_t *t_coeff = temp_coeff;
#else
  int16_t *t_coeff = coeff;
#endif
  int idx;
  for (idx = 0; idx < 4; ++idx) {
    // src_diff: 9 bit, dynamic range [-255, 255]
    const int16_t *src_ptr =
        src_diff + (idx >> 1) * 16 * src_stride + (idx & 0x01) * 16;
    hadamard_16x16_avx2(src_ptr, src_stride,
                        (tran_low_t *)(t_coeff + idx * 256), 0);
  }

  for (idx = 0; idx < 256; idx += 16) {
    const __m256i coeff0 = _mm256_loadu_si256((const __m256i *)t_coeff);
    const __m256i coeff1 = _mm256_loadu_si256((const __m256i *)(t_coeff + 256));
    const __m256i coeff2 = _mm256_loadu_si256((const __m256i *)(t_coeff + 512));
    const __m256i coeff3 = _mm256_loadu_si256((const __m256i *)(t_coeff + 768));

    __m256i b0 = _mm256_add_epi16(coeff0, coeff1);
    __m256i b1 = _mm256_sub_epi16(coeff0, coeff1);
    __m256i b2 = _mm256_add_epi16(coeff2, coeff3);
    __m256i b3 = _mm256_sub_epi16(coeff2, coeff3);

    b0 = _mm256_srai_epi16(b0, 2);
    b1 = _mm256_srai_epi16(b1, 2);
    b2 = _mm256_srai_epi16(b2, 2);
    b3 = _mm256_srai_epi16(b3, 2);

    store_tran_low(_mm256_add_epi16(b0, b2), coeff);
    store_tran_low(_mm256_add_epi16(b1, b3), coeff + 256);
    store_tran_low(_mm256_sub_epi16(b0, b2), coeff + 512);
    store_tran_low(_mm256_sub_epi16(b1, b3), coeff + 768);

    coeff += 16;
    t_coeff += 16;
  }
}
示例#24
0
static void dequantize(DCTELEM *block, MscCodecContext *mscContext, int intraMatrix) {
	DECLARE_ALIGNED(16, DCTELEM, tmp)[64];
	DCTELEM firstElemValue;
	uint16_t *qmatrix = intraMatrix ? mscContext->intra_matrix : mscContext->non_intra_matrix;

	firstElemValue = 8 * block[0];
	block[0] = 0;

	for (int i = 0; i < 64; ++i) {
		const int index	= scantab[i];
		tmp[mscContext->scantable.permutated[i]]= (block[index] * qmatrix[i]) >> 4;
	}

	tmp[0] = firstElemValue;
	for (int i = 0; i < 64; ++i) {
		block[i] = tmp[i];
	}
}
void vpx_highbd_convolve8_avg_neon(const uint16_t *src, ptrdiff_t src_stride,
                                   uint16_t *dst, ptrdiff_t dst_stride,
                                   const int16_t *filter_x, int x_step_q4,
                                   const int16_t *filter_y, int y_step_q4,
                                   int w, int h, int bd) {
  const int y0_q4 = get_filter_offset(filter_y, get_filter_base(filter_y));
  // + 1 to make it divisible by 4
  DECLARE_ALIGNED(16, uint16_t, temp[64 * 136]);
  const int intermediate_height =
      (((h - 1) * y_step_q4 + y0_q4) >> SUBPEL_BITS) + SUBPEL_TAPS;

  /* This implementation has the same issues as above. In addition, we only want
   * to average the values after both passes.
   */
  vpx_highbd_convolve8_horiz_neon(src - src_stride * 3, src_stride, temp, w,
                                  filter_x, x_step_q4, filter_y, y_step_q4, w,
                                  intermediate_height, bd);
  vpx_highbd_convolve8_avg_vert_neon(temp + w * 3, w, dst, dst_stride, filter_x,
                                     x_step_q4, filter_y, y_step_q4, w, h, bd);
}
示例#26
0
static int do_layer2(struct frame *fr,int outmode)
{
    int clip=0;
    int i,j;
    int stereo = fr->stereo;
    DECLARE_ALIGNED(16, real, fraction[2][4][SBLIMIT]); /* pick_table clears unused subbands */
    unsigned int bit_alloc[64];
    int scale[192];
    int single = fr->single;

    II_select_table(fr);
    fr->jsbound = (fr->mode == MPG_MD_JOINT_STEREO) ?
                  (fr->mode_ext<<2)+4 : fr->II_sblimit;

    if(stereo == 1 || single == 3)
        single = 0;

    II_step_one(bit_alloc, scale, fr);

    for (i=0; i<SCALE_BLOCK; i++)
    {
        II_step_two(bit_alloc,fraction,scale,fr,i>>2);
        for (j=0; j<3; j++)
        {
            if(single >= 0)
            {
                clip += (fr->synth_mono) (fraction[single][j],pcm_sample,&pcm_point);
            }
            else {
                int p1 = pcm_point;
                clip += (fr->synth) (fraction[0][j],0,pcm_sample,&p1);
                clip += (fr->synth) (fraction[1][j],1,pcm_sample,&pcm_point);
            }

//      if(pcm_point >= audiobufsize) audio_flush(outmode,ai);
        }
    }

    return clip;
}
示例#27
0
static int do_layer1(struct frame *fr,int single)
{
  int clip=0;
  int i,stereo = fr->stereo;
  unsigned int balloc[2*SBLIMIT];
  unsigned int scale_index[2][SBLIMIT];
  DECLARE_ALIGNED(16, real, fraction[2][SBLIMIT]);
//  int single = fr->single;

//  printf("do_layer1(0x%02X 0x%02X 0x%02X 0x%02X 0x%02X 0x%02X 0x%02X 0x%02X )\n",
//    wordpointer[0],wordpointer[1],wordpointer[2],wordpointer[3],wordpointer[4],wordpointer[5],wordpointer[6],wordpointer[7]);

  fr->jsbound = (fr->mode == MPG_MD_JOINT_STEREO) ? 
                         (fr->mode_ext<<2)+4 : 32;

  if(stereo == 1 || single == 3)
    single = 0;

  I_step_one(balloc,scale_index,fr);

  for (i=0;i<SCALE_BLOCK;i++)
  {
    I_step_two(fraction,balloc,scale_index,fr);

    if(single >= 0)
    {
      clip += (fr->synth_mono)( (real *) fraction[single],pcm_sample,&pcm_point);
    }
    else {
        int p1 = pcm_point;
        clip += (fr->synth)( (real *) fraction[0],0,pcm_sample,&p1);
        clip += (fr->synth)( (real *) fraction[1],1,pcm_sample,&pcm_point);
    }

  }

  return clip;
}
示例#28
0
void av1_fht16x16_msa(const int16_t *input, int16_t *output, int32_t stride,
                       int32_t tx_type) {
  DECLARE_ALIGNED(32, int16_t, tmp[256]);
  DECLARE_ALIGNED(32, int16_t, trans_buf[256]);
  DECLARE_ALIGNED(32, int16_t, tmp_buf[128]);
  int32_t i;
  int16_t *ptmpbuf = &tmp_buf[0];
  int16_t *trans = &trans_buf[0];
  const int32_t const_arr[29 * 4] = {
    52707308,    52707308,    52707308,    52707308,    -1072430300,
    -1072430300, -1072430300, -1072430300, 795618043,   795618043,
    795618043,   795618043,   -721080468,  -721080468,  -721080468,
    -721080468,  459094491,   459094491,   459094491,   459094491,
    -970646691,  -970646691,  -970646691,  -970646691,  1010963856,
    1010963856,  1010963856,  1010963856,  -361743294,  -361743294,
    -361743294,  -361743294,  209469125,   209469125,   209469125,
    209469125,   -1053094788, -1053094788, -1053094788, -1053094788,
    1053160324,  1053160324,  1053160324,  1053160324,  639644520,
    639644520,   639644520,   639644520,   -862444000,  -862444000,
    -862444000,  -862444000,  1062144356,  1062144356,  1062144356,
    1062144356,  -157532337,  -157532337,  -157532337,  -157532337,
    260914709,   260914709,   260914709,   260914709,   -1041559667,
    -1041559667, -1041559667, -1041559667, 920985831,   920985831,
    920985831,   920985831,   -551995675,  -551995675,  -551995675,
    -551995675,  596522295,   596522295,   596522295,   596522295,
    892853362,   892853362,   892853362,   892853362,   -892787826,
    -892787826,  -892787826,  -892787826,  410925857,   410925857,
    410925857,   410925857,   -992012162,  -992012162,  -992012162,
    -992012162,  992077698,   992077698,   992077698,   992077698,
    759246145,   759246145,   759246145,   759246145,   -759180609,
    -759180609,  -759180609,  -759180609,  -759222975,  -759222975,
    -759222975,  -759222975,  759288511,   759288511,   759288511,
    759288511
  };

  switch (tx_type) {
    case DCT_DCT:
      /* column transform */
      for (i = 0; i < 2; ++i) {
        fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride);
      }

      /* row transform */
      for (i = 0; i < 2; ++i) {
        fdct16x8_1d_row(tmp + (128 * i), output + (128 * i));
      }
      break;
    case ADST_DCT:
      /* column transform */
      for (i = 0; i < 2; ++i) {
        fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf);
        fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3));
      }

      /* row transform */
      for (i = 0; i < 2; ++i) {
        postproc_fdct16x8_1d_row(tmp + (128 * i), output + (128 * i));
      }
      break;
    case DCT_ADST:
      /* column transform */
      for (i = 0; i < 2; ++i) {
        fdct8x16_1d_column(input + 8 * i, tmp + 8 * i, stride);
      }

      fadst16_transpose_postproc_msa(tmp, trans);

      /* row transform */
      for (i = 0; i < 2; ++i) {
        fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf);
        fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7));
      }

      fadst16_transpose_msa(tmp, output);
      break;
    case ADST_ADST:
      /* column transform */
      for (i = 0; i < 2; ++i) {
        fadst16_cols_step1_msa(input + (i << 3), stride, const_arr, ptmpbuf);
        fadst16_cols_step2_msa(ptmpbuf, const_arr, tmp + (i << 3));
      }

      fadst16_transpose_postproc_msa(tmp, trans);

      /* row transform */
      for (i = 0; i < 2; ++i) {
        fadst16_rows_step1_msa(trans + (i << 7), const_arr, ptmpbuf);
        fadst16_rows_step2_msa(ptmpbuf, const_arr, tmp + (i << 7));
      }

      fadst16_transpose_msa(tmp, output);
      break;
    default: assert(0); break;
  }
}
void av1_highbd_wiener_convolve_add_src_ssse3(
    const uint8_t *src8, ptrdiff_t src_stride, uint8_t *dst8,
    ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4,
    const int16_t *filter_y, int y_step_q4, int w, int h,
    const ConvolveParams *conv_params, int bd) {
  assert(x_step_q4 == 16 && y_step_q4 == 16);
  assert(!(w & 7));
  assert(bd + FILTER_BITS - conv_params->round_0 + 2 <= 16);
  (void)x_step_q4;
  (void)y_step_q4;

  const uint16_t *const src = CONVERT_TO_SHORTPTR(src8);
  uint16_t *const dst = CONVERT_TO_SHORTPTR(dst8);

  DECLARE_ALIGNED(16, uint16_t,
                  temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]);
  int intermediate_height = h + SUBPEL_TAPS - 1;
  int i, j;
  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
  const uint16_t *const src_ptr = src - center_tap * src_stride - center_tap;

  const __m128i zero = _mm_setzero_si128();
  // Add an offset to account for the "add_src" part of the convolve function.
  const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);

  /* Horizontal filter */
  {
    const __m128i coeffs_x =
        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);

    // coeffs 0 1 0 1 2 3 2 3
    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
    // coeffs 4 5 4 5 6 7 6 7
    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);

    // coeffs 0 1 0 1 0 1 0 1
    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
    // coeffs 2 3 2 3 2 3 2 3
    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
    // coeffs 4 5 4 5 4 5 4 5
    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
    // coeffs 6 7 6 7 6 7 6 7
    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);

    const __m128i round_const = _mm_set1_epi32(
        (1 << (conv_params->round_0 - 1)) + (1 << (bd + FILTER_BITS - 1)));

    for (i = 0; i < intermediate_height; ++i) {
      for (j = 0; j < w; j += 8) {
        const __m128i data =
            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);
        const __m128i data2 =
            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j + 8]);

        // Filter even-index pixels
        const __m128i res_0 = _mm_madd_epi16(data, coeff_01);
        const __m128i res_2 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 4), coeff_23);
        const __m128i res_4 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 8), coeff_45);
        const __m128i res_6 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 12), coeff_67);

        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
                                         _mm_add_epi32(res_2, res_6));
        res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
                                  conv_params->round_0);

        // Filter odd-index pixels
        const __m128i res_1 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 2), coeff_01);
        const __m128i res_3 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 6), coeff_23);
        const __m128i res_5 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 10), coeff_45);
        const __m128i res_7 =
            _mm_madd_epi16(_mm_alignr_epi8(data2, data, 14), coeff_67);

        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
                                        _mm_add_epi32(res_3, res_7));
        res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
                                 conv_params->round_0);

        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
        const __m128i maxval =
            _mm_set1_epi16((WIENER_CLAMP_LIMIT(conv_params->round_0, bd)) - 1);
        __m128i res = _mm_packs_epi32(res_even, res_odd);
        res = _mm_min_epi16(_mm_max_epi16(res, zero), maxval);
        _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
      }
    }
  }

  /* Vertical filter */
  {
    const __m128i coeffs_y =
        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);

    // coeffs 0 1 0 1 2 3 2 3
    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
    // coeffs 4 5 4 5 6 7 6 7
    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);

    // coeffs 0 1 0 1 0 1 0 1
    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
    // coeffs 2 3 2 3 2 3 2 3
    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
    // coeffs 4 5 4 5 4 5 4 5
    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
    // coeffs 6 7 6 7 6 7 6 7
    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);

    const __m128i round_const =
        _mm_set1_epi32((1 << (conv_params->round_1 - 1)) -
                       (1 << (bd + conv_params->round_1 - 1)));

    for (i = 0; i < h; ++i) {
      for (j = 0; j < w; j += 8) {
        // Filter even-index pixels
        const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
        const __m128i src_0 =
            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
        const __m128i src_2 =
            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
        const __m128i src_4 =
            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
        const __m128i src_6 =
            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
                               *(__m128i *)(data + 7 * MAX_SB_SIZE));

        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);

        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
                                               _mm_add_epi32(res_4, res_6));

        // Filter odd-index pixels
        const __m128i src_1 =
            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
        const __m128i src_3 =
            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
        const __m128i src_5 =
            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
        const __m128i src_7 =
            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
                               *(__m128i *)(data + 7 * MAX_SB_SIZE));

        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);

        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
                                              _mm_add_epi32(res_5, res_7));

        // Rearrange pixels back into the order 0 ... 7
        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);

        const __m128i res_lo_round = _mm_srai_epi32(
            _mm_add_epi32(res_lo, round_const), conv_params->round_1);
        const __m128i res_hi_round = _mm_srai_epi32(
            _mm_add_epi32(res_hi, round_const), conv_params->round_1);

        const __m128i maxval = _mm_set1_epi16((1 << bd) - 1);
        __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
        res_16bit = _mm_min_epi16(_mm_max_epi16(res_16bit, zero), maxval);

        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
        _mm_storeu_si128(p, res_16bit);
      }
    }
  }
}
static INLINE void scaledconvolve_horiz_w4(
    const uint8_t *src, const ptrdiff_t src_stride, uint8_t *dst,
    const ptrdiff_t dst_stride, const InterpKernel *const x_filters,
    const int x0_q4, const int x_step_q4, const int w, const int h) {
  DECLARE_ALIGNED(16, uint8_t, temp[4 * 4]);
  int x, y, z;

  src -= SUBPEL_TAPS / 2 - 1;

  y = h;
  do {
    int x_q4 = x0_q4;
    x = 0;
    do {
      // process 4 src_x steps
      for (z = 0; z < 4; ++z) {
        const uint8_t *const src_x = &src[x_q4 >> SUBPEL_BITS];
        if (x_q4 & SUBPEL_MASK) {
          const int16x8_t filters = vld1q_s16(x_filters[x_q4 & SUBPEL_MASK]);
          const int16x4_t filter3 = vdup_lane_s16(vget_low_s16(filters), 3);
          const int16x4_t filter4 = vdup_lane_s16(vget_high_s16(filters), 0);
          uint8x8_t s[8], d;
          int16x8_t ss[4];
          int16x4_t t[8], tt;

          load_u8_8x4(src_x, src_stride, &s[0], &s[1], &s[2], &s[3]);
          transpose_u8_8x4(&s[0], &s[1], &s[2], &s[3]);

          ss[0] = vreinterpretq_s16_u16(vmovl_u8(s[0]));
          ss[1] = vreinterpretq_s16_u16(vmovl_u8(s[1]));
          ss[2] = vreinterpretq_s16_u16(vmovl_u8(s[2]));
          ss[3] = vreinterpretq_s16_u16(vmovl_u8(s[3]));
          t[0] = vget_low_s16(ss[0]);
          t[1] = vget_low_s16(ss[1]);
          t[2] = vget_low_s16(ss[2]);
          t[3] = vget_low_s16(ss[3]);
          t[4] = vget_high_s16(ss[0]);
          t[5] = vget_high_s16(ss[1]);
          t[6] = vget_high_s16(ss[2]);
          t[7] = vget_high_s16(ss[3]);

          tt = convolve8_4(t[0], t[1], t[2], t[3], t[4], t[5], t[6], t[7],
                           filters, filter3, filter4);
          d = vqrshrun_n_s16(vcombine_s16(tt, tt), 7);
          vst1_lane_u32((uint32_t *)&temp[4 * z], vreinterpret_u32_u8(d), 0);
        } else {
          int i;
          for (i = 0; i < 4; ++i) {
            temp[z * 4 + i] = src_x[i * src_stride + 3];
          }
        }
        x_q4 += x_step_q4;
      }

      // transpose the 4x4 filters values back to dst
      {
        const uint8x8x4_t d4 = vld4_u8(temp);
        vst1_lane_u32((uint32_t *)&dst[x + 0 * dst_stride],
                      vreinterpret_u32_u8(d4.val[0]), 0);
        vst1_lane_u32((uint32_t *)&dst[x + 1 * dst_stride],
                      vreinterpret_u32_u8(d4.val[1]), 0);
        vst1_lane_u32((uint32_t *)&dst[x + 2 * dst_stride],
                      vreinterpret_u32_u8(d4.val[2]), 0);
        vst1_lane_u32((uint32_t *)&dst[x + 3 * dst_stride],
                      vreinterpret_u32_u8(d4.val[3]), 0);
      }
      x += 4;
    } while (x < w);

    src += src_stride * 4;
    dst += dst_stride * 4;
    y -= 4;
  } while (y > 0);
}