Ejemplo n.º 1
0
static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
                                  uint8_t *dst, const int16_t *filter) {
  const __m128i k_256 = _mm_set1_epi16(1 << 8);
  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
  // pack and duplicate the filter values
  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
  const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
  // TRANSPOSE...
  // 00 01 02 03 04 05 06 07
  // 10 11 12 13 14 15 16 17
  // 20 21 22 23 24 25 26 27
  // 30 31 32 33 34 35 36 37
  //
  // TO
  //
  // 00 10 20 30
  // 01 11 21 31
  // 02 12 22 32
  // 03 13 23 33
  // 04 14 24 34
  // 05 15 25 35
  // 06 16 26 36
  // 07 17 27 37
  //
  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
  const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
  const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
  const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
  const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1);
  // 02 03 12 13 22 23 32 33
  const __m128i s3s2 = _mm_srli_si128(s1s0, 8);
  // 06 07 16 17 26 27 36 37
  const __m128i s7s6 = _mm_srli_si128(s5s4, 8);
  // multiply 2 adjacent elements with the filter and add the result
  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
  // add and saturate the results together
  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
  __m128i temp = _mm_adds_epi16(x0, x3);
  temp = _mm_adds_epi16(temp, min_x2x1);
  temp = _mm_adds_epi16(temp, max_x2x1);
  // round and shift by 7 bit each 16 bit
  temp = _mm_mulhrs_epi16(temp, k_256);
  // shrink to 8 bit each 16 bits
  temp = _mm_packus_epi16(temp, temp);
  // save only 4 bytes
  *(int *)dst = _mm_cvtsi128_si32(temp);
}
Ejemplo n.º 2
0
/**
 * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
 * precise version of a box filter 4:2:2 pixel subsampling in Q3.
 *
 * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
 * active area is specified using width and height.
 *
 * Note: We don't need to worry about going over the active area, as long as we
 * stay inside the CfL prediction buffer.
 */
static INLINE void cfl_luma_subsampling_422_lbd_ssse3(const uint8_t *input,
                                                      int input_stride,
                                                      uint16_t *pred_buf_q3,
                                                      int width, int height) {
  const __m128i fours = _mm_set1_epi8(4);
  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
  const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
  do {
    if (width == 4) {
      __m128i top = _mm_loadh_epi32((__m128i *)input);
      top = _mm_maddubs_epi16(top, fours);
      _mm_storeh_epi32(pred_buf_m128i, top);
    } else if (width == 8) {
      __m128i top = _mm_loadl_epi64((__m128i *)input);
      top = _mm_maddubs_epi16(top, fours);
      _mm_storel_epi64(pred_buf_m128i, top);
    } else {
      __m128i top = _mm_loadu_si128((__m128i *)input);
      top = _mm_maddubs_epi16(top, fours);
      _mm_storeu_si128(pred_buf_m128i, top);
      if (width == 32) {
        __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
        top_1 = _mm_maddubs_epi16(top_1, fours);
        _mm_storeu_si128(pred_buf_m128i + 1, top_1);
      }
    }
    input += input_stride;
    pred_buf_m128i += CFL_BUF_LINE_I128;
  } while (pred_buf_m128i < end);
}
Ejemplo n.º 3
0
static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch,
                                  uint8_t *dst, const int16_t *x_filter) {
  const __m128i k_256 = _mm_set1_epi16(1 << 8);
  const __m128i f_values = _mm_load_si128((const __m128i *)x_filter);
  // pack and duplicate the filter values
  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
  const __m128i A = _mm_loadl_epi64((const __m128i *)src_x);
  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch));
  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2));
  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3));
  const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4));
  const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5));
  const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6));
  const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7));
  // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17
  const __m128i tr0_0 = _mm_unpacklo_epi16(A, B);
  // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37
  const __m128i tr0_1 = _mm_unpacklo_epi16(C, D);
  // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57
  const __m128i tr0_2 = _mm_unpacklo_epi16(E, F);
  // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77
  const __m128i tr0_3 = _mm_unpacklo_epi16(G, H);
  // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33
  const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
  // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37
  const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1);
  // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73
  const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3);
  // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77
  const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
  // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71
  const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2);
  const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2);
  const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3);
  const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3);
  // multiply 2 adjacent elements with the filter and add the result
  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
  // add and saturate the results together
  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
  __m128i temp = _mm_adds_epi16(x0, x3);
  temp = _mm_adds_epi16(temp, min_x2x1);
  temp = _mm_adds_epi16(temp, max_x2x1);
  // round and shift by 7 bit each 16 bit
  temp = _mm_mulhrs_epi16(temp, k_256);
  // shrink to 8 bit each 16 bits
  temp = _mm_packus_epi16(temp, temp);
  // save only 8 bytes convolve result
  _mm_storel_epi64((__m128i *)dst, temp);
}
static void aom_filter_block1d8_h4_ssse3(
    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
  __m128i filtersReg;
  __m128i addFilterReg32, filt2Reg, filt3Reg;
  __m128i secondFilters, thirdFilters;
  __m128i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3;
  __m128i srcReg32b1;
  unsigned int i;
  src_ptr -= 3;
  addFilterReg32 = _mm_set1_epi16(32);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
  filtersReg = _mm_srai_epi16(filtersReg, 1);
  // converting the 16 bit (short) to 8 bit (byte) and have the same data
  // in both lanes of 128 bit register.
  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);

  // duplicate only the second 16 bits (third and forth byte)
  // across 256 bit register
  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
  // duplicate only the third 16 bits (fifth and sixth byte)
  // across 256 bit register
  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));

  filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32));
  filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2));

  for (i = output_height; i > 0; i -= 1) {
    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);

    // filter the source buffer
    srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg);
    srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters);
    srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters);

    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2);

    // shift by 6 bit each 16 bit
    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);

    // shrink to 8 bit each 16 bits
    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());

    src_ptr += src_pixels_per_line;

    _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1);

    output_ptr += output_pitch;
  }
}
/* Test the 128-bit form */
static void
ssse3_test_pmaddubsw128 (int *i1, int *i2, int *r)
{
  /* Assumes incoming pointers are 16-byte aligned */
  __m128i t1 = *(__m128i *) i1;
  __m128i t2 = *(__m128i *) i2;
  *(__m128i *) r = _mm_maddubs_epi16 (t1, t2);
}
Ejemplo n.º 6
0
void kvz_eight_tap_filter_x8_and_flip(__m128i *data01, __m128i *data23, __m128i *data45, __m128i *data67, __m128i *filter, __m128i *dst)
{
  __m128i a, b, c, d;
  __m128i fir = _mm_broadcastq_epi64(_mm_loadl_epi64(filter));

  a = _mm_maddubs_epi16(*data01, fir);
  b = _mm_maddubs_epi16(*data23, fir);
  a = _mm_hadd_epi16(a, b);

  c = _mm_maddubs_epi16(*data45, fir);
  d = _mm_maddubs_epi16(*data67, fir);
  c = _mm_hadd_epi16(c, d);

  a = _mm_hadd_epi16(a, c);

  _mm_storeu_si128(dst, a);
}
Ejemplo n.º 7
0
static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr,
                                            int src_stride,
                                            const uint8_t *a_ptr, int a_stride,
                                            const uint8_t *b_ptr, int b_stride,
                                            const uint8_t *m_ptr, int m_stride,
                                            int width, int height) {
  int x, y;
  __m128i res = _mm_setzero_si128();
  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));

  for (y = 0; y < height; y++) {
    for (x = 0; x < width; x += 16) {
      const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]);
      const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]);
      const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]);
      const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]);
      const __m128i m_inv = _mm_sub_epi8(mask_max, m);

      // Calculate 16 predicted pixels.
      // Note that the maximum value of any entry of 'pred_l' or 'pred_r'
      // is 64 * 255, so we have plenty of space to add rounding constants.
      const __m128i data_l = _mm_unpacklo_epi8(a, b);
      const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
      __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
      pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);

      const __m128i data_r = _mm_unpackhi_epi8(a, b);
      const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
      __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
      pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);

      const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
      res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));
    }

    src_ptr += src_stride;
    a_ptr += a_stride;
    b_ptr += b_stride;
    m_ptr += m_stride;
  }
  // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'.
  int32_t sad =
      _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
  return (sad + 31) >> 6;
}
Ejemplo n.º 8
0
static INLINE unsigned int masked_sad8xh_ssse3(
    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
    int height) {
  int y;
  __m128i res = _mm_setzero_si128();
  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));

  for (y = 0; y < height; y += 2) {
    const __m128i src = _mm_unpacklo_epi64(
        _mm_loadl_epi64((const __m128i *)src_ptr),
        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
    const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr);
    const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]);
    const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr);
    const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]);
    const __m128i m =
        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
                           _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
    const __m128i m_inv = _mm_sub_epi8(mask_max, m);

    const __m128i data_l = _mm_unpacklo_epi8(a0, b0);
    const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
    __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
    pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);

    const __m128i data_r = _mm_unpacklo_epi8(a1, b1);
    const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
    __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
    pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);

    const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
    res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));

    src_ptr += src_stride * 2;
    a_ptr += a_stride * 2;
    b_ptr += b_stride * 2;
    m_ptr += m_stride * 2;
  }
  int32_t sad =
      _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
  return (sad + 31) >> 6;
}
Ejemplo n.º 9
0
static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
                                 uint8_t *dst, const int16_t *filter) {
  const __m128i k_256 = _mm_set1_epi16(1 << 8);
  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
  // pack and duplicate the filter values
  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
  const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr);
  const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
  const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
  const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
  const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
  const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
  const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
  const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));
  const __m128i s1s0 = _mm_unpacklo_epi8(A, B);
  const __m128i s3s2 = _mm_unpacklo_epi8(C, D);
  const __m128i s5s4 = _mm_unpacklo_epi8(E, F);
  const __m128i s7s6 = _mm_unpacklo_epi8(G, H);
  // multiply 2 adjacent elements with the filter and add the result
  const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0);
  const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2);
  const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4);
  const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6);
  // add and saturate the results together
  const __m128i min_x2x1 = _mm_min_epi16(x2, x1);
  const __m128i max_x2x1 = _mm_max_epi16(x2, x1);
  __m128i temp = _mm_adds_epi16(x0, x3);
  temp = _mm_adds_epi16(temp, min_x2x1);
  temp = _mm_adds_epi16(temp, max_x2x1);
  // round and shift by 7 bit each 16 bit
  temp = _mm_mulhrs_epi16(temp, k_256);
  // shrink to 8 bit each 16 bits
  temp = _mm_packus_epi16(temp, temp);
  // save only 8 bytes convolve result
  _mm_storel_epi64((__m128i *)dst, temp);
}
Ejemplo n.º 10
0
void SSE42_blend() {
	int n = width * height * 4;
	int dummy __attribute__((unused));

    const uint8_t alpha2 = alpha/2;
    __m128i alpha_np = _mm_set1_epi16(alpha2 | ((uint16_t)(alpha2 ^ 0x7f) << 8));

    for (size_t i=0; i < n; i += 32) {
        __m128i A0 = _mm_load_si128((__m128i*)(imgA + i));
        __m128i B0 = _mm_load_si128((__m128i*)(imgB + i));

        __m128i A1 = _mm_load_si128((__m128i*)(imgA + i + 16));
        __m128i B1 = _mm_load_si128((__m128i*)(imgB + i + 16));

        __m128i lo0 = _mm_unpacklo_epi8(A0, B0);
        __m128i hi0 = _mm_unpackhi_epi8(A0, B0);

        __m128i lo1 = _mm_unpacklo_epi8(A1, B1);
        __m128i hi1 = _mm_unpackhi_epi8(A1, B1);

        lo0 = _mm_maddubs_epi16(lo0, alpha_np);
        lo1 = _mm_maddubs_epi16(lo1, alpha_np);
        hi0 = _mm_maddubs_epi16(hi0, alpha_np);
        hi1 = _mm_maddubs_epi16(hi1, alpha_np);

        lo0 = _mm_srli_epi16(lo0, 7);
        lo1 = _mm_srli_epi16(lo1, 7);
        hi0 = _mm_srli_epi16(hi0, 7);
        hi1 = _mm_srli_epi16(hi1, 7);

        __m128i res0 = _mm_packus_epi16(lo0, hi0);
        __m128i res1 = _mm_packus_epi16(lo1, hi1);

        _mm_store_si128((__m128i*)(data + i +  0), res0);
        _mm_store_si128((__m128i*)(data + i + 16), res1);
    }
}
Ejemplo n.º 11
0
/**
 * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
 * precise version of a box filter 4:2:0 pixel subsampling in Q3.
 *
 * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
 * active area is specified using width and height.
 *
 * Note: We don't need to worry about going over the active area, as long as we
 * stay inside the CfL prediction buffer.
 */
static INLINE void cfl_luma_subsampling_420_lbd_ssse3(const uint8_t *input,
                                                      int input_stride,
                                                      uint16_t *pred_buf_q3,
                                                      int width, int height) {
  const __m128i twos = _mm_set1_epi8(2);
  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
  const __m128i *end = pred_buf_m128i + (height >> 1) * CFL_BUF_LINE_I128;
  const int luma_stride = input_stride << 1;
  do {
    if (width == 4) {
      __m128i top = _mm_loadh_epi32((__m128i *)input);
      top = _mm_maddubs_epi16(top, twos);
      __m128i bot = _mm_loadh_epi32((__m128i *)(input + input_stride));
      bot = _mm_maddubs_epi16(bot, twos);
      const __m128i sum = _mm_add_epi16(top, bot);
      _mm_storeh_epi32(pred_buf_m128i, sum);
    } else if (width == 8) {
      __m128i top = _mm_loadl_epi64((__m128i *)input);
      top = _mm_maddubs_epi16(top, twos);
      __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride));
      bot = _mm_maddubs_epi16(bot, twos);
      const __m128i sum = _mm_add_epi16(top, bot);
      _mm_storel_epi64(pred_buf_m128i, sum);
    } else {
      __m128i top = _mm_loadu_si128((__m128i *)input);
      top = _mm_maddubs_epi16(top, twos);
      __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride));
      bot = _mm_maddubs_epi16(bot, twos);
      const __m128i sum = _mm_add_epi16(top, bot);
      _mm_storeu_si128(pred_buf_m128i, sum);
      if (width == 32) {
        __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
        __m128i bot_1 =
            _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1);
        top_1 = _mm_maddubs_epi16(top_1, twos);
        bot_1 = _mm_maddubs_epi16(bot_1, twos);
        __m128i sum_1 = _mm_add_epi16(top_1, bot_1);
        _mm_storeu_si128(pred_buf_m128i + 1, sum_1);
      }
    }
    input += luma_stride;
    pred_buf_m128i += CFL_BUF_LINE_I128;
  } while (pred_buf_m128i < end);
}
static void aom_filter_block1d4_h4_ssse3(
    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
  __m128i filtersReg;
  __m128i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
  unsigned int i;
  src_ptr -= 3;
  addFilterReg32 = _mm_set1_epi16(32);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
  filtersReg = _mm_srai_epi16(filtersReg, 1);
  // converting the 16 bit (short) to 8 bit (byte) and have the same data
  // in both lanes of 128 bit register.
  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);

  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u));
  filt1Reg = _mm_load_si128((__m128i const *)(filtd4));

  for (i = output_height; i > 0; i -= 1) {
    // load the 2 strides of source
    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);

    // filter the source buffer
    srcRegFilt32b1_1 = _mm_shuffle_epi8(srcReg32b1, filt1Reg);

    // multiply 4 adjacent elements with the filter and add the result
    srcRegFilt32b1_1 = _mm_maddubs_epi16(srcRegFilt32b1_1, firstFilters);

    srcRegFilt32b1_1 = _mm_hadds_epi16(srcRegFilt32b1_1, _mm_setzero_si128());

    // shift by 6 bit each 16 bit
    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);

    // shrink to 8 bit each 16 bits, the first lane contain the first
    // convolve result and the second lane contain the second convolve result
    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());

    src_ptr += src_pixels_per_line;

    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1);
    output_ptr += output_pitch;
  }
}
Ejemplo n.º 13
0
static INLINE unsigned int masked_sad4xh_ssse3(
    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
    int height) {
  int y;
  __m128i res = _mm_setzero_si128();
  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));

  for (y = 0; y < height; y += 2) {
    // Load two rows at a time, this seems to be a bit faster
    // than four rows at a time in this case.
    const __m128i src = _mm_unpacklo_epi32(
        _mm_cvtsi32_si128(*(uint32_t *)src_ptr),
        _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride]));
    const __m128i a =
        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)a_ptr),
                           _mm_cvtsi32_si128(*(uint32_t *)&a_ptr[a_stride]));
    const __m128i b =
        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr),
                           _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride]));
    const __m128i m =
        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr),
                           _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride]));
    const __m128i m_inv = _mm_sub_epi8(mask_max, m);

    const __m128i data = _mm_unpacklo_epi8(a, b);
    const __m128i mask = _mm_unpacklo_epi8(m, m_inv);
    __m128i pred_16bit = _mm_maddubs_epi16(data, mask);
    pred_16bit = xx_roundn_epu16(pred_16bit, AOM_BLEND_A64_ROUND_BITS);

    const __m128i pred = _mm_packus_epi16(pred_16bit, _mm_setzero_si128());
    res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));

    src_ptr += src_stride * 2;
    a_ptr += a_stride * 2;
    b_ptr += b_stride * 2;
    m_ptr += m_stride * 2;
  }
  // At this point, the SAD is stored in lane 0 of 'res'
  int32_t sad = _mm_cvtsi128_si32(res);
  return (sad + 31) >> 6;
}
static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr,
                                         ptrdiff_t src_pixels_per_line,
                                         uint8_t *output_ptr,
                                         ptrdiff_t output_pitch,
                                         uint32_t output_height,
                                         const int16_t *filter) {
  __m128i filtersReg;
  __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg;
  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
  __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3;
  __m256i srcReg32b1, srcReg32b2, filtersReg32;
  unsigned int i;
  ptrdiff_t src_stride, dst_stride;

  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
  // converting the 16 bit (short) to 8 bit (byte) and have the same data
  // in both lanes of 128 bit register.
  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
  // have the same data in both lanes of a 256 bit register
  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);

  // duplicate only the first 16 bits (first and second byte)
  // across 256 bit register
  firstFilters = _mm256_shuffle_epi8(filtersReg32,
                 _mm256_set1_epi16(0x100u));
  // duplicate only the second 16 bits (third and forth byte)
  // across 256 bit register
  secondFilters = _mm256_shuffle_epi8(filtersReg32,
                  _mm256_set1_epi16(0x302u));
  // duplicate only the third 16 bits (fifth and sixth byte)
  // across 256 bit register
  thirdFilters = _mm256_shuffle_epi8(filtersReg32,
                 _mm256_set1_epi16(0x504u));
  // duplicate only the forth 16 bits (seventh and eighth byte)
  // across 256 bit register
  forthFilters = _mm256_shuffle_epi8(filtersReg32,
                 _mm256_set1_epi16(0x706u));

  filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2);
  filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2);
  filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2);
  filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2);

  // multiple the size of the source and destination stride by two
  src_stride = src_pixels_per_line << 1;
  dst_stride = output_pitch << 1;
  for (i = output_height; i > 1; i-=2) {
    // load the 2 strides of source
    srcReg32b1 = _mm256_castsi128_si256(
                 _mm_loadu_si128((const __m128i *)(src_ptr - 3)));
    srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
                 _mm_loadu_si128((const __m128i *)
                 (src_ptr+src_pixels_per_line-3)), 1);

    // filter the source buffer
    srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg);
    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters);
    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);

    // add and saturate the results together
    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2);

    // filter the source buffer
    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg);
    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);

    // add and saturate the results together
    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
                       _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));

    // reading 2 strides of the next 16 bytes
    // (part of it was being read by earlier read)
    srcReg32b2 = _mm256_castsi128_si256(
                 _mm_loadu_si128((const __m128i *)(src_ptr + 5)));
    srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
                 _mm_loadu_si128((const __m128i *)
                 (src_ptr+src_pixels_per_line+5)), 1);

    // add and saturate the results together
    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1,
                       _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));

    // filter the source buffer
    srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg);
    srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters);
    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters);

    // add and saturate the results together
    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2);

    // filter the source buffer
    srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg);
    srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters);
    srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters);

    // add and saturate the results together
    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
                       _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2));
    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1,
                       _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2));


    srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64);

    srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64);

    // shift by 7 bit each 16 bit
    srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7);
    srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7);

    // shrink to 8 bit each 16 bits, the first lane contain the first
    // convolve result and the second lane contain the second convolve
    // result
    srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1,
                                           srcRegFilt32b2_1);

    src_ptr+=src_stride;

    // save 16 bytes
    _mm_store_si128((__m128i*)output_ptr,
    _mm256_castsi256_si128(srcRegFilt32b1_1));

    // save the next 16 bits
    _mm_store_si128((__m128i*)(output_ptr+output_pitch),
    _mm256_extractf128_si256(srcRegFilt32b1_1, 1));
    output_ptr+=dst_stride;
  }

  // if the number of strides is odd.
  // process only 16 bytes
  if (i > 0) {
    __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1;
    __m128i srcRegFilt2, srcRegFilt3;

    srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3));

    // filter the source buffer
    srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1,
                    _mm256_castsi256_si128(filt1Reg));
    srcRegFilt2 = _mm_shuffle_epi8(srcReg1,
                  _mm256_castsi256_si128(filt4Reg));

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1,
                    _mm256_castsi256_si128(firstFilters));
    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
                  _mm256_castsi256_si128(forthFilters));

    // add and saturate the results together
    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);

    // filter the source buffer
    srcRegFilt3= _mm_shuffle_epi8(srcReg1,
                 _mm256_castsi256_si128(filt2Reg));
    srcRegFilt2= _mm_shuffle_epi8(srcReg1,
                 _mm256_castsi256_si128(filt3Reg));

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
                  _mm256_castsi256_si128(secondFilters));
    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
                  _mm256_castsi256_si128(thirdFilters));

    // add and saturate the results together
    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
                    _mm_min_epi16(srcRegFilt3, srcRegFilt2));

    // reading the next 16 bytes
    // (part of it was being read by earlier read)
    srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5));

    // add and saturate the results together
    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
                    _mm_max_epi16(srcRegFilt3, srcRegFilt2));

    // filter the source buffer
    srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2,
                    _mm256_castsi256_si128(filt1Reg));
    srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
                  _mm256_castsi256_si128(filt4Reg));

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1,
                    _mm256_castsi256_si128(firstFilters));
    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
                  _mm256_castsi256_si128(forthFilters));

    // add and saturate the results together
    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);

    // filter the source buffer
    srcRegFilt3 = _mm_shuffle_epi8(srcReg2,
                  _mm256_castsi256_si128(filt2Reg));
    srcRegFilt2 = _mm_shuffle_epi8(srcReg2,
                  _mm256_castsi256_si128(filt3Reg));

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3,
                  _mm256_castsi256_si128(secondFilters));
    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2,
                  _mm256_castsi256_si128(thirdFilters));

    // add and saturate the results together
    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
                    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
                    _mm_max_epi16(srcRegFilt3, srcRegFilt2));


    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
                    _mm256_castsi256_si128(addFilterReg64));

    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
                    _mm256_castsi256_si128(addFilterReg64));

    // shift by 7 bit each 16 bit
    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);

    // shrink to 8 bit each 16 bits, the first lane contain the first
    // convolve result and the second lane contain the second convolve
    // result
    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);

    // save 16 bytes
    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);
  }
}
Ejemplo n.º 15
0
void aom_filter_block1d8_h8_intrin_ssse3(
    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
  __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg;
  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
  __m128i addFilterReg64, filtersReg, minReg;
  unsigned int i;

  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
  // converting the 16 bit (short) to  8 bit (byte) and have the same data
  // in both lanes of 128 bit register.
  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);

  // duplicate only the first 16 bits (first and second byte)
  // across 128 bit register
  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
  // duplicate only the second 16 bits (third and forth byte)
  // across 128 bit register
  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
  // duplicate only the third 16 bits (fifth and sixth byte)
  // across 128 bit register
  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
  // duplicate only the forth 16 bits (seventh and eighth byte)
  // across 128 bit register
  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));

  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);

  for (i = 0; i < output_height; i++) {
    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));

    // filter the source buffer
    srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg);
    srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);

    // filter the source buffer
    srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg);
    srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters);
    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters);

    // add and saturate all the results together
    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);

    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);

    // shift by 7 bit each 16 bits
    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);

    // shrink to 8 bit each 16 bits
    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);

    src_ptr += src_pixels_per_line;

    // save only 8 bytes
    _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);

    output_ptr += output_pitch;
  }
}
Ejemplo n.º 16
0
static void
ssse3_fetch_horizontal (bits_image_t *image, line_t *line,
			int y, pixman_fixed_t x, pixman_fixed_t ux, int n)
{
    uint32_t *bits = image->bits + y * image->rowstride;
    __m128i vx = _mm_set_epi16 (
	- (x + 1), x, - (x + 1), x,
	- (x + ux + 1), x + ux,  - (x + ux + 1), x + ux);
    __m128i vux = _mm_set_epi16 (
	- 2 * ux, 2 * ux, - 2 * ux, 2 * ux,
	- 2 * ux, 2 * ux, - 2 * ux, 2 * ux);
    __m128i vaddc = _mm_set_epi16 (1, 0, 1, 0, 1, 0, 1, 0);
    __m128i *b = (__m128i *)line->buffer;
    __m128i vrl0, vrl1;

    while ((n -= 2) >= 0)
    {
	__m128i vw, vr, s;

	vrl1 = _mm_loadl_epi64 (
	    (__m128i *)(bits + pixman_fixed_to_int (x + ux)));
	/* vrl1: R1, L1 */

    final_pixel:
	vrl0 = _mm_loadl_epi64 (
	    (__m128i *)(bits + pixman_fixed_to_int (x)));
	/* vrl0: R0, L0 */

	/* The weights are based on vx which is a vector of 
	 *
	 *    - (x + 1), x, - (x + 1), x,
	 *          - (x + ux + 1), x + ux, - (x + ux + 1), x + ux
	 *
	 * so the 16 bit weights end up like this:
	 *
	 *    iw0, w0, iw0, w0, iw1, w1, iw1, w1
	 *
	 * and after shifting and packing, we get these bytes:
	 *
	 *    iw0, w0, iw0, w0, iw1, w1, iw1, w1,
	 *        iw0, w0, iw0, w0, iw1, w1, iw1, w1,
	 *
	 * which means the first and the second input pixel 
	 * have to be interleaved like this:
	 *
	 *    la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
	 *        lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
	 *
	 * before maddubsw can be used.
	 */

	vw = _mm_add_epi16 (
	    vaddc, _mm_srli_epi16 (vx, 16 - BILINEAR_INTERPOLATION_BITS));
	/* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1
	 */

	vw = _mm_packus_epi16 (vw, vw);
	/* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1,
	 *         iw0, w0, iw0, w0, iw1, w1, iw1, w1
	 */
	vx = _mm_add_epi16 (vx, vux);

	x += 2 * ux;

	vr = _mm_unpacklo_epi16 (vrl1, vrl0);
	/* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */

	s = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (1, 0, 3, 2));
	/* s:  lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */

	vr = _mm_unpackhi_epi8 (vr, s);
	/* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1,
	 *         lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1
	 */

	vr = _mm_maddubs_epi16 (vr, vw);

	/* When the weight is 0, the inverse weight is
	 * 128 which can't be represented in a signed byte.
	 * As a result maddubsw computes the following:
	 *
	 *     r = l * -128 + r * 0
	 *
	 * rather than the desired
	 *
	 *     r = l * 128 + r * 0
	 *
	 * We fix this by taking the absolute value of the
	 * result.
	 */
	vr = _mm_abs_epi16 (vr);

	/* vr: A0, R0, A1, R1, G0, B0, G1, B1 */
	_mm_store_si128 (b++, vr);
    }

    if (n == -1)
    {
	vrl1 = _mm_setzero_si128();
	goto final_pixel;
    }

    line->y = y;
}
static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr,
                                         ptrdiff_t src_pitch,
                                         uint8_t *output_ptr,
                                         ptrdiff_t out_pitch,
                                         uint32_t output_height,
                                         const int16_t *filter) {
  __m128i filtersReg;
  __m256i addFilterReg64;
  __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5;
  __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10;
  __m256i srcReg32b11, srcReg32b12, filtersReg32;
  __m256i firstFilters, secondFilters, thirdFilters, forthFilters;
  unsigned int i;
  ptrdiff_t src_stride, dst_stride;

  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
  addFilterReg64 = _mm256_set1_epi32((int)0x0400040u);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
  // converting the 16 bit (short) to  8 bit (byte) and have the
  // same data in both lanes of 128 bit register.
  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);
  // have the same data in both lanes of a 256 bit register
  filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg);

  // duplicate only the first 16 bits (first and second byte)
  // across 256 bit register
  firstFilters = _mm256_shuffle_epi8(filtersReg32,
                 _mm256_set1_epi16(0x100u));
  // duplicate only the second 16 bits (third and forth byte)
  // across 256 bit register
  secondFilters = _mm256_shuffle_epi8(filtersReg32,
                  _mm256_set1_epi16(0x302u));
  // duplicate only the third 16 bits (fifth and sixth byte)
  // across 256 bit register
  thirdFilters = _mm256_shuffle_epi8(filtersReg32,
                 _mm256_set1_epi16(0x504u));
  // duplicate only the forth 16 bits (seventh and eighth byte)
  // across 256 bit register
  forthFilters = _mm256_shuffle_epi8(filtersReg32,
                 _mm256_set1_epi16(0x706u));

  // multiple the size of the source and destination stride by two
  src_stride = src_pitch << 1;
  dst_stride = out_pitch << 1;

  // load 16 bytes 7 times in stride of src_pitch
  srcReg32b1 = _mm256_castsi128_si256(
               _mm_loadu_si128((const __m128i *)(src_ptr)));
  srcReg32b2 = _mm256_castsi128_si256(
               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)));
  srcReg32b3 = _mm256_castsi128_si256(
               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)));
  srcReg32b4 = _mm256_castsi128_si256(
               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)));
  srcReg32b5 = _mm256_castsi128_si256(
               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)));
  srcReg32b6 = _mm256_castsi128_si256(
               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)));
  srcReg32b7 = _mm256_castsi128_si256(
               _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)));

  // have each consecutive loads on the same 256 register
  srcReg32b1 = _mm256_inserti128_si256(srcReg32b1,
               _mm256_castsi256_si128(srcReg32b2), 1);
  srcReg32b2 = _mm256_inserti128_si256(srcReg32b2,
               _mm256_castsi256_si128(srcReg32b3), 1);
  srcReg32b3 = _mm256_inserti128_si256(srcReg32b3,
               _mm256_castsi256_si128(srcReg32b4), 1);
  srcReg32b4 = _mm256_inserti128_si256(srcReg32b4,
               _mm256_castsi256_si128(srcReg32b5), 1);
  srcReg32b5 = _mm256_inserti128_si256(srcReg32b5,
               _mm256_castsi256_si128(srcReg32b6), 1);
  srcReg32b6 = _mm256_inserti128_si256(srcReg32b6,
               _mm256_castsi256_si128(srcReg32b7), 1);

  // merge every two consecutive registers except the last one
  srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2);
  srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2);

  // save
  srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4);

  // save
  srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4);

  // save
  srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6);

  // save
  srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6);


  for (i = output_height; i > 1; i-=2) {
     // load the last 2 loads of 16 bytes and have every two
     // consecutive loads in the same 256 bit register
     srcReg32b8 = _mm256_castsi128_si256(
     _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)));
     srcReg32b7 = _mm256_inserti128_si256(srcReg32b7,
     _mm256_castsi256_si128(srcReg32b8), 1);
     srcReg32b9 = _mm256_castsi128_si256(
     _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8)));
     srcReg32b8 = _mm256_inserti128_si256(srcReg32b8,
     _mm256_castsi256_si128(srcReg32b9), 1);

     // merge every two consecutive registers
     // save
     srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8);
     srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8);

     // multiply 2 adjacent elements with the filter and add the result
     srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters);
     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters);

     // add and saturate the results together
     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6);

     // multiply 2 adjacent elements with the filter and add the result
     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters);
     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters);

     // add and saturate the results together
     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
                   _mm256_min_epi16(srcReg32b8, srcReg32b12));
     srcReg32b10 = _mm256_adds_epi16(srcReg32b10,
                   _mm256_max_epi16(srcReg32b8, srcReg32b12));

     // multiply 2 adjacent elements with the filter and add the result
     srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters);
     srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters);

     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6);

     // multiply 2 adjacent elements with the filter and add the result
     srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters);
     srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters);

     // add and saturate the results together
     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
                  _mm256_min_epi16(srcReg32b8, srcReg32b12));
     srcReg32b1 = _mm256_adds_epi16(srcReg32b1,
                  _mm256_max_epi16(srcReg32b8, srcReg32b12));

     srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64);
     srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64);

     // shift by 7 bit each 16 bit
     srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7);
     srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7);

     // shrink to 8 bit each 16 bits, the first lane contain the first
     // convolve result and the second lane contain the second convolve
     // result
     srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1);

     src_ptr+=src_stride;

     // save 16 bytes
     _mm_store_si128((__m128i*)output_ptr,
     _mm256_castsi256_si128(srcReg32b1));

     // save the next 16 bits
     _mm_store_si128((__m128i*)(output_ptr+out_pitch),
     _mm256_extractf128_si256(srcReg32b1, 1));

     output_ptr+=dst_stride;

     // save part of the registers for next strides
     srcReg32b10 = srcReg32b11;
     srcReg32b1 = srcReg32b3;
     srcReg32b11 = srcReg32b2;
     srcReg32b3 = srcReg32b5;
     srcReg32b2 = srcReg32b4;
     srcReg32b5 = srcReg32b7;
     srcReg32b7 = srcReg32b9;
  }
  if (i > 0) {
    __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5;
    __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8;
    // load the last 16 bytes
    srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));

    // merge the last 2 results together
    srcRegFilt4 = _mm_unpacklo_epi8(
                  _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);
    srcRegFilt7 = _mm_unpackhi_epi8(
                  _mm256_castsi256_si128(srcReg32b7), srcRegFilt8);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10),
                  _mm256_castsi256_si128(firstFilters));
    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4,
                  _mm256_castsi256_si128(forthFilters));
    srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1),
                  _mm256_castsi256_si128(firstFilters));
    srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7,
                  _mm256_castsi256_si128(forthFilters));

    // add and saturate the results together
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7);


    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11),
                  _mm256_castsi256_si128(secondFilters));
    srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3),
                  _mm256_castsi256_si128(secondFilters));

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2),
                  _mm256_castsi256_si128(thirdFilters));
    srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5),
                  _mm256_castsi256_si128(thirdFilters));

    // add and saturate the results together
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
                  _mm_min_epi16(srcRegFilt4, srcRegFilt6));
    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
                  _mm_min_epi16(srcRegFilt5, srcRegFilt7));

    // add and saturate the results together
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
                  _mm_max_epi16(srcRegFilt4, srcRegFilt6));
    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
                  _mm_max_epi16(srcRegFilt5, srcRegFilt7));


    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
                  _mm256_castsi256_si128(addFilterReg64));
    srcRegFilt3 = _mm_adds_epi16(srcRegFilt3,
                  _mm256_castsi256_si128(addFilterReg64));

    // shift by 7 bit each 16 bit
    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);
    srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7);

    // shrink to 8 bit each 16 bits, the first lane contain the first
    // convolve result and the second lane contain the second convolve
    // result
    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3);

    // save 16 bytes
    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);
  }
}
static void aom_filter_block1d4_v4_ssse3(
    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
  __m128i filtersReg;
  __m128i addFilterReg32;
  __m128i srcReg2, srcReg3, srcReg23, srcReg4, srcReg34, srcReg5, srcReg45,
      srcReg6, srcReg56;
  __m128i srcReg23_34_lo, srcReg45_56_lo;
  __m128i srcReg2345_3456_lo, srcReg2345_3456_hi;
  __m128i resReglo, resReghi;
  __m128i firstFilters;
  unsigned int i;
  ptrdiff_t src_stride, dst_stride;

  addFilterReg32 = _mm_set1_epi16(32);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
  // converting the 16 bit (short) to  8 bit (byte) and have the
  // same data in both lanes of 128 bit register.
  filtersReg = _mm_srai_epi16(filtersReg, 1);
  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);

  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u));

  // multiple the size of the source and destination stride by two
  src_stride = src_pitch << 1;
  dst_stride = out_pitch << 1;

  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
  srcReg23 = _mm_unpacklo_epi32(srcReg2, srcReg3);

  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));

  // have consecutive loads on the same 256 register
  srcReg34 = _mm_unpacklo_epi32(srcReg3, srcReg4);

  srcReg23_34_lo = _mm_unpacklo_epi8(srcReg23, srcReg34);

  for (i = output_height; i > 1; i -= 2) {
    srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
    srcReg45 = _mm_unpacklo_epi32(srcReg4, srcReg5);

    srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
    srcReg56 = _mm_unpacklo_epi32(srcReg5, srcReg6);

    // merge every two consecutive registers
    srcReg45_56_lo = _mm_unpacklo_epi8(srcReg45, srcReg56);

    srcReg2345_3456_lo = _mm_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
    srcReg2345_3456_hi = _mm_unpackhi_epi16(srcReg23_34_lo, srcReg45_56_lo);

    // multiply 2 adjacent elements with the filter and add the result
    resReglo = _mm_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
    resReghi = _mm_maddubs_epi16(srcReg2345_3456_hi, firstFilters);

    resReglo = _mm_hadds_epi16(resReglo, _mm_setzero_si128());
    resReghi = _mm_hadds_epi16(resReghi, _mm_setzero_si128());

    // shift by 6 bit each 16 bit
    resReglo = _mm_adds_epi16(resReglo, addFilterReg32);
    resReghi = _mm_adds_epi16(resReghi, addFilterReg32);
    resReglo = _mm_srai_epi16(resReglo, 6);
    resReghi = _mm_srai_epi16(resReghi, 6);

    // shrink to 8 bit each 16 bits, the first lane contain the first
    // convolve result and the second lane contain the second convolve
    // result
    resReglo = _mm_packus_epi16(resReglo, resReglo);
    resReghi = _mm_packus_epi16(resReghi, resReghi);

    src_ptr += src_stride;

    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(resReglo);
    *((uint32_t *)(output_ptr + out_pitch)) = _mm_cvtsi128_si32(resReghi);

    output_ptr += dst_stride;

    // save part of the registers for next strides
    srcReg23_34_lo = srcReg45_56_lo;
    srcReg4 = srcReg6;
  }
}
 SIMD_INLINE __m128i BinomialSum16(const __m128i & ab, const __m128i & cd)
 {
     return _mm_add_epi16(_mm_maddubs_epi16(ab, K8_01_03), _mm_maddubs_epi16(cd, K8_03_01));
 }
Ejemplo n.º 20
0
static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch,
                                  uint8_t *dst, const int16_t *filter, int w) {
  const __m128i k_256 = _mm_set1_epi16(1 << 8);
  const __m128i f_values = _mm_load_si128((const __m128i *)filter);
  // pack and duplicate the filter values
  const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u));
  const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u));
  const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u));
  const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu));
  int i;

  for (i = 0; i < w; i += 16) {
    const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr);
    const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch));
    const __m128i C =
        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
    const __m128i D =
        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
    const __m128i E =
        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));
    const __m128i F =
        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));
    const __m128i G =
        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));
    const __m128i H =
        _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7));
    // merge the result together
    const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B);
    const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H);
    const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B);
    const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H);
    // multiply 2 adjacent elements with the filter and add the result
    const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0);
    const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6);
    const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0);
    const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6);
    // add and saturate the results together
    const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo);
    const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi);
    // merge the result together
    const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D);
    const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D);
    // multiply 2 adjacent elements with the filter and add the result
    const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2);
    const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2);
    // merge the result together
    const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F);
    const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F);
    // multiply 2 adjacent elements with the filter and add the result
    const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4);
    const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4);
    // add and saturate the results together
    __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo));
    __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi));

    // add and saturate the results together
    temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo));
    temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi));
    // round and shift by 7 bit each 16 bit
    temp_lo = _mm_mulhrs_epi16(temp_lo, k_256);
    temp_hi = _mm_mulhrs_epi16(temp_hi, k_256);
    // shrink to 8 bit each 16 bits, the first lane contain the first
    // convolve result and the second lane contain the second convolve
    // result
    temp_hi = _mm_packus_epi16(temp_lo, temp_hi);
    src_ptr += 16;
    // save 16 bytes convolve result
    _mm_store_si128((__m128i *)&dst[i], temp_hi);
  }
}
static void aom_filter_block1d16_v4_ssse3(
    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
  __m128i filtersReg;
  __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6;
  __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi;
  __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi;
  __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo;
  __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi;
  __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi;
  __m128i resReg23_45, resReg34_56;
  __m128i addFilterReg32, secondFilters, thirdFilters;
  unsigned int i;
  ptrdiff_t src_stride, dst_stride;

  addFilterReg32 = _mm_set1_epi16(32);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
  // converting the 16 bit (short) to  8 bit (byte) and have the
  // same data in both lanes of 128 bit register.
  filtersReg = _mm_srai_epi16(filtersReg, 1);
  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);

  // duplicate only the second 16 bits (third and forth byte)
  // across 128 bit register
  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
  // duplicate only the third 16 bits (fifth and sixth byte)
  // across 128 bit register
  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));

  // multiple the size of the source and destination stride by two
  src_stride = src_pitch << 1;
  dst_stride = out_pitch << 1;

  srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2));
  srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3));
  srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3);
  srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3);

  srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4));

  // have consecutive loads on the same 256 register
  srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4);
  srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4);

  for (i = output_height; i > 1; i -= 2) {
    srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5));

    srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5);
    srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5);

    srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6));

    srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6);
    srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6);

    // multiply 2 adjacent elements with the filter and add the result
    resReg23_lo = _mm_maddubs_epi16(srcReg23_lo, secondFilters);
    resReg34_lo = _mm_maddubs_epi16(srcReg34_lo, secondFilters);
    resReg45_lo = _mm_maddubs_epi16(srcReg45_lo, thirdFilters);
    resReg56_lo = _mm_maddubs_epi16(srcReg56_lo, thirdFilters);

    // add and saturate the results together
    resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo);
    resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo);

    // multiply 2 adjacent elements with the filter and add the result

    resReg23_hi = _mm_maddubs_epi16(srcReg23_hi, secondFilters);
    resReg34_hi = _mm_maddubs_epi16(srcReg34_hi, secondFilters);
    resReg45_hi = _mm_maddubs_epi16(srcReg45_hi, thirdFilters);
    resReg56_hi = _mm_maddubs_epi16(srcReg56_hi, thirdFilters);

    // add and saturate the results together
    resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi);
    resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi);

    // shift by 6 bit each 16 bit
    resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32);
    resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32);
    resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32);
    resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32);
    resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6);
    resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6);
    resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6);
    resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6);

    // shrink to 8 bit each 16 bits, the first lane contain the first
    // convolve result and the second lane contain the second convolve
    // result
    resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi);
    resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi);

    src_ptr += src_stride;

    _mm_store_si128((__m128i *)output_ptr, (resReg23_45));
    _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56));

    output_ptr += dst_stride;

    // save part of the registers for next strides
    srcReg23_lo = srcReg45_lo;
    srcReg34_lo = srcReg56_lo;
    srcReg23_hi = srcReg45_hi;
    srcReg34_hi = srcReg56_hi;
    srcReg4 = srcReg6;
  }
}
Ejemplo n.º 22
0
void aom_filter_block1d4_h8_intrin_ssse3(
    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
  __m128i firstFilters, secondFilters, shuffle1, shuffle2;
  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4;
  __m128i addFilterReg64, filtersReg, srcReg, minReg;
  unsigned int i;

  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
  // converting the 16 bit (short) to  8 bit (byte) and have the same data
  // in both lanes of 128 bit register.
  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);

  // duplicate only the first 16 bits in the filter into the first lane
  firstFilters = _mm_shufflelo_epi16(filtersReg, 0);
  // duplicate only the third 16 bit in the filter into the first lane
  secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu);
  // duplicate only the seconds 16 bits in the filter into the second lane
  // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3
  firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u);
  // duplicate only the forth 16 bits in the filter into the second lane
  // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7
  secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu);

  // loading the local filters
  shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8);
  shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8);

  for (i = 0; i < output_height; i++) {
    srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3));

    // filter the source buffer
    srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1);
    srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters);

    // extract the higher half of the lane
    srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8);
    srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8);

    minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2);

    // add and saturate all the results together
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4);
    srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2);
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);

    // shift by 7 bit each 16 bits
    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);

    // shrink to 8 bit each 16 bits
    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);
    src_ptr += src_pixels_per_line;

    // save only 4 bytes
    *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1);

    output_ptr += output_pitch;
  }
}
Ejemplo n.º 23
0
 SIMD_INLINE void Average16(__m128i & a)
 {
     a = _mm_srli_epi16(_mm_add_epi16(_mm_maddubs_epi16(a, K8_01), K16_0001), 1); 
 }
Ejemplo n.º 24
0
 SIMD_INLINE __m128i Average16(const __m128i & s0, const __m128i & s1)
 {
     return _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(s0, K8_01), _mm_maddubs_epi16(s1, K8_01)), K16_0002), 2); 
 }
void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr,
                                          unsigned int src_pixels_per_line,
                                          unsigned char *output_ptr,
                                          unsigned int output_pitch,
                                          unsigned int output_height,
                                          int16_t *filter) {
  __m128i addFilterReg64, filtersReg, srcReg1, srcReg2;
  __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg;
  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
  __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3;
  unsigned int i;

  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
  filtersReg = _mm_loadu_si128((__m128i *)filter);
  // converting the 16 bit (short) to  8 bit (byte) and have the same data
  // in both lanes of 128 bit register.
  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);

  // duplicate only the first 16 bits (first and second byte)
  // across 128 bit register
  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
  // duplicate only the second 16 bits (third and forth byte)
  // across 128 bit register
  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
  // duplicate only the third 16 bits (fifth and sixth byte)
  // across 128 bit register
  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
  // duplicate only the forth 16 bits (seventh and eighth byte)
  // across 128 bit register
  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));

  filt1Reg = _mm_load_si128((__m128i const *)filt1_global);
  filt2Reg = _mm_load_si128((__m128i const *)filt2_global);
  filt3Reg = _mm_load_si128((__m128i const *)filt3_global);
  filt4Reg = _mm_load_si128((__m128i const *)filt4_global);

  for (i = 0; i < output_height; i++) {
    srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3));

    // filter the source buffer
    srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg);
    srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters);
    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);

    // add and saturate the results together
    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2);

    // filter the source buffer
    srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg);
    srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);

    // add and saturate the results together
    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
                                   _mm_min_epi16(srcRegFilt3, srcRegFilt2));

    // reading the next 16 bytes.
    // (part of it was being read by earlier read)
    srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5));

    // add and saturate the results together
    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1,
                                   _mm_max_epi16(srcRegFilt3, srcRegFilt2));

    // filter the source buffer
    srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg);
    srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters);
    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters);

    // add and saturate the results together
    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2);

    // filter the source buffer
    srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg);
    srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);

    // add and saturate the results together
    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
    _mm_min_epi16(srcRegFilt3, srcRegFilt2));
    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1,
    _mm_max_epi16(srcRegFilt3, srcRegFilt2));

    srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64);
    srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64);

    // shift by 7 bit each 16 bit
    srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7);
    srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7);

    // shrink to 8 bit each 16 bits, the first lane contain the first
    // convolve result and the second lane contain the second convolve
    // result
    srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1);

    src_ptr+=src_pixels_per_line;

    // save 16 bytes
    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1);

    output_ptr+=output_pitch;
  }
}
Ejemplo n.º 26
0
void foo(__m128i *a, __m128i *b, __m128i c, __m128i d, __m128i e)
{
  *a = _mm_maddubs_epi16(c, d);
  *b = _mm_maddubs_epi16(c, e);
}
Ejemplo n.º 27
0
__m128i test_mm_maddubs_epi16(__m128i a, __m128i b) {
  // CHECK-LABEL: test_mm_maddubs_epi16
  // CHECK: call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}})
  return _mm_maddubs_epi16(a, b);
}
Ejemplo n.º 28
0
 /**
 * \brief Linear interpolation for 4 pixels. Returns 4 filtered pixels in lowest 32-bits of the register.
 * \param ref_main      Reference pixels
 * \param delta_pos     Fractional pixel precise position of sample displacement
 * \param x             Sample offset in direction x in ref_main array
 */
static INLINE __m128i filter_4x1_avx2(const kvz_pixel *ref_main, int16_t delta_pos, int x){

  int8_t delta_int = delta_pos >> 5;
  int8_t delta_fract = delta_pos & (32-1);
  __m128i sample0 = _mm_cvtsi32_si128(*(uint32_t*)&(ref_main[x + delta_int]));
  __m128i sample1 = _mm_cvtsi32_si128(*(uint32_t*)&(ref_main[x + delta_int + 1]));
  __m128i pairs = _mm_unpacklo_epi8(sample0, sample1);
  __m128i weight = _mm_set1_epi16( (delta_fract << 8) | (32 - delta_fract) );
  sample0 = _mm_maddubs_epi16(pairs, weight);
  sample0 = _mm_add_epi16(sample0, _mm_set1_epi16(16));
  sample0 = _mm_srli_epi16(sample0, 5);
  sample0 = _mm_packus_epi16(sample0, sample0);

  return sample0;
}

 /**
 * \brief Linear interpolation for 4x4 block. Writes filtered 4x4 block to dst.
 * \param dst           Destination buffer
 * \param ref_main      Reference pixels
 * \param sample_disp   Sample displacement per row
 * \param vertical_mode Mode direction, true if vertical
 */
static void filter_4x4_avx2(kvz_pixel *dst, const kvz_pixel *ref_main, int sample_disp, bool vertical_mode){
Ejemplo n.º 29
0
void aom_filter_block1d8_v8_intrin_ssse3(
    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
  __m128i addFilterReg64, filtersReg, minReg;
  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
  __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5;
  __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7;
  __m128i srcReg8;
  unsigned int i;

  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
  // converting the 16 bit (short) to  8 bit (byte) and have the same data
  // in both lanes of 128 bit register.
  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);

  // duplicate only the first 16 bits in the filter
  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
  // duplicate only the second 16 bits in the filter
  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
  // duplicate only the third 16 bits in the filter
  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
  // duplicate only the forth 16 bits in the filter
  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));

  // load the first 7 rows of 8 bytes
  srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr);
  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch));
  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
  srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));
  srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
  srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));

  for (i = 0; i < output_height; i++) {
    // load the last 8 bytes
    srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7));

    // merge the result together
    srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2);
    srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4);

    // merge the result together
    srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6);
    srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters);
    srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters);
    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters);

    // add and saturate the results together
    minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3);
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5);
    srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3);
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg);
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2);
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);

    // shift by 7 bit each 16 bit
    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);

    // shrink to 8 bit each 16 bits
    srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1);

    src_ptr += src_pitch;

    // shift down a row
    srcReg1 = srcReg2;
    srcReg2 = srcReg3;
    srcReg3 = srcReg4;
    srcReg4 = srcReg5;
    srcReg5 = srcReg6;
    srcReg6 = srcReg7;
    srcReg7 = srcReg8;

    // save only 8 bytes convolve result
    _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1);

    output_ptr += out_pitch;
  }
}
void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
                                          unsigned int src_pitch,
                                          unsigned char *output_ptr,
                                          unsigned int out_pitch,
                                          unsigned int output_height,
                                          int16_t *filter) {
  __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3;
  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
  __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
  unsigned int i;

  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
  filtersReg = _mm_loadu_si128((__m128i *)filter);
  // converting the 16 bit (short) to  8 bit (byte) and have the same data
  // in both lanes of 128 bit register.
  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);

  // duplicate only the first 16 bits in the filter
  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
  // duplicate only the second 16 bits in the filter
  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
  // duplicate only the third 16 bits in the filter
  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
  // duplicate only the forth 16 bits in the filter
  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));

  for (i = 0; i < output_height; i++) {
    // load the first 16 bytes
    srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr));
    // load the next 16 bytes in stride of src_pitch
    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch));
    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6));
    srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));

    // merge the result together
    srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
    srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
    srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2);
    srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
    srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);

    // add and saturate the results together
    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);

    // load the next 16 bytes in stride of two/three src_pitch
    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2));
    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3));

    // merge the result together
    srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
    srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters);
    srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);

    // load the next 16 bytes in stride of four/five src_pitch
    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4));
    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5));

    // merge the result together
    srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
    srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
    srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);

    // add and saturate the results together
    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
                                 _mm_min_epi16(srcRegFilt4, srcRegFilt7));
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
                                 _mm_min_epi16(srcRegFilt6, srcRegFilt8));

    // add and saturate the results together
    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
                                 _mm_max_epi16(srcRegFilt4, srcRegFilt7));
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
                                 _mm_max_epi16(srcRegFilt6, srcRegFilt8));
    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);

    // shift by 7 bit each 16 bit
    srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);

    // shrink to 8 bit each 16 bits, the first lane contain the first
    // convolve result and the second lane contain the second convolve
    // result
    srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);

    src_ptr+=src_pitch;

    // save 16 bytes convolve result
    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);

    output_ptr+=out_pitch;
  }
}