Ejemplo n.º 1
0
INLINE static __m256i diff_row_dual_avx2(const kvz_pixel *buf1, const kvz_pixel *buf2, const kvz_pixel *orig)
{
  __m128i temp1 = _mm_loadl_epi64((__m128i*)buf1);
  __m128i temp2 = _mm_loadl_epi64((__m128i*)buf2);
  __m128i temp3 = _mm_loadl_epi64((__m128i*)orig);
  __m256i buf1_row = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(temp1, temp2));
  __m256i buf2_row = _mm256_cvtepu8_epi16(_mm_broadcastq_epi64(temp3));

  return _mm256_sub_epi16(buf1_row, buf2_row);
}
Ejemplo n.º 2
0
static void satd_8bit_4x4_dual_avx2(
  const pred_buffer preds, const kvz_pixel * const orig, unsigned num_modes, unsigned *satds_out) 
{

  __m256i original = _mm256_broadcastsi128_si256(_mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)orig)));
  __m256i pred = _mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)preds[0]));
  pred = _mm256_inserti128_si256(pred, _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)preds[1])), 1);

  __m256i diff_lo = _mm256_sub_epi16(pred, original);

  original = _mm256_broadcastsi128_si256(_mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(orig + 8))));
  pred = _mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(preds[0] + 8)));
  pred = _mm256_inserti128_si256(pred, _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(preds[1] + 8))), 1);

  __m256i diff_hi = _mm256_sub_epi16(pred, original);

  //Hor
  __m256i row0 = _mm256_hadd_epi16(diff_lo, diff_hi);
  __m256i row1 = _mm256_hsub_epi16(diff_lo, diff_hi);

  __m256i row2 = _mm256_hadd_epi16(row0, row1);
  __m256i row3 = _mm256_hsub_epi16(row0, row1);

  //Ver
  row0 = _mm256_hadd_epi16(row2, row3);
  row1 = _mm256_hsub_epi16(row2, row3);

  row2 = _mm256_hadd_epi16(row0, row1);
  row3 = _mm256_hsub_epi16(row0, row1);

  //Abs and sum
  row2 = _mm256_abs_epi16(row2);
  row3 = _mm256_abs_epi16(row3);

  row3 = _mm256_add_epi16(row2, row3);

  row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, KVZ_PERMUTE(2, 3, 0, 1) ));
  row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, KVZ_PERMUTE(1, 0, 1, 0) ));
  row3 = _mm256_add_epi16(row3, _mm256_shufflelo_epi16(row3, KVZ_PERMUTE(1, 0, 1, 0) ));

  unsigned sum1 = _mm_extract_epi16(_mm256_castsi256_si128(row3), 0);
  sum1 = (sum1 + 1) >> 1;

  unsigned sum2 = _mm_extract_epi16(_mm256_extracti128_si256(row3, 1), 0);
  sum2 = (sum2 + 1) >> 1;

  satds_out[0] = sum1;
  satds_out[1] = sum2;
}
Ejemplo n.º 3
0
static INLINE unsigned int highbd_masked_sad16xh_avx2(
    const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride,
    const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride,
    int width, int height) {
  const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8);
  const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8);
  const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8);
  int x, y;
  __m256i res = _mm256_setzero_si256();
  const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS));
  const __m256i round_const =
      _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1);
  const __m256i one = _mm256_set1_epi16(1);

  for (y = 0; y < height; y++) {
    for (x = 0; x < width; x += 16) {
      const __m256i src = _mm256_lddqu_si256((const __m256i *)&src_ptr[x]);
      const __m256i a = _mm256_lddqu_si256((const __m256i *)&a_ptr[x]);
      const __m256i b = _mm256_lddqu_si256((const __m256i *)&b_ptr[x]);
      // Zero-extend mask to 16 bits
      const __m256i m =
          _mm256_cvtepu8_epi16(_mm_lddqu_si128((const __m128i *)&m_ptr[x]));
      const __m256i m_inv = _mm256_sub_epi16(mask_max, m);

      const __m256i data_l = _mm256_unpacklo_epi16(a, b);
      const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv);
      __m256i pred_l = _mm256_madd_epi16(data_l, mask_l);
      pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const),
                                 AOM_BLEND_A64_ROUND_BITS);

      const __m256i data_r = _mm256_unpackhi_epi16(a, b);
      const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv);
      __m256i pred_r = _mm256_madd_epi16(data_r, mask_r);
      pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const),
                                 AOM_BLEND_A64_ROUND_BITS);

      // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15,
      // so it is safe to do signed saturation here.
      const __m256i pred = _mm256_packs_epi32(pred_l, pred_r);
      // There is no 16-bit SAD instruction, so we have to synthesize
      // an 8-element SAD. We do this by storing 4 32-bit partial SADs,
      // and accumulating them at the end
      const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src));
      res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one));
    }

    src_ptr += src_stride;
    a_ptr += a_stride;
    b_ptr += b_stride;
    m_ptr += m_stride;
  }
  // At this point, we have four 32-bit partial SADs stored in 'res'.
  res = _mm256_hadd_epi32(res, res);
  res = _mm256_hadd_epi32(res, res);
  int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4);
  return (sad + 31) >> 6;
}
Ejemplo n.º 4
0
static void
avx2_test (void)
{
  union128i_b s;
  union256i_w res;
  short res_ref[16];

  s.x = _mm_set_epi8 (1, 2, 3, 4, 200, 50, 6, 8, 1, 2, 3, 4, 200, 5, 6, 8);

  res.x = _mm256_cvtepu8_epi16 (s.x);

  compute_movzxbw (s.a, res_ref);

  if (check_union256i_w (res, res_ref))
    abort ();
}
Ejemplo n.º 5
0
 template <bool align> SIMD_INLINE __m256i LoadUnpacked(const void * src)
 {
     return _mm256_cvtepu8_epi16(LoadHalf<align>((const __m128i*)src));
 }
Ejemplo n.º 6
0
__m256i test_mm256_cvtepu8_epi16(__m128i a) {
  // CHECK: @llvm.x86.avx2.pmovzxbw
  return _mm256_cvtepu8_epi16(a);
}
Ejemplo n.º 7
0
__m256i test_mm256_cvtepu8_epi16(__m128i a) {
  // CHECK-LABEL: test_mm256_cvtepu8_epi16
  // CHECK: zext <16 x i8> %{{.*}} to <16 x i16>
  return _mm256_cvtepu8_epi16(a);
}
Ejemplo n.º 8
0
void aom_highbd_comp_mask_pred_avx2(uint8_t *comp_pred8, const uint8_t *pred8,
                                    int width, int height, const uint8_t *ref8,
                                    int ref_stride, const uint8_t *mask,
                                    int mask_stride, int invert_mask) {
  int i = 0;
  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);
  uint16_t *comp_pred = CONVERT_TO_SHORTPTR(comp_pred8);
  const uint16_t *src0 = invert_mask ? pred : ref;
  const uint16_t *src1 = invert_mask ? ref : pred;
  const int stride0 = invert_mask ? width : ref_stride;
  const int stride1 = invert_mask ? ref_stride : width;
  const __m256i zero = _mm256_setzero_si256();

  if (width == 8) {
    do {
      const __m256i s0 = mm256_loadu2_16(src0 + stride0, src0);
      const __m256i s1 = mm256_loadu2_16(src1 + stride1, src1);

      const __m128i m_l = _mm_loadl_epi64((const __m128i *)mask);
      const __m128i m_h = _mm_loadl_epi64((const __m128i *)(mask + 8));

      __m256i m = _mm256_castsi128_si256(m_l);
      m = _mm256_insertf128_si256(m, m_h, 1);
      const __m256i m_16 = _mm256_unpacklo_epi8(m, zero);

      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);

      _mm_storeu_si128((__m128i *)(comp_pred), _mm256_castsi256_si128(comp));

      _mm_storeu_si128((__m128i *)(comp_pred + width),
                       _mm256_extractf128_si256(comp, 1));

      src0 += (stride0 << 1);
      src1 += (stride1 << 1);
      mask += (mask_stride << 1);
      comp_pred += (width << 1);
      i += 2;
    } while (i < height);
  } else if (width == 16) {
    do {
      const __m256i s0 = _mm256_loadu_si256((const __m256i *)(src0));
      const __m256i s1 = _mm256_loadu_si256((const __m256i *)(src1));
      const __m256i m_16 =
          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask));

      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m_16);

      _mm256_storeu_si256((__m256i *)comp_pred, comp);

      src0 += stride0;
      src1 += stride1;
      mask += mask_stride;
      comp_pred += width;
      i += 1;
    } while (i < height);
  } else if (width == 32) {
    do {
      const __m256i s0 = _mm256_loadu_si256((const __m256i *)src0);
      const __m256i s2 = _mm256_loadu_si256((const __m256i *)(src0 + 16));
      const __m256i s1 = _mm256_loadu_si256((const __m256i *)src1);
      const __m256i s3 = _mm256_loadu_si256((const __m256i *)(src1 + 16));

      const __m256i m01_16 =
          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)mask));
      const __m256i m23_16 =
          _mm256_cvtepu8_epi16(_mm_loadu_si128((const __m128i *)(mask + 16)));

      const __m256i comp = highbd_comp_mask_pred_line_avx2(s0, s1, m01_16);
      const __m256i comp1 = highbd_comp_mask_pred_line_avx2(s2, s3, m23_16);

      _mm256_storeu_si256((__m256i *)comp_pred, comp);
      _mm256_storeu_si256((__m256i *)(comp_pred + 16), comp1);

      src0 += stride0;
      src1 += stride1;
      mask += mask_stride;
      comp_pred += width;
      i += 1;
    } while (i < height);
  }
}
Ejemplo n.º 9
0
void av1_build_compound_diffwtd_mask_avx2(uint8_t *mask,
                                          DIFFWTD_MASK_TYPE mask_type,
                                          const uint8_t *src0, int stride0,
                                          const uint8_t *src1, int stride1,
                                          int h, int w) {
  const int mb = (mask_type == DIFFWTD_38_INV) ? AOM_BLEND_A64_MAX_ALPHA : 0;
  const __m256i y_mask_base = _mm256_set1_epi16(38 - mb);
  int i = 0;
  if (4 == w) {
    do {
      const __m128i s0A = xx_loadl_32(src0);
      const __m128i s0B = xx_loadl_32(src0 + stride0);
      const __m128i s0C = xx_loadl_32(src0 + stride0 * 2);
      const __m128i s0D = xx_loadl_32(src0 + stride0 * 3);
      const __m128i s0AB = _mm_unpacklo_epi32(s0A, s0B);
      const __m128i s0CD = _mm_unpacklo_epi32(s0C, s0D);
      const __m128i s0ABCD = _mm_unpacklo_epi64(s0AB, s0CD);
      const __m256i s0ABCD_w = _mm256_cvtepu8_epi16(s0ABCD);

      const __m128i s1A = xx_loadl_32(src1);
      const __m128i s1B = xx_loadl_32(src1 + stride1);
      const __m128i s1C = xx_loadl_32(src1 + stride1 * 2);
      const __m128i s1D = xx_loadl_32(src1 + stride1 * 3);
      const __m128i s1AB = _mm_unpacklo_epi32(s1A, s1B);
      const __m128i s1CD = _mm_unpacklo_epi32(s1C, s1D);
      const __m128i s1ABCD = _mm_unpacklo_epi64(s1AB, s1CD);
      const __m256i s1ABCD_w = _mm256_cvtepu8_epi16(s1ABCD);
      const __m256i m16 = calc_mask_avx2(y_mask_base, s0ABCD_w, s1ABCD_w);
      const __m256i m8 = _mm256_packus_epi16(m16, _mm256_setzero_si256());
      const __m128i x_m8 =
          _mm256_castsi256_si128(_mm256_permute4x64_epi64(m8, 0xd8));
      xx_storeu_128(mask, x_m8);
      src0 += (stride0 << 2);
      src1 += (stride1 << 2);
      mask += 16;
      i += 4;
    } while (i < h);
  } else if (8 == w) {
    do {
      const __m128i s0A = xx_loadl_64(src0);
      const __m128i s0B = xx_loadl_64(src0 + stride0);
      const __m128i s0C = xx_loadl_64(src0 + stride0 * 2);
      const __m128i s0D = xx_loadl_64(src0 + stride0 * 3);
      const __m256i s0AC_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0A, s0C));
      const __m256i s0BD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s0B, s0D));
      const __m128i s1A = xx_loadl_64(src1);
      const __m128i s1B = xx_loadl_64(src1 + stride1);
      const __m128i s1C = xx_loadl_64(src1 + stride1 * 2);
      const __m128i s1D = xx_loadl_64(src1 + stride1 * 3);
      const __m256i s1AB_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1A, s1C));
      const __m256i s1CD_w = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(s1B, s1D));
      const __m256i m16AC = calc_mask_avx2(y_mask_base, s0AC_w, s1AB_w);
      const __m256i m16BD = calc_mask_avx2(y_mask_base, s0BD_w, s1CD_w);
      const __m256i m8 = _mm256_packus_epi16(m16AC, m16BD);
      yy_storeu_256(mask, m8);
      src0 += stride0 << 2;
      src1 += stride1 << 2;
      mask += 32;
      i += 4;
    } while (i < h);
  } else if (16 == w) {
    do {
      const __m128i s0A = xx_load_128(src0);
      const __m128i s0B = xx_load_128(src0 + stride0);
      const __m128i s1A = xx_load_128(src1);
      const __m128i s1B = xx_load_128(src1 + stride1);
      const __m256i s0AL = _mm256_cvtepu8_epi16(s0A);
      const __m256i s0BL = _mm256_cvtepu8_epi16(s0B);
      const __m256i s1AL = _mm256_cvtepu8_epi16(s1A);
      const __m256i s1BL = _mm256_cvtepu8_epi16(s1B);

      const __m256i m16AL = calc_mask_avx2(y_mask_base, s0AL, s1AL);
      const __m256i m16BL = calc_mask_avx2(y_mask_base, s0BL, s1BL);

      const __m256i m8 =
          _mm256_permute4x64_epi64(_mm256_packus_epi16(m16AL, m16BL), 0xd8);
      yy_storeu_256(mask, m8);
      src0 += stride0 << 1;
      src1 += stride1 << 1;
      mask += 32;
      i += 2;
    } while (i < h);
  } else {
    do {
      int j = 0;
      do {
        const __m256i s0 = yy_loadu_256(src0 + j);
        const __m256i s1 = yy_loadu_256(src1 + j);
        const __m256i s0L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s0));
        const __m256i s1L = _mm256_cvtepu8_epi16(_mm256_castsi256_si128(s1));
        const __m256i s0H =
            _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s0, 1));
        const __m256i s1H =
            _mm256_cvtepu8_epi16(_mm256_extracti128_si256(s1, 1));
        const __m256i m16L = calc_mask_avx2(y_mask_base, s0L, s1L);
        const __m256i m16H = calc_mask_avx2(y_mask_base, s0H, s1H);
        const __m256i m8 =
            _mm256_permute4x64_epi64(_mm256_packus_epi16(m16L, m16H), 0xd8);
        yy_storeu_256(mask + j, m8);
        j += 32;
      } while (j < w);
      src0 += stride0;
      src1 += stride1;
      mask += w;
      i += 1;
    } while (i < h);
  }
}