int64_t vp9_block_error_avx2(const int16_t *coeff,
                             const int16_t *dqcoeff,
                             intptr_t block_size,
                             int64_t *ssz) {
  __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;
  __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
  __m256i sse_reg_64hi, ssz_reg_64hi;
  __m128i sse_reg128, ssz_reg128;
  int64_t sse;
  int i;
  const __m256i zero_reg = _mm256_set1_epi16(0);

  // init sse and ssz registerd to zero
  sse_reg = _mm256_set1_epi16(0);
  ssz_reg = _mm256_set1_epi16(0);

  for (i = 0 ; i < block_size ; i+= 16) {
    // load 32 bytes from coeff and dqcoeff
    coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i));
    dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i));
    // dqcoeff - coeff
    dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg);
    // madd (dqcoeff - coeff)
    dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg);
    // madd coeff
    coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg);
    // expand each double word of madd (dqcoeff - coeff) to quad word
    exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg);
    exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg);
    // expand each double word of madd (coeff) to quad word
    exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg);
    exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg);
    // add each quad word of madd (dqcoeff - coeff) and madd (coeff)
    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo);
    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo);
    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi);
    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi);
  }
  // save the higher 64 bit of each 128 bit lane
  sse_reg_64hi = _mm256_srli_si256(sse_reg, 8);
  ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8);
  // add the higher 64 bit to the low 64 bit
  sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi);
  ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi);

  // add each 64 bit from each of the 128 bit lane of the 256 bit
  sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg),
                             _mm256_extractf128_si256(sse_reg, 1));

  ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg),
                             _mm256_extractf128_si256(ssz_reg, 1));

  // store the results
  _mm_storel_epi64((__m128i*)(&sse), sse_reg128);

  _mm_storel_epi64((__m128i*)(ssz), ssz_reg128);
  return sse;
}
static uint32_t maxbitas32int(const __m256i accumulator) {
  const __m256i _tmp1 =
      _mm256_or_si256(_mm256_srli_si256(accumulator, 8), accumulator);
  const __m256i _tmp2 = _mm256_or_si256(_mm256_srli_si256(_tmp1, 4), _tmp1);
  uint32_t ans1 = _mm256_extract_epi32(_tmp2, 0);
  uint32_t ans2 = _mm256_extract_epi32(_tmp2, 4);
  uint32_t ans = ans1 > ans2 ? ans1 : ans2;
  return ans;
}
Example #3
0
static unsigned int sad_w64_avg_avx2(const uint8_t *src_ptr, int src_stride,
                                     const uint8_t *ref_ptr, int ref_stride,
                                     const int h, const uint8_t *second_pred,
                                     const int second_pred_stride) {
  int i, res;
  __m256i sad1_reg, sad2_reg, ref1_reg, ref2_reg;
  __m256i sum_sad = _mm256_setzero_si256();
  __m256i sum_sad_h;
  __m128i sum_sad128;
  for (i = 0; i < h; i++) {
    ref1_reg = _mm256_loadu_si256((__m256i const *)ref_ptr);
    ref2_reg = _mm256_loadu_si256((__m256i const *)(ref_ptr + 32));
    ref1_reg = _mm256_avg_epu8(
        ref1_reg, _mm256_loadu_si256((__m256i const *)second_pred));
    ref2_reg = _mm256_avg_epu8(
        ref2_reg, _mm256_loadu_si256((__m256i const *)(second_pred + 32)));
    sad1_reg =
        _mm256_sad_epu8(ref1_reg, _mm256_loadu_si256((__m256i const *)src_ptr));
    sad2_reg = _mm256_sad_epu8(
        ref2_reg, _mm256_loadu_si256((__m256i const *)(src_ptr + 32)));
    sum_sad = _mm256_add_epi32(sum_sad, _mm256_add_epi32(sad1_reg, sad2_reg));
    ref_ptr += ref_stride;
    src_ptr += src_stride;
    second_pred += second_pred_stride;
  }
  sum_sad_h = _mm256_srli_si256(sum_sad, 8);
  sum_sad = _mm256_add_epi32(sum_sad, sum_sad_h);
  sum_sad128 = _mm256_extracti128_si256(sum_sad, 1);
  sum_sad128 = _mm_add_epi32(_mm256_castsi256_si128(sum_sad), sum_sad128);
  res = _mm_cvtsi128_si32(sum_sad128);

  return res;
}
 SIMD_INLINE __m256i BgraToGray32(__m256i bgra)
 {
     const __m256i g0a0 = _mm256_and_si256(_mm256_srli_si256(bgra, 1), K16_00FF);
     const __m256i b0r0 = _mm256_and_si256(bgra, K16_00FF);
     const __m256i weightedSum = _mm256_add_epi32(_mm256_madd_epi16(g0a0, K16_GREEN_0000), _mm256_madd_epi16(b0r0, K16_BLUE_RED));
     return _mm256_srli_epi32(_mm256_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT);
 }
Example #5
0
void
test8bit (void)
{
    l1 = _mm256_mpsadbw_epu8 (l2, l3, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_alignr_epi8 (l2, l3, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    i1 = _mm_blend_epi32 (i1, i1, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_blend_epi32 (l2, l3, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_blend_epi16(l2, l3, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_permute2x128_si256 (l2, l3, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    e1 = _mm256_permute4x64_pd (e2, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_permute4x64_epi64 (l2, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_shuffle_epi32 (l2, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_shufflehi_epi16 (l2, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_shufflelo_epi16 (l2, 256);  /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_slli_si256 (l2, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_srli_si256 (l2, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
}
Example #6
0
int vpx_highbd_satd_avx2(const tran_low_t *coeff, int length) {
  __m256i accum = _mm256_setzero_si256();
  int i;

  for (i = 0; i < length; i += 8, coeff += 8) {
    const __m256i src_line = _mm256_loadu_si256((const __m256i *)coeff);
    const __m256i abs = _mm256_abs_epi32(src_line);
    accum = _mm256_add_epi32(accum, abs);
  }

  {  // 32 bit horizontal add
    const __m256i a = _mm256_srli_si256(accum, 8);
    const __m256i b = _mm256_add_epi32(accum, a);
    const __m256i c = _mm256_srli_epi64(b, 32);
    const __m256i d = _mm256_add_epi32(b, c);
    const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
                                            _mm256_extractf128_si256(d, 1));
    return _mm_cvtsi128_si32(accum_128);
  }
}
Example #7
0
static unsigned int sad32x32(const uint8_t *src_ptr, int src_stride,
                             const uint8_t *ref_ptr, int ref_stride) {
  __m256i s1, s2, r1, r2;
  __m256i sum = _mm256_setzero_si256();
  __m128i sum_i128;
  int i;

  for (i = 0; i < 16; ++i) {
    r1 = _mm256_loadu_si256((__m256i const *)ref_ptr);
    r2 = _mm256_loadu_si256((__m256i const *)(ref_ptr + ref_stride));
    s1 = _mm256_sad_epu8(r1, _mm256_loadu_si256((__m256i const *)src_ptr));
    s2 = _mm256_sad_epu8(
        r2, _mm256_loadu_si256((__m256i const *)(src_ptr + src_stride)));
    sum = _mm256_add_epi32(sum, _mm256_add_epi32(s1, s2));
    ref_ptr += ref_stride << 1;
    src_ptr += src_stride << 1;
  }

  sum = _mm256_add_epi32(sum, _mm256_srli_si256(sum, 8));
  sum_i128 = _mm_add_epi32(_mm256_extracti128_si256(sum, 1),
                           _mm256_castsi256_si128(sum));
  return _mm_cvtsi128_si32(sum_i128);
}
Example #8
0
int vpx_satd_avx2(const tran_low_t *coeff, int length) {
  const __m256i one = _mm256_set1_epi16(1);
  __m256i accum = _mm256_setzero_si256();
  int i;

  for (i = 0; i < length; i += 16) {
    const __m256i src_line = load_tran_low(coeff);
    const __m256i abs = _mm256_abs_epi16(src_line);
    const __m256i sum = _mm256_madd_epi16(abs, one);
    accum = _mm256_add_epi32(accum, sum);
    coeff += 16;
  }

  {  // 32 bit horizontal add
    const __m256i a = _mm256_srli_si256(accum, 8);
    const __m256i b = _mm256_add_epi32(accum, a);
    const __m256i c = _mm256_srli_epi64(b, 32);
    const __m256i d = _mm256_add_epi32(b, c);
    const __m128i accum_128 = _mm_add_epi32(_mm256_castsi256_si128(d),
                                            _mm256_extractf128_si256(d, 1));
    return _mm_cvtsi128_si32(accum_128);
  }
}
Example #9
0
char *_base64_encode_avx2(char *out, const unsigned char *in, size_t n, int options)
{
    size_t i;
    size_t o = 0;

    const char (*alphabet)[2] = _base64_alphabet_precombined;
    if (options & Base64UseUrlAlphabet)
        alphabet = _base64url_alphabet_precombined;

    for (i = 0; n - i >= 48; i += 48) {
        // read 48 bytes and duplicate each 16-byte chunk in the high part of the register
        __m256i chunk1 = _mm256_broadcastsi128_si256(* (const __m128i *)&in[i+0]);
        __m256i chunk2 = _mm256_broadcastsi128_si256(* (const __m128i *)&in[i+16]);
        __m256i chunk3 = _mm256_broadcastsi128_si256(* (const __m128i *)&in[i+32]);

        // first chunk of 12 bytes
        do_encode_12bytes(alphabet, out + o, chunk1);
        o += 16;

        // second chunk: 4 bytes left in chunk1
        do_encode_12bytes(alphabet, out + o, _mm256_alignr_epi8(chunk2, chunk1, 12));
        o += 16;

        // third chunk: 8 bytes left in chunk2
        do_encode_12bytes(alphabet, out + o, _mm256_alignr_epi8(chunk3, chunk2, 8));
        o += 16;

        // fourth chunk: 12 final bytes in chunk3
        do_encode_12bytes(alphabet, out + o, _mm256_srli_si256(chunk3, 4));
        o += 16;

        if (options & Base64InsertLineBreaks)
            out[o++] = '\n';
    }

    return _base64_encode_tail(out, o, in, n, options);
}
Example #10
0
__m256i test_mm256_srli_si256(__m256i a) {
  // CHECK: @llvm.x86.avx2.psrl.dq
  return _mm256_srli_si256(a, 3);
}
Example #11
0
__m256i test_mm256_srli_si256(__m256i a) {
  // CHECK-LABEL: test_mm256_srli_si256
  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50>
  return _mm256_srli_si256(a, 3);
}
Example #12
0
void extern
avx2_test (void)
{
  x = _mm256_srli_si256 (x, 13);
}