예제 #1
0
static void satd_8bit_4x4_dual_avx2(
  const pred_buffer preds, const kvz_pixel * const orig, unsigned num_modes, unsigned *satds_out) 
{

  __m256i original = _mm256_broadcastsi128_si256(_mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)orig)));
  __m256i pred = _mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)preds[0]));
  pred = _mm256_inserti128_si256(pred, _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)preds[1])), 1);

  __m256i diff_lo = _mm256_sub_epi16(pred, original);

  original = _mm256_broadcastsi128_si256(_mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(orig + 8))));
  pred = _mm256_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(preds[0] + 8)));
  pred = _mm256_inserti128_si256(pred, _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(preds[1] + 8))), 1);

  __m256i diff_hi = _mm256_sub_epi16(pred, original);

  //Hor
  __m256i row0 = _mm256_hadd_epi16(diff_lo, diff_hi);
  __m256i row1 = _mm256_hsub_epi16(diff_lo, diff_hi);

  __m256i row2 = _mm256_hadd_epi16(row0, row1);
  __m256i row3 = _mm256_hsub_epi16(row0, row1);

  //Ver
  row0 = _mm256_hadd_epi16(row2, row3);
  row1 = _mm256_hsub_epi16(row2, row3);

  row2 = _mm256_hadd_epi16(row0, row1);
  row3 = _mm256_hsub_epi16(row0, row1);

  //Abs and sum
  row2 = _mm256_abs_epi16(row2);
  row3 = _mm256_abs_epi16(row3);

  row3 = _mm256_add_epi16(row2, row3);

  row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, KVZ_PERMUTE(2, 3, 0, 1) ));
  row3 = _mm256_add_epi16(row3, _mm256_shuffle_epi32(row3, KVZ_PERMUTE(1, 0, 1, 0) ));
  row3 = _mm256_add_epi16(row3, _mm256_shufflelo_epi16(row3, KVZ_PERMUTE(1, 0, 1, 0) ));

  unsigned sum1 = _mm_extract_epi16(_mm256_castsi256_si128(row3), 0);
  sum1 = (sum1 + 1) >> 1;

  unsigned sum2 = _mm_extract_epi16(_mm256_extracti128_si256(row3, 1), 0);
  sum2 = (sum2 + 1) >> 1;

  satds_out[0] = sum1;
  satds_out[1] = sum2;
}
예제 #2
0
char *_base64_encode_avx2(char *out, const unsigned char *in, size_t n, int options)
{
    size_t i;
    size_t o = 0;

    const char (*alphabet)[2] = _base64_alphabet_precombined;
    if (options & Base64UseUrlAlphabet)
        alphabet = _base64url_alphabet_precombined;

    for (i = 0; n - i >= 48; i += 48) {
        // read 48 bytes and duplicate each 16-byte chunk in the high part of the register
        __m256i chunk1 = _mm256_broadcastsi128_si256(* (const __m128i *)&in[i+0]);
        __m256i chunk2 = _mm256_broadcastsi128_si256(* (const __m128i *)&in[i+16]);
        __m256i chunk3 = _mm256_broadcastsi128_si256(* (const __m128i *)&in[i+32]);

        // first chunk of 12 bytes
        do_encode_12bytes(alphabet, out + o, chunk1);
        o += 16;

        // second chunk: 4 bytes left in chunk1
        do_encode_12bytes(alphabet, out + o, _mm256_alignr_epi8(chunk2, chunk1, 12));
        o += 16;

        // third chunk: 8 bytes left in chunk2
        do_encode_12bytes(alphabet, out + o, _mm256_alignr_epi8(chunk3, chunk2, 8));
        o += 16;

        // fourth chunk: 12 final bytes in chunk3
        do_encode_12bytes(alphabet, out + o, _mm256_srli_si256(chunk3, 4));
        o += 16;

        if (options & Base64InsertLineBreaks)
            out[o++] = '\n';
    }

    return _base64_encode_tail(out, o, in, n, options);
}
예제 #3
0
__m256i test_mm256_broadcastsi128_si256(__m128i a) {
  // CHECK-LABEL: test_mm256_broadcastsi128_si256
  // CHECK: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
  return _mm256_broadcastsi128_si256(a);
}
예제 #4
0
__m256i test_mm256_broadcastsi128_si256(__m128i a) {
  // CHECK: @llvm.x86.avx2.vbroadcasti128
  return _mm256_broadcastsi128_si256(a);
}