コード例 #1
0
static inline int16_t _mm256_hmax_epi16_rpl(__m256i a) {
    a = _mm256_max_epi16(a, _mm256_permute2x128_si256(a, a, _MM_SHUFFLE(0,0,0,0)));
    a = _mm256_max_epi16(a, _mm256_slli_si256(a, 8));
    a = _mm256_max_epi16(a, _mm256_slli_si256(a, 4));
    a = _mm256_max_epi16(a, _mm256_slli_si256(a, 2));
    return _mm256_extract_epi16_rpl(a, 15);
}
コード例 #2
0
        template <bool align> void Bgr48pToBgra32(const uint8_t * blue, size_t blueStride, size_t width, size_t height,
            const uint8_t * green, size_t greenStride, const uint8_t * red, size_t redStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha)
        {
            assert(width >= HA);
            if(align)
            {
                assert(Aligned(blue) && Aligned(blueStride));
                assert(Aligned(green) && Aligned(greenStride));
                assert(Aligned(red) && Aligned(redStride));
                assert(Aligned(bgra) && Aligned(bgraStride));
            }

            __m256i _alpha = _mm256_slli_si256(_mm256_set1_epi16(alpha), 1);
            size_t alignedWidth = AlignLo(width, HA);
            for(size_t row = 0; row < height; ++row)
            {
                for(size_t col = 0, srcOffset = 0, dstOffset = 0; col < alignedWidth; col += HA, srcOffset += A, dstOffset += DA)
                    Bgr48pToBgra32<align>(bgra + dstOffset, blue, green, red, srcOffset, _alpha);
                if(width != alignedWidth)
                    Bgr48pToBgra32<false>(bgra + (width - HA)*4, blue, green, red, (width - HA)*2, _alpha);
                blue += blueStride;
                green += greenStride;
                red += redStride;
                bgra += bgraStride;
            }
        }
コード例 #3
0
        template <bool align> SIMD_INLINE void Bgr48pToBgra32(uint8_t * bgra, 
            const uint8_t * blue, const uint8_t * green, const uint8_t * red, size_t offset, __m256i alpha)
        {
            __m256i _blue = _mm256_and_si256(LoadPermuted<align>((__m256i*)(blue + offset)), K16_00FF);
            __m256i _green = _mm256_and_si256(LoadPermuted<align>((__m256i*)(green + offset)), K16_00FF);
            __m256i _red = _mm256_and_si256(LoadPermuted<align>((__m256i*)(red + offset)), K16_00FF);

            __m256i bg = _mm256_or_si256(_blue, _mm256_slli_si256(_green, 1));
            __m256i ra = _mm256_or_si256(_red, alpha);

            Store<align>((__m256i*)bgra + 0, _mm256_unpacklo_epi16(bg, ra));
            Store<align>((__m256i*)bgra + 1, _mm256_unpackhi_epi16(bg, ra));
        }
コード例 #4
0
ファイル: testimm-9.c プロジェクト: pjump/gcc
void
test8bit (void)
{
    l1 = _mm256_mpsadbw_epu8 (l2, l3, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_alignr_epi8 (l2, l3, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    i1 = _mm_blend_epi32 (i1, i1, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_blend_epi32 (l2, l3, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_blend_epi16(l2, l3, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_permute2x128_si256 (l2, l3, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    e1 = _mm256_permute4x64_pd (e2, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_permute4x64_epi64 (l2, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_shuffle_epi32 (l2, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_shufflehi_epi16 (l2, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_shufflelo_epi16 (l2, 256);  /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_slli_si256 (l2, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
    l1 = _mm256_srli_si256 (l2, 256); /* { dg-error "the last argument must be an 8-bit immediate" } */
}
コード例 #5
0
        template <bool align> void BgrToBgra(const uint8_t * bgr, size_t width, size_t height, size_t bgrStride, uint8_t * bgra, size_t bgraStride, uint8_t alpha)
        {
            assert(width >= A);
            if(align)
                assert(Aligned(bgra) && Aligned(bgraStride) && Aligned(bgr) && Aligned(bgrStride));

            size_t alignedWidth = AlignLo(width, A);

            __m256i _alpha = _mm256_slli_si256(_mm256_set1_epi32(alpha), 3);

            for(size_t row = 0; row < height; ++row)
            {
                for(size_t col = 0; col < alignedWidth; col += A)
                    BgrToBgra<align>(bgr + 3*col, bgra + 4*col, _alpha);
                if(width != alignedWidth)
                    BgrToBgra<false>(bgr + 3*(width - A), bgra + 4*(width - A), _alpha);
                bgr += bgrStride;
                bgra += bgraStride;
            }
        }
コード例 #6
0
ファイル: avx2-vpslldq-2.c プロジェクト: 0day-ci/gcc
void static
avx2_test (void)
{
  union256i_b s1, res;
  char res_ref[32];
  int i, j;
  int fail = 0;

  for (i = 0; i < 10; i++)
    {
      for (j = 0; j < 32; j++)
	s1.a[j] = j * i;

      res.x = _mm256_slli_si256 (s1.x, N);

      compute_pslldq256 (s1.a, res_ref);

      fail += check_union256i_b (res, res_ref);
    }

  if (fail != 0)
    abort ();
}
コード例 #7
0
ファイル: avx2-builtins.c プロジェクト: mgranberry/clang
__m256i test_mm256_slli_si256(__m256i a) {
  // CHECK: @llvm.x86.avx2.psll.dq
  return _mm256_slli_si256(a, 3);
}
コード例 #8
0
__m256i test_mm256_slli_si256(__m256i a) {
  // CHECK-LABEL: test_mm256_slli_si256
  // CHECK: shufflevector <32 x i8> %{{.*}}, <32 x i8> %{{.*}}, <32 x i32> <i32 13, i32 14, i32 15, i32 32, i32 33, i32 34, i32 35, i32 36, i32 37, i32 38, i32 39, i32 40, i32 41, i32 42, i32 43, i32 44, i32 29, i32 30, i32 31, i32 48, i32 49, i32 50, i32 51, i32 52, i32 53, i32 54, i32 55, i32 56, i32 57, i32 58, i32 59, i32 60>
  return _mm256_slli_si256(a, 3);
}
コード例 #9
0
ファイル: sad4d_avx2.c プロジェクト: jfiguinha/Regards
void aom_sad64x64x4d_avx2(const uint8_t *src, int src_stride,
                          const uint8_t *const ref[4], int ref_stride,
                          uint32_t res[4]) {
  __m256i src_reg, srcnext_reg, ref0_reg, ref0next_reg;
  __m256i ref1_reg, ref1next_reg, ref2_reg, ref2next_reg;
  __m256i ref3_reg, ref3next_reg;
  __m256i sum_ref0, sum_ref1, sum_ref2, sum_ref3;
  __m256i sum_mlow, sum_mhigh;
  int i;
  const uint8_t *ref0, *ref1, *ref2, *ref3;

  ref0 = ref[0];
  ref1 = ref[1];
  ref2 = ref[2];
  ref3 = ref[3];
  sum_ref0 = _mm256_set1_epi16(0);
  sum_ref1 = _mm256_set1_epi16(0);
  sum_ref2 = _mm256_set1_epi16(0);
  sum_ref3 = _mm256_set1_epi16(0);
  for (i = 0; i < 64; i++) {
    // load 64 bytes from src and all refs
    src_reg = _mm256_loadu_si256((const __m256i *)src);
    srcnext_reg = _mm256_loadu_si256((const __m256i *)(src + 32));
    ref0_reg = _mm256_loadu_si256((const __m256i *)ref0);
    ref0next_reg = _mm256_loadu_si256((const __m256i *)(ref0 + 32));
    ref1_reg = _mm256_loadu_si256((const __m256i *)ref1);
    ref1next_reg = _mm256_loadu_si256((const __m256i *)(ref1 + 32));
    ref2_reg = _mm256_loadu_si256((const __m256i *)ref2);
    ref2next_reg = _mm256_loadu_si256((const __m256i *)(ref2 + 32));
    ref3_reg = _mm256_loadu_si256((const __m256i *)ref3);
    ref3next_reg = _mm256_loadu_si256((const __m256i *)(ref3 + 32));
    // sum of the absolute differences between every ref-i to src
    ref0_reg = _mm256_sad_epu8(ref0_reg, src_reg);
    ref1_reg = _mm256_sad_epu8(ref1_reg, src_reg);
    ref2_reg = _mm256_sad_epu8(ref2_reg, src_reg);
    ref3_reg = _mm256_sad_epu8(ref3_reg, src_reg);
    ref0next_reg = _mm256_sad_epu8(ref0next_reg, srcnext_reg);
    ref1next_reg = _mm256_sad_epu8(ref1next_reg, srcnext_reg);
    ref2next_reg = _mm256_sad_epu8(ref2next_reg, srcnext_reg);
    ref3next_reg = _mm256_sad_epu8(ref3next_reg, srcnext_reg);

    // sum every ref-i
    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0_reg);
    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1_reg);
    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2_reg);
    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3_reg);
    sum_ref0 = _mm256_add_epi32(sum_ref0, ref0next_reg);
    sum_ref1 = _mm256_add_epi32(sum_ref1, ref1next_reg);
    sum_ref2 = _mm256_add_epi32(sum_ref2, ref2next_reg);
    sum_ref3 = _mm256_add_epi32(sum_ref3, ref3next_reg);
    src += src_stride;
    ref0 += ref_stride;
    ref1 += ref_stride;
    ref2 += ref_stride;
    ref3 += ref_stride;
  }
  {
    __m128i sum;

    // in sum_ref-i the result is saved in the first 4 bytes
    // the other 4 bytes are zeroed.
    // sum_ref1 and sum_ref3 are shifted left by 4 bytes
    sum_ref1 = _mm256_slli_si256(sum_ref1, 4);
    sum_ref3 = _mm256_slli_si256(sum_ref3, 4);

    // merge sum_ref0 and sum_ref1 also sum_ref2 and sum_ref3
    sum_ref0 = _mm256_or_si256(sum_ref0, sum_ref1);
    sum_ref2 = _mm256_or_si256(sum_ref2, sum_ref3);

    // merge every 64 bit from each sum_ref-i
    sum_mlow = _mm256_unpacklo_epi64(sum_ref0, sum_ref2);
    sum_mhigh = _mm256_unpackhi_epi64(sum_ref0, sum_ref2);

    // add the low 64 bit to the high 64 bit
    sum_mlow = _mm256_add_epi32(sum_mlow, sum_mhigh);

    // add the low 128 bit to the high 128 bit
    sum = _mm_add_epi32(_mm256_castsi256_si128(sum_mlow),
                        _mm256_extractf128_si256(sum_mlow, 1));

    _mm_storeu_si128((__m128i *)(res), sum);
  }
  _mm256_zeroupper();
}