예제 #1
0
void extern
avx512bw_test (void)
{
  m16 = _mm_cmpge_epu8_mask (x128, x128);
  m32 = _mm256_cmpge_epu8_mask (x256, x256);
  m64 = _mm512_cmpge_epu8_mask (x512, x512);
  m16 = _mm_mask_cmpge_epu8_mask (3, x128, x128);
  m32 = _mm256_mask_cmpge_epu8_mask (3, x256, x256);
  m64 = _mm512_mask_cmpge_epu8_mask (3, x512, x512);
}
예제 #2
0
 template <bool align, bool mask> void LbpEstimate(const uint8_t * src, ptrdiff_t stride, uint8_t * dst, __mmask64 tail = -1)
 {
     __m512i threshold = Load<false, mask>(src, tail);
     __m512i lbp = _mm512_setzero_si512();
     lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<align, mask>(src - 1 - stride, tail)), threshold), (char)0x01));
     lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<false, mask>(src - stride, tail)), threshold), (char)0x02));
     lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<false, mask>(src + 1 - stride, tail)), threshold), (char)0x04));
     lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<false, mask>(src + 1, tail)), threshold), (char)0x08));
     lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<false, mask>(src + 1 + stride, tail)), threshold), (char)0x10));
     lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<false, mask>(src + stride, tail)), threshold), (char)0x20));
     lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<align, mask>(src - 1 + stride, tail)), threshold), (char)0x40));
     lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<align, mask>(src - 1, tail)), threshold), (char)0x80));
     Store<false, mask>(dst, lbp, tail);
 }
예제 #3
0
__mmask64 test_mm512_cmpge_epu8_mask(__m512i __a, __m512i __b) {
  // CHECK-LABEL: @test_mm512_cmpge_epu8_mask
  // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i8 5, i64 -1)
  return (__mmask64)_mm512_cmpge_epu8_mask(__a, __b);
}