void extern avx512bw_test (void) { m16 = _mm_cmpge_epu8_mask (x128, x128); m32 = _mm256_cmpge_epu8_mask (x256, x256); m64 = _mm512_cmpge_epu8_mask (x512, x512); m16 = _mm_mask_cmpge_epu8_mask (3, x128, x128); m32 = _mm256_mask_cmpge_epu8_mask (3, x256, x256); m64 = _mm512_mask_cmpge_epu8_mask (3, x512, x512); }
template <bool align, bool mask> void LbpEstimate(const uint8_t * src, ptrdiff_t stride, uint8_t * dst, __mmask64 tail = -1) { __m512i threshold = Load<false, mask>(src, tail); __m512i lbp = _mm512_setzero_si512(); lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<align, mask>(src - 1 - stride, tail)), threshold), (char)0x01)); lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<false, mask>(src - stride, tail)), threshold), (char)0x02)); lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<false, mask>(src + 1 - stride, tail)), threshold), (char)0x04)); lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<false, mask>(src + 1, tail)), threshold), (char)0x08)); lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<false, mask>(src + 1 + stride, tail)), threshold), (char)0x10)); lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<false, mask>(src + stride, tail)), threshold), (char)0x20)); lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<align, mask>(src - 1 + stride, tail)), threshold), (char)0x40)); lbp = _mm512_or_si512(lbp, _mm512_maskz_set1_epi8(_mm512_cmpge_epu8_mask((Load<align, mask>(src - 1, tail)), threshold), (char)0x80)); Store<false, mask>(dst, lbp, tail); }
__mmask64 test_mm512_cmpge_epu8_mask(__m512i __a, __m512i __b) { // CHECK-LABEL: @test_mm512_cmpge_epu8_mask // CHECK: @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> {{.*}}, <64 x i8> {{.*}}, i8 5, i64 -1) return (__mmask64)_mm512_cmpge_epu8_mask(__a, __b); }