예제 #1
0
__m512i broadcast_avx512f__version_2(uint8_t b) {
    
    // maps to instructions:
	//   vpbroadcastb   byte, %ymm0
	//   vinserti64x4   $0x1, %ymm0, %zmm0, %zmm0
    return _mm512_set1_epi8(b);
}
size_t avx512bw_strstr_v3_memcmp(const char* string, size_t n, const char* needle, MEMCMP memeq_fun) {

    assert(n > 0);
    assert(k > 0);

    const __m512i first = _mm512_set1_epi8(needle[0]);
    const __m512i last  = _mm512_set1_epi8(needle[k - 1]);

    char* haystack = const_cast<char*>(string);
    char* end      = haystack + n;

    for (/**/; haystack < end; haystack += 64) {

        const __m512i block_first = _mm512_loadu_si512(haystack + 0);
        const __mmask64 first_eq  = _mm512_cmpeq_epi8_mask(block_first, first);

        if (first_eq == 0)
            continue;

        const __m512i block_last  = _mm512_loadu_si512(haystack + k - 1);
        uint64_t mask = _mm512_mask_cmpeq_epi8_mask(first_eq, block_last, last);

        while (mask != 0) {

            const uint64_t bitpos = bits::get_first_bit_set(mask);
            const char* s = reinterpret_cast<const char*>(haystack);

            if (memeq_fun(s + bitpos + 1, needle + 1)) {
                return (s - string) + bitpos;
            }

            mask = bits::clear_leftmost_set(mask);
        }
    }

    return size_t(-1);
}
std::uint64_t popcnt_AVX512BW_lookup_original(const uint8_t* data, const size_t n) {

    size_t i = 0;

    const __m512i lookup = _mm512_setr_epi64(
        0x0302020102010100llu, 0x0403030203020201llu,
        0x0302020102010100llu, 0x0403030203020201llu,
        0x0302020102010100llu, 0x0403030203020201llu,
        0x0302020102010100llu, 0x0403030203020201llu
    );

    const __m512i low_mask = _mm512_set1_epi8(0x0f);

    __m512i acc = _mm512_setzero_si512();

    while (i + 64 < n) {

        __m512i local = _mm512_setzero_si512(); 

        for (int k=0; k < 255/8 && i + 64 < n; k++, i += 64) {
            const __m512i vec = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(data + i));
            const __m512i lo  = _mm512_and_si512(vec, low_mask);
            const __m512i hi  = _mm512_and_si512(_mm512_srli_epi32(vec, 4), low_mask);

            const __m512i popcnt1 = _mm512_shuffle_epi8(lookup, lo);
            const __m512i popcnt2 = _mm512_shuffle_epi8(lookup, hi);

            local = _mm512_add_epi8(local, popcnt1);
            local = _mm512_add_epi8(local, popcnt2);
        }

        acc = _mm512_add_epi64(acc, _mm512_sad_epu8(local, _mm512_setzero_si512()));
    }


    uint64_t result = custom::_mm512_hsum_epi64(acc);

    for (/**/; i < n; i++) {
        result += lookup8bit[data[i]];
    }

    return result;
}
예제 #4
0
void extern
avx512bw_test (void)
{
  x = _mm512_broadcastb_epi8 (z);
  x = _mm512_mask_broadcastb_epi8 (x, mx, z);
  x = _mm512_maskz_broadcastb_epi8 (mx, z);
  y = _mm256_mask_broadcastb_epi8 (y, my, z);
  y = _mm256_maskz_broadcastb_epi8 (my, z);
  z = _mm_mask_broadcastb_epi8 (z, mz, z);
  z = _mm_maskz_broadcastb_epi8 (mz, z);

  x = _mm512_set1_epi8 (w);
  x = _mm512_mask_set1_epi8 (x, mx, w);
  x = _mm512_maskz_set1_epi8 (mx, w);
  y = _mm256_mask_set1_epi8 (y, my, w);
  y = _mm256_maskz_set1_epi8 (my, w);
  z = _mm_mask_set1_epi8 (z, mz, w);
  z = _mm_maskz_set1_epi8 (mz, w);
}