예제 #1
0
void jpeg_zigzag_avx512bw_permute16(const uint8_t* in, uint8_t* out) {
    
    const __m512i v = _mm512_loadu_si512((const __m512i*)in);

    // crosslane 16-bit shuffle #1:
    //   [ 0, 1][ 2, 3][ 4, 5][ 8, 9][10,11][16,17][18,19][24,25] -- lane 0: missing: 32
    //   [ 6, 7][12,13][14,15][20,21][26,27][28,29][32,33][34,35] -- lane 1: missing: 19, 34, 40, 41, 48
    //   [22,23][30,31][36,37][42,43][44,45][48,49][50,51][56,57] -- lane 2: missing: 15, 29, 35
    //   [38,39][46,47][52,53][54,55][58,59][60,61][62,63][62,62] -- lane 3: missing: 31, 45
    const int16_t crosslane_shuffle1[32] __attribute__((aligned(64))) = {
         0,  1,  2,  4,  5,  8,  9, 12,
         3,  6,  7, 10, 13, 14, 16, 17,
        11, 15, 18, 21, 22, 24, 25, 28,
        19, 23, 26, 27, 29, 30, 31, 31
    };

    const __m512i shuf1 = _mm512_permutexvar_epi16(_mm512_load_si512((__m512i*)crosslane_shuffle1), v);

    // in-lane 8-bit shuffle #1
    const int8_t inlane_shuffle1[64] __attribute__((aligned(64))) = {
        /* lane 0 */  0,  1,  6, 10,  7,  2,  3,  8, 11, 14, -1, 15, 12,  9,  4,  5,
        /* lane 1 */  2, -1,  8, 13, -1, -1, -1, 14,  9,  6,  3,  0,  1,  4,  7, 10,
        /* lane 2 */ -1,  6, 11, 14, 15, 12,  7,  4, -1,  0, -1,  1,  2,  5,  8, 13,
        /* lane 3 */  8,  9,  4, -1,  0, -1,  1,  2,  5, 10, 11,  6,  3,  7, 12, 13
    };

    const __m512i t0 = _mm512_shuffle_epi8(shuf1, _mm512_load_si512(inlane_shuffle1));

    // crosslane 16-bit shuffle #2 -- we need to get values not available in step #1
    //   [32,33][32,33][32,33][32,33][32,33][32,33][32,33][32,33] -- lane 0: missing: 32
    //   [18,19][34,35][40,41][48,49][48,49][48,49][48,49][48,49] -- lane 1: missing: 19, 34, 40, 41, 48
    //   [14,15][28,29][34,35][34,35][34,35][34,35][34,35][34,35] -- lane 2: missing: 15, 29, 35
    //   [30,31][44,45][44,45][44,45][44,45][44,45][44,45][44,45] -- lane 2: missing: 31, 45
    const int16_t crosslane_shuffle2[32] __attribute__((aligned(64))) = {
        /* lane 0*/ 16, 16, 16, 16, 16, 16, 16, 16,
        /* lane 1*/  9, 17, 20, 24, 24, 24, 24, 24,
        /* lane 2*/  7, 14, 17, 17, 17, 17, 17, 17,
        /* lane 3*/ 15, 22, 22, 22, 22, 22, 22, 22
    };
    const __m512i shuf2 = _mm512_permutexvar_epi16(_mm512_load_si512((__m512i*)crosslane_shuffle2), v);

    // in-lane 8-bit shuffle #2
    const int8_t inlane_shuffle2[64] __attribute__((aligned(64))) = {
        /* lane 0 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0, -1, -1, -1, -1, -1,
        /* lane 1 */ -1,  1, -1, -1,  4,  6,  5,  2, -1, -1, -1, -1, -1, -1, -1, -1,
        /* lane 2 */  5, -1, -1, -1, -1, -1, -1, -1,  3, -1,  1, -1, -1, -1, -1, -1,
        /* lane 3 */ -1, -1, -1,  3, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
    };

    const __m512i t1 = _mm512_shuffle_epi8(shuf2, _mm512_load_si512(inlane_shuffle2));

    const __m512i res = _mm512_or_si512(t0, t1);
    _mm512_storeu_si512((__m512i*)out, res);
}
std::uint64_t popcnt_AVX512BW_lookup_original(const uint8_t* data, const size_t n) {

    size_t i = 0;

    const __m512i lookup = _mm512_setr_epi64(
        0x0302020102010100llu, 0x0403030203020201llu,
        0x0302020102010100llu, 0x0403030203020201llu,
        0x0302020102010100llu, 0x0403030203020201llu,
        0x0302020102010100llu, 0x0403030203020201llu
    );

    const __m512i low_mask = _mm512_set1_epi8(0x0f);

    __m512i acc = _mm512_setzero_si512();

    while (i + 64 < n) {

        __m512i local = _mm512_setzero_si512(); 

        for (int k=0; k < 255/8 && i + 64 < n; k++, i += 64) {
            const __m512i vec = _mm512_loadu_si512(reinterpret_cast<const __m512i*>(data + i));
            const __m512i lo  = _mm512_and_si512(vec, low_mask);
            const __m512i hi  = _mm512_and_si512(_mm512_srli_epi32(vec, 4), low_mask);

            const __m512i popcnt1 = _mm512_shuffle_epi8(lookup, lo);
            const __m512i popcnt2 = _mm512_shuffle_epi8(lookup, hi);

            local = _mm512_add_epi8(local, popcnt1);
            local = _mm512_add_epi8(local, popcnt2);
        }

        acc = _mm512_add_epi64(acc, _mm512_sad_epu8(local, _mm512_setzero_si512()));
    }


    uint64_t result = custom::_mm512_hsum_epi64(acc);

    for (/**/; i < n; i++) {
        result += lookup8bit[data[i]];
    }

    return result;
}
예제 #3
0
__m512i test_mm512_shuffle_epi8(__m512i __A, __m512i __B) {
  // CHECK-LABEL: @test_mm512_shuffle_epi8
  // CHECK: @llvm.x86.avx512.mask.pshuf.b.512
  return _mm512_shuffle_epi8(__A,__B); 
}