示例#1
0
void jpeg_zigzag_avx512bw_permute16(const uint8_t* in, uint8_t* out) {
    
    const __m512i v = _mm512_loadu_si512((const __m512i*)in);

    // crosslane 16-bit shuffle #1:
    //   [ 0, 1][ 2, 3][ 4, 5][ 8, 9][10,11][16,17][18,19][24,25] -- lane 0: missing: 32
    //   [ 6, 7][12,13][14,15][20,21][26,27][28,29][32,33][34,35] -- lane 1: missing: 19, 34, 40, 41, 48
    //   [22,23][30,31][36,37][42,43][44,45][48,49][50,51][56,57] -- lane 2: missing: 15, 29, 35
    //   [38,39][46,47][52,53][54,55][58,59][60,61][62,63][62,62] -- lane 3: missing: 31, 45
    const int16_t crosslane_shuffle1[32] __attribute__((aligned(64))) = {
         0,  1,  2,  4,  5,  8,  9, 12,
         3,  6,  7, 10, 13, 14, 16, 17,
        11, 15, 18, 21, 22, 24, 25, 28,
        19, 23, 26, 27, 29, 30, 31, 31
    };

    const __m512i shuf1 = _mm512_permutexvar_epi16(_mm512_load_si512((__m512i*)crosslane_shuffle1), v);

    // in-lane 8-bit shuffle #1
    const int8_t inlane_shuffle1[64] __attribute__((aligned(64))) = {
        /* lane 0 */  0,  1,  6, 10,  7,  2,  3,  8, 11, 14, -1, 15, 12,  9,  4,  5,
        /* lane 1 */  2, -1,  8, 13, -1, -1, -1, 14,  9,  6,  3,  0,  1,  4,  7, 10,
        /* lane 2 */ -1,  6, 11, 14, 15, 12,  7,  4, -1,  0, -1,  1,  2,  5,  8, 13,
        /* lane 3 */  8,  9,  4, -1,  0, -1,  1,  2,  5, 10, 11,  6,  3,  7, 12, 13
    };

    const __m512i t0 = _mm512_shuffle_epi8(shuf1, _mm512_load_si512(inlane_shuffle1));

    // crosslane 16-bit shuffle #2 -- we need to get values not available in step #1
    //   [32,33][32,33][32,33][32,33][32,33][32,33][32,33][32,33] -- lane 0: missing: 32
    //   [18,19][34,35][40,41][48,49][48,49][48,49][48,49][48,49] -- lane 1: missing: 19, 34, 40, 41, 48
    //   [14,15][28,29][34,35][34,35][34,35][34,35][34,35][34,35] -- lane 2: missing: 15, 29, 35
    //   [30,31][44,45][44,45][44,45][44,45][44,45][44,45][44,45] -- lane 2: missing: 31, 45
    const int16_t crosslane_shuffle2[32] __attribute__((aligned(64))) = {
        /* lane 0*/ 16, 16, 16, 16, 16, 16, 16, 16,
        /* lane 1*/  9, 17, 20, 24, 24, 24, 24, 24,
        /* lane 2*/  7, 14, 17, 17, 17, 17, 17, 17,
        /* lane 3*/ 15, 22, 22, 22, 22, 22, 22, 22
    };
    const __m512i shuf2 = _mm512_permutexvar_epi16(_mm512_load_si512((__m512i*)crosslane_shuffle2), v);

    // in-lane 8-bit shuffle #2
    const int8_t inlane_shuffle2[64] __attribute__((aligned(64))) = {
        /* lane 0 */ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,  0, -1, -1, -1, -1, -1,
        /* lane 1 */ -1,  1, -1, -1,  4,  6,  5,  2, -1, -1, -1, -1, -1, -1, -1, -1,
        /* lane 2 */  5, -1, -1, -1, -1, -1, -1, -1,  3, -1,  1, -1, -1, -1, -1, -1,
        /* lane 3 */ -1, -1, -1,  3, -1,  1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
    };

    const __m512i t1 = _mm512_shuffle_epi8(shuf2, _mm512_load_si512(inlane_shuffle2));

    const __m512i res = _mm512_or_si512(t0, t1);
    _mm512_storeu_si512((__m512i*)out, res);
}
示例#2
0
void extern
avx512bw_test (void)
{
  x1 = _mm512_permutexvar_epi16 (x1, x1);
  x1 = _mm512_maskz_permutexvar_epi16 (m1, x1, x1);
  x1 = _mm512_mask_permutexvar_epi16 (x1, m1, x1, x1);
  x2 = _mm256_permutexvar_epi16 (x2, x2);
  x2 = _mm256_maskz_permutexvar_epi16 (m2, x2, x2);
  x2 = _mm256_mask_permutexvar_epi16 (x2, m2, x2, x2);
  x3 = _mm_permutexvar_epi16 (x3, x3);
  x3 = _mm_maskz_permutexvar_epi16 (m3, x3, x3);
  x3 = _mm_mask_permutexvar_epi16 (x3, m3, x3, x3);
}