void jpeg_zigzag_avx512bw(const uint16_t* in, uint16_t* out) { const __m512i A = _mm512_loadu_si512((const __m512i*)(in + 0*32)); const __m512i B = _mm512_loadu_si512((const __m512i*)(in + 1*32)); const __m512i shuf0 = _mm512_loadu_si512((const __m512i*)(zigzag_shuffle + 0*32)); const __m512i res0 = _mm512_permutex2var_epi16(A, shuf0, B); const __m512i shuf1 = _mm512_loadu_si512((const __m512i*)(zigzag_shuffle + 1*32)); const __m512i res1 = _mm512_permutex2var_epi16(A, shuf1, B); _mm512_storeu_si512((__m512i*)(out + 0*32), res0); _mm512_storeu_si512((__m512i*)(out + 1*32), res1); }
void extern avx512bw_test (void) { x3 = _mm512_permutex2var_epi16 (x3, z, x3); x3 = _mm512_mask_permutex2var_epi16 (x3, m3, z, x3); x3 = _mm512_maskz_permutex2var_epi16 (m3, x3, z, x3); x2 = _mm256_permutex2var_epi16 (x2, y, x2); x2 = _mm256_mask_permutex2var_epi16 (x2, m2, y, x2); x2 = _mm256_maskz_permutex2var_epi16 (m2, x2, y, x2); x1 = _mm_permutex2var_epi16 (x1, x, x1); x1 = _mm_mask_permutex2var_epi16 (x1, m1, x, x1); x1 = _mm_maskz_permutex2var_epi16 (m1, x1, x, x1); }
__m512i test_mm512_permutex2var_epi16(__m512i __A, __m512i __I, __m512i __B) { // CHECK-LABEL: @test_mm512_permutex2var_epi16 // CHECK: @llvm.x86.avx512.mask.vpermt2var.hi.512 return _mm512_permutex2var_epi16(__A,__I,__B); }