std::uint64_t _mm512_hsum_epi64(__m512i v) { const __m256i t0 = _mm512_extracti64x4_epi64(v, 0); const __m256i t1 = _mm512_extracti64x4_epi64(v, 1); return _mm256_hsum_epi64(t0) + _mm256_hsum_epi64(t1); }
static __m256i popcount(const __m512i v) { const __m256i lo = _mm512_extracti64x4_epi64(v, 0); const __m256i hi = _mm512_extracti64x4_epi64(v, 1); const __m256i s = _mm256_add_epi8(avx2_popcount(lo), avx2_popcount(hi)); return _mm256_sad_epu8(s, _mm256_setzero_si256()); }
static uint32_t avx512maxbitas32int(const __m512i accumulator) { uint32_t ans1 = maxbitas32int(_mm512_castsi512_si256(accumulator)); uint32_t ans2 = maxbitas32int(_mm512_extracti64x4_epi64(accumulator, 1)); printf("ans1 = %u ans2 = % u\n", ans1, ans2); uint32_t ans = ans1 > ans2 ? ans1 : ans2; return bits(ans); }
void extern avx512f_test (void) { y = _mm512_extracti64x4_epi64 (x, 1); y = _mm512_mask_extracti64x4_epi64 (y, 2, x, 1); y = _mm512_maskz_extracti64x4_epi64 (2, x, 1); }
void test1bit (void) { m256d = _mm512_extractf64x4_pd (m512d, 256); /* { dg-error "the last argument must be a 1-bit immediate" } */ m256d = _mm512_mask_extractf64x4_pd (m256d, mmask8, m512d, 256); /* { dg-error "the last argument must be a 1-bit immediate" } */ m256d = _mm512_maskz_extractf64x4_pd (mmask8, m512d, 256); /* { dg-error "the last argument must be a 1-bit immediate" } */ m256i = _mm512_extracti64x4_epi64 (m512i, 256); /* { dg-error "the last argument must be a 1-bit immediate" } */ m256i = _mm512_mask_extracti64x4_epi64 (m256i, mmask8, m512i, 256); /* { dg-error "the last argument must be a 1-bit immediate" } */ m256i = _mm512_maskz_extracti64x4_epi64 (mmask8, m512i, 256); /* { dg-error "the last argument must be a 1-bit immediate" } */ m512d = _mm512_insertf64x4 (m512d, m256d, 256); /* { dg-error "the last argument must be a 1-bit immediate" } */ m512d = _mm512_mask_insertf64x4 (m512d, mmask8, m512d, m256d, 256); /* { dg-error "the last argument must be a 1-bit immediate" } */ m512d = _mm512_maskz_insertf64x4 (mmask8, m512d, m256d, 256); /* { dg-error "the last argument must be a 1-bit immediate" } */ m512i = _mm512_inserti64x4 (m512i, m256i, 256); /* { dg-error "the last argument must be a 1-bit immediate" } */ m512i = _mm512_mask_inserti64x4 (m512i, mmask8, m512i, m256i, 256); /* { dg-error "the last argument must be a 1-bit immediate" } */ m512i = _mm512_maskz_inserti64x4 (mmask8, m512i, m256i, 256); /* { dg-error "the last argument must be a 1-bit immediate" } */ }
int main(int argc, char **argv) { int w, h, bit_num = 0; char byte_acc = 0; long byte_total = 0; int i, iter = 50; double x, y, limit = 2.0; double Zr, Zi, Cr, Ci, Tr, Ti; w = h = argc > 1 ? atoi(argv[1]) : 32000; printf("P4\n%d %d\n", w, h); #ifdef USEAVX512 __m512i a = _mm512_set1_epi32(0); __m512i b = _mm512_set1_epi32(1); __m512i t; #endif for (y = 0; y < h; ++y) { #ifdef USEAVX512 t = a; a = b; #ifdef USEHEAVYAVX512 b = _mm512_mul_epi32(b, t); #else b = _mm512_add_epi32(b, t); #endif #endif for (x = 0; x < w; ++x) { Zr = Zi = Tr = Ti = 0.0; Cr = (2.0 * x / w - 1.5); Ci = (2.0 * y / h - 1.0); for (i = 0; i < iter && (Tr + Ti <= limit * limit); ++i) { Zi = 2.0 * Zr * Zi + Ci; Zr = Tr - Ti + Cr; Tr = Zr * Zr; Ti = Zi * Zi; } byte_acc <<= 1; if (Tr + Ti <= limit * limit) byte_acc |= 0x01; ++bit_num; if (bit_num == 8) { byte_total += byte_acc; // putc(byte_acc,stdout); byte_acc = 0; bit_num = 0; } else if (x == w - 1) { byte_acc <<= (8 - w % 8); byte_total += byte_acc; // putc(byte_acc,stdout); byte_acc = 0; bit_num = 0; } } } #ifdef USEAVX512 printf("we used avx512 %d \n", _mm256_extract_epi32(_mm512_extracti64x4_epi64(b, 1), 7)); #else printf("we did not use avx512\n"); #endif return byte_total; }