size_t vectorshift_unrolled(uint32_t *array, size_t length, int shiftamount) { size_t k = 0; __m256i * a = (__m256i *) array; for (; k +3 < length / 8 ; k +=4, a+=4) { __m256i v1 = _mm256_loadu_si256(a); __m256i v2 = _mm256_loadu_si256(a + 1); __m256i v3 = _mm256_loadu_si256(a + 2); __m256i v4 = _mm256_loadu_si256(a + 3); v1 = _mm256_srli_epi32(v1,SHIFTAMOUNT); v2 = _mm256_srli_epi32(v2,SHIFTAMOUNT); v3 = _mm256_srli_epi32(v3,SHIFTAMOUNT); v4 = _mm256_srli_epi32(v4,SHIFTAMOUNT); _mm256_storeu_si256(a,v1); _mm256_storeu_si256(a + 1,v2); _mm256_storeu_si256(a + 2,v3); _mm256_storeu_si256(a + 3,v4); } for (; k < length / 8 ; k ++, a++) { array[k] = array[k] >> shiftamount; __m256i v = _mm256_loadu_si256(a); v = _mm256_srli_epi32(v,SHIFTAMOUNT); _mm256_storeu_si256(a,v); } k *= 8; for (; k < length; ++k) { array[k] = array[k] >> SHIFTAMOUNT; } return 0; }
static inline void blend_unorm8_argb(struct reg *src, __m256i dst_argb) { if (gt.blend.enable) { const __m256i mask = _mm256_set1_epi32(0xff); const __m256 scale = _mm256_set1_ps(1.0f / 255.0f); struct reg dst[4]; /* Convert to float */ dst[2].reg = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(dst_argb, mask)), scale); dst_argb = _mm256_srli_epi32(dst_argb, 8); dst[1].reg = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(dst_argb, mask)), scale); dst_argb = _mm256_srli_epi32(dst_argb, 8); dst[0].reg = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(dst_argb, mask)), scale); dst_argb = _mm256_srli_epi32(dst_argb, 8); dst[3].reg = _mm256_mul_ps(_mm256_cvtepi32_ps(_mm256_and_si256(dst_argb, mask)), scale); /* Blend, assuming src BLENDFACTOR_SRC_ALPHA, dst * BLENDFACTOR_INV_SRC_ALPHA, and BLENDFUNCTION_ADD. */ const __m256 inv_alpha = _mm256_sub_ps(_mm256_set1_ps(1.0f), src[3].reg); src[0].reg = _mm256_add_ps(_mm256_mul_ps(src[3].reg, src[0].reg), _mm256_mul_ps(inv_alpha, dst[0].reg)); src[1].reg = _mm256_add_ps(_mm256_mul_ps(src[3].reg, src[1].reg), _mm256_mul_ps(inv_alpha, dst[1].reg)); src[2].reg = _mm256_add_ps(_mm256_mul_ps(src[3].reg, src[2].reg), _mm256_mul_ps(inv_alpha, dst[2].reg)); src[3].reg = _mm256_add_ps(_mm256_mul_ps(src[3].reg, src[3].reg), _mm256_mul_ps(inv_alpha, dst[3].reg)); } }
SIMD_INLINE __m256i BgraToGray32(__m256i bgra) { const __m256i g0a0 = _mm256_and_si256(_mm256_srli_si256(bgra, 1), K16_00FF); const __m256i b0r0 = _mm256_and_si256(bgra, K16_00FF); const __m256i weightedSum = _mm256_add_epi32(_mm256_madd_epi16(g0a0, K16_GREEN_0000), _mm256_madd_epi16(b0r0, K16_BLUE_RED)); return _mm256_srli_epi32(_mm256_add_epi32(weightedSum, K32_ROUND_TERM), Base::BGR_TO_GRAY_AVERAGING_SHIFT); }
__m256i branchfree_search8_avx(int* source, size_t n, __m256i target) { __m256i offsets = _mm256_setzero_si256(); if(n == 0) return offsets; __m256i ha = _mm256_set1_epi32(n>>1); while(n>1) { n -= n>>1; __m256i offsetsplushalf = _mm256_add_epi32(offsets,ha); ha = _mm256_sub_epi32(ha,_mm256_srli_epi32(ha,1)); __m256i keys = _mm256_i32gather_epi32(source,offsetsplushalf,4); __m256i lt = _mm256_cmpgt_epi32(target,keys); offsets = _mm256_blendv_epi8(offsets,offsetsplushalf,lt); } __m256i lastkeys = _mm256_i32gather_epi32(source,offsets,4); __m256i lastlt = _mm256_cmpgt_epi32(target,lastkeys); __m256i oneswhereneeded = _mm256_srli_epi32(lastlt,31); __m256i answer = _mm256_add_epi32(offsets,oneswhereneeded); return answer; }
static inline __m256i dec_reshuffle (__m256i in) { // Shuffle bytes to 32-bit bigendian: in = _mm256_bswap_epi32(in); // Mask in a single byte per shift: __m256i mask = _mm256_set1_epi32(0x3F000000); // Pack bytes together: __m256i out = _mm256_slli_epi32(_mm256_and_si256(in, mask), 2); mask = _mm256_srli_epi32(mask, 8); out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 4)); mask = _mm256_srli_epi32(mask, 8); out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 6)); mask = _mm256_srli_epi32(mask, 8); out = _mm256_or_si256(out, _mm256_slli_epi32(_mm256_and_si256(in, mask), 8)); // Pack bytes together within 32-bit words, discarding words 3 and 7: out = _mm256_shuffle_epi8(out, _mm256_setr_epi8( 3, 2, 1, 7, 6, 5, 11, 10, 9, 15, 14, 13, -1, -1, -1, -1, 3, 2, 1, 7, 6, 5, 11, 10, 9, 15, 14, 13, -1, -1, -1, -1)); // Pack 32-bit words together, squashing empty words 3 and 7: return _mm256_permutevar8x32_epi32(out, _mm256_setr_epi32( 0, 1, 2, 4, 5, 6, -1, -1)); }
size_t vectorshift(uint32_t *array, size_t length, int shiftamount) { size_t k = 0; __m256i * a = (__m256i *) array; for (; k < length / 8 ; k ++, a++) { __m256i v = _mm256_loadu_si256(a); v = _mm256_srli_epi32(v,SHIFTAMOUNT); _mm256_storeu_si256(a,v); } k *= 8; for (; k < length; ++k) { array[k] = array[k] >> SHIFTAMOUNT; } return 0; }
void static avx2_test (void) { union256i_d s1, res; int res_ref[8]; int i, j; int fail = 0; for (i = 0; i < 10; i++) { for (j = 0; j < 8; j++) s1.a[j] = j * i; res.x = _mm256_srli_epi32 (s1.x, N); compute_psrldi256 (s1.a, res_ref); fail += check_union256i_d (res, res_ref); } if (fail != 0) abort (); }
__m256i test_mm256_srli_epi32(__m256i a) { // CHECK: @llvm.x86.avx2.psrli.d return _mm256_srli_epi32(a, 3); }
__m256i test_mm256_srli_epi32(__m256i a) { // CHECK-LABEL: test_mm256_srli_epi32 // CHECK: call <8 x i32> @llvm.x86.avx2.psrli.d(<8 x i32> %{{.*}}, i32 %{{.*}}) return _mm256_srli_epi32(a, 3); }
void TransLut_FindIndexAvx2 <TransLut::MapperLog>::find_index (const TransLut::FloatIntMix val_arr [8], __m256i &index, __m256 &frac) { assert (val_arr != 0); // Constants static const int mant_size = 23; static const int exp_bias = 127; static const uint32_t base = (exp_bias + LOGLUT_MIN_L2) << mant_size; static const float val_min = 1.0f / (int64_t (1) << -LOGLUT_MIN_L2); // static const float val_max = float (int64_t (1) << LOGLUT_MAX_L2); static const int frac_size = mant_size - LOGLUT_RES_L2; static const uint32_t frac_mask = (1 << frac_size) - 1; const __m256 zero_f = _mm256_setzero_ps (); const __m256 one_f = _mm256_set1_ps (1); const __m256 frac_mul = _mm256_set1_ps (1.0f / (1 << frac_size)); const __m256 mul_eps = _mm256_set1_ps (1.0f / val_min); const __m256 mask_abs_f = _mm256_load_ps ( reinterpret_cast <const float *> (fstb::ToolsAvx2::_mask_abs) ); const __m256i zero_i = _mm256_setzero_si256 (); const __m256i mask_abs_epi32 = _mm256_set1_epi32 (0x7FFFFFFF); const __m256i one_epi32 = _mm256_set1_epi32 (1); const __m256i base_epi32 = _mm256_set1_epi32 (int (base)); const __m256i frac_mask_epi32 = _mm256_set1_epi32 (frac_mask); const __m256i val_min_epi32 = _mm256_set1_epi32 ((LOGLUT_MIN_L2 + exp_bias) << mant_size); const __m256i val_max_epi32 = _mm256_set1_epi32 ((LOGLUT_MAX_L2 + exp_bias) << mant_size); const __m256i index_max_epi32 = _mm256_set1_epi32 ((LOGLUT_MAX_L2 - LOGLUT_MIN_L2) << LOGLUT_RES_L2); const __m256i hsize_epi32 = _mm256_set1_epi32 (LOGLUT_HSIZE); const __m256i mirror_epi32 = _mm256_set1_epi32 (LOGLUT_HSIZE - 1); // It really starts here const __m256 val_f = _mm256_load_ps (reinterpret_cast <const float *> (val_arr)); const __m256 val_a = _mm256_and_ps (val_f, mask_abs_f); const __m256i val_i = _mm256_load_si256 (reinterpret_cast <const __m256i *> (val_arr)); const __m256i val_u = _mm256_and_si256 (val_i, mask_abs_epi32); // Standard path __m256i index_std = _mm256_sub_epi32 (val_u, base_epi32); index_std = _mm256_srli_epi32 (index_std, frac_size); index_std = _mm256_add_epi32 (index_std, one_epi32); __m256i frac_stdi = _mm256_and_si256 (val_u, frac_mask_epi32); __m256 frac_std = _mm256_cvtepi32_ps (frac_stdi); frac_std = _mm256_mul_ps (frac_std, frac_mul); // Epsilon path __m256 frac_eps = _mm256_max_ps (val_a, zero_f); frac_eps = _mm256_mul_ps (frac_eps, mul_eps); // Range cases const __m256i eps_flag_i = _mm256_cmpgt_epi32 (val_min_epi32, val_u); const __m256i std_flag_i = _mm256_cmpgt_epi32 (val_max_epi32, val_u); const __m256 eps_flag_f = _mm256_castsi256_ps (eps_flag_i); const __m256 std_flag_f = _mm256_castsi256_ps (std_flag_i); __m256i index_tmp = fstb::ToolsAvx2::select (std_flag_i, index_std, index_max_epi32); __m256 frac_tmp = fstb::ToolsAvx2::select (std_flag_f, frac_std, one_f); index_tmp = fstb::ToolsAvx2::select (eps_flag_i, zero_i, index_tmp); frac_tmp = fstb::ToolsAvx2::select (eps_flag_f, frac_eps, frac_tmp); // Sign cases const __m256i neg_flag_i = _mm256_srai_epi32 (val_i, 31); const __m256 neg_flag_f = _mm256_castsi256_ps (neg_flag_i); const __m256i index_neg = _mm256_sub_epi32 (mirror_epi32, index_tmp); const __m256i index_pos = _mm256_add_epi32 (hsize_epi32, index_tmp); const __m256 frac_neg = _mm256_sub_ps (one_f, frac_tmp); index = fstb::ToolsAvx2::select (neg_flag_i, index_neg, index_pos); frac = fstb::ToolsAvx2::select (neg_flag_f, frac_neg, frac_tmp); }
l0 = _mm_shuffle_epi8(l0, _mm_setr_epi8(2, 2, 1, 0, 5, 5, 4, 3, 8, 8, 7, 6, 11, 11, 10, 9)); l1 = _mm_loadu_si128((__m128i *)&c[12]); l1 = _mm_shuffle_epi8(l1, _mm_setr_epi8(2, 2, 1, 0, 5, 5, 4, 3, 8, 8, 7, 6, 11, 11, 10, 9)); /* Combine into a single 256-bit register: */ str = _mm256_castsi128_si256(l0); str = _mm256_insertf128_si256(str, l1, 1); /* Mask to pass through only the lower 6 bits of one byte: */ mask = _mm256_set1_epi32(0x3F000000); /* Shift bits by 2, mask in only the first byte: */ res = _mm256_and_si256(_mm256_srli_epi32(str, 2), mask); mask = _mm256_srli_epi32(mask, 8); /* Shift bits by 4, mask in only the second byte: */ res = _mm256_or_si256(res, _mm256_and_si256(_mm256_srli_epi32(str, 4), mask)); mask = _mm256_srli_epi32(mask, 8); /* Shift bits by 6, mask in only the third byte: */ res = _mm256_or_si256(res, _mm256_and_si256(_mm256_srli_epi32(str, 6) , mask)); mask = _mm256_srli_epi32(mask, 8); /* No shift necessary for the fourth byte because we duplicated * the third byte to this position; just mask: */ res = _mm256_or_si256(res, _mm256_and_si256(str, mask)); /* Reorder to 32-bit little-endian: */
__m256i inline ShR(__m256i x, int n) { return _mm256_srli_epi32(x, n); }
static void mshabal256_compress(mshabal256_context *sc, const unsigned char *buf0, const unsigned char *buf1, const unsigned char *buf2, const unsigned char *buf3, const unsigned char *buf4, const unsigned char *buf5, const unsigned char *buf6, const unsigned char *buf7, size_t num) { union { u32 words[64 * MSHABAL256_FACTOR]; __m256i data[16]; } u; size_t j; __m256i A[12], B[16], C[16]; __m256i one; for (j = 0; j < 12; j++) A[j] = _mm256_loadu_si256((__m256i *)sc->state + j); for (j = 0; j < 16; j++) { B[j] = _mm256_loadu_si256((__m256i *)sc->state + j + 12); C[j] = _mm256_loadu_si256((__m256i *)sc->state + j + 28); } one = _mm256_set1_epi32(C32(0xFFFFFFFF)); #define M(i) _mm256_load_si256(u.data + (i)) while (num-- > 0) { for (j = 0; j < 64 * MSHABAL256_FACTOR; j += 4 * MSHABAL256_FACTOR) { size_t o = j / MSHABAL256_FACTOR; u.words[j + 0] = *(u32 *)(buf0 + o); u.words[j + 1] = *(u32 *)(buf1 + o); u.words[j + 2] = *(u32 *)(buf2 + o); u.words[j + 3] = *(u32 *)(buf3 + o); u.words[j + 4] = *(u32 *)(buf4 + o); u.words[j + 5] = *(u32 *)(buf5 + o); u.words[j + 6] = *(u32 *)(buf6 + o); u.words[j + 7] = *(u32 *)(buf7 + o); } for (j = 0; j < 16; j++) B[j] = _mm256_add_epi32(B[j], M(j)); A[0] = _mm256_xor_si256(A[0], _mm256_set1_epi32(sc->Wlow)); A[1] = _mm256_xor_si256(A[1], _mm256_set1_epi32(sc->Whigh)); for (j = 0; j < 16; j++) B[j] = _mm256_or_si256(_mm256_slli_epi32(B[j], 17), _mm256_srli_epi32(B[j], 15)); #define PP(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) do { \ __m256i tt; \ tt = _mm256_or_si256(_mm256_slli_epi32(xa1, 15), \ _mm256_srli_epi32(xa1, 17)); \ tt = _mm256_add_epi32(_mm256_slli_epi32(tt, 2), tt); \ tt = _mm256_xor_si256(_mm256_xor_si256(xa0, tt), xc); \ tt = _mm256_add_epi32(_mm256_slli_epi32(tt, 1), tt); \ tt = _mm256_xor_si256(\ _mm256_xor_si256(tt, xb1), \ _mm256_xor_si256(_mm256_andnot_si256(xb3, xb2), xm)); \ xa0 = tt; \ tt = xb0; \ tt = _mm256_or_si256(_mm256_slli_epi32(tt, 1), \ _mm256_srli_epi32(tt, 31)); \ xb0 = _mm256_xor_si256(tt, _mm256_xor_si256(xa0, one)); \ } while (0) PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); A[0xB] = _mm256_add_epi32(A[0xB], C[0x6]); A[0xA] = _mm256_add_epi32(A[0xA], C[0x5]); A[0x9] = _mm256_add_epi32(A[0x9], C[0x4]); A[0x8] = _mm256_add_epi32(A[0x8], C[0x3]); A[0x7] = _mm256_add_epi32(A[0x7], C[0x2]); A[0x6] = _mm256_add_epi32(A[0x6], C[0x1]); A[0x5] = _mm256_add_epi32(A[0x5], C[0x0]); A[0x4] = _mm256_add_epi32(A[0x4], C[0xF]); A[0x3] = _mm256_add_epi32(A[0x3], C[0xE]); A[0x2] = _mm256_add_epi32(A[0x2], C[0xD]); A[0x1] = _mm256_add_epi32(A[0x1], C[0xC]); A[0x0] = _mm256_add_epi32(A[0x0], C[0xB]); A[0xB] = _mm256_add_epi32(A[0xB], C[0xA]); A[0xA] = _mm256_add_epi32(A[0xA], C[0x9]); A[0x9] = _mm256_add_epi32(A[0x9], C[0x8]); A[0x8] = _mm256_add_epi32(A[0x8], C[0x7]); A[0x7] = _mm256_add_epi32(A[0x7], C[0x6]); A[0x6] = _mm256_add_epi32(A[0x6], C[0x5]); A[0x5] = _mm256_add_epi32(A[0x5], C[0x4]); A[0x4] = _mm256_add_epi32(A[0x4], C[0x3]); A[0x3] = _mm256_add_epi32(A[0x3], C[0x2]); A[0x2] = _mm256_add_epi32(A[0x2], C[0x1]); A[0x1] = _mm256_add_epi32(A[0x1], C[0x0]); A[0x0] = _mm256_add_epi32(A[0x0], C[0xF]); A[0xB] = _mm256_add_epi32(A[0xB], C[0xE]); A[0xA] = _mm256_add_epi32(A[0xA], C[0xD]); A[0x9] = _mm256_add_epi32(A[0x9], C[0xC]); A[0x8] = _mm256_add_epi32(A[0x8], C[0xB]); A[0x7] = _mm256_add_epi32(A[0x7], C[0xA]); A[0x6] = _mm256_add_epi32(A[0x6], C[0x9]); A[0x5] = _mm256_add_epi32(A[0x5], C[0x8]); A[0x4] = _mm256_add_epi32(A[0x4], C[0x7]); A[0x3] = _mm256_add_epi32(A[0x3], C[0x6]); A[0x2] = _mm256_add_epi32(A[0x2], C[0x5]); A[0x1] = _mm256_add_epi32(A[0x1], C[0x4]); A[0x0] = _mm256_add_epi32(A[0x0], C[0x3]); #define SWAP_AND_SUB(xb, xc, xm) do { \ __m256i tmp; \ tmp = xb; \ xb = _mm256_sub_epi32(xc, xm); \ xc = tmp; \ } while (0) SWAP_AND_SUB(B[0x0], C[0x0], M(0x0)); SWAP_AND_SUB(B[0x1], C[0x1], M(0x1)); SWAP_AND_SUB(B[0x2], C[0x2], M(0x2)); SWAP_AND_SUB(B[0x3], C[0x3], M(0x3)); SWAP_AND_SUB(B[0x4], C[0x4], M(0x4)); SWAP_AND_SUB(B[0x5], C[0x5], M(0x5)); SWAP_AND_SUB(B[0x6], C[0x6], M(0x6)); SWAP_AND_SUB(B[0x7], C[0x7], M(0x7)); SWAP_AND_SUB(B[0x8], C[0x8], M(0x8)); SWAP_AND_SUB(B[0x9], C[0x9], M(0x9)); SWAP_AND_SUB(B[0xA], C[0xA], M(0xA)); SWAP_AND_SUB(B[0xB], C[0xB], M(0xB)); SWAP_AND_SUB(B[0xC], C[0xC], M(0xC)); SWAP_AND_SUB(B[0xD], C[0xD], M(0xD)); SWAP_AND_SUB(B[0xE], C[0xE], M(0xE)); SWAP_AND_SUB(B[0xF], C[0xF], M(0xF)); buf0 += 64; buf1 += 64; buf2 += 64; buf3 += 64; buf4 += 64; buf5 += 64; buf6 += 64; buf7 += 64; if (++sc->Wlow == 0) sc->Whigh++; } for (j = 0; j < 12; j++) _mm256_storeu_si256((__m256i *)sc->state + j, A[j]); for (j = 0; j < 16; j++) { _mm256_storeu_si256((__m256i *)sc->state + j + 12, B[j]); _mm256_storeu_si256((__m256i *)sc->state + j + 28, C[j]); } #undef M }
/* natural logarithm computed for 8 simultaneous float return NaN for x <= 0 */ v8sf log256_ps(v8sf x) { v8si imm0; v8sf one = *(v8sf*)_ps256_1; //v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps()); v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS); x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos); /* cut off denormalized stuff */ // can be done with AVX2 imm0 = _mm256_srli_epi32(_mm256_castps_si256(x), 23); /* keep only the fractional part */ x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask); x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5); // this is again another AVX2 instruction imm0 = _mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f); v8sf e = _mm256_cvtepi32_ps(imm0); e = _mm256_add_ps(e, one); /* part2: if( x < SQRTHF ) { e -= 1; x = x + x - 1.0; } else { x = x - 1.0; } */ //v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF); v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS); v8sf tmp = _mm256_and_ps(x, mask); x = _mm256_sub_ps(x, one); e = _mm256_sub_ps(e, _mm256_and_ps(one, mask)); x = _mm256_add_ps(x, tmp); v8sf z = _mm256_mul_ps(x,x); v8sf y = *(v8sf*)_ps256_cephes_log_p0; y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8); y = _mm256_mul_ps(y, x); y = _mm256_mul_ps(y, z); tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1); y = _mm256_add_ps(y, tmp); tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5); y = _mm256_sub_ps(y, tmp); tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2); x = _mm256_add_ps(x, y); x = _mm256_add_ps(x, tmp); x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN return x; }