void TransLut_FindIndexAvx2 <TransLut::MapperLog>::find_index (const TransLut::FloatIntMix val_arr [8], __m256i &index, __m256 &frac) { assert (val_arr != 0); // Constants static const int mant_size = 23; static const int exp_bias = 127; static const uint32_t base = (exp_bias + LOGLUT_MIN_L2) << mant_size; static const float val_min = 1.0f / (int64_t (1) << -LOGLUT_MIN_L2); // static const float val_max = float (int64_t (1) << LOGLUT_MAX_L2); static const int frac_size = mant_size - LOGLUT_RES_L2; static const uint32_t frac_mask = (1 << frac_size) - 1; const __m256 zero_f = _mm256_setzero_ps (); const __m256 one_f = _mm256_set1_ps (1); const __m256 frac_mul = _mm256_set1_ps (1.0f / (1 << frac_size)); const __m256 mul_eps = _mm256_set1_ps (1.0f / val_min); const __m256 mask_abs_f = _mm256_load_ps ( reinterpret_cast <const float *> (fstb::ToolsAvx2::_mask_abs) ); const __m256i zero_i = _mm256_setzero_si256 (); const __m256i mask_abs_epi32 = _mm256_set1_epi32 (0x7FFFFFFF); const __m256i one_epi32 = _mm256_set1_epi32 (1); const __m256i base_epi32 = _mm256_set1_epi32 (int (base)); const __m256i frac_mask_epi32 = _mm256_set1_epi32 (frac_mask); const __m256i val_min_epi32 = _mm256_set1_epi32 ((LOGLUT_MIN_L2 + exp_bias) << mant_size); const __m256i val_max_epi32 = _mm256_set1_epi32 ((LOGLUT_MAX_L2 + exp_bias) << mant_size); const __m256i index_max_epi32 = _mm256_set1_epi32 ((LOGLUT_MAX_L2 - LOGLUT_MIN_L2) << LOGLUT_RES_L2); const __m256i hsize_epi32 = _mm256_set1_epi32 (LOGLUT_HSIZE); const __m256i mirror_epi32 = _mm256_set1_epi32 (LOGLUT_HSIZE - 1); // It really starts here const __m256 val_f = _mm256_load_ps (reinterpret_cast <const float *> (val_arr)); const __m256 val_a = _mm256_and_ps (val_f, mask_abs_f); const __m256i val_i = _mm256_load_si256 (reinterpret_cast <const __m256i *> (val_arr)); const __m256i val_u = _mm256_and_si256 (val_i, mask_abs_epi32); // Standard path __m256i index_std = _mm256_sub_epi32 (val_u, base_epi32); index_std = _mm256_srli_epi32 (index_std, frac_size); index_std = _mm256_add_epi32 (index_std, one_epi32); __m256i frac_stdi = _mm256_and_si256 (val_u, frac_mask_epi32); __m256 frac_std = _mm256_cvtepi32_ps (frac_stdi); frac_std = _mm256_mul_ps (frac_std, frac_mul); // Epsilon path __m256 frac_eps = _mm256_max_ps (val_a, zero_f); frac_eps = _mm256_mul_ps (frac_eps, mul_eps); // Range cases const __m256i eps_flag_i = _mm256_cmpgt_epi32 (val_min_epi32, val_u); const __m256i std_flag_i = _mm256_cmpgt_epi32 (val_max_epi32, val_u); const __m256 eps_flag_f = _mm256_castsi256_ps (eps_flag_i); const __m256 std_flag_f = _mm256_castsi256_ps (std_flag_i); __m256i index_tmp = fstb::ToolsAvx2::select (std_flag_i, index_std, index_max_epi32); __m256 frac_tmp = fstb::ToolsAvx2::select (std_flag_f, frac_std, one_f); index_tmp = fstb::ToolsAvx2::select (eps_flag_i, zero_i, index_tmp); frac_tmp = fstb::ToolsAvx2::select (eps_flag_f, frac_eps, frac_tmp); // Sign cases const __m256i neg_flag_i = _mm256_srai_epi32 (val_i, 31); const __m256 neg_flag_f = _mm256_castsi256_ps (neg_flag_i); const __m256i index_neg = _mm256_sub_epi32 (mirror_epi32, index_tmp); const __m256i index_pos = _mm256_add_epi32 (hsize_epi32, index_tmp); const __m256 frac_neg = _mm256_sub_ps (one_f, frac_tmp); index = fstb::ToolsAvx2::select (neg_flag_i, index_neg, index_pos); frac = fstb::ToolsAvx2::select (neg_flag_f, frac_neg, frac_tmp); }
__m256i test_mm256_cmpgt_epi32(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_cmpgt_epi32 // CHECK: icmp sgt <8 x i32> return _mm256_cmpgt_epi32(a, b); }
void extern avx2_test (void) { x = _mm256_cmpgt_epi32 (x, x); }