static INLINE __m256i calc_mask_d16_avx2(const __m256i *data_src0, const __m256i *data_src1, const __m256i *round_const, const __m256i *mask_base_16, const __m256i *clip_diff, int round) { const __m256i diffa = _mm256_subs_epu16(*data_src0, *data_src1); const __m256i diffb = _mm256_subs_epu16(*data_src1, *data_src0); const __m256i diff = _mm256_max_epu16(diffa, diffb); const __m256i diff_round = _mm256_srli_epi16(_mm256_adds_epu16(diff, *round_const), round); const __m256i diff_factor = _mm256_srli_epi16(diff_round, DIFF_FACTOR_LOG2); const __m256i diff_mask = _mm256_adds_epi16(diff_factor, *mask_base_16); const __m256i diff_clamp = _mm256_min_epi16(diff_mask, *clip_diff); return diff_clamp; }
__m256i test_mm256_subs_epu16(__m256i a, __m256i b) { // CHECK: @llvm.x86.avx2.psubus.w return _mm256_subs_epu16(a, b); }
__m256i test_mm256_subs_epu16(__m256i a, __m256i b) { // CHECK-LABEL: test_mm256_subs_epu16 // CHECK: call <16 x i16> @llvm.x86.avx2.psubus.w(<16 x i16> %{{.*}}, <16 x i16> %{{.*}}) return _mm256_subs_epu16(a, b); }