static void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16, const __m128i mul_constants_0, const __m128i mul_constants_1, const int strength, const int rounding, const int weight) { const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); const __m128i rounding_u16 = _mm_set1_epi16(rounding); const __m128i weight_u16 = _mm_set1_epi16(weight); const __m128i sixteen = _mm_set1_epi16(16); __m128i input_0, input_1; input_0 = _mm_mulhi_epu16(*sum_0_u16, mul_constants_0); input_0 = _mm_adds_epu16(input_0, rounding_u16); input_1 = _mm_mulhi_epu16(*sum_1_u16, mul_constants_1); input_1 = _mm_adds_epu16(input_1, rounding_u16); input_0 = _mm_srl_epi16(input_0, strength_u128); input_1 = _mm_srl_epi16(input_1, strength_u128); input_0 = _mm_min_epu16(input_0, sixteen); input_1 = _mm_min_epu16(input_1, sixteen); input_0 = _mm_sub_epi16(sixteen, input_0); input_1 = _mm_sub_epi16(sixteen, input_1); *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16); *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16); }
__m128i test_mm_srl_epi16(__m128i A, __m128i B) { // DAG-LABEL: test_mm_srl_epi16 // DAG: call <8 x i16> @llvm.x86.sse2.psrl.w // // ASM-LABEL: test_mm_srl_epi16 // ASM: psrlw return _mm_srl_epi16(A, B); }
// Average the value based on the number of values summed (9 for pixels away // from the border, 4 for pixels in corners, and 6 for other edge values). // // Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply // by weight. static __m128i average_8(__m128i sum, const __m128i mul_constants, const int strength, const int rounding, const int weight) { // _mm_srl_epi16 uses the lower 64 bit value for the shift. const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); const __m128i rounding_u16 = _mm_set1_epi16(rounding); const __m128i weight_u16 = _mm_set1_epi16(weight); const __m128i sixteen = _mm_set1_epi16(16); // modifier * 3 / index; sum = _mm_mulhi_epu16(sum, mul_constants); sum = _mm_adds_epu16(sum, rounding_u16); sum = _mm_srl_epi16(sum, strength_u128); // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4 // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385 // So this needs to use the epu16 version which did not come until SSE4. sum = _mm_min_epu16(sum, sixteen); sum = _mm_sub_epi16(sixteen, sum); return _mm_mullo_epi16(sum, weight_u16); }
sse2_tests (void) { /* psraw */ c128.v = _mm_srai_epi16 (m128_16, SHIFT); dump128_16 (buf, "_mm_srai_epi16", c128); c128.v = _mm_sra_epi16 (m128_16, s128); dump128_16 (buf, "_mm_sra_epi16", c128); /* psrad */ c128.v = _mm_srai_epi32 (m128_32, SHIFT); dump128_32 (buf, "_mm_srai_epi32", c128); c128.v = _mm_sra_epi32 (m128_32, s128); dump128_32 (buf, "_mm_sra_epi32", c128); /* psrlw */ c128.v = _mm_srli_epi16 (m128_16, SHIFT); dump128_16 (buf, "_mm_srli_epi16", c128); c128.v = _mm_srl_epi16 (m128_16, s128); dump128_16 (buf, "_mm_srl_epi16", c128); /* psrld */ c128.v = _mm_srli_epi32 (m128_32, SHIFT); dump128_32 (buf, "_mm_srli_epi32", c128); c128.v = _mm_srl_epi32 (m128_32, s128); dump128_32 (buf, "_mm_srl_epi32", c128); /* psrlq */ c128.v = _mm_srli_epi64 (m128_64, SHIFT); dump128_64 (buf, "_mm_srli_epi64", c128); c128.v = _mm_srl_epi64 (m128_64, s128); dump128_64 (buf, "_mm_srl_epi64", c128); /* psrldq */ c128.v = _mm_srli_si128 (m128_128, SHIFT); dump128_128 (buf, "_mm_srli_si128 (byte shift) ", c128); /* psllw */ c128.v = _mm_slli_epi16 (m128_16, SHIFT); dump128_16 (buf, "_mm_slli_epi16", c128); c128.v = _mm_sll_epi16 (m128_16, s128); dump128_16 (buf, "_mm_sll_epi16", c128); /* pslld */ c128.v = _mm_slli_epi32 (m128_32, SHIFT); dump128_32 (buf, "_mm_slli_epi32", c128); c128.v = _mm_sll_epi32 (m128_32, s128); dump128_32 (buf, "_mm_sll_epi32", c128); /* psllq */ c128.v = _mm_slli_epi64 (m128_64, SHIFT); dump128_64 (buf, "_mm_slli_epi64", c128); c128.v = _mm_sll_epi64 (m128_64, s128); dump128_64 (buf, "_mm_sll_epi64", c128); /* pslldq */ c128.v = _mm_slli_si128 (m128_128, SHIFT); dump128_128 (buf, "_mm_sll_si128 (byte shift)", c128); /* Shuffle constant 0x1b == 0b_00_01_10_11, e.g. swap words: ABCD => DCBA. */ /* pshufd */ c128.v = _mm_shuffle_epi32 (m128_128, 0x1b); dump128_32 (buf, "_mm_shuffle_epi32", c128); /* pshuflw */ c128.v = _mm_shufflelo_epi16 (m128_128, 0x1b); dump128_16 (buf, "_mm_shuffelo_epi16", c128); /* pshufhw */ c128.v = _mm_shufflehi_epi16 (m128_128, 0x1b); dump128_16 (buf, "_mm_shuffehi_epi16", c128); }