static void aom_filter_block1d4_h4_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m128i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1; unsigned int i; src_ptr -= 3; addFilterReg32 = _mm_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); filtersReg = _mm_srai_epi16(filtersReg, 1); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u)); filt1Reg = _mm_load_si128((__m128i const *)(filtd4)); for (i = output_height; i > 0; i -= 1) { // load the 2 strides of source srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); // filter the source buffer srcRegFilt32b1_1 = _mm_shuffle_epi8(srcReg32b1, filt1Reg); // multiply 4 adjacent elements with the filter and add the result srcRegFilt32b1_1 = _mm_maddubs_epi16(srcRegFilt32b1_1, firstFilters); srcRegFilt32b1_1 = _mm_hadds_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); // shift by 6 bit each 16 bit srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve result srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); src_ptr += src_pixels_per_line; *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1); output_ptr += output_pitch; } }
__m128i test_mm_hadds_epi16(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_hadds_epi16 // CHECK: call <8 x i16> @llvm.x86.ssse3.phadd.sw.128(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) return _mm_hadds_epi16(a, b); }
static void aom_filter_block1d4_v4_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m128i addFilterReg32; __m128i srcReg2, srcReg3, srcReg23, srcReg4, srcReg34, srcReg5, srcReg45, srcReg6, srcReg56; __m128i srcReg23_34_lo, srcReg45_56_lo; __m128i srcReg2345_3456_lo, srcReg2345_3456_hi; __m128i resReglo, resReghi; __m128i firstFilters; unsigned int i; ptrdiff_t src_stride, dst_stride; addFilterReg32 = _mm_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg = _mm_srai_epi16(filtersReg, 1); filtersReg = _mm_packs_epi16(filtersReg, filtersReg); firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u)); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); srcReg23 = _mm_unpacklo_epi32(srcReg2, srcReg3); srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); // have consecutive loads on the same 256 register srcReg34 = _mm_unpacklo_epi32(srcReg3, srcReg4); srcReg23_34_lo = _mm_unpacklo_epi8(srcReg23, srcReg34); for (i = output_height; i > 1; i -= 2) { srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); srcReg45 = _mm_unpacklo_epi32(srcReg4, srcReg5); srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); srcReg56 = _mm_unpacklo_epi32(srcReg5, srcReg6); // merge every two consecutive registers srcReg45_56_lo = _mm_unpacklo_epi8(srcReg45, srcReg56); srcReg2345_3456_lo = _mm_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo); srcReg2345_3456_hi = _mm_unpackhi_epi16(srcReg23_34_lo, srcReg45_56_lo); // multiply 2 adjacent elements with the filter and add the result resReglo = _mm_maddubs_epi16(srcReg2345_3456_lo, firstFilters); resReghi = _mm_maddubs_epi16(srcReg2345_3456_hi, firstFilters); resReglo = _mm_hadds_epi16(resReglo, _mm_setzero_si128()); resReghi = _mm_hadds_epi16(resReghi, _mm_setzero_si128()); // shift by 6 bit each 16 bit resReglo = _mm_adds_epi16(resReglo, addFilterReg32); resReghi = _mm_adds_epi16(resReghi, addFilterReg32); resReglo = _mm_srai_epi16(resReglo, 6); resReghi = _mm_srai_epi16(resReghi, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result resReglo = _mm_packus_epi16(resReglo, resReglo); resReghi = _mm_packus_epi16(resReghi, resReghi); src_ptr += src_stride; *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(resReglo); *((uint32_t *)(output_ptr + out_pitch)) = _mm_cvtsi128_si32(resReghi); output_ptr += dst_stride; // save part of the registers for next strides srcReg23_34_lo = srcReg45_56_lo; srcReg4 = srcReg6; } }