static void filter_horiz_w4_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *dst, const int16_t *filter) { const __m128i k_256 = _mm_set1_epi16(1 << 8); const __m128i f_values = _mm_load_si128((const __m128i *)filter); // pack and duplicate the filter values const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); // TRANSPOSE... // 00 01 02 03 04 05 06 07 // 10 11 12 13 14 15 16 17 // 20 21 22 23 24 25 26 27 // 30 31 32 33 34 35 36 37 // // TO // // 00 10 20 30 // 01 11 21 31 // 02 12 22 32 // 03 13 23 33 // 04 14 24 34 // 05 15 25 35 // 06 16 26 36 // 07 17 27 37 // // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 const __m128i s1s0 = _mm_unpacklo_epi32(tr0_0, tr0_1); // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 const __m128i s5s4 = _mm_unpackhi_epi32(tr0_0, tr0_1); // 02 03 12 13 22 23 32 33 const __m128i s3s2 = _mm_srli_si128(s1s0, 8); // 06 07 16 17 26 27 36 37 const __m128i s7s6 = _mm_srli_si128(s5s4, 8); // multiply 2 adjacent elements with the filter and add the result const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); // add and saturate the results together const __m128i min_x2x1 = _mm_min_epi16(x2, x1); const __m128i max_x2x1 = _mm_max_epi16(x2, x1); __m128i temp = _mm_adds_epi16(x0, x3); temp = _mm_adds_epi16(temp, min_x2x1); temp = _mm_adds_epi16(temp, max_x2x1); // round and shift by 7 bit each 16 bit temp = _mm_mulhrs_epi16(temp, k_256); // shrink to 8 bit each 16 bits temp = _mm_packus_epi16(temp, temp); // save only 4 bytes *(int *)dst = _mm_cvtsi128_si32(temp); }
/** * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more * precise version of a box filter 4:2:2 pixel subsampling in Q3. * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer. */ static INLINE void cfl_luma_subsampling_422_lbd_ssse3(const uint8_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { const __m128i fours = _mm_set1_epi8(4); __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128; do { if (width == 4) { __m128i top = _mm_loadh_epi32((__m128i *)input); top = _mm_maddubs_epi16(top, fours); _mm_storeh_epi32(pred_buf_m128i, top); } else if (width == 8) { __m128i top = _mm_loadl_epi64((__m128i *)input); top = _mm_maddubs_epi16(top, fours); _mm_storel_epi64(pred_buf_m128i, top); } else { __m128i top = _mm_loadu_si128((__m128i *)input); top = _mm_maddubs_epi16(top, fours); _mm_storeu_si128(pred_buf_m128i, top); if (width == 32) { __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); top_1 = _mm_maddubs_epi16(top_1, fours); _mm_storeu_si128(pred_buf_m128i + 1, top_1); } } input += input_stride; pred_buf_m128i += CFL_BUF_LINE_I128; } while (pred_buf_m128i < end); }
static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch, uint8_t *dst, const int16_t *x_filter) { const __m128i k_256 = _mm_set1_epi16(1 << 8); const __m128i f_values = _mm_load_si128((const __m128i *)x_filter); // pack and duplicate the filter values const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); const __m128i A = _mm_loadl_epi64((const __m128i *)src_x); const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch)); const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2)); const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3)); const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4)); const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5)); const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6)); const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7)); // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57 const __m128i tr0_2 = _mm_unpacklo_epi16(E, F); // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77 const __m128i tr0_3 = _mm_unpacklo_epi16(G, H); // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1); // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73 const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3); // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2); const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2); const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3); const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3); // multiply 2 adjacent elements with the filter and add the result const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); // add and saturate the results together const __m128i min_x2x1 = _mm_min_epi16(x2, x1); const __m128i max_x2x1 = _mm_max_epi16(x2, x1); __m128i temp = _mm_adds_epi16(x0, x3); temp = _mm_adds_epi16(temp, min_x2x1); temp = _mm_adds_epi16(temp, max_x2x1); // round and shift by 7 bit each 16 bit temp = _mm_mulhrs_epi16(temp, k_256); // shrink to 8 bit each 16 bits temp = _mm_packus_epi16(temp, temp); // save only 8 bytes convolve result _mm_storel_epi64((__m128i *)dst, temp); }
static void aom_filter_block1d8_h4_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m128i addFilterReg32, filt2Reg, filt3Reg; __m128i secondFilters, thirdFilters; __m128i srcRegFilt32b1_1, srcRegFilt32b2, srcRegFilt32b3; __m128i srcReg32b1; unsigned int i; src_ptr -= 3; addFilterReg32 = _mm_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); filtersReg = _mm_srai_epi16(filtersReg, 1); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32)); filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2)); for (i = output_height; i > 0; i -= 1) { srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); // filter the source buffer srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg); srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters); srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); // shift by 6 bit each 16 bit srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); // shrink to 8 bit each 16 bits srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); src_ptr += src_pixels_per_line; _mm_storel_epi64((__m128i *)output_ptr, srcRegFilt32b1_1); output_ptr += output_pitch; } }
/* Test the 128-bit form */ static void ssse3_test_pmaddubsw128 (int *i1, int *i2, int *r) { /* Assumes incoming pointers are 16-byte aligned */ __m128i t1 = *(__m128i *) i1; __m128i t2 = *(__m128i *) i2; *(__m128i *) r = _mm_maddubs_epi16 (t1, t2); }
void kvz_eight_tap_filter_x8_and_flip(__m128i *data01, __m128i *data23, __m128i *data45, __m128i *data67, __m128i *filter, __m128i *dst) { __m128i a, b, c, d; __m128i fir = _mm_broadcastq_epi64(_mm_loadl_epi64(filter)); a = _mm_maddubs_epi16(*data01, fir); b = _mm_maddubs_epi16(*data23, fir); a = _mm_hadd_epi16(a, b); c = _mm_maddubs_epi16(*data45, fir); d = _mm_maddubs_epi16(*data67, fir); c = _mm_hadd_epi16(c, d); a = _mm_hadd_epi16(a, c); _mm_storeu_si128(dst, a); }
static INLINE unsigned int masked_sad_ssse3(const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height) { int x, y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 16) { const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); const __m128i m = _mm_loadu_si128((const __m128i *)&m_ptr[x]); const __m128i m_inv = _mm_sub_epi8(mask_max, m); // Calculate 16 predicted pixels. // Note that the maximum value of any entry of 'pred_l' or 'pred_r' // is 64 * 255, so we have plenty of space to add rounding constants. const __m128i data_l = _mm_unpacklo_epi8(a, b); const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv); __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpackhi_epi8(a, b); const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv); __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); const __m128i pred = _mm_packus_epi16(pred_l, pred_r); res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); } src_ptr += src_stride; a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } // At this point, we have two 32-bit partial SADs in lanes 0 and 2 of 'res'. int32_t sad = _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8)); return (sad + 31) >> 6; }
static INLINE unsigned int masked_sad8xh_ssse3( const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { int y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); for (y = 0; y < height; y += 2) { const __m128i src = _mm_unpacklo_epi64( _mm_loadl_epi64((const __m128i *)src_ptr), _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr); const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]); const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr); const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]); const __m128i m = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr), _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride])); const __m128i m_inv = _mm_sub_epi8(mask_max, m); const __m128i data_l = _mm_unpacklo_epi8(a0, b0); const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv); __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpacklo_epi8(a1, b1); const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv); __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); const __m128i pred = _mm_packus_epi16(pred_l, pred_r); res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); src_ptr += src_stride * 2; a_ptr += a_stride * 2; b_ptr += b_stride * 2; m_ptr += m_stride * 2; } int32_t sad = _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8)); return (sad + 31) >> 6; }
static void filter_vert_w8_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *dst, const int16_t *filter) { const __m128i k_256 = _mm_set1_epi16(1 << 8); const __m128i f_values = _mm_load_si128((const __m128i *)filter); // pack and duplicate the filter values const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); const __m128i A = _mm_loadl_epi64((const __m128i *)src_ptr); const __m128i B = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); const __m128i C = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); const __m128i D = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); const __m128i E = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); const __m128i F = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); const __m128i G = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); const __m128i H = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); const __m128i s1s0 = _mm_unpacklo_epi8(A, B); const __m128i s3s2 = _mm_unpacklo_epi8(C, D); const __m128i s5s4 = _mm_unpacklo_epi8(E, F); const __m128i s7s6 = _mm_unpacklo_epi8(G, H); // multiply 2 adjacent elements with the filter and add the result const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); // add and saturate the results together const __m128i min_x2x1 = _mm_min_epi16(x2, x1); const __m128i max_x2x1 = _mm_max_epi16(x2, x1); __m128i temp = _mm_adds_epi16(x0, x3); temp = _mm_adds_epi16(temp, min_x2x1); temp = _mm_adds_epi16(temp, max_x2x1); // round and shift by 7 bit each 16 bit temp = _mm_mulhrs_epi16(temp, k_256); // shrink to 8 bit each 16 bits temp = _mm_packus_epi16(temp, temp); // save only 8 bytes convolve result _mm_storel_epi64((__m128i *)dst, temp); }
void SSE42_blend() { int n = width * height * 4; int dummy __attribute__((unused)); const uint8_t alpha2 = alpha/2; __m128i alpha_np = _mm_set1_epi16(alpha2 | ((uint16_t)(alpha2 ^ 0x7f) << 8)); for (size_t i=0; i < n; i += 32) { __m128i A0 = _mm_load_si128((__m128i*)(imgA + i)); __m128i B0 = _mm_load_si128((__m128i*)(imgB + i)); __m128i A1 = _mm_load_si128((__m128i*)(imgA + i + 16)); __m128i B1 = _mm_load_si128((__m128i*)(imgB + i + 16)); __m128i lo0 = _mm_unpacklo_epi8(A0, B0); __m128i hi0 = _mm_unpackhi_epi8(A0, B0); __m128i lo1 = _mm_unpacklo_epi8(A1, B1); __m128i hi1 = _mm_unpackhi_epi8(A1, B1); lo0 = _mm_maddubs_epi16(lo0, alpha_np); lo1 = _mm_maddubs_epi16(lo1, alpha_np); hi0 = _mm_maddubs_epi16(hi0, alpha_np); hi1 = _mm_maddubs_epi16(hi1, alpha_np); lo0 = _mm_srli_epi16(lo0, 7); lo1 = _mm_srli_epi16(lo1, 7); hi0 = _mm_srli_epi16(hi0, 7); hi1 = _mm_srli_epi16(hi1, 7); __m128i res0 = _mm_packus_epi16(lo0, hi0); __m128i res1 = _mm_packus_epi16(lo1, hi1); _mm_store_si128((__m128i*)(data + i + 0), res0); _mm_store_si128((__m128i*)(data + i + 16), res1); } }
/** * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more * precise version of a box filter 4:2:0 pixel subsampling in Q3. * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer. */ static INLINE void cfl_luma_subsampling_420_lbd_ssse3(const uint8_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { const __m128i twos = _mm_set1_epi8(2); __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; const __m128i *end = pred_buf_m128i + (height >> 1) * CFL_BUF_LINE_I128; const int luma_stride = input_stride << 1; do { if (width == 4) { __m128i top = _mm_loadh_epi32((__m128i *)input); top = _mm_maddubs_epi16(top, twos); __m128i bot = _mm_loadh_epi32((__m128i *)(input + input_stride)); bot = _mm_maddubs_epi16(bot, twos); const __m128i sum = _mm_add_epi16(top, bot); _mm_storeh_epi32(pred_buf_m128i, sum); } else if (width == 8) { __m128i top = _mm_loadl_epi64((__m128i *)input); top = _mm_maddubs_epi16(top, twos); __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride)); bot = _mm_maddubs_epi16(bot, twos); const __m128i sum = _mm_add_epi16(top, bot); _mm_storel_epi64(pred_buf_m128i, sum); } else { __m128i top = _mm_loadu_si128((__m128i *)input); top = _mm_maddubs_epi16(top, twos); __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride)); bot = _mm_maddubs_epi16(bot, twos); const __m128i sum = _mm_add_epi16(top, bot); _mm_storeu_si128(pred_buf_m128i, sum); if (width == 32) { __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); __m128i bot_1 = _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1); top_1 = _mm_maddubs_epi16(top_1, twos); bot_1 = _mm_maddubs_epi16(bot_1, twos); __m128i sum_1 = _mm_add_epi16(top_1, bot_1); _mm_storeu_si128(pred_buf_m128i + 1, sum_1); } } input += luma_stride; pred_buf_m128i += CFL_BUF_LINE_I128; } while (pred_buf_m128i < end); }
static void aom_filter_block1d4_h4_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m128i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1; unsigned int i; src_ptr -= 3; addFilterReg32 = _mm_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); filtersReg = _mm_srai_epi16(filtersReg, 1); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u)); filt1Reg = _mm_load_si128((__m128i const *)(filtd4)); for (i = output_height; i > 0; i -= 1) { // load the 2 strides of source srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); // filter the source buffer srcRegFilt32b1_1 = _mm_shuffle_epi8(srcReg32b1, filt1Reg); // multiply 4 adjacent elements with the filter and add the result srcRegFilt32b1_1 = _mm_maddubs_epi16(srcRegFilt32b1_1, firstFilters); srcRegFilt32b1_1 = _mm_hadds_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); // shift by 6 bit each 16 bit srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve result srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); src_ptr += src_pixels_per_line; *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1); output_ptr += output_pitch; } }
static INLINE unsigned int masked_sad4xh_ssse3( const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { int y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); for (y = 0; y < height; y += 2) { // Load two rows at a time, this seems to be a bit faster // than four rows at a time in this case. const __m128i src = _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(uint32_t *)src_ptr), _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride])); const __m128i a = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)a_ptr), _mm_cvtsi32_si128(*(uint32_t *)&a_ptr[a_stride])); const __m128i b = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr), _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride])); const __m128i m = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr), _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride])); const __m128i m_inv = _mm_sub_epi8(mask_max, m); const __m128i data = _mm_unpacklo_epi8(a, b); const __m128i mask = _mm_unpacklo_epi8(m, m_inv); __m128i pred_16bit = _mm_maddubs_epi16(data, mask); pred_16bit = xx_roundn_epu16(pred_16bit, AOM_BLEND_A64_ROUND_BITS); const __m128i pred = _mm_packus_epi16(pred_16bit, _mm_setzero_si128()); res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); src_ptr += src_stride * 2; a_ptr += a_stride * 2; b_ptr += b_stride * 2; m_ptr += m_stride * 2; } // At this point, the SAD is stored in lane 0 of 'res' int32_t sad = _mm_cvtsi128_si32(res); return (sad + 31) >> 6; }
static void vpx_filter_block1d16_h8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg64, filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; __m256i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; __m256i srcReg32b1, srcReg32b2, filtersReg32; unsigned int i; ptrdiff_t src_stride, dst_stride; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); // duplicate only the first 16 bits (first and second byte) // across 256 bit register firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 256 bit register forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); filt1Reg = _mm256_load_si256((__m256i const *)filt1_global_avx2); filt2Reg = _mm256_load_si256((__m256i const *)filt2_global_avx2); filt3Reg = _mm256_load_si256((__m256i const *)filt3_global_avx2); filt4Reg = _mm256_load_si256((__m256i const *)filt4_global_avx2); // multiple the size of the source and destination stride by two src_stride = src_pixels_per_line << 1; dst_stride = output_pitch << 1; for (i = output_height; i > 1; i-=2) { // load the 2 strides of source srcReg32b1 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr - 3))); srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, _mm_loadu_si128((const __m128i *) (src_ptr+src_pixels_per_line-3)), 1); // filter the source buffer srcRegFilt32b1_1= _mm256_shuffle_epi8(srcReg32b1, filt1Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b1_1 = _mm256_maddubs_epi16(srcRegFilt32b1_1, firstFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, srcRegFilt32b2); // filter the source buffer srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b1, filt2Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b1, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); // reading 2 strides of the next 16 bytes // (part of it was being read by earlier read) srcReg32b2 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + 5))); srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, _mm_loadu_si128((const __m128i *) (src_ptr+src_pixels_per_line+5)), 1); // add and saturate the results together srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); // filter the source buffer srcRegFilt32b2_1 = _mm256_shuffle_epi8(srcReg32b2, filt1Reg); srcRegFilt32b2 = _mm256_shuffle_epi8(srcReg32b2, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b2_1 = _mm256_maddubs_epi16(srcRegFilt32b2_1, firstFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, forthFilters); // add and saturate the results together srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, srcRegFilt32b2); // filter the source buffer srcRegFilt32b3= _mm256_shuffle_epi8(srcReg32b2, filt2Reg); srcRegFilt32b2= _mm256_shuffle_epi8(srcReg32b2, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm256_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm256_maddubs_epi16(srcRegFilt32b2, thirdFilters); // add and saturate the results together srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, _mm256_min_epi16(srcRegFilt32b3, srcRegFilt32b2)); srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, _mm256_max_epi16(srcRegFilt32b3, srcRegFilt32b2)); srcRegFilt32b1_1 = _mm256_adds_epi16(srcRegFilt32b1_1, addFilterReg64); srcRegFilt32b2_1 = _mm256_adds_epi16(srcRegFilt32b2_1, addFilterReg64); // shift by 7 bit each 16 bit srcRegFilt32b1_1 = _mm256_srai_epi16(srcRegFilt32b1_1, 7); srcRegFilt32b2_1 = _mm256_srai_epi16(srcRegFilt32b2_1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt32b1_1 = _mm256_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); src_ptr+=src_stride; // save 16 bytes _mm_store_si128((__m128i*)output_ptr, _mm256_castsi256_si128(srcRegFilt32b1_1)); // save the next 16 bits _mm_store_si128((__m128i*)(output_ptr+output_pitch), _mm256_extractf128_si256(srcRegFilt32b1_1, 1)); output_ptr+=dst_stride; } // if the number of strides is odd. // process only 16 bytes if (i > 0) { __m128i srcReg1, srcReg2, srcRegFilt1_1, srcRegFilt2_1; __m128i srcRegFilt2, srcRegFilt3; srcReg1 = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1_1 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt1Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt4Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, _mm256_castsi256_si128(firstFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); // filter the source buffer srcRegFilt3= _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt2Reg)); srcRegFilt2= _mm_shuffle_epi8(srcReg1, _mm256_castsi256_si128(filt3Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); // reading the next 16 bytes // (part of it was being read by earlier read) srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + 5)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); // filter the source buffer srcRegFilt2_1 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt1Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt4Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, _mm256_castsi256_si128(firstFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); // filter the source buffer srcRegFilt3 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt2Reg)); srcRegFilt2 = _mm_shuffle_epi8(srcReg2, _mm256_castsi256_si128(filt3Reg)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, _mm256_castsi256_si128(secondFilters)); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm256_castsi256_si128(addFilterReg64)); srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm256_castsi256_si128(addFilterReg64)); // shift by 7 bit each 16 bit srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); // save 16 bytes _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); } }
void aom_filter_block1d8_h8_intrin_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; __m128i addFilterReg64, filtersReg, minReg; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the first 16 bits (first and second byte) // across 128 bit register firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 128 bit register secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 128 bit register thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 128 bit register forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); filt1Reg = _mm_load_si128((__m128i const *)filt1_global); filt2Reg = _mm_load_si128((__m128i const *)filt2_global); filt3Reg = _mm_load_si128((__m128i const *)filt3_global); filt4Reg = _mm_load_si128((__m128i const *)filt4_global); for (i = 0; i < output_height; i++) { srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg); srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); // filter the source buffer srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg); srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); // add and saturate all the results together minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); // shift by 7 bit each 16 bits srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); // shrink to 8 bit each 16 bits srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); src_ptr += src_pixels_per_line; // save only 8 bytes _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); output_ptr += output_pitch; } }
static void ssse3_fetch_horizontal (bits_image_t *image, line_t *line, int y, pixman_fixed_t x, pixman_fixed_t ux, int n) { uint32_t *bits = image->bits + y * image->rowstride; __m128i vx = _mm_set_epi16 ( - (x + 1), x, - (x + 1), x, - (x + ux + 1), x + ux, - (x + ux + 1), x + ux); __m128i vux = _mm_set_epi16 ( - 2 * ux, 2 * ux, - 2 * ux, 2 * ux, - 2 * ux, 2 * ux, - 2 * ux, 2 * ux); __m128i vaddc = _mm_set_epi16 (1, 0, 1, 0, 1, 0, 1, 0); __m128i *b = (__m128i *)line->buffer; __m128i vrl0, vrl1; while ((n -= 2) >= 0) { __m128i vw, vr, s; vrl1 = _mm_loadl_epi64 ( (__m128i *)(bits + pixman_fixed_to_int (x + ux))); /* vrl1: R1, L1 */ final_pixel: vrl0 = _mm_loadl_epi64 ( (__m128i *)(bits + pixman_fixed_to_int (x))); /* vrl0: R0, L0 */ /* The weights are based on vx which is a vector of * * - (x + 1), x, - (x + 1), x, * - (x + ux + 1), x + ux, - (x + ux + 1), x + ux * * so the 16 bit weights end up like this: * * iw0, w0, iw0, w0, iw1, w1, iw1, w1 * * and after shifting and packing, we get these bytes: * * iw0, w0, iw0, w0, iw1, w1, iw1, w1, * iw0, w0, iw0, w0, iw1, w1, iw1, w1, * * which means the first and the second input pixel * have to be interleaved like this: * * la0, ra0, lr0, rr0, la1, ra1, lr1, rr1, * lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1 * * before maddubsw can be used. */ vw = _mm_add_epi16 ( vaddc, _mm_srli_epi16 (vx, 16 - BILINEAR_INTERPOLATION_BITS)); /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1 */ vw = _mm_packus_epi16 (vw, vw); /* vw: iw0, w0, iw0, w0, iw1, w1, iw1, w1, * iw0, w0, iw0, w0, iw1, w1, iw1, w1 */ vx = _mm_add_epi16 (vx, vux); x += 2 * ux; vr = _mm_unpacklo_epi16 (vrl1, vrl0); /* vr: rar0, rar1, rgb0, rgb1, lar0, lar1, lgb0, lgb1 */ s = _mm_shuffle_epi32 (vr, _MM_SHUFFLE (1, 0, 3, 2)); /* s: lar0, lar1, lgb0, lgb1, rar0, rar1, rgb0, rgb1 */ vr = _mm_unpackhi_epi8 (vr, s); /* vr: la0, ra0, lr0, rr0, la1, ra1, lr1, rr1, * lg0, rg0, lb0, rb0, lg1, rg1, lb1, rb1 */ vr = _mm_maddubs_epi16 (vr, vw); /* When the weight is 0, the inverse weight is * 128 which can't be represented in a signed byte. * As a result maddubsw computes the following: * * r = l * -128 + r * 0 * * rather than the desired * * r = l * 128 + r * 0 * * We fix this by taking the absolute value of the * result. */ vr = _mm_abs_epi16 (vr); /* vr: A0, R0, A1, R1, G0, B0, G1, B1 */ _mm_store_si128 (b++, vr); } if (n == -1) { vrl1 = _mm_setzero_si128(); goto final_pixel; } line->y = y; }
static void vpx_filter_block1d16_v8_avx2(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m256i addFilterReg64; __m256i srcReg32b1, srcReg32b2, srcReg32b3, srcReg32b4, srcReg32b5; __m256i srcReg32b6, srcReg32b7, srcReg32b8, srcReg32b9, srcReg32b10; __m256i srcReg32b11, srcReg32b12, filtersReg32; __m256i firstFilters, secondFilters, thirdFilters, forthFilters; unsigned int i; ptrdiff_t src_stride, dst_stride; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm256_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); // have the same data in both lanes of a 256 bit register filtersReg32 = MM256_BROADCASTSI128_SI256(filtersReg); // duplicate only the first 16 bits (first and second byte) // across 256 bit register firstFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 256 bit register forthFilters = _mm256_shuffle_epi8(filtersReg32, _mm256_set1_epi16(0x706u)); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; // load 16 bytes 7 times in stride of src_pitch srcReg32b1 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr))); srcReg32b2 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch))); srcReg32b3 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2))); srcReg32b4 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3))); srcReg32b5 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4))); srcReg32b6 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5))); srcReg32b7 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6))); // have each consecutive loads on the same 256 register srcReg32b1 = _mm256_inserti128_si256(srcReg32b1, _mm256_castsi256_si128(srcReg32b2), 1); srcReg32b2 = _mm256_inserti128_si256(srcReg32b2, _mm256_castsi256_si128(srcReg32b3), 1); srcReg32b3 = _mm256_inserti128_si256(srcReg32b3, _mm256_castsi256_si128(srcReg32b4), 1); srcReg32b4 = _mm256_inserti128_si256(srcReg32b4, _mm256_castsi256_si128(srcReg32b5), 1); srcReg32b5 = _mm256_inserti128_si256(srcReg32b5, _mm256_castsi256_si128(srcReg32b6), 1); srcReg32b6 = _mm256_inserti128_si256(srcReg32b6, _mm256_castsi256_si128(srcReg32b7), 1); // merge every two consecutive registers except the last one srcReg32b10 = _mm256_unpacklo_epi8(srcReg32b1, srcReg32b2); srcReg32b1 = _mm256_unpackhi_epi8(srcReg32b1, srcReg32b2); // save srcReg32b11 = _mm256_unpacklo_epi8(srcReg32b3, srcReg32b4); // save srcReg32b3 = _mm256_unpackhi_epi8(srcReg32b3, srcReg32b4); // save srcReg32b2 = _mm256_unpacklo_epi8(srcReg32b5, srcReg32b6); // save srcReg32b5 = _mm256_unpackhi_epi8(srcReg32b5, srcReg32b6); for (i = output_height; i > 1; i-=2) { // load the last 2 loads of 16 bytes and have every two // consecutive loads in the same 256 bit register srcReg32b8 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7))); srcReg32b7 = _mm256_inserti128_si256(srcReg32b7, _mm256_castsi256_si128(srcReg32b8), 1); srcReg32b9 = _mm256_castsi128_si256( _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 8))); srcReg32b8 = _mm256_inserti128_si256(srcReg32b8, _mm256_castsi256_si128(srcReg32b9), 1); // merge every two consecutive registers // save srcReg32b4 = _mm256_unpacklo_epi8(srcReg32b7, srcReg32b8); srcReg32b7 = _mm256_unpackhi_epi8(srcReg32b7, srcReg32b8); // multiply 2 adjacent elements with the filter and add the result srcReg32b10 = _mm256_maddubs_epi16(srcReg32b10, firstFilters); srcReg32b6 = _mm256_maddubs_epi16(srcReg32b4, forthFilters); // add and saturate the results together srcReg32b10 = _mm256_adds_epi16(srcReg32b10, srcReg32b6); // multiply 2 adjacent elements with the filter and add the result srcReg32b8 = _mm256_maddubs_epi16(srcReg32b11, secondFilters); srcReg32b12 = _mm256_maddubs_epi16(srcReg32b2, thirdFilters); // add and saturate the results together srcReg32b10 = _mm256_adds_epi16(srcReg32b10, _mm256_min_epi16(srcReg32b8, srcReg32b12)); srcReg32b10 = _mm256_adds_epi16(srcReg32b10, _mm256_max_epi16(srcReg32b8, srcReg32b12)); // multiply 2 adjacent elements with the filter and add the result srcReg32b1 = _mm256_maddubs_epi16(srcReg32b1, firstFilters); srcReg32b6 = _mm256_maddubs_epi16(srcReg32b7, forthFilters); srcReg32b1 = _mm256_adds_epi16(srcReg32b1, srcReg32b6); // multiply 2 adjacent elements with the filter and add the result srcReg32b8 = _mm256_maddubs_epi16(srcReg32b3, secondFilters); srcReg32b12 = _mm256_maddubs_epi16(srcReg32b5, thirdFilters); // add and saturate the results together srcReg32b1 = _mm256_adds_epi16(srcReg32b1, _mm256_min_epi16(srcReg32b8, srcReg32b12)); srcReg32b1 = _mm256_adds_epi16(srcReg32b1, _mm256_max_epi16(srcReg32b8, srcReg32b12)); srcReg32b10 = _mm256_adds_epi16(srcReg32b10, addFilterReg64); srcReg32b1 = _mm256_adds_epi16(srcReg32b1, addFilterReg64); // shift by 7 bit each 16 bit srcReg32b10 = _mm256_srai_epi16(srcReg32b10, 7); srcReg32b1 = _mm256_srai_epi16(srcReg32b1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcReg32b1 = _mm256_packus_epi16(srcReg32b10, srcReg32b1); src_ptr+=src_stride; // save 16 bytes _mm_store_si128((__m128i*)output_ptr, _mm256_castsi256_si128(srcReg32b1)); // save the next 16 bits _mm_store_si128((__m128i*)(output_ptr+out_pitch), _mm256_extractf128_si256(srcReg32b1, 1)); output_ptr+=dst_stride; // save part of the registers for next strides srcReg32b10 = srcReg32b11; srcReg32b1 = srcReg32b3; srcReg32b11 = srcReg32b2; srcReg32b3 = srcReg32b5; srcReg32b2 = srcReg32b4; srcReg32b5 = srcReg32b7; srcReg32b7 = srcReg32b9; } if (i > 0) { __m128i srcRegFilt1, srcRegFilt3, srcRegFilt4, srcRegFilt5; __m128i srcRegFilt6, srcRegFilt7, srcRegFilt8; // load the last 16 bytes srcRegFilt8 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); // merge the last 2 results together srcRegFilt4 = _mm_unpacklo_epi8( _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); srcRegFilt7 = _mm_unpackhi_epi8( _mm256_castsi256_si128(srcReg32b7), srcRegFilt8); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b10), _mm256_castsi256_si128(firstFilters)); srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, _mm256_castsi256_si128(forthFilters)); srcRegFilt3 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b1), _mm256_castsi256_si128(firstFilters)); srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, _mm256_castsi256_si128(forthFilters)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, srcRegFilt7); // multiply 2 adjacent elements with the filter and add the result srcRegFilt4 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b11), _mm256_castsi256_si128(secondFilters)); srcRegFilt5 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b3), _mm256_castsi256_si128(secondFilters)); // multiply 2 adjacent elements with the filter and add the result srcRegFilt6 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b2), _mm256_castsi256_si128(thirdFilters)); srcRegFilt7 = _mm_maddubs_epi16(_mm256_castsi256_si128(srcReg32b5), _mm256_castsi256_si128(thirdFilters)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt4, srcRegFilt6)); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, _mm_min_epi16(srcRegFilt5, srcRegFilt7)); // add and saturate the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt4, srcRegFilt6)); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, _mm_max_epi16(srcRegFilt5, srcRegFilt7)); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm256_castsi256_si128(addFilterReg64)); srcRegFilt3 = _mm_adds_epi16(srcRegFilt3, _mm256_castsi256_si128(addFilterReg64)); // shift by 7 bit each 16 bit srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); srcRegFilt3 = _mm_srai_epi16(srcRegFilt3, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt3); // save 16 bytes _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); } }
static void aom_filter_block1d4_v4_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m128i addFilterReg32; __m128i srcReg2, srcReg3, srcReg23, srcReg4, srcReg34, srcReg5, srcReg45, srcReg6, srcReg56; __m128i srcReg23_34_lo, srcReg45_56_lo; __m128i srcReg2345_3456_lo, srcReg2345_3456_hi; __m128i resReglo, resReghi; __m128i firstFilters; unsigned int i; ptrdiff_t src_stride, dst_stride; addFilterReg32 = _mm_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg = _mm_srai_epi16(filtersReg, 1); filtersReg = _mm_packs_epi16(filtersReg, filtersReg); firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u)); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); srcReg23 = _mm_unpacklo_epi32(srcReg2, srcReg3); srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); // have consecutive loads on the same 256 register srcReg34 = _mm_unpacklo_epi32(srcReg3, srcReg4); srcReg23_34_lo = _mm_unpacklo_epi8(srcReg23, srcReg34); for (i = output_height; i > 1; i -= 2) { srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); srcReg45 = _mm_unpacklo_epi32(srcReg4, srcReg5); srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); srcReg56 = _mm_unpacklo_epi32(srcReg5, srcReg6); // merge every two consecutive registers srcReg45_56_lo = _mm_unpacklo_epi8(srcReg45, srcReg56); srcReg2345_3456_lo = _mm_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo); srcReg2345_3456_hi = _mm_unpackhi_epi16(srcReg23_34_lo, srcReg45_56_lo); // multiply 2 adjacent elements with the filter and add the result resReglo = _mm_maddubs_epi16(srcReg2345_3456_lo, firstFilters); resReghi = _mm_maddubs_epi16(srcReg2345_3456_hi, firstFilters); resReglo = _mm_hadds_epi16(resReglo, _mm_setzero_si128()); resReghi = _mm_hadds_epi16(resReghi, _mm_setzero_si128()); // shift by 6 bit each 16 bit resReglo = _mm_adds_epi16(resReglo, addFilterReg32); resReghi = _mm_adds_epi16(resReghi, addFilterReg32); resReglo = _mm_srai_epi16(resReglo, 6); resReghi = _mm_srai_epi16(resReghi, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result resReglo = _mm_packus_epi16(resReglo, resReglo); resReghi = _mm_packus_epi16(resReghi, resReghi); src_ptr += src_stride; *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(resReglo); *((uint32_t *)(output_ptr + out_pitch)) = _mm_cvtsi128_si32(resReghi); output_ptr += dst_stride; // save part of the registers for next strides srcReg23_34_lo = srcReg45_56_lo; srcReg4 = srcReg6; } }
SIMD_INLINE __m128i BinomialSum16(const __m128i & ab, const __m128i & cd) { return _mm_add_epi16(_mm_maddubs_epi16(ab, K8_01_03), _mm_maddubs_epi16(cd, K8_03_01)); }
static void filter_vert_w16_ssse3(const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *dst, const int16_t *filter, int w) { const __m128i k_256 = _mm_set1_epi16(1 << 8); const __m128i f_values = _mm_load_si128((const __m128i *)filter); // pack and duplicate the filter values const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); int i; for (i = 0; i < w; i += 16) { const __m128i A = _mm_loadu_si128((const __m128i *)src_ptr); const __m128i B = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch)); const __m128i C = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); const __m128i D = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); const __m128i E = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); const __m128i F = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); const __m128i G = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); const __m128i H = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 7)); // merge the result together const __m128i s1s0_lo = _mm_unpacklo_epi8(A, B); const __m128i s7s6_lo = _mm_unpacklo_epi8(G, H); const __m128i s1s0_hi = _mm_unpackhi_epi8(A, B); const __m128i s7s6_hi = _mm_unpackhi_epi8(G, H); // multiply 2 adjacent elements with the filter and add the result const __m128i x0_lo = _mm_maddubs_epi16(s1s0_lo, f1f0); const __m128i x3_lo = _mm_maddubs_epi16(s7s6_lo, f7f6); const __m128i x0_hi = _mm_maddubs_epi16(s1s0_hi, f1f0); const __m128i x3_hi = _mm_maddubs_epi16(s7s6_hi, f7f6); // add and saturate the results together const __m128i x3x0_lo = _mm_adds_epi16(x0_lo, x3_lo); const __m128i x3x0_hi = _mm_adds_epi16(x0_hi, x3_hi); // merge the result together const __m128i s3s2_lo = _mm_unpacklo_epi8(C, D); const __m128i s3s2_hi = _mm_unpackhi_epi8(C, D); // multiply 2 adjacent elements with the filter and add the result const __m128i x1_lo = _mm_maddubs_epi16(s3s2_lo, f3f2); const __m128i x1_hi = _mm_maddubs_epi16(s3s2_hi, f3f2); // merge the result together const __m128i s5s4_lo = _mm_unpacklo_epi8(E, F); const __m128i s5s4_hi = _mm_unpackhi_epi8(E, F); // multiply 2 adjacent elements with the filter and add the result const __m128i x2_lo = _mm_maddubs_epi16(s5s4_lo, f5f4); const __m128i x2_hi = _mm_maddubs_epi16(s5s4_hi, f5f4); // add and saturate the results together __m128i temp_lo = _mm_adds_epi16(x3x0_lo, _mm_min_epi16(x1_lo, x2_lo)); __m128i temp_hi = _mm_adds_epi16(x3x0_hi, _mm_min_epi16(x1_hi, x2_hi)); // add and saturate the results together temp_lo = _mm_adds_epi16(temp_lo, _mm_max_epi16(x1_lo, x2_lo)); temp_hi = _mm_adds_epi16(temp_hi, _mm_max_epi16(x1_hi, x2_hi)); // round and shift by 7 bit each 16 bit temp_lo = _mm_mulhrs_epi16(temp_lo, k_256); temp_hi = _mm_mulhrs_epi16(temp_hi, k_256); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result temp_hi = _mm_packus_epi16(temp_lo, temp_hi); src_ptr += 16; // save 16 bytes convolve result _mm_store_si128((__m128i *)&dst[i], temp_hi); } }
static void aom_filter_block1d16_v4_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi; __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi; __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi; __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi; __m128i resReg23_45, resReg34_56; __m128i addFilterReg32, secondFilters, thirdFilters; unsigned int i; ptrdiff_t src_stride, dst_stride; addFilterReg32 = _mm_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg = _mm_srai_epi16(filtersReg, 1); filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the second 16 bits (third and forth byte) // across 128 bit register secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 128 bit register thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3); srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3); srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); // have consecutive loads on the same 256 register srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4); srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4); for (i = output_height; i > 1; i -= 2) { srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5); srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5); srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6); srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6); // multiply 2 adjacent elements with the filter and add the result resReg23_lo = _mm_maddubs_epi16(srcReg23_lo, secondFilters); resReg34_lo = _mm_maddubs_epi16(srcReg34_lo, secondFilters); resReg45_lo = _mm_maddubs_epi16(srcReg45_lo, thirdFilters); resReg56_lo = _mm_maddubs_epi16(srcReg56_lo, thirdFilters); // add and saturate the results together resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo); resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo); // multiply 2 adjacent elements with the filter and add the result resReg23_hi = _mm_maddubs_epi16(srcReg23_hi, secondFilters); resReg34_hi = _mm_maddubs_epi16(srcReg34_hi, secondFilters); resReg45_hi = _mm_maddubs_epi16(srcReg45_hi, thirdFilters); resReg56_hi = _mm_maddubs_epi16(srcReg56_hi, thirdFilters); // add and saturate the results together resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi); resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi); // shift by 6 bit each 16 bit resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32); resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32); resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32); resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32); resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6); resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6); resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6); resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi); resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi); src_ptr += src_stride; _mm_store_si128((__m128i *)output_ptr, (resReg23_45)); _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56)); output_ptr += dst_stride; // save part of the registers for next strides srcReg23_lo = srcReg45_lo; srcReg34_lo = srcReg56_lo; srcReg23_hi = srcReg45_hi; srcReg34_hi = srcReg56_hi; srcReg4 = srcReg6; } }
void aom_filter_block1d4_h8_intrin_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i firstFilters, secondFilters, shuffle1, shuffle2; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; __m128i addFilterReg64, filtersReg, srcReg, minReg; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the first 16 bits in the filter into the first lane firstFilters = _mm_shufflelo_epi16(filtersReg, 0); // duplicate only the third 16 bit in the filter into the first lane secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); // duplicate only the seconds 16 bits in the filter into the second lane // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); // duplicate only the forth 16 bits in the filter into the second lane // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); // loading the local filters shuffle1 = _mm_load_si128((__m128i const *)filt1_4_h8); shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); for (i = 0; i < output_height; i++) { srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1 = _mm_shuffle_epi8(srcReg, shuffle1); srcRegFilt2 = _mm_shuffle_epi8(srcReg, shuffle2); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); // extract the higher half of the lane srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); // add and saturate all the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); // shift by 7 bit each 16 bits srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); // shrink to 8 bit each 16 bits srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); src_ptr += src_pixels_per_line; // save only 4 bytes *((int *)&output_ptr[0]) = _mm_cvtsi128_si32(srcRegFilt1); output_ptr += output_pitch; } }
SIMD_INLINE void Average16(__m128i & a) { a = _mm_srli_epi16(_mm_add_epi16(_mm_maddubs_epi16(a, K8_01), K16_0001), 1); }
SIMD_INLINE __m128i Average16(const __m128i & s0, const __m128i & s1) { return _mm_srli_epi16(_mm_add_epi16(_mm_add_epi16(_mm_maddubs_epi16(s0, K8_01), _mm_maddubs_epi16(s1, K8_01)), K16_0002), 2); }
void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, unsigned int src_pixels_per_line, unsigned char *output_ptr, unsigned int output_pitch, unsigned int output_height, int16_t *filter) { __m128i addFilterReg64, filtersReg, srcReg1, srcReg2; __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m128i firstFilters, secondFilters, thirdFilters, forthFilters; __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((__m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); // duplicate only the first 16 bits (first and second byte) // across 128 bit register firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 128 bit register secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 128 bit register thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 128 bit register forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); filt1Reg = _mm_load_si128((__m128i const *)filt1_global); filt2Reg = _mm_load_si128((__m128i const *)filt2_global); filt3Reg = _mm_load_si128((__m128i const *)filt3_global); filt4Reg = _mm_load_si128((__m128i const *)filt4_global); for (i = 0; i < output_height; i++) { srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); // filter the source buffer srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); // filter the source buffer srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg); srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); // reading the next 16 bytes. // (part of it was being read by earlier read) srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); // filter the source buffer srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg); srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); // filter the source buffer srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg); srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64); srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64); // shift by 7 bit each 16 bit srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); src_ptr+=src_pixels_per_line; // save 16 bytes _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); output_ptr+=output_pitch; } }
void foo(__m128i *a, __m128i *b, __m128i c, __m128i d, __m128i e) { *a = _mm_maddubs_epi16(c, d); *b = _mm_maddubs_epi16(c, e); }
__m128i test_mm_maddubs_epi16(__m128i a, __m128i b) { // CHECK-LABEL: test_mm_maddubs_epi16 // CHECK: call <8 x i16> @llvm.x86.ssse3.pmadd.ub.sw.128(<16 x i8> %{{.*}}, <16 x i8> %{{.*}}) return _mm_maddubs_epi16(a, b); }
/** * \brief Linear interpolation for 4 pixels. Returns 4 filtered pixels in lowest 32-bits of the register. * \param ref_main Reference pixels * \param delta_pos Fractional pixel precise position of sample displacement * \param x Sample offset in direction x in ref_main array */ static INLINE __m128i filter_4x1_avx2(const kvz_pixel *ref_main, int16_t delta_pos, int x){ int8_t delta_int = delta_pos >> 5; int8_t delta_fract = delta_pos & (32-1); __m128i sample0 = _mm_cvtsi32_si128(*(uint32_t*)&(ref_main[x + delta_int])); __m128i sample1 = _mm_cvtsi32_si128(*(uint32_t*)&(ref_main[x + delta_int + 1])); __m128i pairs = _mm_unpacklo_epi8(sample0, sample1); __m128i weight = _mm_set1_epi16( (delta_fract << 8) | (32 - delta_fract) ); sample0 = _mm_maddubs_epi16(pairs, weight); sample0 = _mm_add_epi16(sample0, _mm_set1_epi16(16)); sample0 = _mm_srli_epi16(sample0, 5); sample0 = _mm_packus_epi16(sample0, sample0); return sample0; } /** * \brief Linear interpolation for 4x4 block. Writes filtered 4x4 block to dst. * \param dst Destination buffer * \param ref_main Reference pixels * \param sample_disp Sample displacement per row * \param vertical_mode Mode direction, true if vertical */ static void filter_4x4_avx2(kvz_pixel *dst, const kvz_pixel *ref_main, int sample_disp, bool vertical_mode){
void aom_filter_block1d8_v8_intrin_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i addFilterReg64, filtersReg, minReg; __m128i firstFilters, secondFilters, thirdFilters, forthFilters; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt5; __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; __m128i srcReg8; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the first 16 bits in the filter firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); // duplicate only the second 16 bits in the filter secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); // duplicate only the third 16 bits in the filter thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); // duplicate only the forth 16 bits in the filter forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); // load the first 7 rows of 8 bytes srcReg1 = _mm_loadl_epi64((const __m128i *)src_ptr); srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch)); srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); srcReg7 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); for (i = 0; i < output_height; i++) { // load the last 8 bytes srcReg8 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 7)); // merge the result together srcRegFilt1 = _mm_unpacklo_epi8(srcReg1, srcReg2); srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); // merge the result together srcRegFilt2 = _mm_unpacklo_epi8(srcReg5, srcReg6); srcRegFilt5 = _mm_unpacklo_epi8(srcReg7, srcReg8); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, forthFilters); // add and saturate the results together minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt5); srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); // shift by 7 bit each 16 bit srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); // shrink to 8 bit each 16 bits srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); src_ptr += src_pitch; // shift down a row srcReg1 = srcReg2; srcReg2 = srcReg3; srcReg3 = srcReg4; srcReg4 = srcReg5; srcReg5 = srcReg6; srcReg6 = srcReg7; srcReg7 = srcReg8; // save only 8 bytes convolve result _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); output_ptr += out_pitch; } }
void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, unsigned int src_pitch, unsigned char *output_ptr, unsigned int out_pitch, unsigned int output_height, int16_t *filter) { __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3; __m128i firstFilters, secondFilters, thirdFilters, forthFilters; __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((__m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); // duplicate only the first 16 bits in the filter firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); // duplicate only the second 16 bits in the filter secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); // duplicate only the third 16 bits in the filter thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); // duplicate only the forth 16 bits in the filter forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); for (i = 0; i < output_height; i++) { // load the first 16 bytes srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr)); // load the next 16 bytes in stride of src_pitch srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)); srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6)); srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)); // merge the result together srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2); srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4); // multiply 2 adjacent elements with the filter and add the result srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters); srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters); srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); // add and saturate the results together srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); // load the next 16 bytes in stride of two/three src_pitch srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2)); srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3)); // merge the result together srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); // multiply 2 adjacent elements with the filter and add the result srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters); srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters); // load the next 16 bytes in stride of four/five src_pitch srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4)); srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5)); // merge the result together srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); // multiply 2 adjacent elements with the filter and add the result srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters); srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters); // add and saturate the results together srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, _mm_min_epi16(srcRegFilt4, srcRegFilt7)); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt6, srcRegFilt8)); // add and saturate the results together srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, _mm_max_epi16(srcRegFilt4, srcRegFilt7)); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt6, srcRegFilt8)); srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); // shift by 7 bit each 16 bit srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7); srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1); src_ptr+=src_pitch; // save 16 bytes convolve result _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); output_ptr+=out_pitch; } }