void left_shift_b2b_sse2(const void *src, void *dst, unsigned shift, unsigned left, unsigned right) { const uint8_t *src_p = static_cast<const uint8_t *>(src); uint8_t *dst_p = static_cast<uint8_t *>(dst); unsigned vec_left = ceil_n(left, 16); unsigned vec_right = floor_n(right, 16); __m128i count = _mm_set1_epi64x(shift); if (left != vec_left) { __m128i x = _mm_load_si128((const __m128i *)(src_p + vec_left - 16)); x = mm_sll_epi8(x, count); mm_store_idxhi_epi8((__m128i *)(dst_p + vec_left - 16), x, left % 16); } for (unsigned j = vec_left; j < vec_right; j += 16) { __m128i x = _mm_load_si128((const __m128i *)(src_p + j)); x = mm_sll_epi8(x, count); _mm_store_si128((__m128i *)(dst_p + j), x); } if (right != vec_right) { __m128i x = _mm_load_si128((const __m128i *)(src_p + vec_right)); x = mm_sll_epi8(x, count); mm_store_idxlo_epi8((__m128i *)(dst_p + vec_right), x, right % 16); } }
void left_shift_w2w_sse2(const void *src, void *dst, unsigned shift, unsigned left, unsigned right) { const uint16_t *src_p = static_cast<const uint16_t *>(src); uint16_t *dst_p = static_cast<uint16_t *>(dst); unsigned vec_left = ceil_n(left, 8); unsigned vec_right = floor_n(right, 8); __m128i count = _mm_set1_epi64x(shift); if (left != vec_left) { __m128i x = _mm_load_si128((const __m128i *)(src_p + vec_left - 8)); x = _mm_sll_epi16(x, count); mm_store_left_epi16(dst_p + vec_left - 8, x, vec_left - left); } for (unsigned j = vec_left; j < vec_right; j += 8) { __m128i x = _mm_load_si128((const __m128i *)(src_p + j)); x = _mm_sll_epi16(x, count); _mm_store_si128((__m128i *)(dst_p + j), x); } if (right != vec_right) { __m128i x = _mm_load_si128((const __m128i *)(src_p + vec_right)); x = _mm_sll_epi16(x, count); mm_store_right_epi16(dst_p + vec_right, x, right - vec_right); } }
void left_shift_b2w_sse2(const void *src, void *dst, unsigned shift, unsigned left, unsigned right) { const uint8_t *src_p = static_cast<const uint8_t *>(src); uint16_t *dst_p = static_cast<uint16_t *>(dst); unsigned vec_left = ceil_n(left, 16); unsigned vec_right = floor_n(right, 16); __m128i count = _mm_set1_epi64x(shift); if (left != vec_left) { __m128i x = _mm_load_si128((const __m128i *)(src_p + vec_left - 16)); __m128i lo = _mm_unpacklo_epi8(x, _mm_setzero_si128()); __m128i hi = _mm_unpackhi_epi8(x, _mm_setzero_si128()); lo = _mm_sll_epi16(lo, count); hi = _mm_sll_epi16(hi, count); if (vec_left - left > 8) { mm_store_left_epi16(dst_p + vec_left - 16, lo, (vec_left - left) % 8); _mm_store_si128((__m128i *)(dst_p + vec_left - 8), hi); } else { mm_store_left_epi16(dst_p + vec_left - 8, hi, vec_left - left); } } for (unsigned j = vec_left; j < vec_right; j += 16) { __m128i x = _mm_load_si128((const __m128i *)(src_p + j)); __m128i lo = _mm_unpacklo_epi8(x, _mm_setzero_si128()); __m128i hi = _mm_unpackhi_epi8(x, _mm_setzero_si128()); lo = _mm_sll_epi16(lo, count); hi = _mm_sll_epi16(hi, count); _mm_store_si128((__m128i *)(dst_p + j + 0), lo); _mm_store_si128((__m128i *)(dst_p + j + 8), hi); } if (right != vec_right) { __m128i x = _mm_load_si128((const __m128i *)(src_p + vec_right)); __m128i lo = _mm_unpacklo_epi8(x, _mm_setzero_si128()); __m128i hi = _mm_unpackhi_epi8(x, _mm_setzero_si128()); lo = _mm_sll_epi16(lo, count); hi = _mm_sll_epi16(hi, count); if (right - vec_right > 8) { _mm_store_si128((__m128i *)(dst_p + vec_right), lo); mm_store_right_epi16(dst_p + vec_right + 8, hi, (right - vec_right) % 8); } else { mm_store_right_epi16(dst_p + vec_right, lo, right - vec_right); } } }
void depth_convert_w2f_sse2(const void *src, void *dst, float scale, float offset, unsigned left, unsigned right) { const uint16_t *src_p = static_cast<const uint16_t *>(src); float *dst_p = static_cast<float *>(dst); unsigned vec_left = ceil_n(left, 8); unsigned vec_right = floor_n(right, 8); const __m128 scale_ps = _mm_set_ps1(scale); const __m128 offset_ps = _mm_set_ps1(offset); __m128 lo, hi; #define XITER depth_convert_w2f_sse2_xiter #define XARGS src_p, scale_ps, offset_ps, lo, hi if (left != vec_left) { XITER(vec_left - 8, XARGS); if (vec_left - left > 4) { mm_store_left(dst_p + vec_left - 8, lo, vec_left - left - 4); _mm_store_ps(dst_p + vec_left - 4, hi); } else { mm_store_left(dst_p + vec_left - 4, hi, vec_left - left); } } for (unsigned j = vec_left; j < vec_right; j += 8) { XITER(j, XARGS); _mm_store_ps(dst_p + j + 0, lo); _mm_store_ps(dst_p + j + 4, hi); } if (right != vec_right) { XITER(vec_right, XARGS); if (right - vec_right > 4) { _mm_store_ps(dst_p + vec_right + 0, lo); mm_store_right(dst_p + vec_right + 4, hi, right - vec_right - 4); } else { mm_store_right(dst_p + vec_right, lo, right - vec_right); } } #undef XITER #undef XARGS }
void left_shift_w2b_sse2(const void *src, void *dst, unsigned shift, unsigned left, unsigned right) { const uint16_t *src_p = static_cast<const uint16_t *>(src); uint8_t *dst_p = static_cast<uint8_t *>(dst); unsigned vec_left = ceil_n(left, 16); unsigned vec_right = floor_n(right, 16); __m128i count = _mm_set1_epi64x(shift); if (left != vec_left) { __m128i lo = _mm_load_si128((const __m128i *)(src_p + vec_left - 16)); __m128i hi = _mm_load_si128((const __m128i *)(src_p + vec_left - 8)); lo = _mm_sll_epi16(lo, count); hi = _mm_sll_epi16(hi, count); lo = _mm_packus_epi16(lo, hi); mm_store_left_epi8(dst_p + vec_left - 16, lo, vec_left - left); } for (unsigned j = vec_left; j < vec_right; j += 16) { __m128i lo = _mm_load_si128((const __m128i *)(src_p + j + 0)); __m128i hi = _mm_load_si128((const __m128i *)(src_p + j + 8)); lo = _mm_sll_epi16(lo, count); hi = _mm_sll_epi16(hi, count); lo = _mm_packus_epi16(lo, hi); _mm_store_si128((__m128i *)(dst_p + j), lo); } if (right != vec_right) { __m128i lo = _mm_load_si128((const __m128i *)(src_p + vec_right + 0)); __m128i hi = _mm_load_si128((const __m128i *)(src_p + vec_right + 8)); lo = _mm_sll_epi16(lo, count); hi = _mm_sll_epi16(hi, count); lo = _mm_packus_epi16(lo, hi); mm_store_right_epi8(dst_p + vec_right, lo, right - vec_right); } }
void depth_convert_b2f_sse2(const void *src, void *dst, float scale, float offset, unsigned left, unsigned right) { const uint8_t *src_p = static_cast<const uint8_t *>(src); float *dst_p = static_cast<float *>(dst); unsigned vec_left = ceil_n(left, 16); unsigned vec_right = floor_n(right, 16); const __m128 scale_ps = _mm_set_ps1(scale); const __m128 offset_ps = _mm_set_ps1(offset); __m128 lolo, lohi, hilo, hihi; #define XITER depth_convert_b2f_sse2_xiter #define XARGS src_p, scale_ps, offset_ps, lolo, lohi, hilo, hihi if (left != vec_left) { XITER(vec_left - 16, XARGS); if (vec_left - left > 12) { mm_store_idxhi_ps(dst_p + vec_left - 16, lolo, left % 4); _mm_store_ps(dst_p + vec_left - 12, lohi); _mm_store_ps(dst_p + vec_left - 8, hilo); _mm_store_ps(dst_p + vec_left - 4, hihi); } else if (vec_left - left > 8) { mm_store_idxhi_ps(dst_p + vec_left - 12, lohi, left % 4); _mm_store_ps(dst_p + vec_left - 8, hilo); _mm_store_ps(dst_p + vec_left - 4, hihi); } else if (vec_left - left > 4) { mm_store_idxhi_ps(dst_p + vec_left - 8, hilo, left % 4); _mm_store_ps(dst_p + vec_left - 4, hihi); } else { mm_store_idxhi_ps(dst_p + vec_left - 4, hihi, left % 4); } } for (unsigned j = vec_left; j < vec_right; j += 16) { XITER(j, XARGS); _mm_store_ps(dst_p + j + 0, lolo); _mm_store_ps(dst_p + j + 4, lohi); _mm_store_ps(dst_p + j + 8, hilo); _mm_store_ps(dst_p + j + 12, hihi); } if (right != vec_right) { XITER(vec_right, XARGS); if (right - vec_right >= 12) { _mm_store_ps(dst_p + vec_right + 0, lolo); _mm_store_ps(dst_p + vec_right + 4, lohi); _mm_store_ps(dst_p + vec_right + 8, hilo); mm_store_idxlo_ps(dst_p + vec_right + 12, hihi, right % 4); } else if (right - vec_right >= 8) { _mm_store_ps(dst_p + vec_right + 0, lolo); _mm_store_ps(dst_p + vec_right + 4, lohi); mm_store_idxlo_ps(dst_p + vec_right + 8, hilo, right % 4); } else if (right - vec_right >= 4) { _mm_store_ps(dst_p + vec_right + 0, lolo); mm_store_idxlo_ps(dst_p + vec_right + 4, lohi, right % 4); } else { mm_store_idxlo_ps(dst_p + vec_right, lolo, right % 4); } } #undef XITER #undef XARGS }
x = _mm_mul_ps(c, x); accum1 = _mm_add_ps(accum1, x); } if (!FWidth || FWidth >= 2) accum0 = _mm_add_ps(accum0, accum1); return accum0; } template <unsigned FWidth, unsigned Tail> void resize_line4_h_f32_sse(const unsigned * RESTRICT filter_left, const float * RESTRICT filter_data, unsigned filter_stride, unsigned filter_width, const float * RESTRICT src, float * const * RESTRICT dst, unsigned src_base, unsigned left, unsigned right) { unsigned vec_left = ceil_n(left, 4); unsigned vec_right = floor_n(right, 4); float *dst_p0 = dst[0]; float *dst_p1 = dst[1]; float *dst_p2 = dst[2]; float *dst_p3 = dst[3]; #define XITER resize_line4_h_f32_sse_xiter<FWidth, Tail> #define XARGS filter_left, filter_data, filter_stride, filter_width, src, src_base for (unsigned j = left; j < vec_left; ++j) { __m128 x = XITER(j, XARGS); mm_scatter_ps(dst_p0 + j, dst_p1 + j, dst_p2 + j, dst_p3 + j, x); } for (unsigned j = vec_left; j < vec_right; j += 4) { __m128 x0, x1, x2, x3;
template <bool DoLoop, unsigned Tail> __m128i resize_line8_h_u16_sse2_xiter(unsigned j, const unsigned *filter_left, const int16_t * RESTRICT filter_data, unsigned filter_stride, unsigned filter_width, const uint16_t * RESTRICT src_ptr, unsigned src_base, uint16_t limit) { const __m128i i16_min = _mm_set1_epi16(INT16_MIN); const __m128i lim = _mm_set1_epi16(limit + INT16_MIN); const int16_t *filter_coeffs = filter_data + j * filter_stride; const uint16_t *src_p = src_ptr + (filter_left[j] - src_base) * 8; __m128i accum_lo = _mm_setzero_si128(); __m128i accum_hi = _mm_setzero_si128(); __m128i x0, x1, xl, xh, c, coeffs; unsigned k_end = DoLoop ? floor_n(filter_width, 8) : 0; for (unsigned k = 0; k < k_end; k += 8) { coeffs = _mm_load_si128((const __m128i *)(filter_coeffs + k)); c = _mm_shuffle_epi32(coeffs, _MM_SHUFFLE(0, 0, 0, 0)); x0 = _mm_load_si128((const __m128i *)(src_p + (k + 0) * 8)); x1 = _mm_load_si128((const __m128i *)(src_p + (k + 1) * 8)); x0 = _mm_add_epi16(x0, i16_min); x1 = _mm_add_epi16(x1, i16_min); xl = _mm_unpacklo_epi16(x0, x1); xh = _mm_unpackhi_epi16(x0, x1); xl = _mm_madd_epi16(c, xl); xh = _mm_madd_epi16(c, xh);