Пример #1
0
void left_shift_b2b_sse2(const void *src, void *dst, unsigned shift, unsigned left, unsigned right)
{
	const uint8_t *src_p = static_cast<const uint8_t *>(src);
	uint8_t *dst_p = static_cast<uint8_t *>(dst);

	unsigned vec_left = ceil_n(left, 16);
	unsigned vec_right = floor_n(right, 16);

	__m128i count = _mm_set1_epi64x(shift);

	if (left != vec_left) {
		__m128i x = _mm_load_si128((const __m128i *)(src_p + vec_left - 16));
		x = mm_sll_epi8(x, count);
		mm_store_idxhi_epi8((__m128i *)(dst_p + vec_left - 16), x, left % 16);
	}

	for (unsigned j = vec_left; j < vec_right; j += 16) {
		__m128i x = _mm_load_si128((const __m128i *)(src_p + j));
		x = mm_sll_epi8(x, count);
		_mm_store_si128((__m128i *)(dst_p + j), x);
	}

	if (right != vec_right) {
		__m128i x = _mm_load_si128((const __m128i *)(src_p + vec_right));
		x = mm_sll_epi8(x, count);
		mm_store_idxlo_epi8((__m128i *)(dst_p + vec_right), x, right % 16);
	}
}
Пример #2
0
void left_shift_w2w_sse2(const void *src, void *dst, unsigned shift, unsigned left, unsigned right)
{
	const uint16_t *src_p = static_cast<const uint16_t *>(src);
	uint16_t *dst_p = static_cast<uint16_t *>(dst);

	unsigned vec_left = ceil_n(left, 8);
	unsigned vec_right = floor_n(right, 8);

	__m128i count = _mm_set1_epi64x(shift);

	if (left != vec_left) {
		__m128i x = _mm_load_si128((const __m128i *)(src_p + vec_left - 8));
		x = _mm_sll_epi16(x, count);
		mm_store_left_epi16(dst_p + vec_left - 8, x, vec_left - left);
	}

	for (unsigned j = vec_left; j < vec_right; j += 8) {
		__m128i x = _mm_load_si128((const __m128i *)(src_p + j));
		x = _mm_sll_epi16(x, count);
		_mm_store_si128((__m128i *)(dst_p + j), x);
	}

	if (right != vec_right) {
		__m128i x = _mm_load_si128((const __m128i *)(src_p + vec_right));
		x = _mm_sll_epi16(x, count);
		mm_store_right_epi16(dst_p + vec_right, x, right - vec_right);
	}
}
Пример #3
0
void left_shift_b2w_sse2(const void *src, void *dst, unsigned shift, unsigned left, unsigned right)
{
	const uint8_t *src_p = static_cast<const uint8_t *>(src);
	uint16_t *dst_p = static_cast<uint16_t *>(dst);

	unsigned vec_left = ceil_n(left, 16);
	unsigned vec_right = floor_n(right, 16);

	__m128i count = _mm_set1_epi64x(shift);

	if (left != vec_left) {
		__m128i x = _mm_load_si128((const __m128i *)(src_p + vec_left - 16));
		__m128i lo = _mm_unpacklo_epi8(x, _mm_setzero_si128());
		__m128i hi = _mm_unpackhi_epi8(x, _mm_setzero_si128());
		lo = _mm_sll_epi16(lo, count);
		hi = _mm_sll_epi16(hi, count);

		if (vec_left - left > 8) {
			mm_store_left_epi16(dst_p + vec_left - 16, lo, (vec_left - left) % 8);
			_mm_store_si128((__m128i *)(dst_p + vec_left - 8), hi);
		} else {
			mm_store_left_epi16(dst_p + vec_left - 8, hi, vec_left - left);
		}
	}

	for (unsigned j = vec_left; j < vec_right; j += 16) {
		__m128i x = _mm_load_si128((const __m128i *)(src_p + j));
		__m128i lo = _mm_unpacklo_epi8(x, _mm_setzero_si128());
		__m128i hi = _mm_unpackhi_epi8(x, _mm_setzero_si128());
		lo = _mm_sll_epi16(lo, count);
		hi = _mm_sll_epi16(hi, count);

		_mm_store_si128((__m128i *)(dst_p + j + 0), lo);
		_mm_store_si128((__m128i *)(dst_p + j + 8), hi);
	}

	if (right != vec_right) {
		__m128i x = _mm_load_si128((const __m128i *)(src_p + vec_right));
		__m128i lo = _mm_unpacklo_epi8(x, _mm_setzero_si128());
		__m128i hi = _mm_unpackhi_epi8(x, _mm_setzero_si128());
		lo = _mm_sll_epi16(lo, count);
		hi = _mm_sll_epi16(hi, count);

		if (right - vec_right > 8) {
			_mm_store_si128((__m128i *)(dst_p + vec_right), lo);
			mm_store_right_epi16(dst_p + vec_right + 8, hi, (right - vec_right) % 8);
		} else {
			mm_store_right_epi16(dst_p + vec_right, lo, right - vec_right);
		}
	}
}
Пример #4
0
void depth_convert_w2f_sse2(const void *src, void *dst, float scale, float offset, unsigned left, unsigned right)
{
	const uint16_t *src_p = static_cast<const uint16_t *>(src);
	float *dst_p = static_cast<float *>(dst);

	unsigned vec_left = ceil_n(left, 8);
	unsigned vec_right = floor_n(right, 8);

	const __m128 scale_ps = _mm_set_ps1(scale);
	const __m128 offset_ps = _mm_set_ps1(offset);

	__m128 lo, hi;

#define XITER depth_convert_w2f_sse2_xiter
#define XARGS src_p, scale_ps, offset_ps, lo, hi
	if (left != vec_left) {
		XITER(vec_left - 8, XARGS);

		if (vec_left - left > 4) {
			mm_store_left(dst_p + vec_left - 8, lo, vec_left - left - 4);
			_mm_store_ps(dst_p + vec_left - 4, hi);
		} else {
			mm_store_left(dst_p + vec_left - 4, hi, vec_left - left);
		}
	}

	for (unsigned j = vec_left; j < vec_right; j += 8) {
		XITER(j, XARGS);

		_mm_store_ps(dst_p + j + 0, lo);
		_mm_store_ps(dst_p + j + 4, hi);
	}

	if (right != vec_right) {
		XITER(vec_right, XARGS);

		if (right - vec_right > 4) {
			_mm_store_ps(dst_p + vec_right + 0, lo);
			mm_store_right(dst_p + vec_right + 4, hi, right - vec_right - 4);
		} else {
			mm_store_right(dst_p + vec_right, lo, right - vec_right);
		}
	}
#undef XITER
#undef XARGS
}
Пример #5
0
void left_shift_w2b_sse2(const void *src, void *dst, unsigned shift, unsigned left, unsigned right)
{
	const uint16_t *src_p = static_cast<const uint16_t *>(src);
	uint8_t *dst_p = static_cast<uint8_t *>(dst);

	unsigned vec_left = ceil_n(left, 16);
	unsigned vec_right = floor_n(right, 16);

	__m128i count = _mm_set1_epi64x(shift);

	if (left != vec_left) {
		__m128i lo = _mm_load_si128((const __m128i *)(src_p + vec_left - 16));
		__m128i hi = _mm_load_si128((const __m128i *)(src_p + vec_left - 8));
		lo = _mm_sll_epi16(lo, count);
		hi = _mm_sll_epi16(hi, count);
		lo = _mm_packus_epi16(lo, hi);
		mm_store_left_epi8(dst_p + vec_left - 16, lo, vec_left - left);
	}

	for (unsigned j = vec_left; j < vec_right; j += 16) {
		__m128i lo = _mm_load_si128((const __m128i *)(src_p + j + 0));
		__m128i hi = _mm_load_si128((const __m128i *)(src_p + j + 8));
		lo = _mm_sll_epi16(lo, count);
		hi = _mm_sll_epi16(hi, count);
		lo = _mm_packus_epi16(lo, hi);
		_mm_store_si128((__m128i *)(dst_p + j), lo);
	}

	if (right != vec_right) {
		__m128i lo = _mm_load_si128((const __m128i *)(src_p + vec_right + 0));
		__m128i hi = _mm_load_si128((const __m128i *)(src_p + vec_right + 8));
		lo = _mm_sll_epi16(lo, count);
		hi = _mm_sll_epi16(hi, count);
		lo = _mm_packus_epi16(lo, hi);
		mm_store_right_epi8(dst_p + vec_right, lo, right - vec_right);
	}
}
Пример #6
0
void depth_convert_b2f_sse2(const void *src, void *dst, float scale, float offset, unsigned left, unsigned right)
{
	const uint8_t *src_p = static_cast<const uint8_t *>(src);
	float *dst_p = static_cast<float *>(dst);

	unsigned vec_left = ceil_n(left, 16);
	unsigned vec_right = floor_n(right, 16);

	const __m128 scale_ps = _mm_set_ps1(scale);
	const __m128 offset_ps = _mm_set_ps1(offset);

	__m128 lolo, lohi, hilo, hihi;

#define XITER depth_convert_b2f_sse2_xiter
#define XARGS src_p, scale_ps, offset_ps, lolo, lohi, hilo, hihi
	if (left != vec_left) {
		XITER(vec_left - 16, XARGS);

		if (vec_left - left > 12) {
			mm_store_idxhi_ps(dst_p + vec_left - 16, lolo, left % 4);
			_mm_store_ps(dst_p + vec_left - 12, lohi);
			_mm_store_ps(dst_p + vec_left - 8, hilo);
			_mm_store_ps(dst_p + vec_left - 4, hihi);
		} else if (vec_left - left > 8) {
			mm_store_idxhi_ps(dst_p + vec_left - 12, lohi, left % 4);
			_mm_store_ps(dst_p + vec_left - 8, hilo);
			_mm_store_ps(dst_p + vec_left - 4, hihi);
		} else if (vec_left - left > 4) {
			mm_store_idxhi_ps(dst_p + vec_left - 8, hilo, left % 4);
			_mm_store_ps(dst_p + vec_left - 4, hihi);
		} else {
			mm_store_idxhi_ps(dst_p + vec_left - 4, hihi, left % 4);
		}
	}

	for (unsigned j = vec_left; j < vec_right; j += 16) {
		XITER(j, XARGS);

		_mm_store_ps(dst_p + j + 0, lolo);
		_mm_store_ps(dst_p + j + 4, lohi);
		_mm_store_ps(dst_p + j + 8, hilo);
		_mm_store_ps(dst_p + j + 12, hihi);
	}

	if (right != vec_right) {
		XITER(vec_right, XARGS);

		if (right - vec_right >= 12) {
			_mm_store_ps(dst_p + vec_right + 0, lolo);
			_mm_store_ps(dst_p + vec_right + 4, lohi);
			_mm_store_ps(dst_p + vec_right + 8, hilo);
			mm_store_idxlo_ps(dst_p + vec_right + 12, hihi, right % 4);
		} else if (right - vec_right >= 8) {
			_mm_store_ps(dst_p + vec_right + 0, lolo);
			_mm_store_ps(dst_p + vec_right + 4, lohi);
			mm_store_idxlo_ps(dst_p + vec_right + 8, hilo, right % 4);
		} else if (right - vec_right >= 4) {
			_mm_store_ps(dst_p + vec_right + 0, lolo);
			mm_store_idxlo_ps(dst_p + vec_right + 4, lohi, right % 4);
		} else {
			mm_store_idxlo_ps(dst_p + vec_right, lolo, right % 4);
		}
	}
#undef XITER
#undef XARGS
}
Пример #7
0
		x = _mm_mul_ps(c, x);
		accum1 = _mm_add_ps(accum1, x);
	}

	if (!FWidth || FWidth >= 2)
		accum0 = _mm_add_ps(accum0, accum1);

	return accum0;
}

template <unsigned FWidth, unsigned Tail>
void resize_line4_h_f32_sse(const unsigned * RESTRICT filter_left, const float * RESTRICT filter_data, unsigned filter_stride, unsigned filter_width,
                            const float * RESTRICT src, float * const * RESTRICT dst, unsigned src_base, unsigned left, unsigned right)
{
	unsigned vec_left = ceil_n(left, 4);
	unsigned vec_right = floor_n(right, 4);

	float *dst_p0 = dst[0];
	float *dst_p1 = dst[1];
	float *dst_p2 = dst[2];
	float *dst_p3 = dst[3];

#define XITER resize_line4_h_f32_sse_xiter<FWidth, Tail>
#define XARGS filter_left, filter_data, filter_stride, filter_width, src, src_base
	for (unsigned j = left; j < vec_left; ++j) {
		__m128 x = XITER(j, XARGS);
		mm_scatter_ps(dst_p0 + j, dst_p1 + j, dst_p2 + j, dst_p3 + j, x);
	}

	for (unsigned j = vec_left; j < vec_right; j += 4) {
		__m128 x0, x1, x2, x3;
Пример #8
0
template <bool DoLoop, unsigned Tail>
__m128i resize_line8_h_u16_sse2_xiter(unsigned j,
                                      const unsigned *filter_left, const int16_t * RESTRICT filter_data, unsigned filter_stride, unsigned filter_width,
                                      const uint16_t * RESTRICT src_ptr, unsigned src_base, uint16_t limit)
{
	const __m128i i16_min = _mm_set1_epi16(INT16_MIN);
	const __m128i lim = _mm_set1_epi16(limit + INT16_MIN);

	const int16_t *filter_coeffs = filter_data + j * filter_stride;
	const uint16_t *src_p = src_ptr + (filter_left[j] - src_base) * 8;

	__m128i accum_lo = _mm_setzero_si128();
	__m128i accum_hi = _mm_setzero_si128();
	__m128i x0, x1, xl, xh, c, coeffs;

	unsigned k_end = DoLoop ? floor_n(filter_width, 8) : 0;

	for (unsigned k = 0; k < k_end; k += 8) {
		coeffs = _mm_load_si128((const __m128i *)(filter_coeffs + k));

		c = _mm_shuffle_epi32(coeffs, _MM_SHUFFLE(0, 0, 0, 0));
		x0 = _mm_load_si128((const __m128i *)(src_p + (k + 0) * 8));
		x1 = _mm_load_si128((const __m128i *)(src_p + (k + 1) * 8));
		x0 = _mm_add_epi16(x0, i16_min);
		x1 = _mm_add_epi16(x1, i16_min);

		xl = _mm_unpacklo_epi16(x0, x1);
		xh = _mm_unpackhi_epi16(x0, x1);
		xl = _mm_madd_epi16(c, xl);
		xh = _mm_madd_epi16(c, xh);