Esempio n. 1
0
void depth_convert_w2f_sse2(const void *src, void *dst, float scale, float offset, unsigned left, unsigned right)
{
	const uint16_t *src_p = static_cast<const uint16_t *>(src);
	float *dst_p = static_cast<float *>(dst);

	unsigned vec_left = ceil_n(left, 8);
	unsigned vec_right = floor_n(right, 8);

	const __m128 scale_ps = _mm_set_ps1(scale);
	const __m128 offset_ps = _mm_set_ps1(offset);

	__m128 lo, hi;

#define XITER depth_convert_w2f_sse2_xiter
#define XARGS src_p, scale_ps, offset_ps, lo, hi
	if (left != vec_left) {
		XITER(vec_left - 8, XARGS);

		if (vec_left - left > 4) {
			mm_store_left(dst_p + vec_left - 8, lo, vec_left - left - 4);
			_mm_store_ps(dst_p + vec_left - 4, hi);
		} else {
			mm_store_left(dst_p + vec_left - 4, hi, vec_left - left);
		}
	}

	for (unsigned j = vec_left; j < vec_right; j += 8) {
		XITER(j, XARGS);

		_mm_store_ps(dst_p + j + 0, lo);
		_mm_store_ps(dst_p + j + 4, hi);
	}

	if (right != vec_right) {
		XITER(vec_right, XARGS);

		if (right - vec_right > 4) {
			_mm_store_ps(dst_p + vec_right + 0, lo);
			mm_store_right(dst_p + vec_right + 4, hi, right - vec_right - 4);
		} else {
			mm_store_right(dst_p + vec_right, lo, right - vec_right);
		}
	}
#undef XITER
#undef XARGS
}
Esempio n. 2
0
void depth_convert_b2f_sse2(const void *src, void *dst, float scale, float offset, unsigned left, unsigned right)
{
	const uint8_t *src_p = static_cast<const uint8_t *>(src);
	float *dst_p = static_cast<float *>(dst);

	unsigned vec_left = ceil_n(left, 16);
	unsigned vec_right = floor_n(right, 16);

	const __m128 scale_ps = _mm_set_ps1(scale);
	const __m128 offset_ps = _mm_set_ps1(offset);

	__m128 lolo, lohi, hilo, hihi;

#define XITER depth_convert_b2f_sse2_xiter
#define XARGS src_p, scale_ps, offset_ps, lolo, lohi, hilo, hihi
	if (left != vec_left) {
		XITER(vec_left - 16, XARGS);

		if (vec_left - left > 12) {
			mm_store_idxhi_ps(dst_p + vec_left - 16, lolo, left % 4);
			_mm_store_ps(dst_p + vec_left - 12, lohi);
			_mm_store_ps(dst_p + vec_left - 8, hilo);
			_mm_store_ps(dst_p + vec_left - 4, hihi);
		} else if (vec_left - left > 8) {
			mm_store_idxhi_ps(dst_p + vec_left - 12, lohi, left % 4);
			_mm_store_ps(dst_p + vec_left - 8, hilo);
			_mm_store_ps(dst_p + vec_left - 4, hihi);
		} else if (vec_left - left > 4) {
			mm_store_idxhi_ps(dst_p + vec_left - 8, hilo, left % 4);
			_mm_store_ps(dst_p + vec_left - 4, hihi);
		} else {
			mm_store_idxhi_ps(dst_p + vec_left - 4, hihi, left % 4);
		}
	}

	for (unsigned j = vec_left; j < vec_right; j += 16) {
		XITER(j, XARGS);

		_mm_store_ps(dst_p + j + 0, lolo);
		_mm_store_ps(dst_p + j + 4, lohi);
		_mm_store_ps(dst_p + j + 8, hilo);
		_mm_store_ps(dst_p + j + 12, hihi);
	}

	if (right != vec_right) {
		XITER(vec_right, XARGS);

		if (right - vec_right >= 12) {
			_mm_store_ps(dst_p + vec_right + 0, lolo);
			_mm_store_ps(dst_p + vec_right + 4, lohi);
			_mm_store_ps(dst_p + vec_right + 8, hilo);
			mm_store_idxlo_ps(dst_p + vec_right + 12, hihi, right % 4);
		} else if (right - vec_right >= 8) {
			_mm_store_ps(dst_p + vec_right + 0, lolo);
			_mm_store_ps(dst_p + vec_right + 4, lohi);
			mm_store_idxlo_ps(dst_p + vec_right + 8, hilo, right % 4);
		} else if (right - vec_right >= 4) {
			_mm_store_ps(dst_p + vec_right + 0, lolo);
			mm_store_idxlo_ps(dst_p + vec_right + 4, lohi, right % 4);
		} else {
			mm_store_idxlo_ps(dst_p + vec_right, lolo, right % 4);
		}
	}
#undef XITER
#undef XARGS
}