void depth_convert_w2f_sse2(const void *src, void *dst, float scale, float offset, unsigned left, unsigned right) { const uint16_t *src_p = static_cast<const uint16_t *>(src); float *dst_p = static_cast<float *>(dst); unsigned vec_left = ceil_n(left, 8); unsigned vec_right = floor_n(right, 8); const __m128 scale_ps = _mm_set_ps1(scale); const __m128 offset_ps = _mm_set_ps1(offset); __m128 lo, hi; #define XITER depth_convert_w2f_sse2_xiter #define XARGS src_p, scale_ps, offset_ps, lo, hi if (left != vec_left) { XITER(vec_left - 8, XARGS); if (vec_left - left > 4) { mm_store_left(dst_p + vec_left - 8, lo, vec_left - left - 4); _mm_store_ps(dst_p + vec_left - 4, hi); } else { mm_store_left(dst_p + vec_left - 4, hi, vec_left - left); } } for (unsigned j = vec_left; j < vec_right; j += 8) { XITER(j, XARGS); _mm_store_ps(dst_p + j + 0, lo); _mm_store_ps(dst_p + j + 4, hi); } if (right != vec_right) { XITER(vec_right, XARGS); if (right - vec_right > 4) { _mm_store_ps(dst_p + vec_right + 0, lo); mm_store_right(dst_p + vec_right + 4, hi, right - vec_right - 4); } else { mm_store_right(dst_p + vec_right, lo, right - vec_right); } } #undef XITER #undef XARGS }
void depth_convert_b2f_sse2(const void *src, void *dst, float scale, float offset, unsigned left, unsigned right) { const uint8_t *src_p = static_cast<const uint8_t *>(src); float *dst_p = static_cast<float *>(dst); unsigned vec_left = ceil_n(left, 16); unsigned vec_right = floor_n(right, 16); const __m128 scale_ps = _mm_set_ps1(scale); const __m128 offset_ps = _mm_set_ps1(offset); __m128 lolo, lohi, hilo, hihi; #define XITER depth_convert_b2f_sse2_xiter #define XARGS src_p, scale_ps, offset_ps, lolo, lohi, hilo, hihi if (left != vec_left) { XITER(vec_left - 16, XARGS); if (vec_left - left > 12) { mm_store_idxhi_ps(dst_p + vec_left - 16, lolo, left % 4); _mm_store_ps(dst_p + vec_left - 12, lohi); _mm_store_ps(dst_p + vec_left - 8, hilo); _mm_store_ps(dst_p + vec_left - 4, hihi); } else if (vec_left - left > 8) { mm_store_idxhi_ps(dst_p + vec_left - 12, lohi, left % 4); _mm_store_ps(dst_p + vec_left - 8, hilo); _mm_store_ps(dst_p + vec_left - 4, hihi); } else if (vec_left - left > 4) { mm_store_idxhi_ps(dst_p + vec_left - 8, hilo, left % 4); _mm_store_ps(dst_p + vec_left - 4, hihi); } else { mm_store_idxhi_ps(dst_p + vec_left - 4, hihi, left % 4); } } for (unsigned j = vec_left; j < vec_right; j += 16) { XITER(j, XARGS); _mm_store_ps(dst_p + j + 0, lolo); _mm_store_ps(dst_p + j + 4, lohi); _mm_store_ps(dst_p + j + 8, hilo); _mm_store_ps(dst_p + j + 12, hihi); } if (right != vec_right) { XITER(vec_right, XARGS); if (right - vec_right >= 12) { _mm_store_ps(dst_p + vec_right + 0, lolo); _mm_store_ps(dst_p + vec_right + 4, lohi); _mm_store_ps(dst_p + vec_right + 8, hilo); mm_store_idxlo_ps(dst_p + vec_right + 12, hihi, right % 4); } else if (right - vec_right >= 8) { _mm_store_ps(dst_p + vec_right + 0, lolo); _mm_store_ps(dst_p + vec_right + 4, lohi); mm_store_idxlo_ps(dst_p + vec_right + 8, hilo, right % 4); } else if (right - vec_right >= 4) { _mm_store_ps(dst_p + vec_right + 0, lolo); mm_store_idxlo_ps(dst_p + vec_right + 4, lohi, right % 4); } else { mm_store_idxlo_ps(dst_p + vec_right, lolo, right % 4); } } #undef XITER #undef XARGS }