static void GF_FUNC_ALIGN VS_CC float_to_dst_gb_16bit(const float *srcp, uint8_t *d, int width, int height, int src_stride, int dst_stride, float th, int bits) { uint16_t *dstp = (uint16_t *)d; dst_stride /= 2; __m128i tmax = _mm_set1_epi32((1 << bits) - 1); for (int y = 0; y < height; y++) { for (int x = 0; x < width; x += 8) { __m128i xmm0 = _mm_cvtps_epi32(_mm_load_ps(srcp + x)); __m128i xmm1 = _mm_cvtps_epi32(_mm_load_ps(srcp + x + 4)); xmm0 = _mm_packs_epi32(mm_min_epi32(tmax, xmm0), mm_min_epi32(tmax, xmm1)); _mm_store_si128((__m128i *)(dstp + x), xmm0); } srcp += src_stride; dstp += dst_stride; } }
inline FORCE_INLINE __m128i mm_cvtps_ph(__m128 x) { __m128 magic = _mm_castsi128_ps(_mm_set1_epi32((uint32_t)15 << 23)); __m128i inf = _mm_set1_epi32((uint32_t)255UL << 23); __m128i f16inf = _mm_set1_epi32((uint32_t)31UL << 23); __m128i sign_mask = _mm_set1_epi32(0x80000000UL); __m128i round_mask = _mm_set1_epi32(~0x0FFFU); __m128i ret_0x7E00 = _mm_set1_epi32(0x7E00); __m128i ret_0x7C00 = _mm_set1_epi32(0x7C00); __m128i f, sign, ge_inf, eq_inf; f = _mm_castps_si128(x); sign = _mm_and_si128(f, sign_mask); f = _mm_xor_si128(f, sign); ge_inf = _mm_cmpgt_epi32(f, inf); eq_inf = _mm_cmpeq_epi32(f, inf); f = _mm_and_si128(f, round_mask); f = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(f), magic)); f = _mm_sub_epi32(f, round_mask); f = mm_min_epi32(f, f16inf); f = _mm_srli_epi32(f, 13); f = mm_blendv_ps(ret_0x7E00, f, ge_inf); f = mm_blendv_ps(ret_0x7C00, f, eq_inf); sign = _mm_srli_epi32(sign, 16); f = _mm_or_si128(f, sign); f = mm_packus_epi32(f, _mm_setzero_si128()); return f; }
static void GF_FUNC_ALIGN VS_CC proc_16bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *d, const uint8_t *s, edge_t *eh, uint16_t plane_max) { const uint16_t *srcp = (uint16_t *)s; uint16_t *dstp = (uint16_t *)d; stride /= 2; bstride /= 2; uint16_t* p0 = (uint16_t *)buff + 8; uint16_t* p1 = p0 + bstride; uint16_t* p2 = p1 + bstride; uint16_t* p3 = p2 + bstride; uint16_t* p4 = p3 + bstride; uint16_t *orig = p0, *end = p4; line_copy16(p0, srcp + 2 * stride, width, 2); line_copy16(p1, srcp + stride, width, 2); line_copy16(p2, srcp, width, 2); srcp += stride; line_copy16(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128 alpha = _mm_set1_ps((float)0.96043387); __m128 beta = _mm_set1_ps((float)0.39782473); __m128i pmax = _mm_set1_epi32(0xFFFF); __m128i min = _mm_set1_epi16((int16_t)eh->min); __m128i max = _mm_set1_epi16((int16_t)eh->max); for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy16(p4, srcp, width, 2); uint16_t* posh[] = {p2 - 2, p2 - 1, p2 + 1, p2 + 2}; uint16_t* posv[] = {p0, p1, p3, p4}; for (int x = 0; x < width; x += 8) { __m128 sumx[2] = {(__m128)zero, (__m128)zero}; __m128 sumy[2] = {(__m128)zero, (__m128)zero}; for (int i = 0; i < 4; i++) { __m128 xmul = _mm_load_ps(ar_mulxf[i]); __m128i xmm0 = _mm_loadu_si128((__m128i *)(posh[i] + x)); __m128i xmm1 = _mm_unpackhi_epi16(xmm0, zero); xmm0 = _mm_unpacklo_epi16(xmm0, zero); sumx[0] = _mm_add_ps(sumx[0], _mm_mul_ps(_mm_cvtepi32_ps(xmm0), xmul)); sumx[1] = _mm_add_ps(sumx[1], _mm_mul_ps(_mm_cvtepi32_ps(xmm1), xmul)); xmul = _mm_load_ps(ar_mulyf[i]); xmm0 = _mm_load_si128((__m128i *)(posv[i] + x)); xmm1 = _mm_unpackhi_epi16(xmm0, zero); xmm0 = _mm_unpacklo_epi16(xmm0, zero); sumy[0] = _mm_add_ps(sumy[0], _mm_mul_ps(_mm_cvtepi32_ps(xmm0), xmul)); sumy[1] = _mm_add_ps(sumy[1], _mm_mul_ps(_mm_cvtepi32_ps(xmm1), xmul)); } __m128i out[2]; for (int i = 0; i < 2; i++) { sumx[i] = mm_abs_ps(sumx[i]); sumy[i] = mm_abs_ps(sumy[i]); __m128 t0 = _mm_max_ps(sumx[i], sumy[i]); __m128 t1 = _mm_min_ps(sumx[i], sumy[i]); t0 = _mm_add_ps(_mm_mul_ps(alpha, t0), _mm_mul_ps(beta, t1)); out[i] = _mm_srli_epi32(_mm_cvtps_epi32(t0), eh->rshift); out[i] = mm_min_epi32(out[i], pmax); } out[0] = mm_cast_epi32(out[0], out[1]); out[1] = MM_MIN_EPU16(out[0], max); out[1] = _mm_cmpeq_epi16(out[1], max); out[0] = _mm_or_si128(out[1], out[0]); out[1] = MM_MAX_EPU16(out[0], min); out[1] = _mm_cmpeq_epi16(out[1], min); out[0] = _mm_andnot_si128(out[1], out[0]); _mm_store_si128((__m128i *)(dstp + x), out[0]); } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
static void GF_FUNC_ALIGN VS_CC proc_16bit_sse2(convolution_t *ch, uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *d, const uint8_t *s) { const uint16_t *srcp = (uint16_t *)s; uint16_t *dstp = (uint16_t *)d; stride /= 2; bstride /= 2; uint16_t *p0 = (uint16_t *)buff + 8; uint16_t *p1 = p0 + bstride; uint16_t *p2 = p1 + bstride; uint16_t *p3 = p2 + bstride; uint16_t *p4 = p3 + bstride; uint16_t *orig = p0, *end = p4; line_copy16(p0, srcp + 2 * stride, width, 2); line_copy16(p1, srcp + stride, width, 2); line_copy16(p2, srcp, width, 2); srcp += stride; line_copy16(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128 rdiv = _mm_set1_ps((float)ch->rdiv); __m128 bias = _mm_set1_ps((float)ch->bias); __m128i max = _mm_set1_epi32(0xFFFF); __m128 matrix[25]; for (int i = 0; i < 25; i++) { matrix[i] = _mm_set1_ps((float)ch->m[i]); } for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy16(p4, srcp, width, 2); uint16_t *array[] = { p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2, p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2, p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2, p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2, p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2 }; for (int x = 0; x < width; x += 8) { __m128 sum[2] = {(__m128)zero, (__m128)zero}; for (int i = 0; i < 25; i++) { __m128i xmm0 = _mm_loadu_si128((__m128i *)(array[i] + x)); __m128 xmm1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm0, zero)); __m128 xmm2 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm0, zero)); xmm1 = _mm_mul_ps(xmm1, matrix[i]); xmm2 = _mm_mul_ps(xmm2, matrix[i]); sum[0] = _mm_add_ps(sum[0], xmm1); sum[1] = _mm_add_ps(sum[1], xmm2); } __m128i sumi[2]; for (int i = 0; i < 2; i++) { sum[i] = _mm_mul_ps(sum[i], rdiv); sum[i] = _mm_add_ps(sum[i], bias); if (!ch->saturate) { sum[i] = mm_abs_ps(sum[i]); } sumi[i] = _mm_cvtps_epi32(sum[i]); sumi[i] = mm_min_epi32(sumi[i], max); __m128i mask = _mm_cmpgt_epi32(sumi[i], zero); sumi[i] = _mm_and_si128(sumi[i], mask); } sumi[0] = mm_cast_epi32(sumi[0], sumi[1]); _mm_store_si128((__m128i *)(dstp + x), sumi[0]); } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }