uint32_t sse_sumbytes(uint8_t* array, size_t size) { __m128i accumulator = _mm_setzero_si128(); for (size_t i=0; i < size; i += 16) { const __m128i v = _mm_loadu_si128((__m128i*)(array + i)); const __m128i v0_3 = v; const __m128i v4_7 = _mm_bsrli_si128(v, 1*4); const __m128i v8_11 = _mm_bsrli_si128(v, 2*4); const __m128i v12_15 = _mm_bsrli_si128(v, 3*4); const __m128i t0 = _mm_cvtepu8_epi32(v0_3); const __m128i t1 = _mm_cvtepu8_epi32(v4_7); const __m128i t2 = _mm_cvtepu8_epi32(v8_11); const __m128i t3 = _mm_cvtepu8_epi32(v12_15); const __m128i t01 = _mm_add_epi32(t0, t1); const __m128i t23 = _mm_add_epi32(t2, t3); accumulator = _mm_add_epi32(accumulator, t01); accumulator = _mm_add_epi32(accumulator, t23); } return uint32_t(_mm_extract_epi32(accumulator, 0)) + uint32_t(_mm_extract_epi32(accumulator, 1)) + uint32_t(_mm_extract_epi32(accumulator, 2)) + uint32_t(_mm_extract_epi32(accumulator, 3)); }
static inline void yuv_to_packed_shader_3(uint8_t** dstp, const uint8_t** srcp, const int dpitch, const int spitch, const int width, const int height, void* _buff) noexcept { const uint8_t* sr = srcp[0]; const uint8_t* sg = srcp[1]; const uint8_t* sb = srcp[2]; uint8_t* d = dstp[0]; float* buff = reinterpret_cast<float*>(_buff); const uint8_t *rlsb, *glsb, *blsb; if (STACK16) { rlsb = sr + height * spitch; glsb = sg + height * spitch; blsb = sb + height * spitch; } const __m128i zero = _mm_setzero_si128(); const __m128 rcp = _mm_set1_ps(1.0f / (STACK16 ? 65535 : 255)); for (int y = 0; y < height; ++y) { for (int x = 0; x < width; x += 4) { __m128i r, g, b; if (!STACK16) { r = _mm_cvtepu8_epi32(loadl(sr + x)); g = _mm_cvtepu8_epi32(loadl(sg + x)); b = _mm_cvtepu8_epi32(loadl(sb + x)); } else { r = _mm_unpacklo_epi16(_mm_unpacklo_epi8(loadl(rlsb + x), loadl(sr + x)), zero); g = _mm_unpacklo_epi16(_mm_unpacklo_epi8(loadl(glsb + x), loadl(sg + x)), zero); b = _mm_unpacklo_epi16(_mm_unpacklo_epi8(loadl(blsb + x), loadl(sb + x)), zero); } __m128i rg = _mm_unpacklo_epi32(r, g); __m128i ba = _mm_unpacklo_epi32(b, zero); __m128 rgba0 = _mm_cvtepi32_ps(_mm_unpacklo_epi64(rg, ba)); __m128 rgba1 = _mm_cvtepi32_ps(_mm_unpackhi_epi64(rg, ba)); _mm_store_ps(buff + 4 * x + 0, _mm_mul_ps(rgba0, rcp)); _mm_store_ps(buff + 4 * x + 4, _mm_mul_ps(rgba1, rcp)); rg = _mm_unpackhi_epi32(r, g); ba = _mm_unpackhi_epi32(b, zero); rgba0 = _mm_cvtepi32_ps(_mm_unpacklo_epi64(rg, ba)); rgba1 = _mm_cvtepi32_ps(_mm_unpackhi_epi64(rg, ba)); _mm_store_ps(buff + 4 * x + 8, _mm_mul_ps(rgba0, rcp)); _mm_store_ps(buff + 4 * x + 12, _mm_mul_ps(rgba1, rcp)); } convert_float_to_half(d, buff, width * 4); d += dpitch; sr += spitch; sg += spitch; sb += spitch; if (STACK16) { rlsb += spitch; glsb += spitch; blsb += spitch; } } }
static INLINE unsigned int obmc_sad_w8n(const uint8_t *pre, const int pre_stride, const int32_t *wsrc, const int32_t *mask, const int width, const int height) { const int pre_step = pre_stride - width; int n = 0; __m128i v_sad_d = _mm_setzero_si128(); assert(width >= 8); assert(IS_POWER_OF_TWO(width)); do { const __m128i v_p1_b = xx_loadl_32(pre + n + 4); const __m128i v_m1_d = xx_load_128(mask + n + 4); const __m128i v_w1_d = xx_load_128(wsrc + n + 4); const __m128i v_p0_b = xx_loadl_32(pre + n); const __m128i v_m0_d = xx_load_128(mask + n); const __m128i v_w0_d = xx_load_128(wsrc + n); const __m128i v_p0_d = _mm_cvtepu8_epi32(v_p0_b); const __m128i v_p1_d = _mm_cvtepu8_epi32(v_p1_b); // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d); const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d); // Rounded absolute difference const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12); const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12); v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d); v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d); n += 8; if (n % width == 0) pre += pre_step; } while (n < width * height); return xx_hsum_epi32_si32(v_sad_d); }
static INLINE unsigned int obmc_sad_w4(const uint8_t *pre, const int pre_stride, const int32_t *wsrc, const int32_t *mask, const int height) { const int pre_step = pre_stride - 4; int n = 0; __m128i v_sad_d = _mm_setzero_si128(); do { const __m128i v_p_b = xx_loadl_32(pre + n); const __m128i v_m_d = xx_load_128(mask + n); const __m128i v_w_d = xx_load_128(wsrc + n); const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b); // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d); // Rounded absolute difference const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12); v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d); n += 4; if (n % 4 == 0) pre += pre_step; } while (n < 4 * height); return xx_hsum_epi32_si32(v_sad_d); }
inline __m128i load_aligned_int32(const uint8_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION __m128i res = _mm_cvtepu8_epi32(tmp); #else __m128i tmp2 = _mm_unpacklo_epi8(tmp, _mm_set1_epi8(0)); __m128i res = _mm_unpacklo_epi16(tmp2, _mm_set1_epi16(0)); #endif return res; }
void __stdcall rgb32_to_packed_shader_3_f16c(uint8_t** dstp, const uint8_t** srcp, const int dpitch, const int spitch, const int width, const int height, void* _buff) noexcept { const uint8_t* s = srcp[0] + (height - 1) * spitch; uint8_t* d = dstp[0]; float* buff = reinterpret_cast<float*>(_buff); const __m128i zero = _mm_setzero_si128(); const __m128 rcp = _mm_set1_ps(1.0f / 255); const __m128i order = _mm_setr_epi8(2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15); for (int y = 0; y < height; ++y) { for (int x = 0; x < width; x += 4) { __m128i sx = _mm_load_si128(reinterpret_cast<const __m128i*>(s + 4 * x)); sx = _mm_shuffle_epi8(sx, order); __m128i s0 = _mm_cvtepu8_epi32(sx); __m128 f0 = _mm_mul_ps(_mm_cvtepi32_ps(s0), rcp); _mm_store_ps(buff + 4 * x + 0, f0); s0 = _mm_cvtepu8_epi32(_mm_srli_si128(sx, 4)); f0 = _mm_mul_ps(_mm_cvtepi32_ps(s0), rcp); _mm_store_ps(buff + 4 * x + 4, f0); s0 = _mm_cvtepu8_epi32(_mm_srli_si128(sx, 8)); f0 = _mm_mul_ps(_mm_cvtepi32_ps(s0), rcp); _mm_store_ps(buff + 4 * x + 8, f0); s0 = _mm_cvtepu8_epi32(_mm_srli_si128(sx, 12)); f0 = _mm_mul_ps(_mm_cvtepi32_ps(s0), rcp); _mm_store_ps(buff + 4 * x + 12, f0); } convert_float_to_half(d, buff, width * 4); d += dpitch; s -= spitch; } }
void convertCSSE(int num, uint8_t *in, float *out){ int i; __m128 sub = _mm_set1_ps(128.0); __m128 mul = _mm_set1_ps(1/128.0); for(i=0; i<num; i+=4){ __m128i val = _mm_loadu_si128((__m128i *)(in + i)); __m128i ints = _mm_cvtepu8_epi32(val); __m128 cvtd = _mm_cvtepi32_ps(ints); __m128 res = _mm_mul_ps(_mm_sub_ps(cvtd, sub), mul); _mm_storeu_ps(out + i, res); } }
__m128i test_mm_cvtepu8_epi32(__m128i a) { // CHECK-LABEL: test_mm_cvtepu8_epi32 // CHECK: call <4 x i32> @llvm.x86.sse41.pmovzxbd(<16 x i8> {{.*}}) // CHECK-ASM: pmovzxbd %xmm{{.*}}, %xmm{{.*}} return _mm_cvtepu8_epi32(a); }
__m128i test_mm_cvtepu8_epi32(__m128i a) { // CHECK-LABEL: test_mm_cvtepu8_epi32 // CHECK: shufflevector <16 x i8> {{.*}}, <16 x i8> {{.*}}, <4 x i32> <i32 0, i32 1, i32 2, i32 3> // CHECK: zext <4 x i8> {{.*}} to <4 x i32> return _mm_cvtepu8_epi32(a); }