static inline __m128i local_abs_epi32(__m128i val) { __m128i mask = _mm_srai_epi32(val, 31); val = _mm_xor_si128(val, mask); val = _mm_sub_epi32(val, mask); return val; }
static __m128i xor_modeproc_SSE2(const __m128i& src, const __m128i& dst) { __m128i sa = SkGetPackedA32_SSE2(src); __m128i da = SkGetPackedA32_SSE2(dst); __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa); __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da); __m128i a1 = _mm_add_epi32(sa, da); __m128i a2 = SkAlphaMulAlpha_SSE2(sa, da); a2 = _mm_slli_epi32(a2, 1); __m128i a = _mm_sub_epi32(a1, a2); __m128i r1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedR32_SSE2(src)); __m128i r2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedR32_SSE2(dst)); __m128i r = _mm_add_epi32(r1, r2); __m128i g1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedG32_SSE2(src)); __m128i g2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedG32_SSE2(dst)); __m128i g = _mm_add_epi32(g1, g2); __m128i b1 = SkAlphaMulAlpha_SSE2(ida, SkGetPackedB32_SSE2(src)); __m128i b2 = SkAlphaMulAlpha_SSE2(isa, SkGetPackedB32_SSE2(dst)); __m128i b = _mm_add_epi32(b1, b2); return SkPackARGB32_SSE2(a, r, g, b); }
static void sub(const RView& X, const RView& Y, Result& result) { const int * x = X.data(); const int * y = Y.data(); int * z = result.data(); __m128i px, py, pz, px1, py1, pz1; for(int i=0; i<DIM_N-(DIM_N&0x7); i+=8) { px = _mm_load_si128((const __m128i *)x); py = _mm_load_si128((const __m128i *)y); pz = _mm_sub_epi32(px, py); px1 = _mm_load_si128((const __m128i *)(x+4)); py1 = _mm_load_si128((const __m128i *)(y+4)); _mm_store_si128((__m128i *)z, pz); pz1 = _mm_sub_epi32(px1, py1); _mm_store_si128((__m128i *)(z+4), pz1); x += 8; y += 8; z += 8; } for(int i=DIM_N-(DIM_N&0x7); i<DIM_N; ++i) { result[i] = X[i] - Y[i]; } }
static inline __m128i hardlight_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { // if (2 * sc <= sa) __m128i tmp1 = _mm_slli_epi32(sc, 1); __m128i cmp1 = _mm_cmpgt_epi32(tmp1, sa); __m128i rc1 = _mm_mullo_epi16(sc, dc); // sc * dc; rc1 = _mm_slli_epi32(rc1, 1); // 2 * sc * dc rc1 = _mm_andnot_si128(cmp1, rc1); // else tmp1 = _mm_mullo_epi16(sa, da); __m128i tmp2 = Multiply32_SSE2(_mm_sub_epi32(da, dc), _mm_sub_epi32(sa, sc)); tmp2 = _mm_slli_epi32(tmp2, 1); __m128i rc2 = _mm_sub_epi32(tmp1, tmp2); rc2 = _mm_and_si128(cmp1, rc2); __m128i rc = _mm_or_si128(rc1, rc2); __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da); tmp1 = _mm_mullo_epi16(sc, ida); __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa); tmp2 = _mm_mullo_epi16(dc, isa); rc = _mm_add_epi32(rc, tmp1); rc = _mm_add_epi32(rc, tmp2); return clamp_div255round_SSE2(rc); }
// For each 4x32 block __m128i in[32], // Input with index, 2, 6 // output pixels: 8-15 in __m128i out[32] static INLINE void highbd_idct32_34_4x32_quarter_2(const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) { __m128i step1[32], step2[32]; // stage 2 highbd_partial_butterfly_sse2(in[2], cospi_30_64, cospi_2_64, &step2[8], &step2[15]); highbd_partial_butterfly_neg_sse2(in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); // stage 3 step1[8] = step2[8]; step1[9] = step2[8]; step1[14] = step2[15]; step1[15] = step2[15]; step1[10] = step2[11]; step1[11] = step2[11]; step1[12] = step2[12]; step1[13] = step2[12]; step1[10] = _mm_sub_epi32(_mm_setzero_si128(), step1[10]); // step1[10] = -step1[10] step1[13] = _mm_sub_epi32(_mm_setzero_si128(), step1[13]); // step1[13] = -step1[13] highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out); }
// For each 4x32 block __m128i in[32], // Input with index, 2, 6, 10, 14, 18, 22, 26, 30 // output pixels: 8-15 in __m128i out[32] static INLINE void highbd_idct32_1024_4x32_quarter_2( const __m128i *in /*in[32]*/, __m128i *out /*out[16]*/) { __m128i step1[32], step2[32]; // stage 2 highbd_butterfly_sse2(in[2], in[30], cospi_30_64, cospi_2_64, &step2[8], &step2[15]); highbd_butterfly_sse2(in[18], in[14], cospi_14_64, cospi_18_64, &step2[9], &step2[14]); highbd_butterfly_sse2(in[10], in[22], cospi_22_64, cospi_10_64, &step2[10], &step2[13]); highbd_butterfly_sse2(in[26], in[6], cospi_6_64, cospi_26_64, &step2[11], &step2[12]); // stage 3 step1[8] = _mm_add_epi32(step2[8], step2[9]); step1[9] = _mm_sub_epi32(step2[8], step2[9]); step1[14] = _mm_sub_epi32(step2[15], step2[14]); step1[15] = _mm_add_epi32(step2[15], step2[14]); step1[10] = _mm_sub_epi32(step2[10], step2[11]); // step1[10] = -step1[10] step1[11] = _mm_add_epi32(step2[10], step2[11]); step1[12] = _mm_add_epi32(step2[13], step2[12]); step1[13] = _mm_sub_epi32(step2[13], step2[12]); // step1[13] = -step1[13] highbd_idct32_4x32_quarter_2_stage_4_to_6(step1, out); }
static INLINE void highbd_idct32_4x32_quarter_2_stage_4_to_6( __m128i *const step1 /*step1[16]*/, __m128i *const out /*out[16]*/) { __m128i step2[32]; // stage 4 step2[8] = step1[8]; step2[15] = step1[15]; highbd_butterfly_sse2(step1[14], step1[9], cospi_24_64, cospi_8_64, &step2[9], &step2[14]); highbd_butterfly_sse2(step1[10], step1[13], cospi_8_64, cospi_24_64, &step2[13], &step2[10]); step2[11] = step1[11]; step2[12] = step1[12]; // stage 5 step1[8] = _mm_add_epi32(step2[8], step2[11]); step1[9] = _mm_add_epi32(step2[9], step2[10]); step1[10] = _mm_sub_epi32(step2[9], step2[10]); step1[11] = _mm_sub_epi32(step2[8], step2[11]); step1[12] = _mm_sub_epi32(step2[15], step2[12]); step1[13] = _mm_sub_epi32(step2[14], step2[13]); step1[14] = _mm_add_epi32(step2[14], step2[13]); step1[15] = _mm_add_epi32(step2[15], step2[12]); // stage 6 out[8] = step1[8]; out[9] = step1[9]; highbd_butterfly_sse2(step1[13], step1[10], cospi_16_64, cospi_16_64, &out[10], &out[13]); highbd_butterfly_sse2(step1[12], step1[11], cospi_16_64, cospi_16_64, &out[11], &out[12]); out[14] = step1[14]; out[15] = step1[15]; }
static void FTransformWHT(const int16_t* in, int16_t* out) { int32_t tmp[16]; int i; for (i = 0; i < 4; ++i, in += 64) { const int a0 = (in[0 * 16] + in[2 * 16]); const int a1 = (in[1 * 16] + in[3 * 16]); const int a2 = (in[1 * 16] - in[3 * 16]); const int a3 = (in[0 * 16] - in[2 * 16]); tmp[0 + i * 4] = a0 + a1; tmp[1 + i * 4] = a3 + a2; tmp[2 + i * 4] = a3 - a2; tmp[3 + i * 4] = a0 - a1; } { const __m128i src0 = _mm_loadu_si128((__m128i*)&tmp[0]); const __m128i src1 = _mm_loadu_si128((__m128i*)&tmp[4]); const __m128i src2 = _mm_loadu_si128((__m128i*)&tmp[8]); const __m128i src3 = _mm_loadu_si128((__m128i*)&tmp[12]); const __m128i a0 = _mm_add_epi32(src0, src2); const __m128i a1 = _mm_add_epi32(src1, src3); const __m128i a2 = _mm_sub_epi32(src1, src3); const __m128i a3 = _mm_sub_epi32(src0, src2); const __m128i b0 = _mm_srai_epi32(_mm_add_epi32(a0, a1), 1); const __m128i b1 = _mm_srai_epi32(_mm_add_epi32(a3, a2), 1); const __m128i b2 = _mm_srai_epi32(_mm_sub_epi32(a3, a2), 1); const __m128i b3 = _mm_srai_epi32(_mm_sub_epi32(a0, a1), 1); const __m128i out0 = _mm_packs_epi32(b0, b1); const __m128i out1 = _mm_packs_epi32(b2, b3); _mm_storeu_si128((__m128i*)&out[0], out0); _mm_storeu_si128((__m128i*)&out[8], out1); } }
SIMD_INLINE __m128i Sum32ip(uint32_t * const ptr[4], size_t offset) { __m128i s0 = _mm_loadu_si128((__m128i*)(ptr[0] + offset)); __m128i s1 = _mm_loadu_si128((__m128i*)(ptr[1] + offset)); __m128i s2 = _mm_loadu_si128((__m128i*)(ptr[2] + offset)); __m128i s3 = _mm_loadu_si128((__m128i*)(ptr[3] + offset)); return _mm_sub_epi32(_mm_sub_epi32(s0, s1), _mm_sub_epi32(s2, s3)); }
int32_t sse_sadbw_unrolled4_sumsignedbytes(int8_t* array, size_t size) { const __m128i zero = _mm_setzero_si128(); __m128i positive = zero; __m128i negative = zero; for (size_t i=0; i < size; i += 16*4) { const __m128i v0 = _mm_loadu_si128((__m128i*)(array + i + 0*16)); const __m128i v1 = _mm_loadu_si128((__m128i*)(array + i + 1*16)); const __m128i v2 = _mm_loadu_si128((__m128i*)(array + i + 2*16)); const __m128i v3 = _mm_loadu_si128((__m128i*)(array + i + 3*16)); { const __m128i v = v0; const __m128i m = _mm_cmplt_epi8(v, zero); const __m128i t0 = _mm_sad_epu8(_mm_andnot_si128(m, v), zero); const __m128i va = _mm_abs_epi8(v); const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero); positive = _mm_add_epi32(positive, t0); negative = _mm_sub_epi32(negative, t1); } { const __m128i v = v1; const __m128i m = _mm_cmplt_epi8(v, zero); const __m128i t0 = _mm_sad_epu8(_mm_andnot_si128(m, v), zero); const __m128i va = _mm_abs_epi8(v); const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero); positive = _mm_add_epi32(positive, t0); negative = _mm_sub_epi32(negative, t1); } { const __m128i v = v2; const __m128i m = _mm_cmplt_epi8(v, zero); const __m128i t0 = _mm_sad_epu8(_mm_andnot_si128(m, v), zero); const __m128i va = _mm_abs_epi8(v); const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero); positive = _mm_add_epi32(positive, t0); negative = _mm_sub_epi32(negative, t1); } { const __m128i v = v3; const __m128i m = _mm_cmplt_epi8(v, zero); const __m128i t0 = _mm_sad_epu8(_mm_andnot_si128(m, v), zero); const __m128i va = _mm_abs_epi8(v); const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero); positive = _mm_add_epi32(positive, t0); negative = _mm_sub_epi32(negative, t1); } } const __m128i accumulator = _mm_add_epi32(positive, negative); return int32_t(_mm_extract_epi32(accumulator, 0)) + int32_t(_mm_extract_epi32(accumulator, 2)); }
SIMD_INLINE __m128 WeightedSum32f(const WeightedRect & rect, size_t offset) { __m128i s0 = _mm_loadu_si128((__m128i*)(rect.p0 + offset)); __m128i s1 = _mm_loadu_si128((__m128i*)(rect.p1 + offset)); __m128i s2 = _mm_loadu_si128((__m128i*)(rect.p2 + offset)); __m128i s3 = _mm_loadu_si128((__m128i*)(rect.p3 + offset)); __m128i sum = _mm_sub_epi32(_mm_sub_epi32(s0, s1), _mm_sub_epi32(s2, s3)); return _mm_mul_ps(_mm_cvtepi32_ps(sum), _mm_set1_ps(rect.weight)); }
template <bool align> SIMD_INLINE void HogDirectionHistograms(const __m128i & t, const __m128i & l, const __m128i & r, const __m128i & b, Buffer & buffer, size_t col) { HogDirectionHistograms<align>( _mm_cvtepi32_ps(_mm_sub_epi32(_mm_unpacklo_epi16(r, K_ZERO), _mm_unpacklo_epi16(l, K_ZERO))), _mm_cvtepi32_ps(_mm_sub_epi32(_mm_unpacklo_epi16(b, K_ZERO), _mm_unpacklo_epi16(t, K_ZERO))), buffer, col + 0); HogDirectionHistograms<align>( _mm_cvtepi32_ps(_mm_sub_epi32(_mm_unpackhi_epi16(r, K_ZERO), _mm_unpackhi_epi16(l, K_ZERO))), _mm_cvtepi32_ps(_mm_sub_epi32(_mm_unpackhi_epi16(b, K_ZERO), _mm_unpackhi_epi16(t, K_ZERO))), buffer, col + 4); }
static inline __m128i darken_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { __m128i sd = _mm_mullo_epi16(sc, da); __m128i ds = _mm_mullo_epi16(dc, sa); __m128i cmp = _mm_cmplt_epi32(sd, ds); __m128i tmp = _mm_add_epi32(sc, dc); __m128i ret1 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(ds)); __m128i ret2 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(sd)); __m128i ret = _mm_or_si128(_mm_and_si128(cmp, ret1), _mm_andnot_si128(cmp, ret2)); return ret; }
static INLINE unsigned int hbd_obmc_sad_w8n(const uint8_t *pre8, const int pre_stride, const int32_t *wsrc, const int32_t *mask, const int width, const int height) { const uint16_t *pre = CONVERT_TO_SHORTPTR(pre8); const int pre_step = pre_stride - width; int n = 0; __m128i v_sad_d = _mm_setzero_si128(); assert(width >= 8); assert(IS_POWER_OF_TWO(width)); do { const __m128i v_p1_w = xx_loadl_64(pre + n + 4); const __m128i v_m1_d = xx_load_128(mask + n + 4); const __m128i v_w1_d = xx_load_128(wsrc + n + 4); const __m128i v_p0_w = xx_loadl_64(pre + n); const __m128i v_m0_d = xx_load_128(mask + n); const __m128i v_w0_d = xx_load_128(wsrc + n); const __m128i v_p0_d = _mm_cvtepu16_epi32(v_p0_w); const __m128i v_p1_d = _mm_cvtepu16_epi32(v_p1_w); // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. const __m128i v_pm0_d = _mm_madd_epi16(v_p0_d, v_m0_d); const __m128i v_pm1_d = _mm_madd_epi16(v_p1_d, v_m1_d); const __m128i v_diff0_d = _mm_sub_epi32(v_w0_d, v_pm0_d); const __m128i v_diff1_d = _mm_sub_epi32(v_w1_d, v_pm1_d); const __m128i v_absdiff0_d = _mm_abs_epi32(v_diff0_d); const __m128i v_absdiff1_d = _mm_abs_epi32(v_diff1_d); // Rounded absolute difference const __m128i v_rad0_d = xx_roundn_epu32(v_absdiff0_d, 12); const __m128i v_rad1_d = xx_roundn_epu32(v_absdiff1_d, 12); v_sad_d = _mm_add_epi32(v_sad_d, v_rad0_d); v_sad_d = _mm_add_epi32(v_sad_d, v_rad1_d); n += 8; if (n % width == 0) pre += pre_step; } while (n < width * height); return xx_hsum_epi32_si32(v_sad_d); }
int32_t sse_sadbw_sumsignedbytes(int8_t* array, size_t size) { const __m128i zero = _mm_setzero_si128(); __m128i positive = zero; __m128i negative = zero; for (size_t i=0; i < size; i += 16) { const __m128i v = _mm_loadu_si128((__m128i*)(array + i)); const __m128i m = _mm_cmplt_epi8(v, zero); const __m128i va = _mm_abs_epi8(v); // sum just positive numbers const __m128i t0 = _mm_sad_epu8(_mm_andnot_si128(m, v), zero); // sum just negative numbers const __m128i t1 = _mm_sad_epu8(_mm_and_si128(m, va), zero); positive = _mm_add_epi32(positive, t0); negative = _mm_sub_epi32(negative, t1); } const __m128i accumulator = _mm_add_epi32(positive, negative); return int32_t(_mm_extract_epi32(accumulator, 0)) + int32_t(_mm_extract_epi32(accumulator, 2)); }
EvalSum& operator -= (const EvalSum& rhs) { #if defined USE_AVX2_EVAL mm = _mm256_sub_epi32(mm, rhs.mm); #elif defined USE_SSE_EVAL m[0] = _mm_sub_epi32(m[0], rhs.m[0]); m[1] = _mm_sub_epi32(m[1], rhs.m[1]); #else m_p[0][0] -= rhs.m_p[0][0]; m_p[0][1] -= rhs.m_p[0][1]; m_p[1][0] -= rhs.m_p[1][0]; m_p[1][1] -= rhs.m_p[1][1]; m_p[2][0] -= rhs.m_p[2][0]; m_p[2][1] -= rhs.m_p[2][1]; #endif return *this; }
static inline __m128 log2f4(__m128 x) { __m128i exp = _mm_load_si128((__m128i*)_exp_mask); __m128i mant = _mm_load_si128((__m128i*)_mantissa_mask); __m128 one = _mm_load_ps(_ones_ps); __m128i i = _mm_castps_si128(x); __m128 e = _mm_cvtepi32_ps(_mm_sub_epi32(_mm_srli_epi32(_mm_and_si128(i, exp), 23), _mm_load_si128((__m128i*)_one27))); __m128 m = _mm_or_ps(_mm_castsi128_ps(_mm_and_si128(i, mant)), one); __m128 p; /* Minimax polynomial fit of log2(x)/(x - 1), for x in range [1, 2[ */ #if LOG_POLY_DEGREE == 6 p = POLY5( m, log_p5_0, log_p5_1, log_p5_2, log_p5_3, log_p5_4, log_p5_5); #elif LOG_POLY_DEGREE == 5 p = POLY4(m, log_p4_0, log_p4_1, log_p4_2, log_p4_3, log_p4_4); #elif LOG_POLY_DEGREE == 4 p = POLY3(m, log_p3_0, log_p3_1, log_p3_2, log_p3_3); #elif LOG_POLY_DEGREE == 3 p = POLY2(m, log_p2_0, log_p2_1, log_p2_2); #else #error #endif /* This effectively increases the polynomial degree by one, but ensures that log2(1) == 0*/ p = _mm_mul_ps(p, _mm_sub_ps(m, one)); return _mm_add_ps(p, e); }
// Unary Ops SIMDValue SIMDInt32x4Operation::OpAbs(const SIMDValue& value) { SIMDValue result; X86SIMDValue x86Result; X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value); if (AutoSystemInfo::Data.SSE3Available()) { x86Result.m128i_value = _mm_abs_epi32(v.m128i_value); // only available after SSE3 result = X86SIMDValue::ToSIMDValue(x86Result); } else if (AutoSystemInfo::Data.SSE2Available()) { X86SIMDValue temp, SIGNMASK; SIGNMASK.m128i_value = _mm_srai_epi32(v.m128i_value, 31); // mask = value >> 31 temp.m128i_value = _mm_xor_si128(v.m128i_value, SIGNMASK.m128i_value); // temp = value ^ mask x86Result.m128i_value = _mm_sub_epi32(temp.m128i_value, SIGNMASK.m128i_value); // temp - mask result = X86SIMDValue::ToSIMDValue(x86Result); } else { result.i32[SIMD_X] = (value.i32[SIMD_X] < 0) ? -1 * value.i32[SIMD_X] : value.i32[SIMD_X]; result.i32[SIMD_Y] = (value.i32[SIMD_Y] < 0) ? -1 * value.i32[SIMD_Y] : value.i32[SIMD_Y]; result.i32[SIMD_Z] = (value.i32[SIMD_Z] < 0) ? -1 * value.i32[SIMD_Z] : value.i32[SIMD_Z]; result.i32[SIMD_W] = (value.i32[SIMD_W] < 0) ? -1 * value.i32[SIMD_W] : value.i32[SIMD_W]; } return result; }
void ColorModelView::paintEvent(QPaintEvent *) { QPainter p(this); auto mainBounds = mainAreaBounds(); auto sideBounds = sideAreaBounds(); if (mainImage_.isNull()) { // FIXME: support other color model? QImage img(256, 256, QImage::Format_RGB32); auto *pixels = reinterpret_cast<quint32 *>(img.bits()); auto basecolor = QColor::fromHsv(value_.hsvHue(), 255, 255); auto basecolorMM = _mm_setr_epi32(basecolor.blue(), basecolor.green(), basecolor.red(), 0); basecolorMM = _mm_add_epi32(basecolorMM, _mm_srli_epi32(basecolorMM, 7)); // map [0, 255] to [0, 256] auto white = _mm_set1_epi32(256 * 255); auto dX = _mm_sub_epi32(basecolorMM, _mm_set1_epi32(256)); for (int y = 0; y < 256; ++y) { auto brightness = _mm_set1_epi32(256 - y - (y >> 7)); auto col = white; // [0, 256 * 255] for (int x = 0; x < 256; ++x) { auto c = _mm_mullo_epi16(_mm_srli_epi32(col, 8), brightness); c = _mm_srli_epi16(c, 8); // [0, 255] c = _mm_packs_epi32(c, c); c = _mm_packus_epi16(c, c); _mm_store_ss(reinterpret_cast<float *>(&pixels[x + y * 256]), _mm_castsi128_ps(c)); col = _mm_add_epi32(col, dX); } } mainImage_ = QPixmap::fromImage(img); }
static INLINE unsigned int obmc_sad_w4(const uint8_t *pre, const int pre_stride, const int32_t *wsrc, const int32_t *mask, const int height) { const int pre_step = pre_stride - 4; int n = 0; __m128i v_sad_d = _mm_setzero_si128(); do { const __m128i v_p_b = xx_loadl_32(pre + n); const __m128i v_m_d = xx_load_128(mask + n); const __m128i v_w_d = xx_load_128(wsrc + n); const __m128i v_p_d = _mm_cvtepu8_epi32(v_p_b); // Values in both pre and mask fit in 15 bits, and are packed at 32 bit // boundaries. We use pmaddwd, as it has lower latency on Haswell // than pmulld but produces the same result with these inputs. const __m128i v_pm_d = _mm_madd_epi16(v_p_d, v_m_d); const __m128i v_diff_d = _mm_sub_epi32(v_w_d, v_pm_d); const __m128i v_absdiff_d = _mm_abs_epi32(v_diff_d); // Rounded absolute difference const __m128i v_rad_d = xx_roundn_epu32(v_absdiff_d, 12); v_sad_d = _mm_add_epi32(v_sad_d, v_rad_d); n += 4; if (n % 4 == 0) pre += pre_step; } while (n < 4 * height); return xx_hsum_epi32_si32(v_sad_d); }
static inline __m128i v4_ialpha_sse2(__m128i c) { __m128i a = _mm_srli_epi32(c, 24); return _mm_sub_epi32(_mm_set1_epi32(0xff), a); }
inline __m128i Convert8DigitsSSE2(uint32_t value) { assert(value <= 99999999); // abcd, efgh = abcdefgh divmod 10000 const __m128i abcdefgh = _mm_cvtsi32_si128(value); const __m128i abcd = _mm_srli_epi64(_mm_mul_epu32(abcdefgh, reinterpret_cast<const __m128i*>(kDiv10000Vector)[0]), 45); const __m128i efgh = _mm_sub_epi32(abcdefgh, _mm_mul_epu32(abcd, reinterpret_cast<const __m128i*>(k10000Vector)[0])); // v1 = [ abcd, efgh, 0, 0, 0, 0, 0, 0 ] const __m128i v1 = _mm_unpacklo_epi16(abcd, efgh); // v1a = v1 * 4 = [ abcd * 4, efgh * 4, 0, 0, 0, 0, 0, 0 ] const __m128i v1a = _mm_slli_epi64(v1, 2); // v2 = [ abcd * 4, abcd * 4, abcd * 4, abcd * 4, efgh * 4, efgh * 4, efgh * 4, efgh * 4 ] const __m128i v2a = _mm_unpacklo_epi16(v1a, v1a); const __m128i v2 = _mm_unpacklo_epi32(v2a, v2a); // v4 = v2 div 10^3, 10^2, 10^1, 10^0 = [ a, ab, abc, abcd, e, ef, efg, efgh ] const __m128i v3 = _mm_mulhi_epu16(v2, reinterpret_cast<const __m128i*>(kDivPowersVector)[0]); const __m128i v4 = _mm_mulhi_epu16(v3, reinterpret_cast<const __m128i*>(kShiftPowersVector)[0]); // v5 = v4 * 10 = [ a0, ab0, abc0, abcd0, e0, ef0, efg0, efgh0 ] const __m128i v5 = _mm_mullo_epi16(v4, reinterpret_cast<const __m128i*>(k10Vector)[0]); // v6 = v5 << 16 = [ 0, a0, ab0, abc0, 0, e0, ef0, efg0 ] const __m128i v6 = _mm_slli_epi64(v5, 16); // v7 = v4 - v6 = { a, b, c, d, e, f, g, h } const __m128i v7 = _mm_sub_epi16(v4, v6); return v7; }
JL_DLLEXPORT __m128i test_m128i(__m128i a, __m128i b, __m128i c, __m128i d) { // 64-bit x86 has only level 2 SSE, which does not have a <4 x int32> multiplication, // so we use floating-point instead, and assume caller knows about the hack. return _mm_add_epi32(a, _mm_cvtps_epi32(_mm_mul_ps(_mm_cvtepi32_ps(b), _mm_cvtepi32_ps(_mm_sub_epi32(c,d))))); }
__m128i test_mm_sub_epi32(__m128i A, __m128i B) { // DAG-LABEL: test_mm_sub_epi32 // DAG: sub <4 x i32> // // ASM-LABEL: test_mm_sub_epi32 // ASM: psubd return _mm_sub_epi32(A, B); }
static inline __m128i blendfunc_multiply_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { // sc * (255 - da) __m128i ret1 = _mm_sub_epi32(_mm_set1_epi32(255), da); ret1 = _mm_mullo_epi16(sc, ret1); // dc * (255 - sa) __m128i ret2 = _mm_sub_epi32(_mm_set1_epi32(255), sa); ret2 = _mm_mullo_epi16(dc, ret2); // sc * dc __m128i ret3 = _mm_mullo_epi16(sc, dc); __m128i ret = _mm_add_epi32(ret1, ret2); ret = _mm_add_epi32(ret, ret3); return clamp_div255round_SSE2(ret); }
__SIMDi _SIMD_sub_epi32(__SIMDi a, __SIMDi b) { #ifdef USE_SSE return _mm_sub_epi32(a,b); #elif defined USE_AVX return _m256_sub_ps(a,b); #elif defined USE_IBM return vec_sub(a,b); #endif }
SIMDValue SIMDInt32x4Operation::OpSub(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); x86Result.m128i_value = _mm_sub_epi32(tmpaValue.m128i_value, tmpbValue.m128i_value); // a - b return X86SIMDValue::ToSIMDValue(x86Result); }
template<int shift, int active_bits> void Haar_invtransform_H_final_1_sse4_2_int32_t(void *_idata, const int istride, const char *odata, const int ostride, const int iwidth, const int iheight, const int ooffset_x, const int ooffset_y, const int owidth, const int oheight) { int32_t *idata = (int32_t *)_idata; const int skip = 1; const __m128i ONE = _mm_set1_epi32(1); const __m128i OFFSET = _mm_set1_epi32(1 << (active_bits - 1)); (void)iwidth; (void)iheight; for (int y = ooffset_y; y < ooffset_y + oheight; y+=skip) { for (int x = ooffset_x; x < ooffset_x + owidth; x += 8) { __m128i D0 = _mm_load_si128((__m128i *)&idata[y*istride + x + 0]); __m128i D4 = _mm_load_si128((__m128i *)&idata[y*istride + x + 4]); __m128i A0 = _mm_unpacklo_epi32(D0, D4); __m128i A2 = _mm_unpackhi_epi32(D0, D4); __m128i E0 = _mm_unpacklo_epi32(A0, A2); __m128i O1 = _mm_unpackhi_epi32(A0, A2); __m128i X0 = _mm_sub_epi32(E0, _mm_srai_epi32(_mm_add_epi32(O1, ONE), 1)); __m128i X1 = _mm_add_epi32(O1, X0); __m128i Z0 = _mm_unpacklo_epi32(X0, X1); __m128i Z4 = _mm_unpackhi_epi32(X0, X1); if (shift != 0) { Z0 = _mm_add_epi32(Z0, ONE); Z4 = _mm_add_epi32(Z4, ONE); Z0 = _mm_srai_epi32(Z0, shift); Z4 = _mm_srai_epi32(Z4, shift); } Z0 = _mm_add_epi32(Z0, OFFSET); Z4 = _mm_add_epi32(Z4, OFFSET); Z0 = _mm_slli_epi32(Z0, (16 - active_bits)); Z4 = _mm_slli_epi32(Z4, (16 - active_bits)); __m128i R = _mm_packus_epi32(Z0, Z4); R = _mm_srli_epi16(R, (16 - active_bits)); _mm_store_si128((__m128i *)&odata[2*((y - ooffset_y)*ostride + x - ooffset_x)], R); } } }
static inline __m128i exclusion_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i&, __m128i&) { __m128i tmp1 = _mm_mullo_epi16(_mm_set1_epi32(255), sc); // 255 * sc __m128i tmp2 = _mm_mullo_epi16(_mm_set1_epi32(255), dc); // 255 * dc tmp1 = _mm_add_epi32(tmp1, tmp2); tmp2 = _mm_mullo_epi16(sc, dc); // sc * dc tmp2 = _mm_slli_epi32(tmp2, 1); // 2 * sc * dc __m128i r = _mm_sub_epi32(tmp1, tmp2); return clamp_div255round_SSE2(r); }
static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk, const uint8_t* src) { const int x_sub = wrk->x_sub; int accum = 0; const __m128i zero = _mm_setzero_si128(); const __m128i mult0 = _mm_set1_epi16(x_sub); const __m128i mult1 = _mm_set1_epi32(wrk->fx_scale); const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER); __m128i sum = zero; rescaler_t* frow = wrk->frow; const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width; if (wrk->num_channels != 4 || wrk->x_add > (x_sub << 7)) { WebPRescalerImportRowShrink_C(wrk, src); return; } assert(!WebPRescalerInputDone(wrk)); assert(!wrk->x_expand); for (; frow < frow_end; frow += 4) { __m128i base = zero; accum += wrk->x_add; while (accum > 0) { const __m128i A = _mm_cvtsi32_si128(WebPMemToUint32(src)); src += 4; base = _mm_unpacklo_epi8(A, zero); // To avoid overflow, we need: base * x_add / x_sub < 32768 // => x_add < x_sub << 7. That's a 1/128 reduction ratio limit. sum = _mm_add_epi16(sum, base); accum -= x_sub; } { // Emit next horizontal pixel. const __m128i mult = _mm_set1_epi16(-accum); const __m128i frac0 = _mm_mullo_epi16(base, mult); // 16b x 16b -> 32b const __m128i frac1 = _mm_mulhi_epu16(base, mult); const __m128i frac = _mm_unpacklo_epi16(frac0, frac1); // frac is 32b const __m128i A0 = _mm_mullo_epi16(sum, mult0); const __m128i A1 = _mm_mulhi_epu16(sum, mult0); const __m128i B0 = _mm_unpacklo_epi16(A0, A1); // sum * x_sub const __m128i frow_out = _mm_sub_epi32(B0, frac); // sum * x_sub - frac const __m128i D0 = _mm_srli_epi64(frac, 32); const __m128i D1 = _mm_mul_epu32(frac, mult1); // 32b x 16b -> 64b const __m128i D2 = _mm_mul_epu32(D0, mult1); const __m128i E1 = _mm_add_epi64(D1, rounder); const __m128i E2 = _mm_add_epi64(D2, rounder); const __m128i F1 = _mm_shuffle_epi32(E1, 1 | (3 << 2)); const __m128i F2 = _mm_shuffle_epi32(E2, 1 | (3 << 2)); const __m128i G = _mm_unpacklo_epi32(F1, F2); sum = _mm_packs_epi32(G, zero); _mm_storeu_si128((__m128i*)frow, frow_out); } } assert(accum == 0); }