void unpack_rgb5a1_sse2(const Uint8* source, const Uint32 size, Uint8* dest) { __m128i t0, t1, t2; Uint32 i; for (i = 0; i < (size / 8); i++) { t0 = _mm_loadl_epi64((__m128i*)&source[i * 8]); t0 = _mm_unpacklo_epi16(t0, t0); t1 = _mm_unpacklo_epi16(t0, t0); t1 = _mm_and_si128(t1, _mm_set_epi16(0x8000, 0x001F, 0x03E0, 0x7C00, 0x8000, 0x001F, 0x03E0, 0x7C00)); t1 = _mm_mullo_epi16(t1, _mm_set_epi16(0x0001, 0x0800, 0x0040, 0x0002, 0x0001, 0x0800, 0x0040, 0x0002)); t1 = _mm_mulhi_epu16(t1, _mm_set_epi16(0x0200, 0x0260, 0x0260, 0x0260, 0x0200, 0x0260, 0x0260, 0x0260)); t1 = _mm_mulhi_epu16(t1, _mm_set_epi16(0xFF00, 0x6ED5, 0x6ED5, 0x6ED5, 0xFF00, 0x6ED5, 0x6ED5, 0x6ED5)); t2 = _mm_unpackhi_epi16(t0, t0); t2 = _mm_and_si128(t2, _mm_set_epi16(0x8000, 0x001F, 0x03E0, 0x7C00, 0x8000, 0x001F, 0x03E0, 0x7C00)); t2 = _mm_mullo_epi16(t2, _mm_set_epi16(0x0001, 0x0800, 0x0040, 0x0002, 0x0001, 0x0800, 0x0040, 0x0002)); t2 = _mm_mulhi_epu16(t2, _mm_set_epi16(0x0200, 0x0260, 0x0260, 0x0260, 0x0200, 0x0260, 0x0260, 0x0260)); t2 = _mm_mulhi_epu16(t2, _mm_set_epi16(0xFF00, 0x6ED5, 0x6ED5, 0x6ED5, 0xFF00, 0x6ED5, 0x6ED5, 0x6ED5)); t1 = _mm_packus_epi16(t1, t2); _mm_stream_si128((__m128i*)&dest[i * 16], t1); } }
static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0, __m128i *sum_1) { const __m128i zero = _mm_setzero_si128(); const __m128i a_u8 = _mm_loadu_si128((const __m128i *)a); const __m128i b_u8 = _mm_loadu_si128((const __m128i *)b); const __m128i a_0_u16 = _mm_cvtepu8_epi16(a_u8); const __m128i a_1_u16 = _mm_unpackhi_epi8(a_u8, zero); const __m128i b_0_u16 = _mm_cvtepu8_epi16(b_u8); const __m128i b_1_u16 = _mm_unpackhi_epi8(b_u8, zero); const __m128i diff_0_s16 = _mm_sub_epi16(a_0_u16, b_0_u16); const __m128i diff_1_s16 = _mm_sub_epi16(a_1_u16, b_1_u16); const __m128i diff_sq_0_u16 = _mm_mullo_epi16(diff_0_s16, diff_0_s16); const __m128i diff_sq_1_u16 = _mm_mullo_epi16(diff_1_s16, diff_1_s16); __m128i shift_left = _mm_slli_si128(diff_sq_0_u16, 2); // Use _mm_alignr_epi8() to "shift in" diff_sq_u16[8]. __m128i shift_right = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 2); __m128i sum_u16 = _mm_adds_epu16(diff_sq_0_u16, shift_left); sum_u16 = _mm_adds_epu16(sum_u16, shift_right); *sum_0 = sum_u16; shift_left = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 14); shift_right = _mm_srli_si128(diff_sq_1_u16, 2); sum_u16 = _mm_adds_epu16(diff_sq_1_u16, shift_left); sum_u16 = _mm_adds_epu16(sum_u16, shift_right); *sum_1 = sum_u16; }
int operator() (const uchar * ptr, int len, int & x0, int & x1, int & x2, int & x3) { int x = 0; if( useSIMD ) { __m128i qx_init = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); __m128i dx = _mm_set1_epi16(8); __m128i z = _mm_setzero_si128(), qx0 = z, qx1 = z, qx2 = z, qx3 = z, qx = qx_init; for( ; x <= len - 8; x += 8 ) { __m128i p = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr + x)), z); __m128i sx = _mm_mullo_epi16(qx, qx); qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z)); qx1 = _mm_add_epi32(qx1, _mm_madd_epi16(p, qx)); qx2 = _mm_add_epi32(qx2, _mm_madd_epi16(p, sx)); qx3 = _mm_add_epi32(qx3, _mm_madd_epi16( _mm_mullo_epi16(p, qx), sx)); qx = _mm_add_epi16(qx, dx); } _mm_store_si128((__m128i*)buf, qx0); x0 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, qx1); x1 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, qx2); x2 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, qx3); x3 = buf[0] + buf[1] + buf[2] + buf[3]; } return x; }
static void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16, const __m128i mul_constants_0, const __m128i mul_constants_1, const int strength, const int rounding, const int weight) { const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); const __m128i rounding_u16 = _mm_set1_epi16(rounding); const __m128i weight_u16 = _mm_set1_epi16(weight); const __m128i sixteen = _mm_set1_epi16(16); __m128i input_0, input_1; input_0 = _mm_mulhi_epu16(*sum_0_u16, mul_constants_0); input_0 = _mm_adds_epu16(input_0, rounding_u16); input_1 = _mm_mulhi_epu16(*sum_1_u16, mul_constants_1); input_1 = _mm_adds_epu16(input_1, rounding_u16); input_0 = _mm_srl_epi16(input_0, strength_u128); input_1 = _mm_srl_epi16(input_1, strength_u128); input_0 = _mm_min_epu16(input_0, sixteen); input_1 = _mm_min_epu16(input_1, sixteen); input_0 = _mm_sub_epi16(sixteen, input_0); input_1 = _mm_sub_epi16(sixteen, input_1); *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16); *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16); }
static inline __m128i hardlight_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { // if (2 * sc <= sa) __m128i tmp1 = _mm_slli_epi32(sc, 1); __m128i cmp1 = _mm_cmpgt_epi32(tmp1, sa); __m128i rc1 = _mm_mullo_epi16(sc, dc); // sc * dc; rc1 = _mm_slli_epi32(rc1, 1); // 2 * sc * dc rc1 = _mm_andnot_si128(cmp1, rc1); // else tmp1 = _mm_mullo_epi16(sa, da); __m128i tmp2 = Multiply32_SSE2(_mm_sub_epi32(da, dc), _mm_sub_epi32(sa, sc)); tmp2 = _mm_slli_epi32(tmp2, 1); __m128i rc2 = _mm_sub_epi32(tmp1, tmp2); rc2 = _mm_and_si128(cmp1, rc2); __m128i rc = _mm_or_si128(rc1, rc2); __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da); tmp1 = _mm_mullo_epi16(sc, ida); __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa); tmp2 = _mm_mullo_epi16(dc, isa); rc = _mm_add_epi32(rc, tmp1); rc = _mm_add_epi32(rc, tmp2); return clamp_div255round_SSE2(rc); }
template <bool align> SIMD_INLINE void VectorProduct(const __m128i & vertical, const uint8_t * horizontal, uint8_t * dst) { __m128i _horizontal = Load<align>((__m128i*)horizontal); __m128i lo = DivideI16By255(_mm_mullo_epi16(vertical, _mm_unpacklo_epi8(_horizontal, K_ZERO))); __m128i hi = DivideI16By255(_mm_mullo_epi16(vertical, _mm_unpackhi_epi8(_horizontal, K_ZERO))); Store<align>((__m128i*)dst, _mm_packus_epi16(lo, hi)); }
void imageFilterBlend_SSE2(Uint32 *dst_buffer, Uint32 *src_buffer, Uint8 *alphap, int alpha, int length) { int n = length; // Compute first few values so we're on a 16-byte boundary in dst_buffer while( (((long)dst_buffer & 0xF) > 0) && (n > 0) ) { BLEND_PIXEL(); --n; ++dst_buffer; ++src_buffer; } // Do bulk of processing using SSE2 (process 4 32bit (BGRA) pixels) // create basic bitmasks 0x00FF00FF, 0x000000FF __m128i bmask2 = _mm_set1_epi32(0x00FF00FF); __m128i bmask = _mm_srli_epi32(bmask2, 16); while(n >= 4) { // alpha1 = ((src_argb >> 24) * alpha) >> 8 __m128i a = _mm_set1_epi32(alpha); __m128i buf = _mm_loadu_si128((__m128i*)src_buffer); __m128i tmp = _mm_srli_epi32(buf, 24); a = _mm_mullo_epi16(a, tmp); a = _mm_srli_epi32(a, 8); // double-up alpha1 (0x000000vv -> 0x00vv00vv) tmp = _mm_slli_epi32(a, 16); a = _mm_or_si128(a, tmp); // rb = (src_argb & bmask2) * alpha1 tmp = _mm_and_si128(buf, bmask2); __m128i rb = _mm_mullo_epi16(a, tmp); // g = ((src_argb >> 8) & bmask) * alpha1 buf = _mm_srli_epi32(buf, 8); tmp = _mm_and_si128(buf, bmask); __m128i g = _mm_mullo_epi16(a, tmp); // alpha2 = alpha1 ^ bmask2 a = _mm_xor_si128(a, bmask2); buf = _mm_load_si128((__m128i*)dst_buffer); // rb += (dst_argb & bmask2) * alpha2 tmp = _mm_and_si128(buf, bmask2); tmp = _mm_mullo_epi16(a, tmp); rb = _mm_add_epi32(rb, tmp); // rb = (rb >> 8) & bmask2 tmp = _mm_srli_epi32(rb, 8); rb = _mm_and_si128(tmp, bmask2); // g += ((dst_argb >> 8) & bmask) * alpha2 buf = _mm_srli_epi32(buf, 8); tmp = _mm_and_si128(buf, bmask); tmp = _mm_mullo_epi16(a, tmp); g = _mm_add_epi32(g, tmp); // g = g & (bmask << 8) tmp =_mm_slli_epi32(bmask, 8); g = _mm_and_si128(g, tmp); // dst_argb = rb | g tmp = _mm_or_si128(rb, g); _mm_store_si128((__m128i*)dst_buffer, tmp); n -= 4; src_buffer += 4; dst_buffer += 4; alphap += 16; } // If any pixels are left over, deal with them individually ++n; BASIC_BLEND(); }
static void MultARGBRow(uint32_t* const ptr, int width, int inverse) { int x = 0; if (!inverse) { const int kSpan = 2; const __m128i zero = _mm_setzero_si128(); const __m128i kRound = _mm_set_epi16(0, 1 << 7, 1 << 7, 1 << 7, 0, 1 << 7, 1 << 7, 1 << 7); const __m128i kMult = _mm_set_epi16(0, 0x0101, 0x0101, 0x0101, 0, 0x0101, 0x0101, 0x0101); const __m128i kOne64 = _mm_set_epi16(1u << 8, 0, 0, 0, 1u << 8, 0, 0, 0); const int w2 = width & ~(kSpan - 1); for (x = 0; x < w2; x += kSpan) { const __m128i argb0 = _mm_loadl_epi64((__m128i*)&ptr[x]); const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero); const __m128i tmp0 = _mm_shufflelo_epi16(argb1, _MM_SHUFFLE(3, 3, 3, 3)); const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, _MM_SHUFFLE(3, 3, 3, 3)); const __m128i tmp2 = _mm_srli_epi64(tmp1, 16); const __m128i scale0 = _mm_mullo_epi16(tmp1, kMult); const __m128i scale1 = _mm_or_si128(tmp2, kOne64); const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0); const __m128i argb3 = _mm_mullo_epi16(argb1, scale1); const __m128i argb4 = _mm_adds_epu16(argb2, argb3); const __m128i argb5 = _mm_adds_epu16(argb4, kRound); const __m128i argb6 = _mm_srli_epi16(argb5, 8); const __m128i argb7 = _mm_packus_epi16(argb6, zero); _mm_storel_epi64((__m128i*)&ptr[x], argb7); } } width -= x; if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse); }
void Lerp_SSE2(void* dest, const void* source1, const void* source2, float alpha, size_t size) { static const size_t stride = sizeof(__m128i)*4; static const u32 PSD = 64; static const __m128i lomask = _mm_set1_epi32(0x00FF00FF); static const __m128i round = _mm_set1_epi16(128); assert(source1 != NULL && source2 != NULL && dest != NULL); assert(size % stride == 0); assert(alpha >= 0.0 && alpha <= 1.0); const __m128i* source128_1 = reinterpret_cast<const __m128i*>(source1); const __m128i* source128_2 = reinterpret_cast<const __m128i*>(source2); __m128i* dest128 = reinterpret_cast<__m128i*>(dest); __m128i s = _mm_setzero_si128(); __m128i d = _mm_setzero_si128(); const __m128i a = _mm_set1_epi16(static_cast<u8>(alpha*256.0f+0.5f)); __m128i drb, dga, srb, sga; for (size_t k = 0, length = size/stride; k < length; ++k) { _mm_prefetch(reinterpret_cast<const char*>(source128_1 + PSD), _MM_HINT_NTA); _mm_prefetch(reinterpret_cast<const char*>(source128_2 + PSD), _MM_HINT_NTA); // TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/ for(int n = 0; n < 4; ++n, ++dest128, ++source128_1, ++source128_2) { // r = d + (s-d)*alpha/256 s = _mm_load_si128(source128_1); // AABBGGRR d = _mm_load_si128(source128_2); // AABBGGRR srb = _mm_and_si128(lomask, s); // 00BB00RR // unpack sga = _mm_srli_epi16(s, 8); // AA00GG00 // unpack drb = _mm_and_si128(lomask, d); // 00BB00RR // unpack dga = _mm_srli_epi16(d, 8); // AA00GG00 // unpack srb = _mm_sub_epi16(srb, drb); // BBBBRRRR // sub srb = _mm_mullo_epi16(srb, a); // BBBBRRRR // mul srb = _mm_add_epi16(srb, round); sga = _mm_sub_epi16(sga, dga); // AAAAGGGG // sub sga = _mm_mullo_epi16(sga, a); // AAAAGGGG // mul sga = _mm_add_epi16(sga, round); srb = _mm_srli_epi16(srb, 8); // 00BB00RR // prepack and div sga = _mm_andnot_si128(lomask, sga);// AA00GG00 // prepack and div srb = _mm_or_si128(srb, sga); // AABBGGRR // pack srb = _mm_add_epi8(srb, d); // AABBGGRR // add there is no overflow(R.N) _mm_store_si128(dest128, srb); } } }
void PreOver_FastSSE2(void* dest, const void* source1, const void* source2, size_t size) { static const size_t stride = sizeof(__m128i)*4; static const u32 PSD = 64; static const __m128i lomask = _mm_set1_epi32(0x00FF00FF); assert(source1 != NULL && source2 != NULL && dest != NULL); assert(size % stride == 0); const __m128i* source128_1 = reinterpret_cast<const __m128i*>(source1); const __m128i* source128_2 = reinterpret_cast<const __m128i*>(source2); __m128i* dest128 = reinterpret_cast<__m128i*>(dest); __m128i d, s, a, rb, ag; // TODO: dynamic prefetch schedluing distance? needs to be optimized (R.N) for(int k = 0, length = size/stride; k < length; ++k) { // TODO: put prefetch between calculations?(R.N) _mm_prefetch(reinterpret_cast<const s8*>(source128_1+PSD), _MM_HINT_NTA); _mm_prefetch(reinterpret_cast<const s8*>(source128_2+PSD), _MM_HINT_NTA); //work on entire cacheline before next prefetch for(int n = 0; n < 4; ++n, ++dest128, ++source128_1, ++source128_2) { // TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/ s = _mm_load_si128(source128_1); // AABGGRR d = _mm_load_si128(source128_2); // AABGGRR // set alpha to lo16 from dest_ rb = _mm_srli_epi32(d, 24); // 000000AA a = _mm_slli_epi32(rb, 16); // 00AA0000 a = _mm_or_si128(rb, a); // 00AA00AA // fix alpha a = a > 127 ? a+1 : a // NOTE: If removed an *overflow* will occur with large values (R.N) rb = _mm_srli_epi16(a, 7); a = _mm_add_epi16(a, rb); rb = _mm_and_si128(lomask, s); // 00B00RR unpack rb = _mm_mullo_epi16(rb, a); // BBRRRR mul (D[A]*S) rb = _mm_srli_epi16(rb, 8); // 00B00RR prepack and div [(D[A]*S)]/255 ag = _mm_srli_epi16(s, 8); // 00AA00GG unpack ag = _mm_mullo_epi16(ag, a); // AAAAGGGG mul (D[A]*S) ag = _mm_andnot_si128(lomask, ag); // AA00GG00 prepack and div [(D[A]*S)]/255 rb = _mm_or_si128(rb, ag); // AABGGRR pack rb = _mm_sub_epi8(s, rb); // sub S-[(D[A]*S)/255] d = _mm_add_epi8(d, rb); // add D+[S-(D[A]*S)/255] _mm_store_si128(dest128, d); } } }
static inline __m128i exclusion_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i&, __m128i&) { __m128i tmp1 = _mm_mullo_epi16(_mm_set1_epi32(255), sc); // 255 * sc __m128i tmp2 = _mm_mullo_epi16(_mm_set1_epi32(255), dc); // 255 * dc tmp1 = _mm_add_epi32(tmp1, tmp2); tmp2 = _mm_mullo_epi16(sc, dc); // sc * dc tmp2 = _mm_slli_epi32(tmp2, 1); // 2 * sc * dc __m128i r = _mm_sub_epi32(tmp1, tmp2); return clamp_div255round_SSE2(r); }
static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk, const uint8_t* src) { const int x_sub = wrk->x_sub; int accum = 0; const __m128i zero = _mm_setzero_si128(); const __m128i mult0 = _mm_set1_epi16(x_sub); const __m128i mult1 = _mm_set1_epi32(wrk->fx_scale); const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER); __m128i sum = zero; rescaler_t* frow = wrk->frow; const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width; if (wrk->num_channels != 4 || wrk->x_add > (x_sub << 7)) { WebPRescalerImportRowShrink_C(wrk, src); return; } assert(!WebPRescalerInputDone(wrk)); assert(!wrk->x_expand); for (; frow < frow_end; frow += 4) { __m128i base = zero; accum += wrk->x_add; while (accum > 0) { const __m128i A = _mm_cvtsi32_si128(WebPMemToUint32(src)); src += 4; base = _mm_unpacklo_epi8(A, zero); // To avoid overflow, we need: base * x_add / x_sub < 32768 // => x_add < x_sub << 7. That's a 1/128 reduction ratio limit. sum = _mm_add_epi16(sum, base); accum -= x_sub; } { // Emit next horizontal pixel. const __m128i mult = _mm_set1_epi16(-accum); const __m128i frac0 = _mm_mullo_epi16(base, mult); // 16b x 16b -> 32b const __m128i frac1 = _mm_mulhi_epu16(base, mult); const __m128i frac = _mm_unpacklo_epi16(frac0, frac1); // frac is 32b const __m128i A0 = _mm_mullo_epi16(sum, mult0); const __m128i A1 = _mm_mulhi_epu16(sum, mult0); const __m128i B0 = _mm_unpacklo_epi16(A0, A1); // sum * x_sub const __m128i frow_out = _mm_sub_epi32(B0, frac); // sum * x_sub - frac const __m128i D0 = _mm_srli_epi64(frac, 32); const __m128i D1 = _mm_mul_epu32(frac, mult1); // 32b x 16b -> 64b const __m128i D2 = _mm_mul_epu32(D0, mult1); const __m128i E1 = _mm_add_epi64(D1, rounder); const __m128i E2 = _mm_add_epi64(D2, rounder); const __m128i F1 = _mm_shuffle_epi32(E1, 1 | (3 << 2)); const __m128i F2 = _mm_shuffle_epi32(E2, 1 | (3 << 2)); const __m128i G = _mm_unpacklo_epi32(F1, F2); sum = _mm_packs_epi32(G, zero); _mm_storeu_si128((__m128i*)frow, frow_out); } } assert(accum == 0); }
void Coefs(unsigned char *current_part_ptr, int current_part_stride, unsigned char *ref_part_ptr, int ref_part_stride, unsigned char *coef_buf, int n) { static const unsigned short c_32[8] = {32, 32, 32, 32, 32, 32, 32, 32}; int i; __m128i v_row0_0, v_row0_1; __m128i v_temp_0, v_temp_1; __m128i v_result; __m128i vZero; vZero = _mm_setzero_si128(); __m128i v_32 = _mm_loadu_si128((__m128i*)c_32); __m128i* coef_ptr = (__m128i*) coef_buf; v_row0_0 = _mm_loadl_epi64((__m128i*)ref_part_ptr); v_row0_1 = _mm_shufflelo_epi16(v_row0_0, 0xf9); v_row0_1 = _mm_insert_epi16(v_row0_1, *(unsigned short*)(ref_part_ptr+8), 3); ref_part_ptr += ref_part_stride; // row0: 0 1 2 3 4 5 6 7 // row1: 2 3 4 5 6 7 8 9 v_row0_0 = _mm_unpacklo_epi8(v_row0_0, vZero); v_row0_1 = _mm_unpacklo_epi8(v_row0_1, vZero); for ( i = 0; i < n; i++ ) { v_row0_0 = _mm_mullo_epi16(v_row0_0, coef_ptr[0]); v_row0_1 = _mm_mullo_epi16(v_row0_1, coef_ptr[1]); v_result = v_32; v_result = _mm_add_epi16(v_result, v_row0_0); v_result = _mm_add_epi16(v_result, v_row0_1); v_row0_0 = _mm_loadl_epi64((__m128i*)ref_part_ptr); v_row0_1 = _mm_shufflelo_epi16(v_row0_0, 0xf9); v_row0_1 = _mm_insert_epi16(v_row0_1, *(unsigned short*)(ref_part_ptr+8), 3); ref_part_ptr += ref_part_stride; v_row0_0 = _mm_unpacklo_epi8(v_row0_0, vZero); v_row0_1 = _mm_unpacklo_epi8(v_row0_1, vZero); v_temp_0 = _mm_mullo_epi16(v_row0_0, coef_ptr[2]); v_temp_1 = _mm_mullo_epi16(v_row0_1, coef_ptr[3]); v_result = _mm_add_epi16(v_result, v_temp_0); v_result = _mm_add_epi16(v_result, v_temp_1); v_result = _mm_srli_epi16(v_result, 6); _mm_store_si128((__m128i*)(current_part_ptr), v_result); current_part_ptr += current_part_stride; } }
static inline __m128i v4_interpolate_color_sse2(__m128i a, __m128i c0, __m128i c1) { const __m128i rb_mask = _mm_set1_epi32(0xFF00FF00); const __m128i zero = _mm_setzero_si128(); __m128i a_l = a; __m128i a_h = a; a_l = _mm_unpacklo_epi16(a_l, a_l); a_h = _mm_unpackhi_epi16(a_h, a_h); __m128i a_t = _mm_slli_epi64(a_l, 32); __m128i a_t0 = _mm_slli_epi64(a_h, 32); a_l = _mm_add_epi32(a_l, a_t); a_h = _mm_add_epi32(a_h, a_t0); __m128i c0_l = c0; __m128i c0_h = c0; c0_l = _mm_unpacklo_epi8(c0_l, zero); c0_h = _mm_unpackhi_epi8(c0_h, zero); __m128i c1_l = c1; __m128i c1_h = c1; c1_l = _mm_unpacklo_epi8(c1_l, zero); c1_h = _mm_unpackhi_epi8(c1_h, zero); __m128i cl_sub = _mm_sub_epi16(c0_l, c1_l); __m128i ch_sub = _mm_sub_epi16(c0_h, c1_h); cl_sub = _mm_mullo_epi16(cl_sub, a_l); ch_sub = _mm_mullo_epi16(ch_sub, a_h); __m128i c1ls = _mm_slli_epi16(c1_l, 8); __m128i c1hs = _mm_slli_epi16(c1_h, 8); cl_sub = _mm_add_epi16(cl_sub, c1ls); ch_sub = _mm_add_epi16(ch_sub, c1hs); cl_sub = _mm_and_si128(cl_sub, rb_mask); ch_sub = _mm_and_si128(ch_sub, rb_mask); cl_sub = _mm_srli_epi64(cl_sub, 8); ch_sub = _mm_srli_epi64(ch_sub, 8); cl_sub = _mm_packus_epi16(cl_sub, cl_sub); ch_sub = _mm_packus_epi16(ch_sub, ch_sub); return (__m128i) _mm_shuffle_ps( (__m128)cl_sub, (__m128)ch_sub, 0x44); }
static inline __m128i difference_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { __m128i tmp1 = _mm_mullo_epi16(sc, da); __m128i tmp2 = _mm_mullo_epi16(dc, sa); __m128i tmp = SkMin32_SSE2(tmp1, tmp2); __m128i ret1 = _mm_add_epi32(sc, dc); __m128i ret2 = _mm_slli_epi32(SkDiv255Round_SSE2(tmp), 1); __m128i ret = _mm_sub_epi32(ret1, ret2); ret = clamp_signed_byte_SSE2(ret); return ret; }
static inline __m128i darken_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { __m128i sd = _mm_mullo_epi16(sc, da); __m128i ds = _mm_mullo_epi16(dc, sa); __m128i cmp = _mm_cmplt_epi32(sd, ds); __m128i tmp = _mm_add_epi32(sc, dc); __m128i ret1 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(ds)); __m128i ret2 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(sd)); __m128i ret = _mm_or_si128(_mm_and_si128(cmp, ret1), _mm_andnot_si128(cmp, ret2)); return ret; }
inline COLORREF MakeColor2(COLORREF a, COLORREF b, int alpha) { #ifdef USE_SSE2 // (a * alpha + b * (256 - alpha)) / 256 -> ((a - b) * alpha) / 256 + b __m128i xmm0, xmm1, xmm2, xmm3; COLORREF color; xmm0 = _mm_setzero_si128(); xmm1 = _mm_cvtsi32_si128( a ); xmm2 = _mm_cvtsi32_si128( b ); xmm3 = _mm_cvtsi32_si128( alpha ); xmm1 = _mm_unpacklo_epi8( xmm1, xmm0 ); // a:a:a:a xmm2 = _mm_unpacklo_epi8( xmm2, xmm0 ); // b:b:b:b xmm3 = _mm_shufflelo_epi16( xmm3, 0 ); // alpha:alpha:alpha:alpha xmm1 = _mm_sub_epi16( xmm1, xmm2 ); // (a - b) xmm1 = _mm_mullo_epi16( xmm1, xmm3 ); // (a - b) * alpha xmm1 = _mm_srli_epi16( xmm1, 8 ); // ((a - b) * alpha) / 256 xmm1 = _mm_add_epi8( xmm1, xmm2 ); // ((a - b) * alpha) / 256 + b xmm1 = _mm_packus_epi16( xmm1, xmm0 ); color = _mm_cvtsi128_si32( xmm1 ); return color; #else const int ap = alpha; const int bp = 256 - ap; BYTE valR = (BYTE)((GetRValue(a) * ap + GetRValue(b) * bp) / 256); BYTE valG = (BYTE)((GetGValue(a) * ap + GetGValue(b) * bp) / 256); BYTE valB = (BYTE)((GetBValue(a) * ap + GetBValue(b) * bp) / 256); return RGB(valR, valG, valB); #endif }
void ColorModelView::paintEvent(QPaintEvent *) { QPainter p(this); auto mainBounds = mainAreaBounds(); auto sideBounds = sideAreaBounds(); if (mainImage_.isNull()) { // FIXME: support other color model? QImage img(256, 256, QImage::Format_RGB32); auto *pixels = reinterpret_cast<quint32 *>(img.bits()); auto basecolor = QColor::fromHsv(value_.hsvHue(), 255, 255); auto basecolorMM = _mm_setr_epi32(basecolor.blue(), basecolor.green(), basecolor.red(), 0); basecolorMM = _mm_add_epi32(basecolorMM, _mm_srli_epi32(basecolorMM, 7)); // map [0, 255] to [0, 256] auto white = _mm_set1_epi32(256 * 255); auto dX = _mm_sub_epi32(basecolorMM, _mm_set1_epi32(256)); for (int y = 0; y < 256; ++y) { auto brightness = _mm_set1_epi32(256 - y - (y >> 7)); auto col = white; // [0, 256 * 255] for (int x = 0; x < 256; ++x) { auto c = _mm_mullo_epi16(_mm_srli_epi32(col, 8), brightness); c = _mm_srli_epi16(c, 8); // [0, 255] c = _mm_packs_epi32(c, c); c = _mm_packus_epi16(c, c); _mm_store_ss(reinterpret_cast<float *>(&pixels[x + y * 256]), _mm_castsi128_ps(c)); col = _mm_add_epi32(col, dX); } } mainImage_ = QPixmap::fromImage(img); }
inline __m128i Convert8DigitsSSE2(uint32_t value) { assert(value <= 99999999); // abcd, efgh = abcdefgh divmod 10000 const __m128i abcdefgh = _mm_cvtsi32_si128(value); const __m128i abcd = _mm_srli_epi64(_mm_mul_epu32(abcdefgh, reinterpret_cast<const __m128i*>(kDiv10000Vector)[0]), 45); const __m128i efgh = _mm_sub_epi32(abcdefgh, _mm_mul_epu32(abcd, reinterpret_cast<const __m128i*>(k10000Vector)[0])); // v1 = [ abcd, efgh, 0, 0, 0, 0, 0, 0 ] const __m128i v1 = _mm_unpacklo_epi16(abcd, efgh); // v1a = v1 * 4 = [ abcd * 4, efgh * 4, 0, 0, 0, 0, 0, 0 ] const __m128i v1a = _mm_slli_epi64(v1, 2); // v2 = [ abcd * 4, abcd * 4, abcd * 4, abcd * 4, efgh * 4, efgh * 4, efgh * 4, efgh * 4 ] const __m128i v2a = _mm_unpacklo_epi16(v1a, v1a); const __m128i v2 = _mm_unpacklo_epi32(v2a, v2a); // v4 = v2 div 10^3, 10^2, 10^1, 10^0 = [ a, ab, abc, abcd, e, ef, efg, efgh ] const __m128i v3 = _mm_mulhi_epu16(v2, reinterpret_cast<const __m128i*>(kDivPowersVector)[0]); const __m128i v4 = _mm_mulhi_epu16(v3, reinterpret_cast<const __m128i*>(kShiftPowersVector)[0]); // v5 = v4 * 10 = [ a0, ab0, abc0, abcd0, e0, ef0, efg0, efgh0 ] const __m128i v5 = _mm_mullo_epi16(v4, reinterpret_cast<const __m128i*>(k10Vector)[0]); // v6 = v5 << 16 = [ 0, a0, ab0, abc0, 0, e0, ef0, efg0 ] const __m128i v6 = _mm_slli_epi64(v5, 16); // v7 = v4 - v6 = { a, b, c, d, e, f, g, h } const __m128i v7 = _mm_sub_epi16(v4, v6); return v7; }
void srslte_vec_prod_sss_simd(short *x, short *y, short *z, uint32_t len) { #ifdef LV_HAVE_SSE unsigned int number = 0; const unsigned int points = len / 8; const __m128i* xPtr = (const __m128i*) x; const __m128i* yPtr = (const __m128i*) y; __m128i* zPtr = (__m128i*) z; __m128i xVal, yVal, zVal; for(;number < points; number++){ xVal = _mm_load_si128(xPtr); yVal = _mm_load_si128(yPtr); zVal = _mm_mullo_epi16(xVal, yVal); _mm_store_si128(zPtr, zVal); xPtr ++; yPtr ++; zPtr ++; } number = points * 8; for(;number < len; number++){ z[number] = x[number] * y[number]; } #endif }
inline static long sse2_dot_prod (const uint16_t *p1, const uint16_t *p2, const size_t size) { unsigned long d2 = 0; unsigned int i; __m128i* mp1 = (__m128i *)p1; __m128i* mp2 = (__m128i *)p2; for (i = 0; i < size; i += 8) { uint16_t res[8]; __m128i *pmres; __m128i mtmp = _mm_mullo_epi16 (_mm_loadu_si128 (mp1), _mm_loadu_si128 (mp2)); pmres = (__m128i*)res; _mm_storeu_si128 (pmres, mtmp); d2 += res[0]+res[1]+res[2]+res[3]+res[4]+res[5]+res[6]+res[7]; mp1++; mp2++; } return d2; }
void freq_equalization(LTE_DL_FRAME_PARMS *frame_parms, int32_t **rxdataF_comp, int32_t **ul_ch_mag, int32_t **ul_ch_magb, uint8_t symbol, uint16_t Msc_RS, uint8_t Qm) { uint16_t re; int16_t amp; __m128i *ul_ch_mag128,*ul_ch_magb128,*rxdataF_comp128; rxdataF_comp128 = (__m128i *)&rxdataF_comp[0][symbol*frame_parms->N_RB_DL*12]; ul_ch_mag128 = (__m128i *)&ul_ch_mag[0][symbol*frame_parms->N_RB_DL*12]; ul_ch_magb128 = (__m128i *)&ul_ch_magb[0][symbol*frame_parms->N_RB_DL*12]; for (re=0;re<(Msc_RS>>2);re++) { amp=(*((int16_t*)&ul_ch_mag128[re])); if (amp>255) amp=255; // printf("freq_eq: symbol %d re %d => %d,%d,%d, (%d) (%d,%d) => ",symbol,re,*((int16_t*)(&ul_ch_mag128[re])),amp,inv_ch[8*amp],*((int16_t*)(&ul_ch_mag128[re]))*inv_ch[8*amp],*(int16_t*)&(rxdataF_comp128[re]),*(1+(int16_t*)&(rxdataF_comp128[re]))); rxdataF_comp128[re] = _mm_mullo_epi16(rxdataF_comp128[re],*((__m128i *)&inv_ch[8*amp])); if (Qm==4) ul_ch_mag128[re] = _mm_set1_epi16(324); // this is 512*2/sqrt(10) else { ul_ch_mag128[re] = _mm_set1_epi16(316); // this is 512*4/sqrt(42) ul_ch_magb128[re] = _mm_set1_epi16(158); // this is 512*2/sqrt(42) } // printf("(%d,%d)\n",*(int16_t*)&(rxdataF_comp128[re]),*(1+(int16_t*)&(rxdataF_comp128[re]))); } }
static void MultRow(uint8_t* const ptr, const uint8_t* const alpha, int width, int inverse) { int x = 0; if (!inverse) { const int kSpan = 8; const __m128i zero = _mm_setzero_si128(); const __m128i kRound = _mm_set1_epi16(1 << 7); const int w2 = width & ~(kSpan - 1); for (x = 0; x < w2; x += kSpan) { const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]); const __m128i v1 = _mm_unpacklo_epi8(v0, zero); const __m128i alpha0 = _mm_loadl_epi64((const __m128i*)&alpha[x]); const __m128i alpha1 = _mm_unpacklo_epi8(alpha0, zero); const __m128i alpha2 = _mm_unpacklo_epi8(alpha0, alpha0); const __m128i v2 = _mm_mulhi_epu16(v1, alpha2); const __m128i v3 = _mm_mullo_epi16(v1, alpha1); const __m128i v4 = _mm_adds_epu16(v2, v3); const __m128i v5 = _mm_adds_epu16(v4, kRound); const __m128i v6 = _mm_srli_epi16(v5, 8); const __m128i v7 = _mm_packus_epi16(v6, zero); _mm_storel_epi64((__m128i*)&ptr[x], v7); } } width -= x; if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse); }
// Load values from 'a' and 'b'. Compute the difference squared and sum // neighboring values such that: // sum[1] = (a[0]-b[0])^2 + (a[1]-b[1])^2 + (a[2]-b[2])^2 // Values to the left and right of the row are set to 0. // The values are returned in sum_0 and sum_1 as *unsigned* 16 bit values. static void sum_8(const uint8_t *a, const uint8_t *b, __m128i *sum) { const __m128i a_u8 = _mm_loadl_epi64((const __m128i *)a); const __m128i b_u8 = _mm_loadl_epi64((const __m128i *)b); const __m128i a_u16 = _mm_cvtepu8_epi16(a_u8); const __m128i b_u16 = _mm_cvtepu8_epi16(b_u8); const __m128i diff_s16 = _mm_sub_epi16(a_u16, b_u16); const __m128i diff_sq_u16 = _mm_mullo_epi16(diff_s16, diff_s16); // Shift all the values one place to the left/right so we can efficiently sum // diff_sq_u16[i - 1] + diff_sq_u16[i] + diff_sq_u16[i + 1]. const __m128i shift_left = _mm_slli_si128(diff_sq_u16, 2); const __m128i shift_right = _mm_srli_si128(diff_sq_u16, 2); // It becomes necessary to treat the values as unsigned at this point. The // 255^2 fits in uint16_t but not int16_t. Use saturating adds from this point // forward since the filter is only applied to smooth small pixel changes. // Once the value has saturated to uint16_t it is well outside the useful // range. __m128i sum_u16 = _mm_adds_epu16(diff_sq_u16, shift_left); sum_u16 = _mm_adds_epu16(sum_u16, shift_right); *sum = sum_u16; }
// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.' static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred, uint16_t *count, uint32_t *accumulator) { const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred); const __m128i zero = _mm_setzero_si128(); __m128i count_u16 = _mm_loadu_si128((const __m128i *)count); __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8); __m128i pred_0_u32, pred_1_u32; __m128i accum_0_u32, accum_1_u32; count_u16 = _mm_adds_epu16(count_u16, sum_u16); _mm_storeu_si128((__m128i *)count, count_u16); pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16); pred_0_u32 = _mm_cvtepu16_epi32(pred_u16); pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero); accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32); accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32); _mm_storeu_si128((__m128i *)accumulator, accum_0_u32); _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); }
__m128i test_mm_mullo_epi16(__m128i A, __m128i B) { // DAG-LABEL: test_mm_mullo_epi16 // DAG: mul <8 x i16> %{{.*}}, %{{.*}} // // ASM-LABEL: test_mm_mullo_epi16 // ASM: pmullw return _mm_mullo_epi16(A, B); }
// Author: Niclas P Andersson void Lerp_OLD(void* dest, const void* source1, const void* source2, float alpha, size_t size) { __m128i ps1, ps2, pd1, pd2, m0, m1, pr1, pr2; __m128i* pSource = (__m128i*)source1; __m128i* pDest = (__m128i*)source2; __m128i* pResult = (__m128i*)dest; __m128i a = _mm_set1_epi16(static_cast<u8>(alpha*256.0f+0.5f)); m0 = _mm_setzero_si128(); int count = size/4; for ( int i = 0; i < count; i+=4 ) { ps1 = _mm_load_si128(pSource); //load 4 pixels from source pd1 = _mm_load_si128(pDest); //load 4 pixels from dest ps2 = _mm_unpackhi_epi64(ps1, m0); //move the 2 high pixels from source pd2 = _mm_unpackhi_epi64(pd1, m0); //move the 2 high pixels from dest //compute the 2 "lower" pixels ps1 = _mm_unpacklo_epi8(ps1, m0); //unpack the 2 low pixels from source (bytes -> words) pd1 = _mm_unpacklo_epi8(pd1, m0); //unpack the 2 low pixels from dest (bytes -> words) pr1 = _mm_sub_epi16(ps1, pd1); //x = src - dest pr1 = _mm_mullo_epi16(pr1, a); //y = x*alpha pr1 = _mm_srli_epi16(pr1, 8); //w = y/256 pr1 = _mm_add_epi8(pr1, pd1); //z = w + dest //same thing for the 2 "high" pixels ps2 = _mm_unpacklo_epi8(ps2, m0); pd2 = _mm_unpacklo_epi8(pd2, m0); pr2 = _mm_sub_epi16(ps2, pd2); //x = src - dest pr2 = _mm_mullo_epi16(pr2, a); //y = x*alpha pr2 = _mm_srli_epi16(pr2, 8); //w = y/256 pr2 = _mm_add_epi8(pr2, pd2); //z = w + dest m1 = _mm_packus_epi16(pr1, pr2); //pack all 4 together again (words -> bytes) _mm_store_si128(pResult, m1); pSource++; pDest++; pResult++; } }
static inline __m128i blendfunc_multiply_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { // sc * (255 - da) __m128i ret1 = _mm_sub_epi32(_mm_set1_epi32(255), da); ret1 = _mm_mullo_epi16(sc, ret1); // dc * (255 - sa) __m128i ret2 = _mm_sub_epi32(_mm_set1_epi32(255), sa); ret2 = _mm_mullo_epi16(dc, ret2); // sc * dc __m128i ret3 = _mm_mullo_epi16(sc, dc); __m128i ret = _mm_add_epi32(ret1, ret2); ret = _mm_add_epi32(ret, ret3); return clamp_div255round_SSE2(ret); }
inline Pixel GetPixelSSE(const Image* img, float x, float y) { const int stride = img->width; const Pixel* p0 = img->data + (int)x + (int)y * stride; // pointer to first pixel // Load the data (2 pixels in one load) __m128i p12 = _mm_loadl_epi64((const __m128i*)&p0[0 * stride]); __m128i p34 = _mm_loadl_epi64((const __m128i*)&p0[1 * stride]); __m128 weight = CalcWeights(x, y); // extend to 16bit p12 = _mm_unpacklo_epi8(p12, _mm_setzero_si128()); p34 = _mm_unpacklo_epi8(p34, _mm_setzero_si128()); // convert floating point weights to 16bit integer weight = _mm_mul_ps(weight, CONST_256); __m128i weighti = _mm_cvtps_epi32(weight); // w4 w3 w2 w1 weighti = _mm_packs_epi32(weighti, _mm_setzero_si128()); // 32->16bit // prepare the weights __m128i w12 = _mm_shufflelo_epi16(weighti, _MM_SHUFFLE(1, 1, 0, 0)); __m128i w34 = _mm_shufflelo_epi16(weighti, _MM_SHUFFLE(3, 3, 2, 2)); w12 = _mm_unpacklo_epi16(w12, w12); // w2 w2 w2 w2 w1 w1 w1 w1 w34 = _mm_unpacklo_epi16(w34, w34); // w4 w4 w4 w4 w3 w3 w3 w3 // multiply each pixel with its weight (2 pixel per SSE mul) __m128i L12 = _mm_mullo_epi16(p12, w12); __m128i L34 = _mm_mullo_epi16(p34, w34); // sum the results __m128i L1234 = _mm_add_epi16(L12, L34); __m128i Lhi = _mm_shuffle_epi32(L1234, _MM_SHUFFLE(3, 2, 3, 2)); __m128i L = _mm_add_epi16(L1234, Lhi); // convert back to 8bit __m128i L8 = _mm_srli_epi16(L, 8); // divide by 256 L8 = _mm_packus_epi16(L8, _mm_setzero_si128()); // return return _mm_cvtsi128_si32(L8); }
__m64 interpolvline_1( unsigned char* image, int PicWidthInPix){ __m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7; __m64 ret; xmm7 = _mm_setzero_si128(); xmm0 = _mm_movpi64_epi64(*((__m64*)(image - 2*PicWidthInPix))); xmm0 = _mm_unpacklo_epi8(xmm0,xmm7); xmm1 = _mm_movpi64_epi64(*((__m64*)(image - 1*PicWidthInPix))); xmm1 = _mm_unpacklo_epi8(xmm1,xmm7); xmm2 = _mm_movpi64_epi64(*((__m64*)(image - 0*PicWidthInPix))); xmm2 = _mm_unpacklo_epi8(xmm2,xmm7); xmm3 = _mm_movpi64_epi64(*((__m64*)(image + 1*PicWidthInPix))); xmm3 = _mm_unpacklo_epi8(xmm3,xmm7); xmm4 = _mm_movpi64_epi64(*((__m64*)(image + 2*PicWidthInPix))); xmm4 = _mm_unpacklo_epi8(xmm4,xmm7); xmm5 = _mm_movpi64_epi64(*((__m64*)(image + 3*PicWidthInPix))); xmm5 = _mm_unpacklo_epi8(xmm5,xmm7); // filter on 8 values xmm6 = _mm_add_epi16(xmm2,xmm3); xmm6 = _mm_slli_epi16(xmm6,2); xmm6 = _mm_sub_epi16(xmm6,xmm1); xmm6 = _mm_sub_epi16(xmm6,xmm4); xmm1 = _mm_set_epi32(0x00050005,0x00050005,0x00050005,0x00050005); xmm6 = _mm_mullo_epi16(xmm6,xmm1); xmm6 = _mm_add_epi16(xmm6,xmm0); xmm6 = _mm_add_epi16(xmm6,xmm5); xmm6 = _mm_add_epi16(xmm6,_mm_set_epi32(0x00100010,0x00100010,0x00100010,0x00100010)); xmm6 = _mm_max_epi16(xmm6, xmm7); // preventing negative values xmm6 = _mm_srli_epi16(xmm6,5); xmm2 = _mm_packus_epi16(xmm2,xmm7); xmm3 = _mm_packus_epi16(xmm3,xmm7); xmm6 = _mm_packus_epi16(xmm6,xmm7); xmm5 = _mm_unpacklo_epi8(xmm2,xmm6); xmm4 = _mm_unpacklo_epi8(xmm6,xmm3); xmm6 = _mm_avg_epu8(xmm4,xmm5); xmm6 = _mm_slli_epi16(xmm6,8); xmm6 = _mm_srli_epi16(xmm6,8); xmm6 = _mm_packus_epi16(xmm6,xmm7); ret = _mm_movepi64_pi64(xmm6); _mm_empty(); return(ret); }