void png_read_filter_row_sub4_sse(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { png_size_t i; __m128i racc = _mm_setzero_si128(); __m128i* rp = (__m128i*)(row); PNG_UNUSED(prev_row) for (i = (row_info->rowbytes + 15) >> 4; i > 0; i--) { __m128i rb = _mm_load_si128(rp); #ifndef __SSSE3__ racc = _mm_srli_si128(racc, 12); racc = _mm_or_si128(racc, _mm_slli_si128(rb, 4)); #else racc = _mm_alignr_epi8(rb, racc, 12); #endif rb = _mm_add_epi8(rb, racc); racc = _mm_slli_si128(racc, 4); rb = _mm_add_epi8(rb, racc); racc = _mm_slli_si128(racc, 4); rb = _mm_add_epi8(rb, racc); racc = _mm_slli_si128(racc, 4); rb = _mm_add_epi8(rb, racc); racc = rb; _mm_store_si128(rp++, rb); } }
// Special case for left-based prediction (when preds==dst-1 or preds==src-1). static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length, int inverse) { int i; if (length <= 0) return; if (inverse) { const int max_pos = length & ~7; __m128i last = _mm_set_epi32(0, 0, 0, dst[-1]); for (i = 0; i < max_pos; i += 8) { const __m128i A0 = _mm_loadl_epi64((const __m128i*)(src + i)); const __m128i A1 = _mm_add_epi8(A0, last); const __m128i A2 = _mm_slli_si128(A1, 1); const __m128i A3 = _mm_add_epi8(A1, A2); const __m128i A4 = _mm_slli_si128(A3, 2); const __m128i A5 = _mm_add_epi8(A3, A4); const __m128i A6 = _mm_slli_si128(A5, 4); const __m128i A7 = _mm_add_epi8(A5, A6); _mm_storel_epi64((__m128i*)(dst + i), A7); last = _mm_srli_epi64(A7, 56); } for (; i < length; ++i) dst[i] = src[i] + dst[i - 1]; } else { const int max_pos = length & ~31; for (i = 0; i < max_pos; i += 32) { const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + i + 0 )); const __m128i B0 = _mm_loadu_si128((const __m128i*)(src + i + 0 - 1)); const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + i + 16 )); const __m128i B1 = _mm_loadu_si128((const __m128i*)(src + i + 16 - 1)); const __m128i C0 = _mm_sub_epi8(A0, B0); const __m128i C1 = _mm_sub_epi8(A1, B1); _mm_storeu_si128((__m128i*)(dst + i + 0), C0); _mm_storeu_si128((__m128i*)(dst + i + 16), C1); } for (; i < length; ++i) dst[i] = src[i] - src[i - 1]; } }
void png_read_filter_row_sub3_sse2(png_row_infop row_info, png_bytep row, png_const_bytep prev) { /* The Sub filter predicts each pixel as the previous pixel, a. * There is no pixel to the left of the first pixel. It's encoded directly. * That works with our main loop if we just say that left pixel was zero. */ png_size_t rb; __m128i a, d = _mm_setzero_si128(); png_debug(1, "in png_read_filter_row_sub3_sse2"); rb = row_info->rowbytes; while (rb >= 4) { a = d; d = load4(row); d = _mm_add_epi8(d, a); store3(row, d); row += 3; rb -= 3; } if (rb > 0) { a = d; d = load3(row); d = _mm_add_epi8(d, a); store3(row, d); row += 3; rb -= 3; } PNG_UNUSED(prev) }
static void TransformColorInverse_SSE2(const VP8LMultipliers* const m, const uint32_t* const src, int num_pixels, uint32_t* dst) { // sign-extended multiplying constants, pre-shifted by 5. #define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend #define MK_CST_16(HI, LO) \ _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff))) const __m128i mults_rb = MK_CST_16(CST(green_to_red_), CST(green_to_blue_)); const __m128i mults_b2 = MK_CST_16(CST(red_to_blue_), 0); #undef MK_CST_16 #undef CST const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0 const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1 const __m128i E = _mm_add_epi8(in, D); // x r' x b' const __m128i F = _mm_slli_epi16(E, 8); // r' 0 b' 0 const __m128i G = _mm_mulhi_epi16(F, mults_b2); // x db2 0 0 const __m128i H = _mm_srli_epi32(G, 8); // 0 x db2 0 const __m128i I = _mm_add_epi8(H, F); // r' x b'' 0 const __m128i J = _mm_srli_epi16(I, 8); // 0 r' 0 b'' const __m128i out = _mm_or_si128(J, A); _mm_storeu_si128((__m128i*)&dst[i], out); } // Fall-back to C-version for left-overs. if (i != num_pixels) { VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i); } }
static void PredictLineTop(const uint8_t* src, const uint8_t* pred, uint8_t* dst, int length, int inverse) { int i; const int max_pos = length & ~31; assert(length >= 0); if (inverse) { for (i = 0; i < max_pos; i += 32) { const __m128i A0 = _mm_loadu_si128((const __m128i*)&src[i + 0]); const __m128i A1 = _mm_loadu_si128((const __m128i*)&src[i + 16]); const __m128i B0 = _mm_loadu_si128((const __m128i*)&pred[i + 0]); const __m128i B1 = _mm_loadu_si128((const __m128i*)&pred[i + 16]); const __m128i C0 = _mm_add_epi8(A0, B0); const __m128i C1 = _mm_add_epi8(A1, B1); _mm_storeu_si128((__m128i*)&dst[i + 0], C0); _mm_storeu_si128((__m128i*)&dst[i + 16], C1); } for (; i < length; ++i) dst[i] = src[i] + pred[i]; } else { for (i = 0; i < max_pos; i += 32) { const __m128i A0 = _mm_loadu_si128((const __m128i*)&src[i + 0]); const __m128i A1 = _mm_loadu_si128((const __m128i*)&src[i + 16]); const __m128i B0 = _mm_loadu_si128((const __m128i*)&pred[i + 0]); const __m128i B1 = _mm_loadu_si128((const __m128i*)&pred[i + 16]); const __m128i C0 = _mm_sub_epi8(A0, B0); const __m128i C1 = _mm_sub_epi8(A1, B1); _mm_storeu_si128((__m128i*)&dst[i + 0], C0); _mm_storeu_si128((__m128i*)&dst[i + 16], C1); } for (; i < length; ++i) dst[i] = src[i] - pred[i]; } }
void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row, png_const_bytep prev) { /* The Avg filter predicts each pixel as the (truncated) average of a and b. * There's no pixel to the left of the first pixel. Luckily, it's * predicted to be half of the pixel above it. So again, this works * perfectly with our loop if we make sure a starts at zero. */ png_size_t rb; const __m128i zero = _mm_setzero_si128(); __m128i b; __m128i a, d = zero; png_debug(1, "in png_read_filter_row_avg3_sse2"); rb = row_info->rowbytes; while (rb >= 4) { __m128i avg; b = load4(prev); a = d; d = load4(row ); /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */ avg = _mm_avg_epu8(a,b); /* ...but we can fix it up by subtracting off 1 if it rounded up. */ avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), _mm_set1_epi8(1))); d = _mm_add_epi8(d, avg); store3(row, d); prev += 3; row += 3; rb -= 3; } if (rb > 0) { __m128i avg; b = load3(prev); a = d; d = load3(row ); /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */ avg = _mm_avg_epu8(a,b); /* ...but we can fix it up by subtracting off 1 if it rounded up. */ avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), _mm_set1_epi8(1))); d = _mm_add_epi8(d, avg); store3(row, d); prev += 3; row += 3; rb -= 3; } }
void sk_avg_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) { // The Avg filter predicts each pixel as the (truncated) average of a and b. // There's no pixel to the left of the first pixel. Luckily, it's // predicted to be half of the pixel above it. So again, this works // perfectly with our loop if we make sure a starts at zero. const __m128i zero = _mm_setzero_si128(); __m128i b; __m128i a, d = zero; int rb = row_info->rowbytes; while (rb > 0) { b = load<bpp>(prev); a = d; d = load<bpp>(row ); // PNG requires a truncating average here, so sadly we can't just use _mm_avg_epu8... __m128i avg = _mm_avg_epu8(a,b); // ...but we can fix it up by subtracting off 1 if it rounded up. avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), _mm_set1_epi8(1))); d = _mm_add_epi8(d, avg); store<bpp>(row, d); prev += bpp; row += bpp; rb -= bpp; } }
static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i, j; __m128i L = _mm_cvtsi32_si128(out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); __m128i pa; GetSumAbsDiff32(&T, &TL, &pa); // pa = sum |T-TL| for (j = 0; j < 4; ++j) { const __m128i L_lo = _mm_unpacklo_epi32(L, L); const __m128i TL_lo = _mm_unpacklo_epi32(TL, L); const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); // pb = sum |L-TL| const __m128i mask = _mm_cmpgt_epi32(pb, pa); const __m128i A = _mm_and_si128(mask, L); const __m128i B = _mm_andnot_si128(mask, T); const __m128i pred = _mm_or_si128(A, B); // pred = (L > T)? L : T L = _mm_add_epi8(src, pred); out[i + j] = _mm_cvtsi128_si32(L); // Shift the pre-computed value for the next iteration. T = _mm_srli_si128(T, 4); TL = _mm_srli_si128(TL, 4); src = _mm_srli_si128(src, 4); pa = _mm_srli_si128(pa, 4); } } if (i != num_pixels) { VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i); } }
// Predictor10: average of (average of (L,TL), average of (T, TR)). static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i, j; __m128i L = _mm_cvtsi32_si128(out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]); __m128i avgTTR; Average2_m128i(&T, &TR, &avgTTR); for (j = 0; j < 4; ++j) { __m128i avgLTL, avg; Average2_m128i(&L, &TL, &avgLTL); Average2_m128i(&avgTTR, &avgLTL, &avg); L = _mm_add_epi8(avg, src); out[i + j] = _mm_cvtsi128_si32(L); // Rotate the pre-computed values for the next iteration. avgTTR = _mm_srli_si128(avgTTR, 4); TL = _mm_srli_si128(TL, 4); src = _mm_srli_si128(src, 4); } } if (i != num_pixels) { VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i); } }
inline void casefoldRange(char* dest, const char* begin, const char* end) { if (end - begin < 64) { // short string, don't bother optimizing for (const char* i = begin; i != end; ++i) *dest++ = casefold(*i); } else { // Shift 'A'..'Z' range ([65..90]) to [102..127] to use one signed comparison insn __m128i shiftAmount = _mm_set1_epi8(127 - 'Z'); __m128i lowerBound = _mm_set1_epi8(127 - ('Z' - 'A') - 1); __m128i upperBit = _mm_set1_epi8(0x20); const char* i = begin; for (; i + 16 < end; i += 16) { __m128i v = _mm_loadu_si128(reinterpret_cast<const __m128i*>(i)); __m128i upperMask = _mm_cmpgt_epi8(_mm_add_epi8(v, shiftAmount), lowerBound); __m128i cfv = _mm_or_si128(v, _mm_and_si128(upperMask, upperBit)); _mm_storeu_si128(reinterpret_cast<__m128i*>(dest), cfv); dest += 16; } for (; i != end; ++i) *dest++ = casefold(*i); } }
inline COLORREF MakeColor2(COLORREF a, COLORREF b, int alpha) { #ifdef USE_SSE2 // (a * alpha + b * (256 - alpha)) / 256 -> ((a - b) * alpha) / 256 + b __m128i xmm0, xmm1, xmm2, xmm3; COLORREF color; xmm0 = _mm_setzero_si128(); xmm1 = _mm_cvtsi32_si128( a ); xmm2 = _mm_cvtsi32_si128( b ); xmm3 = _mm_cvtsi32_si128( alpha ); xmm1 = _mm_unpacklo_epi8( xmm1, xmm0 ); // a:a:a:a xmm2 = _mm_unpacklo_epi8( xmm2, xmm0 ); // b:b:b:b xmm3 = _mm_shufflelo_epi16( xmm3, 0 ); // alpha:alpha:alpha:alpha xmm1 = _mm_sub_epi16( xmm1, xmm2 ); // (a - b) xmm1 = _mm_mullo_epi16( xmm1, xmm3 ); // (a - b) * alpha xmm1 = _mm_srli_epi16( xmm1, 8 ); // ((a - b) * alpha) / 256 xmm1 = _mm_add_epi8( xmm1, xmm2 ); // ((a - b) * alpha) / 256 + b xmm1 = _mm_packus_epi16( xmm1, xmm0 ); color = _mm_cvtsi128_si32( xmm1 ); return color; #else const int ap = alpha; const int bp = 256 - ap; BYTE valR = (BYTE)((GetRValue(a) * ap + GetRValue(b) * bp) / 256); BYTE valG = (BYTE)((GetGValue(a) * ap + GetGValue(b) * bp) / 256); BYTE valB = (BYTE)((GetBValue(a) * ap + GetBValue(b) * bp) / 256); return RGB(valR, valG, valB); #endif }
/*! Compute the image addition: \f$ Ires = I1 + I2 \f$. \param I1 : The first image. \param I2 : The second image. \param Ires : \f$ Ires = I1 + I2 \f$ \param saturate : If true, saturate the result to [0 ; 255] using vpMath::saturate, otherwise overflow may occur. */ void vpImageTools::imageAdd(const vpImage<unsigned char> &I1, const vpImage<unsigned char> &I2, vpImage<unsigned char> &Ires, const bool saturate) { if ((I1.getHeight() != I2.getHeight()) || (I1.getWidth() != I2.getWidth())) { throw (vpException(vpException::dimensionError, "The two images do not have the same size")); } if ((I1.getHeight() != Ires.getHeight()) || (I1.getWidth() != Ires.getWidth())) { Ires.resize(I1.getHeight(), I1.getWidth()); } unsigned char *ptr_I1 = I1.bitmap; unsigned char *ptr_I2 = I2.bitmap; unsigned char *ptr_Ires = Ires.bitmap; unsigned int cpt = 0; #if VISP_HAVE_SSE2 if (Ires.getSize() >= 16) { for (; cpt <= Ires.getSize() - 16 ; cpt += 16, ptr_I1 += 16, ptr_I2 += 16, ptr_Ires += 16) { const __m128i v1 = _mm_loadu_si128( (const __m128i*) ptr_I1); const __m128i v2 = _mm_loadu_si128( (const __m128i*) ptr_I2); const __m128i vres = saturate ? _mm_adds_epu8(v1, v2) : _mm_add_epi8(v1, v2); _mm_storeu_si128( (__m128i*) ptr_Ires, vres ); } } #endif for (; cpt < Ires.getSize(); cpt++, ++ptr_I1, ++ptr_I2, ++ptr_Ires) { *ptr_Ires = saturate ? vpMath::saturate<unsigned char>( (short int) *ptr_I1 + (short int) *ptr_I2 ) : *ptr_I1 + *ptr_I2; } }
static void TransformColor(const VP8LMultipliers* const m, uint32_t* argb_data, int num_pixels) { const __m128i mults_rb = _mm_set_epi16( CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_), CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_), CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_), CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_)); const __m128i mults_b2 = _mm_set_epi16( CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0); const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff); // red-blue masks int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0 const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1 const __m128i E = _mm_slli_epi16(in, 8); // r 0 b 0 const __m128i F = _mm_mulhi_epi16(E, mults_b2); // x db2 0 0 const __m128i G = _mm_srli_epi32(F, 16); // 0 0 x db2 const __m128i H = _mm_add_epi8(G, D); // x dr x db const __m128i I = _mm_and_si128(H, mask_rb); // 0 dr 0 db const __m128i out = _mm_sub_epi8(in, I); _mm_storeu_si128((__m128i*)&argb_data[i], out); } // fallthrough and finish off with plain-C VP8LTransformColor_C(m, argb_data + i, num_pixels - i); }
void Lerp_SSE2(void* dest, const void* source1, const void* source2, float alpha, size_t size) { static const size_t stride = sizeof(__m128i)*4; static const u32 PSD = 64; static const __m128i lomask = _mm_set1_epi32(0x00FF00FF); static const __m128i round = _mm_set1_epi16(128); assert(source1 != NULL && source2 != NULL && dest != NULL); assert(size % stride == 0); assert(alpha >= 0.0 && alpha <= 1.0); const __m128i* source128_1 = reinterpret_cast<const __m128i*>(source1); const __m128i* source128_2 = reinterpret_cast<const __m128i*>(source2); __m128i* dest128 = reinterpret_cast<__m128i*>(dest); __m128i s = _mm_setzero_si128(); __m128i d = _mm_setzero_si128(); const __m128i a = _mm_set1_epi16(static_cast<u8>(alpha*256.0f+0.5f)); __m128i drb, dga, srb, sga; for (size_t k = 0, length = size/stride; k < length; ++k) { _mm_prefetch(reinterpret_cast<const char*>(source128_1 + PSD), _MM_HINT_NTA); _mm_prefetch(reinterpret_cast<const char*>(source128_2 + PSD), _MM_HINT_NTA); // TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/ for(int n = 0; n < 4; ++n, ++dest128, ++source128_1, ++source128_2) { // r = d + (s-d)*alpha/256 s = _mm_load_si128(source128_1); // AABBGGRR d = _mm_load_si128(source128_2); // AABBGGRR srb = _mm_and_si128(lomask, s); // 00BB00RR // unpack sga = _mm_srli_epi16(s, 8); // AA00GG00 // unpack drb = _mm_and_si128(lomask, d); // 00BB00RR // unpack dga = _mm_srli_epi16(d, 8); // AA00GG00 // unpack srb = _mm_sub_epi16(srb, drb); // BBBBRRRR // sub srb = _mm_mullo_epi16(srb, a); // BBBBRRRR // mul srb = _mm_add_epi16(srb, round); sga = _mm_sub_epi16(sga, dga); // AAAAGGGG // sub sga = _mm_mullo_epi16(sga, a); // AAAAGGGG // mul sga = _mm_add_epi16(sga, round); srb = _mm_srli_epi16(srb, 8); // 00BB00RR // prepack and div sga = _mm_andnot_si128(lomask, sga);// AA00GG00 // prepack and div srb = _mm_or_si128(srb, sga); // AABBGGRR // pack srb = _mm_add_epi8(srb, d); // AABBGGRR // add there is no overflow(R.N) _mm_store_si128(dest128, srb); } } }
__m128i test_mm_add_epi8(__m128i A, __m128i B) { // DAG-LABEL: test_mm_add_epi8 // DAG: add <16 x i8> // // ASM-LABEL: test_mm_add_epi8 // ASM: paddb return _mm_add_epi8(A, B); }
void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row, png_const_bytep prev) { /* Paeth tries to predict pixel d using the pixel to the left of it, a, * and two pixels from the previous row, b and c: * prev: c b * row: a d * The Paeth function predicts d to be whichever of a, b, or c is nearest to * p=a+b-c. * * The first pixel has no left context, and so uses an Up filter, p = b. * This works naturally with our main loop's p = a+b-c if we force a and c * to zero. * Here we zero b and d, which become c and a respectively at the start of * the loop. */ png_debug(1, "in png_read_filter_row_paeth4_sse2"); const __m128i zero = _mm_setzero_si128(); __m128i c, b = zero, a, d = zero; int rb = row_info->rowbytes; while (rb > 0) { /* It's easiest to do this math (particularly, deal with pc) with 16-bit * intermediates. */ c = b; b = _mm_unpacklo_epi8(load4(prev), zero); a = d; d = _mm_unpacklo_epi8(load4(row ), zero); /* (p-a) == (a+b-c - a) == (b-c) */ __m128i pa = _mm_sub_epi16(b,c); /* (p-b) == (a+b-c - b) == (a-c) */ __m128i pb = _mm_sub_epi16(a,c); /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ __m128i pc = _mm_add_epi16(pa,pb); pa = abs_i16(pa); /* |p-a| */ pb = abs_i16(pb); /* |p-b| */ pc = abs_i16(pc); /* |p-c| */ __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); /* Paeth breaks ties favoring a over b over c. */ __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c)); /* Note `_epi8`: we need addition to wrap modulo 255. */ d = _mm_add_epi8(d, nearest); store4(row, _mm_packus_epi16(d,d)); prev += 4; row += 4; rb -= 4; } }
void PreOver_FastSSE2(void* dest, const void* source1, const void* source2, size_t size) { static const size_t stride = sizeof(__m128i)*4; static const u32 PSD = 64; static const __m128i lomask = _mm_set1_epi32(0x00FF00FF); assert(source1 != NULL && source2 != NULL && dest != NULL); assert(size % stride == 0); const __m128i* source128_1 = reinterpret_cast<const __m128i*>(source1); const __m128i* source128_2 = reinterpret_cast<const __m128i*>(source2); __m128i* dest128 = reinterpret_cast<__m128i*>(dest); __m128i d, s, a, rb, ag; // TODO: dynamic prefetch schedluing distance? needs to be optimized (R.N) for(int k = 0, length = size/stride; k < length; ++k) { // TODO: put prefetch between calculations?(R.N) _mm_prefetch(reinterpret_cast<const s8*>(source128_1+PSD), _MM_HINT_NTA); _mm_prefetch(reinterpret_cast<const s8*>(source128_2+PSD), _MM_HINT_NTA); //work on entire cacheline before next prefetch for(int n = 0; n < 4; ++n, ++dest128, ++source128_1, ++source128_2) { // TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/ s = _mm_load_si128(source128_1); // AABGGRR d = _mm_load_si128(source128_2); // AABGGRR // set alpha to lo16 from dest_ rb = _mm_srli_epi32(d, 24); // 000000AA a = _mm_slli_epi32(rb, 16); // 00AA0000 a = _mm_or_si128(rb, a); // 00AA00AA // fix alpha a = a > 127 ? a+1 : a // NOTE: If removed an *overflow* will occur with large values (R.N) rb = _mm_srli_epi16(a, 7); a = _mm_add_epi16(a, rb); rb = _mm_and_si128(lomask, s); // 00B00RR unpack rb = _mm_mullo_epi16(rb, a); // BBRRRR mul (D[A]*S) rb = _mm_srli_epi16(rb, 8); // 00B00RR prepack and div [(D[A]*S)]/255 ag = _mm_srli_epi16(s, 8); // 00AA00GG unpack ag = _mm_mullo_epi16(ag, a); // AAAAGGGG mul (D[A]*S) ag = _mm_andnot_si128(lomask, ag); // AA00GG00 prepack and div [(D[A]*S)]/255 rb = _mm_or_si128(rb, ag); // AABGGRR pack rb = _mm_sub_epi8(s, rb); // sub S-[(D[A]*S)/255] d = _mm_add_epi8(d, rb); // add D+[S-(D[A]*S)/255] _mm_store_si128(dest128, d); } } }
// Author: Niclas P Andersson void Lerp_OLD(void* dest, const void* source1, const void* source2, float alpha, size_t size) { __m128i ps1, ps2, pd1, pd2, m0, m1, pr1, pr2; __m128i* pSource = (__m128i*)source1; __m128i* pDest = (__m128i*)source2; __m128i* pResult = (__m128i*)dest; __m128i a = _mm_set1_epi16(static_cast<u8>(alpha*256.0f+0.5f)); m0 = _mm_setzero_si128(); int count = size/4; for ( int i = 0; i < count; i+=4 ) { ps1 = _mm_load_si128(pSource); //load 4 pixels from source pd1 = _mm_load_si128(pDest); //load 4 pixels from dest ps2 = _mm_unpackhi_epi64(ps1, m0); //move the 2 high pixels from source pd2 = _mm_unpackhi_epi64(pd1, m0); //move the 2 high pixels from dest //compute the 2 "lower" pixels ps1 = _mm_unpacklo_epi8(ps1, m0); //unpack the 2 low pixels from source (bytes -> words) pd1 = _mm_unpacklo_epi8(pd1, m0); //unpack the 2 low pixels from dest (bytes -> words) pr1 = _mm_sub_epi16(ps1, pd1); //x = src - dest pr1 = _mm_mullo_epi16(pr1, a); //y = x*alpha pr1 = _mm_srli_epi16(pr1, 8); //w = y/256 pr1 = _mm_add_epi8(pr1, pd1); //z = w + dest //same thing for the 2 "high" pixels ps2 = _mm_unpacklo_epi8(ps2, m0); pd2 = _mm_unpacklo_epi8(pd2, m0); pr2 = _mm_sub_epi16(ps2, pd2); //x = src - dest pr2 = _mm_mullo_epi16(pr2, a); //y = x*alpha pr2 = _mm_srli_epi16(pr2, 8); //w = y/256 pr2 = _mm_add_epi8(pr2, pd2); //z = w + dest m1 = _mm_packus_epi16(pr1, pr2); //pack all 4 together again (words -> bytes) _mm_store_si128(pResult, m1); pSource++; pDest++; pResult++; } }
SIMDValue SIMDInt8x16Operation::OpAdd(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); x86Result.m128i_value = _mm_add_epi8(tmpaValue.m128i_value, tmpbValue.m128i_value); // a + b return X86SIMDValue::ToSIMDValue(x86Result); }
static inline __m128i calculate_pixel_avg(const __m128i rb, const __m128i prb, __m128i pixel, const __m128i mask) { __m128i round; round = _mm_xor_si128(prb, pixel); pixel = _mm_avg_epu8(pixel, prb); round = _mm_and_si128(round, mask); pixel = _mm_sub_epi8(pixel, round); return _mm_add_epi8(pixel, rb); }
__m64 _m_paddb(__m64 _MM1, __m64 _MM2) { __m128i lhs = {0}, rhs = {0}; lhs.m128i_i64[0] = _MM1.m64_i64; rhs.m128i_i64[0] = _MM2.m64_i64; lhs = _mm_add_epi8(lhs, rhs); _MM1.m64_i64 = lhs.m128i_i64[0]; return _MM1; }
static void adddiff_sse2_t(Byte *pDst, ptrdiff_t dst_pitch, const Byte *pSrc, ptrdiff_t src_pitch, int width, int height) { int mod32_width = (width / 32) * 32; auto pDst2 = pDst; auto pSrc2 = pSrc; auto v128 = _mm_set1_epi32(0x80808080); for ( int j = 0; j < height; ++j ) { for ( int i = 0; i < mod32_width; i+=32 ) { _mm_prefetch(reinterpret_cast<const char*>(pDst)+i+128, _MM_HINT_T0); _mm_prefetch(reinterpret_cast<const char*>(pSrc)+i+128, _MM_HINT_T0); auto dst = simd_load_si128<mem_mode>(pDst+i); auto dst2 = simd_load_si128<mem_mode>(pDst+i+16); auto src = simd_load_si128<mem_mode>(pSrc+i); auto src2 = simd_load_si128<mem_mode>(pSrc+i+16); auto dstsub = _mm_sub_epi8(dst, v128); auto dstsub2 = _mm_sub_epi8(dst2, v128); auto srcsub = _mm_sub_epi8(src, v128); auto srcsub2 = _mm_sub_epi8(src2, v128); auto added = _mm_adds_epi8(dstsub, srcsub); auto added2 = _mm_adds_epi8(dstsub2, srcsub2); auto result = _mm_add_epi8(added, v128); auto result2 = _mm_add_epi8(added2, v128); simd_store_si128<mem_mode>(pDst+i, result); simd_store_si128<mem_mode>(pDst+i+16, result2); } pDst += dst_pitch; pSrc += src_pitch; } if (width > mod32_width) { adddiff_c(pDst2 + mod32_width, dst_pitch, pSrc2 + mod32_width, src_pitch, width - mod32_width, height); } }
// Predictor0: ARGB_BLACK. static void PredictorAdd0_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; const __m128i black = _mm_set1_epi32(ARGB_BLACK); for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); const __m128i res = _mm_add_epi8(src, black); _mm_storeu_si128((__m128i*)&out[i], res); } if (i != num_pixels) { VP8LPredictorsAdd_C[0](in + i, upper + i, num_pixels - i, out + i); } }
void png_read_filter_row_sub3_sse(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { png_size_t i; png_bytep rp = row; __m128i racc = _mm_setzero_si128(); PNG_UNUSED(prev_row) __m128i nrb = _mm_load_si128((__m128i*)(rp)); for (i = 0; i < row_info->rowbytes; i += 15, rp += 15) { __m128i rb = nrb; #ifndef __SSSE3__ nrb = _mm_loadu_si128((__m128i*)(rp + 15)); racc = _mm_srli_si128(_mm_slli_si128(racc, 1), 13); racc = _mm_or_si128(racc, _mm_slli_si128(rb, 3)); #else nrb = _mm_lddqu_si128((__m128i*)(rp + 15)); racc = _mm_alignr_epi8(rb, _mm_slli_si128(racc, 1), 13); #endif rb = _mm_add_epi8(rb, racc); racc = _mm_slli_si128(racc, 3); rb = _mm_add_epi8(rb, racc); racc = _mm_slli_si128(racc, 3); rb = _mm_add_epi8(rb, racc); racc = _mm_slli_si128(racc, 3); rb = _mm_add_epi8(rb, racc); racc = _mm_slli_si128(racc, 3); rb = _mm_add_epi8(rb, racc); racc = rb; _mm_storeu_si128((__m128i*)rp, rb); } }
SIMDValue SIMDInt8x16Operation::OpNeg(const SIMDValue& value) { X86SIMDValue x86Result; X86SIMDValue SIGNMASK, temp; X86SIMDValue negativeOnes = { { -1, -1, -1, -1 } }; X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value); temp.m128i_value = _mm_andnot_si128(v.m128i_value, negativeOnes.m128i_value); // (~value) & (negative ones) SIGNMASK.m128i_value = _mm_set1_epi8(0x00000001); // set SIGNMASK to 1 x86Result.m128i_value = _mm_add_epi8(SIGNMASK.m128i_value, temp.m128i_value);// add 16 integers respectively return X86SIMDValue::ToSIMDValue(x86Result); }
// Denoise a 16x1 vector. static INLINE __m128i vp9_denoiser_16x1_sse2( const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y, const __m128i *k_0, const __m128i *k_4, const __m128i *k_8, const __m128i *k_16, const __m128i *l3, const __m128i *l32, const __m128i *l21, __m128i acc_diff) { // Calculate differences const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0])); const __m128i v_mc_running_avg_y = _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0])); __m128i v_running_avg_y; const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig); const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y); // Obtain the sign. FF if diff is negative. const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0); // Clamp absolute difference to 16 to be used to get mask. Doing this // allows us to use _mm_cmpgt_epi8, which operates on signed byte. const __m128i clamped_absdiff = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16); // Get masks for l2 l1 and l0 adjustments. const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff); const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff); const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff); // Get adjustments for l2, l1, and l0. __m128i adj2 = _mm_and_si128(mask2, *l32); const __m128i adj1 = _mm_and_si128(mask1, *l21); const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff); __m128i adj, padj, nadj; // Combine the adjustments and get absolute adjustments. adj2 = _mm_add_epi8(adj2, adj1); adj = _mm_sub_epi8(*l3, adj2); adj = _mm_andnot_si128(mask0, adj); adj = _mm_or_si128(adj, adj0); // Restore the sign and get positive and negative adjustments. padj = _mm_andnot_si128(diff_sign, adj); nadj = _mm_and_si128(diff_sign, adj); // Calculate filtered value. v_running_avg_y = _mm_adds_epu8(v_sig, padj); v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj); _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y); // Adjustments <=7, and each element in acc_diff can fit in signed // char. acc_diff = _mm_adds_epi8(acc_diff, padj); acc_diff = _mm_subs_epi8(acc_diff, nadj); return acc_diff; }
// Predictor1: left. static void PredictorAdd1_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; __m128i prev = _mm_set1_epi32(out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { // a | b | c | d const __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); // 0 | a | b | c const __m128i shift0 = _mm_slli_si128(src, 4); // a | a + b | b + c | c + d const __m128i sum0 = _mm_add_epi8(src, shift0); // 0 | 0 | a | a + b const __m128i shift1 = _mm_slli_si128(sum0, 8); // a | a + b | a + b + c | a + b + c + d const __m128i sum1 = _mm_add_epi8(sum0, shift1); const __m128i res = _mm_add_epi8(sum1, prev); _mm_storeu_si128((__m128i*)&out[i], res); // replicate prev output on the four lanes prev = _mm_shuffle_epi32(res, (3 << 0) | (3 << 2) | (3 << 4) | (3 << 6)); } if (i != num_pixels) { VP8LPredictorsAdd_C[1](in + i, upper + i, num_pixels - i, out + i); } }
static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) { const __m128i mask = _mm_set1_epi32(0x0000ff00); int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); const __m128i in_00g0 = _mm_and_si128(in, mask); // 00g0|00g0|... const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8); // 0g00|0g00|... const __m128i in_000g = _mm_srli_epi32(in_00g0, 8); // 000g|000g|... const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g); const __m128i out = _mm_add_epi8(in, in_0g0g); _mm_storeu_si128((__m128i*)&argb_data[i], out); } // fallthrough and finish off with plain-C VP8LAddGreenToBlueAndRed_C(argb_data + i, num_pixels - i); }
static void AddGreenToBlueAndRed_SSE2(const uint32_t* const src, int num_pixels, uint32_t* dst) { int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb const __m128i A = _mm_srli_epi16(in, 8); // 0 a 0 g const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // 0g0g const __m128i out = _mm_add_epi8(in, C); _mm_storeu_si128((__m128i*)&dst[i], out); } // fallthrough and finish off with plain-C if (i != num_pixels) { VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i); } }
static void sk_sub_sse2(png_row_infop row_info, uint8_t* row, const uint8_t*) { // The Sub filter predicts each pixel as the previous pixel, a. // There is no pixel to the left of the first pixel. It's encoded directly. // That works with our main loop if we just say that left pixel was zero. __m128i a, d = _mm_setzero_si128(); int rb = row_info->rowbytes; while (rb > 0) { a = d; d = load<bpp>(row); d = _mm_add_epi8(d, a); store<bpp>(row, d); row += bpp; rb -= bpp; } }