void maddrc4_shuffle_ssse3(uint8_t* region1, const uint8_t* region2, uint8_t constant, size_t length) { uint8_t *end; register __m128i in1, in2, out, t1, t2, m1, m2, l, h; if (constant == 0) return; if (constant == 1) { xorr_sse2(region1, region2, length); return; } t1 = _mm_loadu_si128((void *)tl[constant]); t2 = _mm_slli_epi64(t1, 4); m1 = _mm_set1_epi8(0x0f); m2 = _mm_set1_epi8(0xf0); for (end=region1+length; region1<end; region1+=16, region2+=16) { in2 = _mm_load_si128((void *)region2); in1 = _mm_load_si128((void *)region1); l = _mm_and_si128(in2, m1); l = _mm_shuffle_epi8(t1, l); h = _mm_and_si128(in2, m2); h = _mm_srli_epi64(h, 4); h = _mm_shuffle_epi8(t2, h); out = _mm_xor_si128(h,l); out = _mm_xor_si128(out, in1); _mm_store_si128((void *)region1, out); } }
inline void casefoldRange(char* dest, const char* begin, const char* end) { if (end - begin < 64) { // short string, don't bother optimizing for (const char* i = begin; i != end; ++i) *dest++ = casefold(*i); } else { // Shift 'A'..'Z' range ([65..90]) to [102..127] to use one signed comparison insn __m128i shiftAmount = _mm_set1_epi8(127 - 'Z'); __m128i lowerBound = _mm_set1_epi8(127 - ('Z' - 'A') - 1); __m128i upperBit = _mm_set1_epi8(0x20); const char* i = begin; for (; i + 16 < end; i += 16) { __m128i v = _mm_loadu_si128(reinterpret_cast<const __m128i*>(i)); __m128i upperMask = _mm_cmpgt_epi8(_mm_add_epi8(v, shiftAmount), lowerBound); __m128i cfv = _mm_or_si128(v, _mm_and_si128(upperMask, upperBit)); _mm_storeu_si128(reinterpret_cast<__m128i*>(dest), cfv); dest += 16; } for (; i != end; ++i) *dest++ = casefold(*i); } }
void FragmentAttributesBuffer::SetAll(const FragmentAttributes &attr) { size_t i = 0; #ifdef ENABLE_SSE2 const __m128i attrDepth_vec128 = _mm_set1_epi32(attr.depth); const __m128i attrOpaquePolyID_vec128 = _mm_set1_epi8(attr.opaquePolyID); const __m128i attrTranslucentPolyID_vec128 = _mm_set1_epi8(attr.translucentPolyID); const __m128i attrStencil_vec128 = _mm_set1_epi8(attr.stencil); const __m128i attrIsFogged_vec128 = _mm_set1_epi8(attr.isFogged); const __m128i attrIsTranslucentPoly_vec128 = _mm_set1_epi8(attr.isTranslucentPoly); const size_t sseCount = count - (count % 16); for (; i < sseCount; i += 16) { _mm_stream_si128((__m128i *)(this->depth + 0), attrDepth_vec128); _mm_stream_si128((__m128i *)(this->depth + 4), attrDepth_vec128); _mm_stream_si128((__m128i *)(this->depth + 8), attrDepth_vec128); _mm_stream_si128((__m128i *)(this->depth + 12), attrDepth_vec128); _mm_stream_si128((__m128i *)this->opaquePolyID, attrOpaquePolyID_vec128); _mm_stream_si128((__m128i *)this->translucentPolyID, attrTranslucentPolyID_vec128); _mm_stream_si128((__m128i *)this->stencil, attrStencil_vec128); _mm_stream_si128((__m128i *)this->isFogged, attrIsFogged_vec128); _mm_stream_si128((__m128i *)this->isTranslucentPoly, attrIsTranslucentPoly_vec128); } #endif for (; i < count; i++) { this->SetAtIndex(i, attr); } }
void mulrc4_shuffle_ssse3(uint8_t *region, uint8_t constant, size_t length) { uint8_t *end; register __m128i in, out, t1, t2, m1, m2, l, h; if (constant == 0) { memset(region, 0, length); return; } if (constant == 1) return; t1 = _mm_loadu_si128((void *)tl[constant]); t2 = _mm_slli_epi64(t1, 4); m1 = _mm_set1_epi8(0x0f); m2 = _mm_set1_epi8(0xf0); for (end=region+length; region<end; region+=16) { in = _mm_load_si128((void *)region); l = _mm_and_si128(in, m1); l = _mm_shuffle_epi8(t1, l); h = _mm_and_si128(in, m2); h = _mm_srli_epi64(h, 4); h = _mm_shuffle_epi8(t2, h); out = _mm_xor_si128(h, l); _mm_store_si128((void *)region, out); } }
static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src, int num_pixels, uint8_t* dst) { const __m128i mask_0x0f = _mm_set1_epi8(0x0f); const __m128i mask_0xf0 = _mm_set1_epi8(0xf0); const __m128i* in = (const __m128i*)src; __m128i* out = (__m128i*)dst; while (num_pixels >= 8) { const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3 const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7 const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4... const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6... const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6... const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7... const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7 const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7 const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7 const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7 const __m128i ga1 = _mm_srli_epi16(ga0, 4); // g0-|g1-|...|a6-|a7- const __m128i rb1 = _mm_and_si128(rb0, mask_0xf0); // -r0|-r1|...|-b6|-a7 const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f); // g0-|g1-|...|a6-|a7- const __m128i rgba0 = _mm_or_si128(ga2, rb1); // rg0..rg7 | ba0..ba7 const __m128i rgba1 = _mm_srli_si128(rgba0, 8); // ba0..ba7 | 0 #if (WEBP_SWAP_16BIT_CSP == 1) const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0); // barg0...barg7 #else const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1); // rgba0...rgba7 #endif _mm_storeu_si128(out++, rgba); num_pixels -= 8; } // left-overs if (num_pixels > 0) { VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out); } }
static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) { uint8_t levels[16], ctxs[16]; uint16_t abs_levels[16]; int n = res->first; // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1 const int p0 = res->prob[n][ctx0][0]; CostArrayPtr const costs = res->costs; const uint16_t* t = costs[n][ctx0]; // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0 // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll // be missing during the loop. int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0; if (res->last < 0) { return VP8BitCost(0, p0); } { // precompute clamped levels and contexts, packed to 8b. const __m128i zero = _mm_setzero_si128(); const __m128i kCst2 = _mm_set1_epi8(2); const __m128i kCst67 = _mm_set1_epi8(MAX_VARIABLE_LEVEL); const __m128i c0 = _mm_loadu_si128((const __m128i*)&res->coeffs[0]); const __m128i c1 = _mm_loadu_si128((const __m128i*)&res->coeffs[8]); const __m128i D0 = _mm_sub_epi16(zero, c0); const __m128i D1 = _mm_sub_epi16(zero, c1); const __m128i E0 = _mm_max_epi16(c0, D0); // abs(v), 16b const __m128i E1 = _mm_max_epi16(c1, D1); const __m128i F = _mm_packs_epi16(E0, E1); const __m128i G = _mm_min_epu8(F, kCst2); // context = 0,1,2 const __m128i H = _mm_min_epu8(F, kCst67); // clamp_level in [0..67] _mm_storeu_si128((__m128i*)&ctxs[0], G); _mm_storeu_si128((__m128i*)&levels[0], H); _mm_storeu_si128((__m128i*)&abs_levels[0], E0); _mm_storeu_si128((__m128i*)&abs_levels[8], E1); } for (; n < res->last; ++n) { const int ctx = ctxs[n]; const int level = levels[n]; const int flevel = abs_levels[n]; // full level cost += VP8LevelFixedCosts[flevel] + t[level]; // simplified VP8LevelCost() t = costs[n + 1][ctx]; } // Last coefficient is always non-zero { const int level = levels[n]; const int flevel = abs_levels[n]; assert(flevel != 0); cost += VP8LevelFixedCosts[flevel] + t[level]; if (n < 15) { const int b = VP8EncBands[n + 1]; const int ctx = ctxs[n]; const int last_p0 = res->prob[b][ctx][0]; cost += VP8BitCost(0, last_p0); } } return cost; }
void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row, png_const_bytep prev) { /* The Avg filter predicts each pixel as the (truncated) average of a and b. * There's no pixel to the left of the first pixel. Luckily, it's * predicted to be half of the pixel above it. So again, this works * perfectly with our loop if we make sure a starts at zero. */ png_size_t rb; const __m128i zero = _mm_setzero_si128(); __m128i b; __m128i a, d = zero; png_debug(1, "in png_read_filter_row_avg3_sse2"); rb = row_info->rowbytes; while (rb >= 4) { __m128i avg; b = load4(prev); a = d; d = load4(row ); /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */ avg = _mm_avg_epu8(a,b); /* ...but we can fix it up by subtracting off 1 if it rounded up. */ avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), _mm_set1_epi8(1))); d = _mm_add_epi8(d, avg); store3(row, d); prev += 3; row += 3; rb -= 3; } if (rb > 0) { __m128i avg; b = load3(prev); a = d; d = load3(row ); /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */ avg = _mm_avg_epu8(a,b); /* ...but we can fix it up by subtracting off 1 if it rounded up. */ avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), _mm_set1_epi8(1))); d = _mm_add_epi8(d, avg); store3(row, d); prev += 3; row += 3; rb -= 3; } }
// input and output are int8_t static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0, const __m128i* const fl) { const __m128i k3 = _mm_set1_epi8(3); const __m128i k4 = _mm_set1_epi8(4); __m128i v3 = _mm_adds_epi8(*fl, k3); __m128i v4 = _mm_adds_epi8(*fl, k4); SignedShift8b(&v4); // v4 >> 3 SignedShift8b(&v3); // v3 >> 3 *q0 = _mm_subs_epi8(*q0, v4); // q0 -= v4 *p0 = _mm_adds_epi8(*p0, v3); // p0 += v3 }
static void NeedsFilter(const __m128i* p1, const __m128i* p0, const __m128i* q0, const __m128i* q1, int thresh, __m128i *mask) { __m128i t1 = MM_ABS(*p1, *q1); // abs(p1 - q1) *mask = _mm_set1_epi8(0xFE); t1 = _mm_and_si128(t1, *mask); // set lsb of each byte to zero t1 = _mm_srli_epi16(t1, 1); // abs(p1 - q1) / 2 *mask = MM_ABS(*p0, *q0); // abs(p0 - q0) *mask = _mm_adds_epu8(*mask, *mask); // abs(p0 - q0) * 2 *mask = _mm_adds_epu8(*mask, t1); // abs(p0 - q0) * 2 + abs(p1 - q1) / 2 t1 = _mm_set1_epi8(thresh); *mask = _mm_subs_epu8(*mask, t1); // mask <= thresh *mask = _mm_cmpeq_epi8(*mask, _mm_setzero_si128()); }
/** * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more * precise version of a box filter 4:2:2 pixel subsampling in Q3. * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer. */ static INLINE void cfl_luma_subsampling_422_lbd_ssse3(const uint8_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { const __m128i fours = _mm_set1_epi8(4); __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128; do { if (width == 4) { __m128i top = _mm_loadh_epi32((__m128i *)input); top = _mm_maddubs_epi16(top, fours); _mm_storeh_epi32(pred_buf_m128i, top); } else if (width == 8) { __m128i top = _mm_loadl_epi64((__m128i *)input); top = _mm_maddubs_epi16(top, fours); _mm_storel_epi64(pred_buf_m128i, top); } else { __m128i top = _mm_loadu_si128((__m128i *)input); top = _mm_maddubs_epi16(top, fours); _mm_storeu_si128(pred_buf_m128i, top); if (width == 32) { __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); top_1 = _mm_maddubs_epi16(top_1, fours); _mm_storeu_si128(pred_buf_m128i + 1, top_1); } } input += input_stride; pred_buf_m128i += CFL_BUF_LINE_I128; } while (pred_buf_m128i < end); }
template <bool align> void EdgeBackgroundAdjustRangeMasked(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride) { assert(width >= A); if(align) { assert(Aligned(backgroundValue) && Aligned(backgroundValueStride)); assert(Aligned(backgroundCount) && Aligned(backgroundCountStride)); assert(Aligned(mask) && Aligned(maskStride)); } const __m128i _threshold = _mm_set1_epi8((char)threshold); size_t alignedWidth = AlignLo(width, A); __m128i tailMask = ShiftLeft(K8_01, A - width + alignedWidth); for(size_t row = 0; row < height; ++row) { for(size_t col = 0; col < alignedWidth; col += A) EdgeBackgroundAdjustRangeMasked<align>(backgroundCount, backgroundValue, mask, col, _threshold, K8_01); if(alignedWidth != width) EdgeBackgroundAdjustRangeMasked<false>(backgroundCount, backgroundValue, mask, width - A, _threshold, tailMask); backgroundValue += backgroundValueStride; backgroundCount += backgroundCountStride; mask += maskStride; } }
void sk_avg_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) { // The Avg filter predicts each pixel as the (truncated) average of a and b. // There's no pixel to the left of the first pixel. Luckily, it's // predicted to be half of the pixel above it. So again, this works // perfectly with our loop if we make sure a starts at zero. const __m128i zero = _mm_setzero_si128(); __m128i b; __m128i a, d = zero; int rb = row_info->rowbytes; while (rb > 0) { b = load<bpp>(prev); a = d; d = load<bpp>(row ); // PNG requires a truncating average here, so sadly we can't just use _mm_avg_epu8... __m128i avg = _mm_avg_epu8(a,b); // ...but we can fix it up by subtracting off 1 if it rounded up. avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), _mm_set1_epi8(1))); d = _mm_add_epi8(d, avg); store<bpp>(row, d); prev += bpp; row += bpp; rb -= bpp; } }
static void clamphigh_u8_sse (uint8_t *dest, const uint8_t *src1, int n, const uint8_t *src2_1) { __m128i xmm1; uint8_t max = *src2_1; /* Initial operations to align the destination pointer */ for (; ((long)dest & 15) && (n > 0); n--) { uint8_t x = *src1++; if (x > max) x = max; *dest++ = x; } xmm1 = _mm_set1_epi8(max); for (; n >= 16; n -= 16) { __m128i xmm0; xmm0 = _mm_loadu_si128((__m128i *)src1); xmm0 = _mm_min_epu8(xmm0, xmm1); _mm_store_si128((__m128i *)dest, xmm0); dest += 16; src1 += 16; } for (; n > 0; n--) { uint8_t x = *src1++; if (x > max) x = max; *dest++ = x; } }
void imageFilterMean_SSE2(unsigned char *src1, unsigned char *src2, unsigned char *dst, int length) { int n = length; // Compute first few values so we're on a 16-byte boundary in dst while( (((long)dst & 0xF) > 0) && (n > 0) ) { MEAN_PIXEL(); --n; ++dst; ++src1; ++src2; } // Do bulk of processing using SSE2 (find the mean of 16 8-bit unsigned integers, with saturation) __m128i mask = _mm_set1_epi8(0x7F); while(n >= 16) { __m128i s1 = _mm_loadu_si128((__m128i*)src1); s1 = _mm_srli_epi16(s1, 1); // shift right 1 s1 = _mm_and_si128(s1, mask); // apply byte-mask __m128i s2 = _mm_loadu_si128((__m128i*)src2); s2 = _mm_srli_epi16(s2, 1); // shift right 1 s2 = _mm_and_si128(s2, mask); // apply byte-mask __m128i r = _mm_adds_epu8(s1, s2); _mm_store_si128((__m128i*)dst, r); n -= 16; src1 += 16; src2 += 16; dst += 16; } // If any bytes are left over, deal with them individually ++n; BASIC_MEAN(); }
__m128i aes_schedule_round(__m128i* rcon, __m128i input1, __m128i input2) { if(rcon) { input2 = _mm_xor_si128(_mm_alignr_epi8(_mm_setzero_si128(), *rcon, 15), input2); *rcon = _mm_alignr_epi8(*rcon, *rcon, 15); // next rcon input1 = _mm_shuffle_epi32(input1, 0xFF); // rotate input1 = _mm_alignr_epi8(input1, input1, 1); } __m128i smeared = _mm_xor_si128(input2, _mm_slli_si128(input2, 4)); smeared = mm_xor3(smeared, _mm_slli_si128(smeared, 8), _mm_set1_epi8(0x5B)); __m128i t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, input1), 4); input1 = _mm_and_si128(low_nibs, input1); __m128i t2 = _mm_shuffle_epi8(k_inv2, input1); input1 = _mm_xor_si128(input1, t); __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t)); __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, input1)); __m128i t5 = _mm_xor_si128(input1, _mm_shuffle_epi8(k_inv1, t3)); __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4)); return mm_xor3(_mm_shuffle_epi8(sb1u, t5), _mm_shuffle_epi8(sb1t, t6), smeared); }
static int HafCpu_Histogram3Thresholds_DATA_U8 ( vx_uint32 dstHist[], vx_uint8 distThreshold0, vx_uint8 distThreshold1, vx_uint8 distThreshold2, vx_uint32 srcWidth, vx_uint32 srcHeight, vx_uint8 * pSrcImage, vx_uint32 srcImageStrideInBytes ) { // offset: to convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes // thresh: source threshold in -128..127 range __m128i offset = _mm_set1_epi8((char)0x80); __m128i T0 = _mm_set1_epi8((char)((distThreshold0 - 1) ^ 0x80)); __m128i T1 = _mm_set1_epi8((char)((distThreshold1 - 1) ^ 0x80)); __m128i T2 = _mm_set1_epi8((char)((distThreshold2 - 1) ^ 0x80)); __m128i onemask = _mm_set1_epi8((char)1); // process one pixel row at a time that counts "pixel < srcThreshold" __m128i count0 = _mm_set1_epi8((char)0); __m128i count1 = _mm_set1_epi8((char)0); __m128i count2 = _mm_set1_epi8((char)0); vx_uint8 * srcRow = pSrcImage; vx_uint32 width = (srcWidth + 15) >> 4; for (unsigned int y = 0; y < srcHeight; y++) { __m128i * src = (__m128i *)srcRow; for (unsigned int x = 0; x < width; x++) { __m128i pixels = _mm_load_si128(src++); pixels = _mm_xor_si128(pixels, offset); __m128i cmpout; cmpout = _mm_cmpgt_epi8(pixels, T0); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); count0 = _mm_add_epi32(count0, cmpout); cmpout = _mm_cmpgt_epi8(pixels, T1); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); count1 = _mm_add_epi32(count1, cmpout); cmpout = _mm_cmpgt_epi8(pixels, T2); cmpout = _mm_and_si128(cmpout, onemask); cmpout = _mm_sad_epu8(cmpout, onemask); count2 = _mm_add_epi32(count2, cmpout); } srcRow += srcImageStrideInBytes; } // extract histogram from count: special case needed when T1 == T2 dstHist[0] = M128I(count0).m128i_u32[0] + M128I(count0).m128i_u32[2]; dstHist[1] = M128I(count1).m128i_u32[0] + M128I(count1).m128i_u32[2] - dstHist[0]; dstHist[2] = M128I(count2).m128i_u32[0] + M128I(count2).m128i_u32[2] - dstHist[0] - dstHist[1]; dstHist[3] = srcWidth * srcHeight - dstHist[0] - dstHist[1] - dstHist[2]; if (M128I(T1).m128i_i8[0] == M128I(T2).m128i_i8[0]) { dstHist[2] = dstHist[3]; dstHist[3] = 0; } return AGO_SUCCESS; }
SIMDValue SIMDInt8x16Operation::OpSplat(int8 x) { X86SIMDValue x86Result; // set 16 signed 8-bit integers values to input value x x86Result.m128i_value = _mm_set1_epi8(x); return X86SIMDValue::ToSIMDValue(x86Result); }
// Applies filter on 4 pixels (p1, p0, q0 and q1) static WEBP_INLINE void DoFilter4(__m128i* p1, __m128i *p0, __m128i* q0, __m128i* q1, const __m128i* mask, int hev_thresh) { __m128i not_hev; __m128i t1, t2, t3; const __m128i sign_bit = _mm_set1_epi8(0x80); // compute hev mask GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev); // convert to signed values FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); t1 = _mm_subs_epi8(*p1, *q1); // p1 - q1 t1 = _mm_andnot_si128(not_hev, t1); // hev(p1 - q1) t2 = _mm_subs_epi8(*q0, *p0); // q0 - p0 t1 = _mm_adds_epi8(t1, t2); // hev(p1 - q1) + 1 * (q0 - p0) t1 = _mm_adds_epi8(t1, t2); // hev(p1 - q1) + 2 * (q0 - p0) t1 = _mm_adds_epi8(t1, t2); // hev(p1 - q1) + 3 * (q0 - p0) t1 = _mm_and_si128(t1, *mask); // mask filter values we don't care about // Do +4 side t2 = _mm_set1_epi8(4); t2 = _mm_adds_epi8(t1, t2); // 3 * (q0 - p0) + (p1 - q1) + 4 SIGNED_SHIFT_N(t2, 3); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3 t3 = t2; // save t2 *q0 = _mm_subs_epi8(*q0, t2); // q0 -= t2 // Now do +3 side t2 = _mm_set1_epi8(3); t2 = _mm_adds_epi8(t1, t2); // +3 instead of +4 SIGNED_SHIFT_N(t2, 3); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3 *p0 = _mm_adds_epi8(*p0, t2); // p0 += t2 t2 = _mm_set1_epi8(1); t3 = _mm_adds_epi8(t3, t2); SIGNED_SHIFT_N(t3, 1); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 4 t3 = _mm_and_si128(not_hev, t3); // if !hev *q1 = _mm_subs_epi8(*q1, t3); // q1 -= t3 *p1 = _mm_adds_epi8(*p1, t3); // p1 += t3 // unoffset FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); }
static WEBP_INLINE void Average2_m128i(const __m128i* const a0, const __m128i* const a1, __m128i* const avg) { // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1) const __m128i ones = _mm_set1_epi8(1); const __m128i avg1 = _mm_avg_epu8(*a0, *a1); const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones); *avg = _mm_sub_epi8(avg1, one); }
Bitboard operator ~ () const { #if defined (HAVE_SSE2) || defined (HAVE_SSE4) Bitboard tmp; _mm_store_si128(&tmp.m_, _mm_andnot_si128(this->m_, _mm_set1_epi8(static_cast<char>(0xffu)))); return tmp; #else return Bitboard(~this->p(0), ~this->p(1)); #endif }
void png_read_filter_row_avg4_sse(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { png_size_t i; __m128i* rp = (__m128i*)row; const __m128i* prp = (const __m128i*)prev_row; __m128i pixel = _mm_setzero_si128(); const __m128i mask = _mm_set1_epi8(0x01); for (i = (row_info->rowbytes + 15) >> 4; i > 0; i--) { __m128i prb = _mm_load_si128(prp++); __m128i rb = _mm_load_si128(rp); // First pixel pixel = calculate_pixel_avg(rb, prb, pixel, mask); prb = _mm_srli_si128(prb, 4); #ifndef __SSSE3__ rb = _mm_srli_si128(rb, 4); rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 12)); #else rb = _mm_alignr_epi8(pixel, rb, 4); #endif // Second pixel pixel = calculate_pixel_avg(rb, prb, pixel, mask); prb = _mm_srli_si128(prb, 4); #ifndef __SSSE3__ rb = _mm_srli_si128(rb, 4); rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 12)); #else rb = _mm_alignr_epi8(pixel, rb, 4); #endif // Third pixel pixel = calculate_pixel_avg(rb, prb, pixel, mask); prb = _mm_srli_si128(prb, 4); #ifndef __SSSE3__ rb = _mm_srli_si128(rb, 4); rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 12)); #else rb = _mm_alignr_epi8(pixel, rb, 4); #endif // Fourth pixel pixel = calculate_pixel_avg(rb, prb, pixel, mask); #ifndef __SSSE3__ rb = _mm_srli_si128(rb, 4); rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 12)); #else rb = _mm_alignr_epi8(pixel, rb, 4); #endif _mm_store_si128(rp++, rb); } }
// input pixels are uint8_t static WEBP_INLINE void NeedsFilter(const __m128i* const p1, const __m128i* const p0, const __m128i* const q0, const __m128i* const q1, int thresh, __m128i* const mask) { const __m128i m_thresh = _mm_set1_epi8(thresh); const __m128i t1 = MM_ABS(*p1, *q1); // abs(p1 - q1) const __m128i kFE = _mm_set1_epi8(0xFE); const __m128i t2 = _mm_and_si128(t1, kFE); // set lsb of each byte to zero const __m128i t3 = _mm_srli_epi16(t2, 1); // abs(p1 - q1) / 2 const __m128i t4 = MM_ABS(*p0, *q0); // abs(p0 - q0) const __m128i t5 = _mm_adds_epu8(t4, t4); // abs(p0 - q0) * 2 const __m128i t6 = _mm_adds_epu8(t5, t3); // abs(p0-q0)*2 + abs(p1-q1)/2 const __m128i t7 = _mm_subs_epu8(t6, m_thresh); // mask <= m_thresh *mask = _mm_cmpeq_epi8(t7, _mm_setzero_si128()); }
int countZeroBytes_SSE(char* values, int length) { int zeroCount = 0; __m128i zero16 = _mm_set1_epi8(0); __m128i and16 = _mm_set1_epi8(1); for(int i=0; i<length; i+=16) { __m128i values16 = _mm_loadu_si128((__m128i*)&values[i]); __m128i cmp = _mm_cmpeq_epi8(values16, zero16); if(_mm_movemask_epi8(cmp)) { cmp = _mm_and_si128(and16, cmp); //change -1 values to 1 //hortiontal sum of 16 bytes __m128i sum1 = _mm_sad_epu8(cmp,zero16); __m128i sum2 = _mm_shuffle_epi32(sum1,2); __m128i sum3 = _mm_add_epi16(sum1,sum2); zeroCount += _mm_cvtsi128_si32(sum3); } } return zeroCount; }
void blend_sse2(const Uint8* alpha, const Uint32 size, const Uint8* source0, const Uint8* source1, Uint8* dest) { __m128i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10; Uint32 i; for (i = 0; i < (size / 4); i++) { t0 = _mm_load_si128((__m128i*)&source0[i * 16]); t1 = _mm_load_si128((__m128i*)&source1[i * 16]); t2 = (__m128i)_mm_load_ss((float*)&alpha[i * 4]); t2 = _mm_unpacklo_epi8(t2, t2); t2 = _mm_unpacklo_epi16(t2, t2); t3 = _mm_unpacklo_epi8(t0, t0); t4 = _mm_unpacklo_epi8(t1, t1); t5 = _mm_unpacklo_epi32(t2, t2); t6 = _mm_sub_epi16(_mm_set1_epi8(0xFF), t5); t7 = _mm_mulhi_epu16(t3, t6); t8 = _mm_mulhi_epu16(t4, t5); t9 = _mm_adds_epu16(t7, t8); t9 = _mm_srli_epi16(t9, 8); t3 = _mm_unpackhi_epi8(t0, t0); t4 = _mm_unpackhi_epi8(t1, t1); t5 = _mm_unpackhi_epi32(t2, t2); t6 = _mm_sub_epi16(_mm_set1_epi8(0xFF), t5); t7 = _mm_mulhi_epu16(t3, t6); t8 = _mm_mulhi_epu16(t4, t5); t10 = _mm_adds_epu16(t7, t8); t10 = _mm_srli_epi16(t10, 8); t10 = _mm_packus_epi16(t9, t10); _mm_stream_si128((__m128i*)&dest[i * 16], t10); } }
mlib_status __mlib_VectorConvert_S8_U8_Sat( mlib_s8 *z, const mlib_u8 *x, mlib_s32 n) { if (n < 1) return (MLIB_FAILURE); mlib_s32 i, ax, az, nstep, n1, n2, n3, xval; mlib_u8 *px = (mlib_u8 *)x; mlib_s8 *pz = (mlib_s8 *)z; __m128i zbuf, xbuf, mask; mask = _mm_set1_epi8(127); ax = (mlib_addr)x & 15; az = (mlib_addr)z & 15; nstep = 16 / sizeof (mlib_u8); n1 = ((16 - ax) & 15) / sizeof (mlib_u8); n2 = (n - n1) / nstep; n3 = n - n1 - n2 * nstep; if (n2 < 1) { for (i = 0; i < n; i++) { xval = *px++; if (xval > 127) xval = 127; *pz++ = xval; } } else { for (i = 0; i < n1; i++) { xval = *px++; if (xval > 127) xval = 127; *pz++ = xval; } for (i = 0; i < n2; i++) { xbuf = _mm_load_si128((__m128i *)px); zbuf = _mm_min_epu8(xbuf, mask); _mm_storeu_si128((__m128i *)pz, zbuf); px += nstep; pz += nstep; } for (i = 0; i < n3; i++) { xval = *px++; if (xval > 127) xval = 127; *pz++ = xval; } } return (MLIB_SUCCESS); }
__m128i aes_schedule_mangle_last_dec(__m128i k) { const __m128i deskew1 = _mm_set_epi32( 0x1DFEB95A, 0x5DBEF91A, 0x07E4A340, 0x47A4E300); const __m128i deskew2 = _mm_set_epi32( 0x2841C2AB, 0xF49D1E77, 0x5F36B5DC, 0x83EA6900); k = _mm_xor_si128(k, _mm_set1_epi8(0x5B)); return aes_schedule_transform(k, deskew1, deskew2); }
static void HE16(uint8_t* dst) { // horizontal int j; const __m128i kShuffle3 = _mm_set1_epi8(3); for (j = 16; j > 0; --j) { const __m128i in = _mm_cvtsi32_si128(*(int*)(dst - 4)); const __m128i values = _mm_shuffle_epi8(in, kShuffle3); _mm_storeu_si128((__m128i*)dst, values); dst += BPS; } }
mlib_status __mlib_VectorSet_S8( mlib_s8 *z, const mlib_s8 *c, mlib_s32 n) { mlib_s8 c0 = *c; __m128i val = _mm_set1_epi8(c0); SET_VALUE(mlib_s8); }
static WEBP_INLINE void Average2_uint32(const uint32_t a0, const uint32_t a1, __m128i* const avg) { // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1) const __m128i ones = _mm_set1_epi8(1); const __m128i A0 = _mm_cvtsi32_si128(a0); const __m128i A1 = _mm_cvtsi32_si128(a1); const __m128i avg1 = _mm_avg_epu8(A0, A1); const __m128i one = _mm_and_si128(_mm_xor_si128(A0, A1), ones); *avg = _mm_sub_epi8(avg1, one); }
// Applies filter on 4 pixels (p1, p0, q0 and q1) static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0, __m128i* const q0, __m128i* const q1, const __m128i* const mask, int hev_thresh) { const __m128i sign_bit = _mm_set1_epi8(0x80); const __m128i k64 = _mm_set1_epi8(0x40); const __m128i zero = _mm_setzero_si128(); __m128i not_hev; __m128i t1, t2, t3; // compute hev mask GetNotHEV(p1, p0, q0, q1, hev_thresh, ¬_hev); // convert to signed values FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); t1 = _mm_subs_epi8(*p1, *q1); // p1 - q1 t1 = _mm_andnot_si128(not_hev, t1); // hev(p1 - q1) t2 = _mm_subs_epi8(*q0, *p0); // q0 - p0 t1 = _mm_adds_epi8(t1, t2); // hev(p1 - q1) + 1 * (q0 - p0) t1 = _mm_adds_epi8(t1, t2); // hev(p1 - q1) + 2 * (q0 - p0) t1 = _mm_adds_epi8(t1, t2); // hev(p1 - q1) + 3 * (q0 - p0) t1 = _mm_and_si128(t1, *mask); // mask filter values we don't care about t2 = _mm_set1_epi8(3); t3 = _mm_set1_epi8(4); t2 = _mm_adds_epi8(t1, t2); // 3 * (q0 - p0) + (p1 - q1) + 3 t3 = _mm_adds_epi8(t1, t3); // 3 * (q0 - p0) + (p1 - q1) + 4 SignedShift8b(&t2); // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3 SignedShift8b(&t3); // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3 *p0 = _mm_adds_epi8(*p0, t2); // p0 += t2 *q0 = _mm_subs_epi8(*q0, t3); // q0 -= t3 FLIP_SIGN_BIT2(*p0, *q0); // this is equivalent to signed (a + 1) >> 1 calculation t2 = _mm_add_epi8(t3, sign_bit); t3 = _mm_avg_epu8(t2, zero); t3 = _mm_sub_epi8(t3, k64); t3 = _mm_and_si128(not_hev, t3); // if !hev *q1 = _mm_subs_epi8(*q1, t3); // q1 -= t3 *p1 = _mm_adds_epi8(*p1, t3); // p1 += t3 FLIP_SIGN_BIT2(*p1, *q1); }