// input: 8 bytes ABCDEFGH -> output: A0E0B0F0C0G0D0H0 static void LoadTwoPixels_SSE2(const uint8_t* const src, __m128i* out) { const __m128i zero = _mm_setzero_si128(); const __m128i A = _mm_loadl_epi64((const __m128i*)(src)); // ABCDEFGH const __m128i B = _mm_unpacklo_epi8(A, zero); // A0B0C0D0E0F0G0H0 const __m128i C = _mm_srli_si128(B, 8); // E0F0G0H0 *out = _mm_unpacklo_epi16(B, C); }
template<> void copyMask_<ushort>(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size) { for( ; size.height--; mask += mstep, _src += sstep, _dst += dstep ) { const ushort* src = (const ushort*)_src; ushort* dst = (ushort*)_dst; int x = 0; #if CV_SSE4_2 if(USE_SSE4_2)// { __m128i zero = _mm_setzero_si128 (); for( ; x <= size.width - 8; x += 8 ) { const __m128i rSrc =_mm_lddqu_si128((const __m128i*)(src+x)); __m128i _mask = _mm_loadl_epi64((const __m128i*)(mask+x)); _mask = _mm_unpacklo_epi8(_mask, _mask); __m128i rDst = _mm_lddqu_si128((const __m128i*)(dst+x)); __m128i _negMask = _mm_cmpeq_epi8(_mask, zero); rDst = _mm_blendv_epi8(rSrc, rDst, _negMask); _mm_storeu_si128((__m128i*)(dst + x), rDst); } } #endif for( ; x < size.width; x++ ) if( mask[x] ) dst[x] = src[x]; } }
void unpack_rgb5a1_sse2(const Uint8* source, const Uint32 size, Uint8* dest) { __m128i t0, t1, t2; Uint32 i; for (i = 0; i < (size / 8); i++) { t0 = _mm_loadl_epi64((__m128i*)&source[i * 8]); t0 = _mm_unpacklo_epi16(t0, t0); t1 = _mm_unpacklo_epi16(t0, t0); t1 = _mm_and_si128(t1, _mm_set_epi16(0x8000, 0x001F, 0x03E0, 0x7C00, 0x8000, 0x001F, 0x03E0, 0x7C00)); t1 = _mm_mullo_epi16(t1, _mm_set_epi16(0x0001, 0x0800, 0x0040, 0x0002, 0x0001, 0x0800, 0x0040, 0x0002)); t1 = _mm_mulhi_epu16(t1, _mm_set_epi16(0x0200, 0x0260, 0x0260, 0x0260, 0x0200, 0x0260, 0x0260, 0x0260)); t1 = _mm_mulhi_epu16(t1, _mm_set_epi16(0xFF00, 0x6ED5, 0x6ED5, 0x6ED5, 0xFF00, 0x6ED5, 0x6ED5, 0x6ED5)); t2 = _mm_unpackhi_epi16(t0, t0); t2 = _mm_and_si128(t2, _mm_set_epi16(0x8000, 0x001F, 0x03E0, 0x7C00, 0x8000, 0x001F, 0x03E0, 0x7C00)); t2 = _mm_mullo_epi16(t2, _mm_set_epi16(0x0001, 0x0800, 0x0040, 0x0002, 0x0001, 0x0800, 0x0040, 0x0002)); t2 = _mm_mulhi_epu16(t2, _mm_set_epi16(0x0200, 0x0260, 0x0260, 0x0260, 0x0200, 0x0260, 0x0260, 0x0260)); t2 = _mm_mulhi_epu16(t2, _mm_set_epi16(0xFF00, 0x6ED5, 0x6ED5, 0x6ED5, 0xFF00, 0x6ED5, 0x6ED5, 0x6ED5)); t1 = _mm_packus_epi16(t1, t2); _mm_stream_si128((__m128i*)&dest[i * 16], t1); } }
int operator() (const uchar * ptr, int len, int & x0, int & x1, int & x2, int & x3) { int x = 0; if( useSIMD ) { __m128i qx_init = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); __m128i dx = _mm_set1_epi16(8); __m128i z = _mm_setzero_si128(), qx0 = z, qx1 = z, qx2 = z, qx3 = z, qx = qx_init; for( ; x <= len - 8; x += 8 ) { __m128i p = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr + x)), z); __m128i sx = _mm_mullo_epi16(qx, qx); qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z)); qx1 = _mm_add_epi32(qx1, _mm_madd_epi16(p, qx)); qx2 = _mm_add_epi32(qx2, _mm_madd_epi16(p, sx)); qx3 = _mm_add_epi32(qx3, _mm_madd_epi16( _mm_mullo_epi16(p, qx), sx)); qx = _mm_add_epi16(qx, dx); } _mm_store_si128((__m128i*)buf, qx0); x0 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, qx1); x1 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, qx2); x2 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, qx3); x3 = buf[0] + buf[1] + buf[2] + buf[3]; } return x; }
// Special case for left-based prediction (when preds==dst-1 or preds==src-1). static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length, int inverse) { int i; if (length <= 0) return; if (inverse) { const int max_pos = length & ~7; __m128i last = _mm_set_epi32(0, 0, 0, dst[-1]); for (i = 0; i < max_pos; i += 8) { const __m128i A0 = _mm_loadl_epi64((const __m128i*)(src + i)); const __m128i A1 = _mm_add_epi8(A0, last); const __m128i A2 = _mm_slli_si128(A1, 1); const __m128i A3 = _mm_add_epi8(A1, A2); const __m128i A4 = _mm_slli_si128(A3, 2); const __m128i A5 = _mm_add_epi8(A3, A4); const __m128i A6 = _mm_slli_si128(A5, 4); const __m128i A7 = _mm_add_epi8(A5, A6); _mm_storel_epi64((__m128i*)(dst + i), A7); last = _mm_srli_epi64(A7, 56); } for (; i < length; ++i) dst[i] = src[i] + dst[i - 1]; } else { const int max_pos = length & ~31; for (i = 0; i < max_pos; i += 32) { const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + i + 0 )); const __m128i B0 = _mm_loadu_si128((const __m128i*)(src + i + 0 - 1)); const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + i + 16 )); const __m128i B1 = _mm_loadu_si128((const __m128i*)(src + i + 16 - 1)); const __m128i C0 = _mm_sub_epi8(A0, B0); const __m128i C1 = _mm_sub_epi8(A1, B1); _mm_storeu_si128((__m128i*)(dst + i + 0), C0); _mm_storeu_si128((__m128i*)(dst + i + 16), C1); } for (; i < length; ++i) dst[i] = src[i] - src[i - 1]; } }
static void MultARGBRow(uint32_t* const ptr, int width, int inverse) { int x = 0; if (!inverse) { const int kSpan = 2; const __m128i zero = _mm_setzero_si128(); const __m128i kRound = _mm_set_epi16(0, 1 << 7, 1 << 7, 1 << 7, 0, 1 << 7, 1 << 7, 1 << 7); const __m128i kMult = _mm_set_epi16(0, 0x0101, 0x0101, 0x0101, 0, 0x0101, 0x0101, 0x0101); const __m128i kOne64 = _mm_set_epi16(1u << 8, 0, 0, 0, 1u << 8, 0, 0, 0); const int w2 = width & ~(kSpan - 1); for (x = 0; x < w2; x += kSpan) { const __m128i argb0 = _mm_loadl_epi64((__m128i*)&ptr[x]); const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero); const __m128i tmp0 = _mm_shufflelo_epi16(argb1, _MM_SHUFFLE(3, 3, 3, 3)); const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, _MM_SHUFFLE(3, 3, 3, 3)); const __m128i tmp2 = _mm_srli_epi64(tmp1, 16); const __m128i scale0 = _mm_mullo_epi16(tmp1, kMult); const __m128i scale1 = _mm_or_si128(tmp2, kOne64); const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0); const __m128i argb3 = _mm_mullo_epi16(argb1, scale1); const __m128i argb4 = _mm_adds_epu16(argb2, argb3); const __m128i argb5 = _mm_adds_epu16(argb4, kRound); const __m128i argb6 = _mm_srli_epi16(argb5, 8); const __m128i argb7 = _mm_packus_epi16(argb6, zero); _mm_storel_epi64((__m128i*)&ptr[x], argb7); } } width -= x; if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse); }
void ff_hevc_transform_skip_8_sse(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride) { uint8_t *dst = (uint8_t*)_dst; ptrdiff_t stride = _stride; int shift = 5; int offset = 16; __m128i r0, r1, r2, r3, r4, r5, r6, r9; r9 = _mm_setzero_si128(); r2 = _mm_set1_epi16(offset); r0 = _mm_load_si128((__m128i*)(coeffs)); r1 = _mm_load_si128((__m128i*)(coeffs + 8)); r0 = _mm_adds_epi16(r0, r2); r1 = _mm_adds_epi16(r1, r2); r0 = _mm_srai_epi16(r0, shift); r1 = _mm_srai_epi16(r1, shift); r3 = _mm_loadl_epi64((__m128i*)(dst)); r4 = _mm_loadl_epi64((__m128i*)(dst + stride)); r5 = _mm_loadl_epi64((__m128i*)(dst + 2 * stride)); r6 = _mm_loadl_epi64((__m128i*)(dst + 3 * stride)); r3 = _mm_unpacklo_epi8(r3, r9); r4 = _mm_unpacklo_epi8(r4, r9); r5 = _mm_unpacklo_epi8(r5, r9); r6 = _mm_unpacklo_epi8(r6, r9); r3 = _mm_unpacklo_epi64(r3, r4); r4 = _mm_unpacklo_epi64(r5, r6); r3 = _mm_adds_epi16(r3, r0); r4 = _mm_adds_epi16(r4, r1); r3 = _mm_packus_epi16(r3, r4); *((uint32_t *)(dst)) = _mm_cvtsi128_si32(r3); dst+=stride; *((uint32_t *)(dst)) = _mm_cvtsi128_si32(_mm_srli_si128(r3, 4)); dst+=stride; *((uint32_t *)(dst)) = _mm_cvtsi128_si32(_mm_srli_si128(r3, 8)); dst+=stride; *((uint32_t *)(dst)) = _mm_cvtsi128_si32(_mm_srli_si128(r3, 12)); }
void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest, int stride) { uint8_t abs_diff; __m128i d; // Prediction data. __m128i p0 = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride)); __m128i p1 = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride)); __m128i p2 = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride)); __m128i p3 = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride)); __m128i p4 = _mm_loadl_epi64((const __m128i *)(dest + 4 * stride)); __m128i p5 = _mm_loadl_epi64((const __m128i *)(dest + 5 * stride)); __m128i p6 = _mm_loadl_epi64((const __m128i *)(dest + 6 * stride)); __m128i p7 = _mm_loadl_epi64((const __m128i *)(dest + 7 * stride)); p0 = _mm_unpacklo_epi64(p0, p1); p2 = _mm_unpacklo_epi64(p2, p3); p4 = _mm_unpacklo_epi64(p4, p5); p6 = _mm_unpacklo_epi64(p6, p7); // Clip diff value to [0, 255] range. Then, do addition or subtraction // according to its sign. if (diff >= 0) { abs_diff = (diff > 255) ? 255 : diff; d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); p0 = _mm_adds_epu8(p0, d); p2 = _mm_adds_epu8(p2, d); p4 = _mm_adds_epu8(p4, d); p6 = _mm_adds_epu8(p6, d); } else { abs_diff = (diff < -255) ? 255 : -diff; d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); p0 = _mm_subs_epu8(p0, d); p2 = _mm_subs_epu8(p2, d); p4 = _mm_subs_epu8(p4, d); p6 = _mm_subs_epu8(p6, d); } _mm_storel_epi64((__m128i *)(dest + 0 * stride), p0); p0 = _mm_srli_si128(p0, 8); _mm_storel_epi64((__m128i *)(dest + 1 * stride), p0); _mm_storel_epi64((__m128i *)(dest + 2 * stride), p2); p2 = _mm_srli_si128(p2, 8); _mm_storel_epi64((__m128i *)(dest + 3 * stride), p2); _mm_storel_epi64((__m128i *)(dest + 4 * stride), p4); p4 = _mm_srli_si128(p4, 8); _mm_storel_epi64((__m128i *)(dest + 5 * stride), p4); _mm_storel_epi64((__m128i *)(dest + 6 * stride), p6); p6 = _mm_srli_si128(p6, 8); _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6); }
unsigned int vp9_avg_4x4_sse2(const uint8_t *s, int p) { __m128i s0, s1, u0; unsigned int avg = 0; u0 = _mm_setzero_si128(); s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); s0 = _mm_adds_epu16(s0, s1); s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); s0 = _mm_adds_epu16(s0, s1); s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); s0 = _mm_adds_epu16(s0, s1); s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 4)); s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); avg = _mm_extract_epi16(s0, 0); return (avg + 8) >> 4; }
static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch, uint8_t *dst, const int16_t *x_filter) { const __m128i k_256 = _mm_set1_epi16(1 << 8); const __m128i f_values = _mm_load_si128((const __m128i *)x_filter); // pack and duplicate the filter values const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); const __m128i A = _mm_loadl_epi64((const __m128i *)src_x); const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch)); const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2)); const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3)); const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4)); const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5)); const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6)); const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7)); // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57 const __m128i tr0_2 = _mm_unpacklo_epi16(E, F); // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77 const __m128i tr0_3 = _mm_unpacklo_epi16(G, H); // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1); // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73 const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3); // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2); const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2); const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3); const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3); // multiply 2 adjacent elements with the filter and add the result const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); // add and saturate the results together const __m128i min_x2x1 = _mm_min_epi16(x2, x1); const __m128i max_x2x1 = _mm_max_epi16(x2, x1); __m128i temp = _mm_adds_epi16(x0, x3); temp = _mm_adds_epi16(temp, min_x2x1); temp = _mm_adds_epi16(temp, max_x2x1); // round and shift by 7 bit each 16 bit temp = _mm_mulhrs_epi16(temp, k_256); // shrink to 8 bit each 16 bits temp = _mm_packus_epi16(temp, temp); // save only 8 bytes convolve result _mm_storel_epi64((__m128i *)dst, temp); }
static FORCE_INLINE void FlowInterSimple_double_8px_AVX2( int w, PixelType *pdst, const PixelType *prefB, const PixelType *prefF, const int16_t *VXFullB, const int16_t *VXFullF, const int16_t *VYFullB, const int16_t *VYFullF, const uint8_t *MaskB, const uint8_t *MaskF, int nPelLog, const __m256i &dwords_ref_pitch, const __m256i &dwords_hoffsets) { __m256i dwords_w = _mm256_add_epi32(_mm256_set1_epi32(w << nPelLog), dwords_hoffsets); /// maybe do it another way __m256i dstF = lookup_double_AVX2(VXFullF, VYFullF, prefF, w, dwords_ref_pitch, dwords_w); __m256i dstB = lookup_double_AVX2(VXFullB, VYFullB, prefB, w, dwords_ref_pitch, dwords_w); __m256i maskf = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskF[w])); __m256i maskb = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskB[w])); __m256i dstF_dstB = _mm256_add_epi32(dstF, dstB); dstF_dstB = _mm256_slli_epi32(dstF_dstB, 8); __m256i dst; if (sizeof(PixelType) == 1) { __m256i dstB_dstF = _mm256_sub_epi16(dstB, dstF); __m256i maskf_maskb = _mm256_sub_epi16(maskf, maskb); dst = _mm256_madd_epi16(dstB_dstF, maskf_maskb); } else { __m256i dstB_dstF = _mm256_sub_epi32(dstB, dstF); __m256i maskf_maskb = _mm256_sub_epi32(maskf, maskb); dst = _mm256_mullo_epi32(dstB_dstF, maskf_maskb); } dst = _mm256_add_epi32(dst, dstF_dstB); dst = _mm256_srai_epi32(dst, 9); dst = _mm256_packus_epi32(dst, dst); dst = _mm256_permute4x64_epi64(dst, 0xe8); // 0b11101000 - copy third qword to second qword __m128i dst128 = _mm256_castsi256_si128(dst); if (sizeof(PixelType) == 1) { dst128 = _mm_packus_epi16(dst128, dst128); _mm_storel_epi64((__m128i *)&pdst[w], dst128); } else { _mm_storeu_si128((__m128i *)&pdst[w], dst128); } }
static INLINE unsigned int highbd_masked_sad_ssse3( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); int x, y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m128i round_const = _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m128i one = _mm_set1_epi16(1); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 8) { const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); // Zero-extend mask to 16 bits const __m128i m = _mm_unpacklo_epi8( _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128()); const __m128i m_inv = _mm_sub_epi16(mask_max, m); const __m128i data_l = _mm_unpacklo_epi16(a, b); const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); __m128i pred_l = _mm_madd_epi16(data_l, mask_l); pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpackhi_epi16(a, b); const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); __m128i pred_r = _mm_madd_epi16(data_r, mask_r); pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, // so it is safe to do signed saturation here. const __m128i pred = _mm_packs_epi32(pred_l, pred_r); // There is no 16-bit SAD instruction, so we have to synthesize // an 8-element SAD. We do this by storing 4 32-bit partial SADs, // and accumulating them at the end const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); } src_ptr += src_stride; a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } // At this point, we have four 32-bit partial SADs stored in 'res'. res = _mm_hadd_epi32(res, res); res = _mm_hadd_epi32(res, res); int sad = _mm_cvtsi128_si32(res); return (sad + 31) >> 6; }
/** * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more * precise version of a box filter 4:2:0 pixel subsampling in Q3. * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer. */ static INLINE void cfl_luma_subsampling_420_hbd_ssse3(const uint16_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE; const int luma_stride = input_stride << 1; do { if (width == 4) { const __m128i top = _mm_loadl_epi64((__m128i *)input); const __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride)); __m128i sum = _mm_add_epi16(top, bot); sum = _mm_hadd_epi16(sum, sum); *((int *)pred_buf_q3) = _mm_cvtsi128_si32(_mm_add_epi16(sum, sum)); } else { const __m128i top = _mm_loadu_si128((__m128i *)input); const __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride)); __m128i sum = _mm_add_epi16(top, bot); if (width == 8) { sum = _mm_hadd_epi16(sum, sum); _mm_storel_epi64((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum)); } else { const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); const __m128i bot_1 = _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1); sum = _mm_hadd_epi16(sum, _mm_add_epi16(top_1, bot_1)); _mm_storeu_si128((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum)); if (width == 32) { const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2); const __m128i bot_2 = _mm_loadu_si128(((__m128i *)(input + input_stride)) + 2); const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3); const __m128i bot_3 = _mm_loadu_si128(((__m128i *)(input + input_stride)) + 3); const __m128i sum_2 = _mm_add_epi16(top_2, bot_2); const __m128i sum_3 = _mm_add_epi16(top_3, bot_3); __m128i next_sum = _mm_hadd_epi16(sum_2, sum_3); _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1, _mm_add_epi16(next_sum, next_sum)); } } } input += luma_stride; } while ((pred_buf_q3 += CFL_BUF_LINE) < end); }
/** * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more * precise version of a box filter 4:2:0 pixel subsampling in Q3. * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer. */ static INLINE void cfl_luma_subsampling_420_lbd_ssse3(const uint8_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { const __m128i twos = _mm_set1_epi8(2); __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; const __m128i *end = pred_buf_m128i + (height >> 1) * CFL_BUF_LINE_I128; const int luma_stride = input_stride << 1; do { if (width == 4) { __m128i top = _mm_loadh_epi32((__m128i *)input); top = _mm_maddubs_epi16(top, twos); __m128i bot = _mm_loadh_epi32((__m128i *)(input + input_stride)); bot = _mm_maddubs_epi16(bot, twos); const __m128i sum = _mm_add_epi16(top, bot); _mm_storeh_epi32(pred_buf_m128i, sum); } else if (width == 8) { __m128i top = _mm_loadl_epi64((__m128i *)input); top = _mm_maddubs_epi16(top, twos); __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride)); bot = _mm_maddubs_epi16(bot, twos); const __m128i sum = _mm_add_epi16(top, bot); _mm_storel_epi64(pred_buf_m128i, sum); } else { __m128i top = _mm_loadu_si128((__m128i *)input); top = _mm_maddubs_epi16(top, twos); __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride)); bot = _mm_maddubs_epi16(bot, twos); const __m128i sum = _mm_add_epi16(top, bot); _mm_storeu_si128(pred_buf_m128i, sum); if (width == 32) { __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); __m128i bot_1 = _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1); top_1 = _mm_maddubs_epi16(top_1, twos); bot_1 = _mm_maddubs_epi16(bot_1, twos); __m128i sum_1 = _mm_add_epi16(top_1, bot_1); _mm_storeu_si128(pred_buf_m128i + 1, sum_1); } } input += luma_stride; pred_buf_m128i += CFL_BUF_LINE_I128; } while (pred_buf_m128i < end); }
inline __m128i load_aligned_int32(const uint16_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION __m128i res = _mm_cvtepu16_epi32(tmp); #else __m128i res = _mm_unpacklo_epi16(tmp, _mm_set1_epi16(0)); #endif return res; }
static unsigned satd_8bit_4x4_avx2(const kvz_pixel *org, const kvz_pixel *cur) { __m128i original = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)org)); __m128i current = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)cur)); __m128i diff_lo = _mm_sub_epi16(current, original); original = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(org + 8))); current = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(cur + 8))); __m128i diff_hi = _mm_sub_epi16(current, original); //Hor __m128i row0 = _mm_hadd_epi16(diff_lo, diff_hi); __m128i row1 = _mm_hsub_epi16(diff_lo, diff_hi); __m128i row2 = _mm_hadd_epi16(row0, row1); __m128i row3 = _mm_hsub_epi16(row0, row1); //Ver row0 = _mm_hadd_epi16(row2, row3); row1 = _mm_hsub_epi16(row2, row3); row2 = _mm_hadd_epi16(row0, row1); row3 = _mm_hsub_epi16(row0, row1); //Abs and sum row2 = _mm_abs_epi16(row2); row3 = _mm_abs_epi16(row3); row3 = _mm_add_epi16(row2, row3); row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(2, 3, 0, 1) )); row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(1, 0, 1, 0) )); row3 = _mm_add_epi16(row3, _mm_shufflelo_epi16(row3, KVZ_PERMUTE(1, 0, 1, 0) )); unsigned sum = _mm_extract_epi16(row3, 0); unsigned satd = (sum + 1) >> 1; return satd; }
inline Pixel GetPixelSSE(const Image* img, float x, float y) { const int stride = img->width; const Pixel* p0 = img->data + (int)x + (int)y * stride; // pointer to first pixel // Load the data (2 pixels in one load) __m128i p12 = _mm_loadl_epi64((const __m128i*)&p0[0 * stride]); __m128i p34 = _mm_loadl_epi64((const __m128i*)&p0[1 * stride]); __m128 weight = CalcWeights(x, y); // extend to 16bit p12 = _mm_unpacklo_epi8(p12, _mm_setzero_si128()); p34 = _mm_unpacklo_epi8(p34, _mm_setzero_si128()); // convert floating point weights to 16bit integer weight = _mm_mul_ps(weight, CONST_256); __m128i weighti = _mm_cvtps_epi32(weight); // w4 w3 w2 w1 weighti = _mm_packs_epi32(weighti, _mm_setzero_si128()); // 32->16bit // prepare the weights __m128i w12 = _mm_shufflelo_epi16(weighti, _MM_SHUFFLE(1, 1, 0, 0)); __m128i w34 = _mm_shufflelo_epi16(weighti, _MM_SHUFFLE(3, 3, 2, 2)); w12 = _mm_unpacklo_epi16(w12, w12); // w2 w2 w2 w2 w1 w1 w1 w1 w34 = _mm_unpacklo_epi16(w34, w34); // w4 w4 w4 w4 w3 w3 w3 w3 // multiply each pixel with its weight (2 pixel per SSE mul) __m128i L12 = _mm_mullo_epi16(p12, w12); __m128i L34 = _mm_mullo_epi16(p34, w34); // sum the results __m128i L1234 = _mm_add_epi16(L12, L34); __m128i Lhi = _mm_shuffle_epi32(L1234, _MM_SHUFFLE(3, 2, 3, 2)); __m128i L = _mm_add_epi16(L1234, Lhi); // convert back to 8bit __m128i L8 = _mm_srli_epi16(L, 8); // divide by 256 L8 = _mm_packus_epi16(L8, _mm_setzero_si128()); // return return _mm_cvtsi128_si32(L8); }
inline Pixel GetPixelSSE3(const Image<Pixel>* img, float x, float y) { const int stride = img->width; const Pixel* p0 = img->data + (int)x + (int)y * stride; // pointer to first pixel // Load the data (2 pixels in one load) __m128i p12 = _mm_loadl_epi64((const __m128i*)&p0[0 * stride]); __m128i p34 = _mm_loadl_epi64((const __m128i*)&p0[1 * stride]); __m128 weight = CalcWeights(x, y); // convert RGBA RGBA RGBA RGAB to RRRR GGGG BBBB AAAA (AoS to SoA) __m128i p1234 = _mm_unpacklo_epi8(p12, p34); __m128i p34xx = _mm_unpackhi_epi64(p1234, _mm_setzero_si128()); __m128i p1234_8bit = _mm_unpacklo_epi8(p1234, p34xx); // extend to 16bit __m128i pRG = _mm_unpacklo_epi8(p1234_8bit, _mm_setzero_si128()); __m128i pBA = _mm_unpackhi_epi8(p1234_8bit, _mm_setzero_si128()); // convert weights to integer weight = _mm_mul_ps(weight, CONST_256); __m128i weighti = _mm_cvtps_epi32(weight); // w4 w3 w2 w1 weighti = _mm_packs_epi32(weighti, weighti); // 32->2x16bit //outRG = [w1*R1 + w2*R2 | w3*R3 + w4*R4 | w1*G1 + w2*G2 | w3*G3 + w4*G4] __m128i outRG = _mm_madd_epi16(pRG, weighti); //outBA = [w1*B1 + w2*B2 | w3*B3 + w4*B4 | w1*A1 + w2*A2 | w3*A3 + w4*A4] __m128i outBA = _mm_madd_epi16(pBA, weighti); // horizontal add that will produce the output values (in 32bit) __m128i out = _mm_hadd_epi32(outRG, outBA); out = _mm_srli_epi32(out, 8); // divide by 256 // convert 32bit->8bit out = _mm_packus_epi32(out, _mm_setzero_si128()); out = _mm_packus_epi16(out, _mm_setzero_si128()); // return return _mm_cvtsi128_si32(out); }
void ihevc_memcpy_mul_8_ssse3(UWORD8 *pu1_dst, UWORD8 *pu1_src, UWORD32 num_bytes) { int col; for(col = num_bytes; col >= 8; col -= 8) { __m128i src_temp16x8b; src_temp16x8b = _mm_loadl_epi64((__m128i *)(pu1_src)); pu1_src += 8; _mm_storel_epi64((__m128i *)(pu1_dst), src_temp16x8b); pu1_dst += 8; } }
static int DispatchAlpha(const uint8_t* alpha, int alpha_stride, int width, int height, uint8_t* dst, int dst_stride) { // alpha_and stores an 'and' operation of all the alpha[] values. The final // value is not 0xff if any of the alpha[] is not equal to 0xff. uint32_t alpha_and = 0xff; int i, j; const __m128i zero = _mm_setzero_si128(); const __m128i rgb_mask = _mm_set1_epi32(0xffffff00u); // to preserve RGB const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u); __m128i all_alphas = all_0xff; // We must be able to access 3 extra bytes after the last written byte // 'dst[4 * width - 4]', because we don't know if alpha is the first or the // last byte of the quadruplet. const int limit = (width - 1) & ~7; for (j = 0; j < height; ++j) { __m128i* out = (__m128i*)dst; for (i = 0; i < limit; i += 8) { // load 8 alpha bytes const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[i]); const __m128i a1 = _mm_unpacklo_epi8(a0, zero); const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero); const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero); // load 8 dst pixels (32 bytes) const __m128i b0_lo = _mm_loadu_si128(out + 0); const __m128i b0_hi = _mm_loadu_si128(out + 1); // mask dst alpha values const __m128i b1_lo = _mm_and_si128(b0_lo, rgb_mask); const __m128i b1_hi = _mm_and_si128(b0_hi, rgb_mask); // combine const __m128i b2_lo = _mm_or_si128(b1_lo, a2_lo); const __m128i b2_hi = _mm_or_si128(b1_hi, a2_hi); // store _mm_storeu_si128(out + 0, b2_lo); _mm_storeu_si128(out + 1, b2_hi); // accumulate eight alpha 'and' in parallel all_alphas = _mm_and_si128(all_alphas, a0); out += 2; } for (; i < width; ++i) { const uint32_t alpha_value = alpha[i]; dst[4 * i] = alpha_value; alpha_and &= alpha_value; } alpha += alpha_stride; dst += dst_stride; } // Combine the eight alpha 'and' into a 8-bit mask. alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff)); return (alpha_and != 0xff); }
static void write4pixelsAccum(__m128i *u, int bd, uint16_t *dst) { __m128i v = _mm_loadl_epi64((__m128i const *)dst); const __m128i ones = _mm_set1_epi16(1); highbdRndingPacks(u); highbd_clip(u, 1, bd); v = _mm_add_epi16(v, u[0]); v = _mm_add_epi16(v, ones); v = _mm_srai_epi16(v, 1); _mm_storel_epi64((__m128i *)dst, v); }
static void write2pixelsAccum(__m128i *u, int bd, uint16_t *dst) { __m128i v = _mm_loadl_epi64((__m128i const *)dst); const __m128i ones = _mm_set1_epi16(1); highbdRndingPacks(u); highbd_clip(u, 1, bd); v = _mm_add_epi16(v, u[0]); v = _mm_add_epi16(v, ones); v = _mm_srai_epi16(v, 1); *(uint32_t *)dst = _mm_cvtsi128_si32(v); }
static inline void jambu_initialization(__m128i *key, const unsigned char *iv, __m128i *stateS, __m128i *stateR) { __m128i c5 = _mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,5); *stateS = _mm_loadl_epi64((__m128i*)iv); aes_enc_128(stateS, key); *stateR = *stateS; *stateS = _mm_xor_si128(*stateS, c5); return; }
void dotmul_intrinsic(unsigned short A[], unsigned short B[], unsigned int &C, int SIZE) { register int k; short sarr[4]; register __m128i partial_sum = _mm_setzero_si128(); register __m128i catch_multiplication = _mm_setzero_si128(); for(k = 0; k < SIZE; k += 4) { // load 64 bit integer data (4 x unsigned short) register __m128i a = _mm_loadl_epi64((__m128i*)&A[k]); register __m128i b = _mm_loadl_epi64((__m128i*)&B[k]); catch_multiplication = _mm_mullo_epi16(a, b); partial_sum = _mm_add_epi16(partial_sum, catch_multiplication); } _mm_storel_epi64((__m128i*) sarr, partial_sum); C = sarr[0] + sarr[1] + sarr[2] + sarr[3]; }
unsigned int vp9_get8x8var_sse2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse, int *sum) { const __m128i zero = _mm_setzero_si128(); __m128i vsum = _mm_setzero_si128(); __m128i vsse = _mm_setzero_si128(); int i; for (i = 0; i < 8; i += 2) { const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64( (const __m128i *)(src + i * src_stride)), zero); const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64( (const __m128i *)(ref + i * ref_stride)), zero); const __m128i diff0 = _mm_sub_epi16(src0, ref0); const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64( (const __m128i *)(src + (i + 1) * src_stride)), zero); const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64( (const __m128i *)(ref + (i + 1) * ref_stride)), zero); const __m128i diff1 = _mm_sub_epi16(src1, ref1); vsum = _mm_add_epi16(vsum, diff0); vsum = _mm_add_epi16(vsum, diff1); vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); } // sum vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); *sum = (int16_t)_mm_extract_epi16(vsum, 0); // sse vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); *sse = _mm_cvtsi128_si32(vsse); return 0; }
void vpx_comp_avg_pred_sse2(uint8_t *comp_pred, const uint8_t *pred, int width, int height, const uint8_t *ref, int ref_stride) { /* comp_pred and pred must be 16 byte aligned. */ assert(((intptr_t)comp_pred & 0xf) == 0); assert(((intptr_t)pred & 0xf) == 0); if (width > 8) { int x, y; for (y = 0; y < height; ++y) { for (x = 0; x < width; x += 16) { const __m128i p = _mm_load_si128((const __m128i *)(pred + x)); const __m128i r = _mm_loadu_si128((const __m128i *)(ref + x)); const __m128i avg = _mm_avg_epu8(p, r); _mm_store_si128((__m128i *)(comp_pred + x), avg); } comp_pred += width; pred += width; ref += ref_stride; } } else { // width must be 4 or 8. int i; // Process 16 elements at a time. comp_pred and pred have width == stride // and therefore live in contigious memory. 4*4, 4*8, 8*4, 8*8, and 8*16 are // all divisible by 16 so just ref needs to be massaged when loading. for (i = 0; i < width * height; i += 16) { const __m128i p = _mm_load_si128((const __m128i *)pred); __m128i r; __m128i avg; if (width == ref_stride) { r = _mm_loadu_si128((const __m128i *)ref); ref += 16; } else if (width == 4) { r = _mm_set_epi32(loadu_uint32(ref + 3 * ref_stride), loadu_uint32(ref + 2 * ref_stride), loadu_uint32(ref + ref_stride), loadu_uint32(ref)); ref += 4 * ref_stride; } else { const __m128i r_0 = _mm_loadl_epi64((const __m128i *)ref); assert(width == 8); r = _mm_castps_si128(_mm_loadh_pi(_mm_castsi128_ps(r_0), (const __m64 *)(ref + ref_stride))); ref += 2 * ref_stride; } avg = _mm_avg_epu8(p, r); _mm_store_si128((__m128i *)comp_pred, avg); pred += 16; comp_pred += 16; } } }
inline __m128i load_aligned_int32(const int8_t* src) { __m128i tmp = _mm_loadl_epi64((const __m128i*)src); #if XSIMD_X86_INSTR_SET >= XSIMD_X86_SSE4_1_VERSION __m128i res = _mm_cvtepi8_epi32(tmp); #else __m128i mask = _mm_cmplt_epi8(tmp, _mm_set1_epi8(0)); __m128i tmp1 = _mm_unpacklo_epi8(tmp, mask); mask = _mm_cmplt_epi16(tmp1, _mm_set1_epi16(0)); __m128i res = _mm_unpacklo_epi16(tmp1, mask); #endif return res; }
void LOADERDECL TexCoord_ReadIndex_Float2_SSSE3() { static_assert(!std::numeric_limits<I>::is_signed, "Only unsigned I is sane!"); auto const index = DataRead<I>(); const u32 *pData = (const u32 *)(cached_arraybases[ARRAY_TEXCOORD0+tcIndex] + (index * g_main_cp_state.array_strides[ARRAY_TEXCOORD0+tcIndex])); GC_ALIGNED128(const __m128i a = _mm_loadl_epi64((__m128i*)pData)); GC_ALIGNED128(const __m128i b = _mm_shuffle_epi8(a, kMaskSwap32)); _mm_storel_epi64((__m128i*)VertexManager::s_pCurBufferPointer, b); VertexManager::s_pCurBufferPointer += sizeof(float) * 2; LOG_TEX<2>(); tcIndex++; }
static INLINE unsigned int highbd_masked_sad4xh_ssse3( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); int y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m128i round_const = _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m128i one = _mm_set1_epi16(1); for (y = 0; y < height; y += 2) { const __m128i src = _mm_unpacklo_epi64( _mm_loadl_epi64((const __m128i *)src_ptr), _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); const __m128i a = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr), _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride])); const __m128i b = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr), _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride])); // Zero-extend mask to 16 bits const __m128i m = _mm_unpacklo_epi8( _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const uint32_t *)m_ptr), _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])), _mm_setzero_si128()); const __m128i m_inv = _mm_sub_epi16(mask_max, m); const __m128i data_l = _mm_unpacklo_epi16(a, b); const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); __m128i pred_l = _mm_madd_epi16(data_l, mask_l); pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpackhi_epi16(a, b); const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); __m128i pred_r = _mm_madd_epi16(data_r, mask_r); pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i pred = _mm_packs_epi32(pred_l, pred_r); const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); src_ptr += src_stride * 2; a_ptr += a_stride * 2; b_ptr += b_stride * 2; m_ptr += m_stride * 2; } res = _mm_hadd_epi32(res, res); res = _mm_hadd_epi32(res, res); int sad = _mm_cvtsi128_si32(res); return (sad + 31) >> 6; }
static void GradientPredictInverse(const uint8_t* const in, const uint8_t* const top, uint8_t* const row, int length) { if (length > 0) { int i; const int max_pos = length & ~7; const __m128i zero = _mm_setzero_si128(); __m128i A = _mm_set_epi32(0, 0, 0, row[-1]); // left sample for (i = 0; i < max_pos; i += 8) { const __m128i tmp0 = _mm_loadl_epi64((const __m128i*)&top[i]); const __m128i tmp1 = _mm_loadl_epi64((const __m128i*)&top[i - 1]); const __m128i B = _mm_unpacklo_epi8(tmp0, zero); const __m128i C = _mm_unpacklo_epi8(tmp1, zero); const __m128i tmp2 = _mm_loadl_epi64((const __m128i*)&in[i]); const __m128i D = _mm_unpacklo_epi8(tmp2, zero); // base input const __m128i E = _mm_sub_epi16(B, C); // unclipped gradient basis B - C __m128i out = zero; // accumulator for output __m128i mask_hi = _mm_set_epi32(0, 0, 0, 0xff); int k = 8; while (1) { const __m128i tmp3 = _mm_add_epi16(A, E); // delta = A + B - C const __m128i tmp4 = _mm_min_epi16(tmp3, mask_hi); const __m128i tmp5 = _mm_max_epi16(tmp4, zero); // clipped delta const __m128i tmp6 = _mm_add_epi16(tmp5, D); // add to in[] values A = _mm_and_si128(tmp6, mask_hi); // 1-complement clip out = _mm_or_si128(out, A); // accumulate output if (--k == 0) break; A = _mm_slli_si128(A, 2); // rotate left sample mask_hi = _mm_slli_si128(mask_hi, 2); // rotate mask } A = _mm_srli_si128(A, 14); // prepare left sample for next iteration _mm_storel_epi64((__m128i*)&row[i], _mm_packus_epi16(out, zero)); } for (; i < length; ++i) { row[i] = in[i] + GradientPredictorC(row[i - 1], top[i], top[i - 1]); } } }