void vp9_add_constant_residual_32x32_sse2(const int16_t diff, uint8_t *dest, int stride) { uint8_t abs_diff; __m128i d; int i = 8; if (diff >= 0) { abs_diff = (diff > 255) ? 255 : diff; d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); } else { abs_diff = (diff < -255) ? 255 : -diff; d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); } do { // Prediction data. __m128i p0 = _mm_load_si128((const __m128i *)(dest + 0 * stride)); __m128i p1 = _mm_load_si128((const __m128i *)(dest + 0 * stride + 16)); __m128i p2 = _mm_load_si128((const __m128i *)(dest + 1 * stride)); __m128i p3 = _mm_load_si128((const __m128i *)(dest + 1 * stride + 16)); __m128i p4 = _mm_load_si128((const __m128i *)(dest + 2 * stride)); __m128i p5 = _mm_load_si128((const __m128i *)(dest + 2 * stride + 16)); __m128i p6 = _mm_load_si128((const __m128i *)(dest + 3 * stride)); __m128i p7 = _mm_load_si128((const __m128i *)(dest + 3 * stride + 16)); // Clip diff value to [0, 255] range. Then, do addition or subtraction // according to its sign. if (diff >= 0) { p0 = _mm_adds_epu8(p0, d); p1 = _mm_adds_epu8(p1, d); p2 = _mm_adds_epu8(p2, d); p3 = _mm_adds_epu8(p3, d); p4 = _mm_adds_epu8(p4, d); p5 = _mm_adds_epu8(p5, d); p6 = _mm_adds_epu8(p6, d); p7 = _mm_adds_epu8(p7, d); } else { p0 = _mm_subs_epu8(p0, d); p1 = _mm_subs_epu8(p1, d); p2 = _mm_subs_epu8(p2, d); p3 = _mm_subs_epu8(p3, d); p4 = _mm_subs_epu8(p4, d); p5 = _mm_subs_epu8(p5, d); p6 = _mm_subs_epu8(p6, d); p7 = _mm_subs_epu8(p7, d); } // Store results _mm_store_si128((__m128i *)(dest + 0 * stride), p0); _mm_store_si128((__m128i *)(dest + 0 * stride + 16), p1); _mm_store_si128((__m128i *)(dest + 1 * stride), p2); _mm_store_si128((__m128i *)(dest + 1 * stride + 16), p3); _mm_store_si128((__m128i *)(dest + 2 * stride), p4); _mm_store_si128((__m128i *)(dest + 2 * stride + 16), p5); _mm_store_si128((__m128i *)(dest + 3 * stride), p6); _mm_store_si128((__m128i *)(dest + 3 * stride + 16), p7); dest += 4 * stride; } while (--i); }
static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) { const __m128i zero = _mm_setzero_si128(); const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero); const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero); const __m128i sum = _mm_add_epi16(A1, A0); return _mm_srli_epi16(sum, 1); }
void S_Interpolate_4x4_IntPel_Mono_Add_Later(unsigned char *current_part_ptr, int current_part_stride, unsigned char *ref_part_ptr, int ref_part_stride){ static const unsigned int c_0[4] = { 0, 0, 0, 0 }; unsigned long s_row0_0, s_row1_0, s_row2_0, s_row3_0; __m128i v_row0_0, v_row1_0, v_row2_0, v_row3_0; __m128i v_Zero = _mm_loadu_si128((__m128i*)c_0); s_row0_0 = *(unsigned long*)(ref_part_ptr+(0*ref_part_stride)); s_row1_0 = *(unsigned long*)(ref_part_ptr+(1*ref_part_stride)); s_row2_0 = *(unsigned long*)(ref_part_ptr+(2*ref_part_stride)); s_row3_0 = *(unsigned long*)(ref_part_ptr+(3*ref_part_stride)); v_row0_0 = _mm_cvtsi32_si128(s_row0_0); v_row1_0 = _mm_cvtsi32_si128(s_row1_0); v_row2_0 = _mm_cvtsi32_si128(s_row2_0); v_row3_0 = _mm_cvtsi32_si128(s_row3_0); v_row0_0 = _mm_unpacklo_epi8(v_row0_0, v_Zero); v_row1_0 = _mm_unpacklo_epi8(v_row1_0, v_Zero); v_row2_0 = _mm_unpacklo_epi8(v_row2_0, v_Zero); v_row3_0 = _mm_unpacklo_epi8(v_row3_0, v_Zero); _mm_storel_epi64((__m128i*)(current_part_ptr+(0*current_part_stride)), v_row0_0); _mm_storel_epi64((__m128i*)(current_part_ptr+(1*current_part_stride)), v_row1_0); _mm_storel_epi64((__m128i*)(current_part_ptr+(2*current_part_stride)), v_row2_0); _mm_storel_epi64((__m128i*)(current_part_ptr+(3*current_part_stride)), v_row3_0); }
inline COLORREF MakeColor2(COLORREF a, COLORREF b, int alpha) { #ifdef USE_SSE2 // (a * alpha + b * (256 - alpha)) / 256 -> ((a - b) * alpha) / 256 + b __m128i xmm0, xmm1, xmm2, xmm3; COLORREF color; xmm0 = _mm_setzero_si128(); xmm1 = _mm_cvtsi32_si128( a ); xmm2 = _mm_cvtsi32_si128( b ); xmm3 = _mm_cvtsi32_si128( alpha ); xmm1 = _mm_unpacklo_epi8( xmm1, xmm0 ); // a:a:a:a xmm2 = _mm_unpacklo_epi8( xmm2, xmm0 ); // b:b:b:b xmm3 = _mm_shufflelo_epi16( xmm3, 0 ); // alpha:alpha:alpha:alpha xmm1 = _mm_sub_epi16( xmm1, xmm2 ); // (a - b) xmm1 = _mm_mullo_epi16( xmm1, xmm3 ); // (a - b) * alpha xmm1 = _mm_srli_epi16( xmm1, 8 ); // ((a - b) * alpha) / 256 xmm1 = _mm_add_epi8( xmm1, xmm2 ); // ((a - b) * alpha) / 256 + b xmm1 = _mm_packus_epi16( xmm1, xmm0 ); color = _mm_cvtsi128_si32( xmm1 ); return color; #else const int ap = alpha; const int bp = 256 - ap; BYTE valR = (BYTE)((GetRValue(a) * ap + GetRValue(b) * bp) / 256); BYTE valG = (BYTE)((GetGValue(a) * ap + GetGValue(b) * bp) / 256); BYTE valB = (BYTE)((GetBValue(a) * ap + GetBValue(b) * bp) / 256); return RGB(valR, valG, valB); #endif }
static INLINE unsigned int highbd_masked_sad4xh_ssse3( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); int y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m128i round_const = _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m128i one = _mm_set1_epi16(1); for (y = 0; y < height; y += 2) { const __m128i src = _mm_unpacklo_epi64( _mm_loadl_epi64((const __m128i *)src_ptr), _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); const __m128i a = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr), _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride])); const __m128i b = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr), _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride])); // Zero-extend mask to 16 bits const __m128i m = _mm_unpacklo_epi8( _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const uint32_t *)m_ptr), _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])), _mm_setzero_si128()); const __m128i m_inv = _mm_sub_epi16(mask_max, m); const __m128i data_l = _mm_unpacklo_epi16(a, b); const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); __m128i pred_l = _mm_madd_epi16(data_l, mask_l); pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpackhi_epi16(a, b); const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); __m128i pred_r = _mm_madd_epi16(data_r, mask_r); pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i pred = _mm_packs_epi32(pred_l, pred_r); const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); src_ptr += src_stride * 2; a_ptr += a_stride * 2; b_ptr += b_stride * 2; m_ptr += m_stride * 2; } res = _mm_hadd_epi32(res, res); res = _mm_hadd_epi32(res, res); int sad = _mm_cvtsi128_si32(res); return (sad + 31) >> 6; }
void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest, int stride) { uint8_t abs_diff; __m128i d; // Prediction data. __m128i p0 = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride)); __m128i p1 = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride)); __m128i p2 = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride)); __m128i p3 = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride)); __m128i p4 = _mm_loadl_epi64((const __m128i *)(dest + 4 * stride)); __m128i p5 = _mm_loadl_epi64((const __m128i *)(dest + 5 * stride)); __m128i p6 = _mm_loadl_epi64((const __m128i *)(dest + 6 * stride)); __m128i p7 = _mm_loadl_epi64((const __m128i *)(dest + 7 * stride)); p0 = _mm_unpacklo_epi64(p0, p1); p2 = _mm_unpacklo_epi64(p2, p3); p4 = _mm_unpacklo_epi64(p4, p5); p6 = _mm_unpacklo_epi64(p6, p7); // Clip diff value to [0, 255] range. Then, do addition or subtraction // according to its sign. if (diff >= 0) { abs_diff = (diff > 255) ? 255 : diff; d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); p0 = _mm_adds_epu8(p0, d); p2 = _mm_adds_epu8(p2, d); p4 = _mm_adds_epu8(p4, d); p6 = _mm_adds_epu8(p6, d); } else { abs_diff = (diff < -255) ? 255 : -diff; d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); p0 = _mm_subs_epu8(p0, d); p2 = _mm_subs_epu8(p2, d); p4 = _mm_subs_epu8(p4, d); p6 = _mm_subs_epu8(p6, d); } _mm_storel_epi64((__m128i *)(dest + 0 * stride), p0); p0 = _mm_srli_si128(p0, 8); _mm_storel_epi64((__m128i *)(dest + 1 * stride), p0); _mm_storel_epi64((__m128i *)(dest + 2 * stride), p2); p2 = _mm_srli_si128(p2, 8); _mm_storel_epi64((__m128i *)(dest + 3 * stride), p2); _mm_storel_epi64((__m128i *)(dest + 4 * stride), p4); p4 = _mm_srli_si128(p4, 8); _mm_storel_epi64((__m128i *)(dest + 5 * stride), p4); _mm_storel_epi64((__m128i *)(dest + 6 * stride), p6); p6 = _mm_srli_si128(p6, 8); _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6); }
static WEBP_INLINE void Average2_uint32(const uint32_t a0, const uint32_t a1, __m128i* const avg) { // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1) const __m128i ones = _mm_set1_epi8(1); const __m128i A0 = _mm_cvtsi32_si128(a0); const __m128i A1 = _mm_cvtsi32_si128(a1); const __m128i avg1 = _mm_avg_epu8(A0, A1); const __m128i one = _mm_and_si128(_mm_xor_si128(A0, A1), ones); *avg = _mm_sub_epi8(avg1, one); }
void png_read_filter_row_paeth3_sse(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { png_size_t i; png_bytep rp = row; png_const_bytep prp = prev_row; __m128i npix = _mm_cvtsi32_si128(*(uint32_t*)rp); __m128i ppix = _mm_setzero_si128(); // Same as 'a' in C version. __m128i prppix = _mm_setzero_si128(); // Same as 'c' in C version. const __m128i zero = _mm_setzero_si128(); for (i = 0; i < row_info->rowbytes; i += 3, rp += 3, prp += 3) { __m128i prpix = _mm_cvtsi32_si128(*(uint32_t*)prp); // Same as 'b' in C ver. __m128i pix, pa, pb, pc, temp; prpix = _mm_unpacklo_epi8(prpix, zero); temp = _mm_sub_epi16(prpix, prppix); // p = b - c pc = _mm_sub_epi16(ppix, prppix); // pc = a - c #ifndef __SSSE3__ pa = _mm_max_epi16(temp, _mm_sub_epi16(prppix, prpix)); pb = _mm_max_epi16(pc, _mm_sub_epi16(prppix, ppix)); temp = _mm_add_epi16(temp, pc); pc = _mm_max_epi16(temp, _mm_sub_epi16(zero, temp)); #else pa = _mm_abs_epi16(temp); // pa = abs(p) pb = _mm_abs_epi16(pc); // pb = abs(pc) temp = _mm_add_epi16(temp, pc); pc = _mm_abs_epi16(temp); // pc = abs(p + pc) #endif temp = _mm_cmplt_epi16(pb, pa); // if (pb < pa) pa = pb, a = b pa = _mm_andnot_si128(temp, pa); pa = _mm_or_si128(pa, _mm_and_si128(temp, pb)); ppix = _mm_andnot_si128(temp, ppix); ppix = _mm_or_si128(ppix, _mm_and_si128(temp, prpix)); pix = npix; npix = _mm_cvtsi32_si128(*(uint32_t*)(rp + 3)); temp = _mm_cmplt_epi16(pc, pa); // if (pc < pa) a = c ppix = _mm_andnot_si128(temp, ppix); ppix = _mm_or_si128(ppix, _mm_and_si128(temp, prppix)); pix = _mm_unpacklo_epi8(pix, zero); prppix = prpix; ppix = _mm_add_epi16(ppix, pix); ppix = _mm_slli_epi16(ppix, 8); ppix = _mm_srli_epi16(ppix, 8); pix = _mm_packus_epi16(ppix, zero); *(uint32_t*)rp = _mm_cvtsi128_si32(pix); } }
static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1, uint32_t c2) { const __m128i zero = _mm_setzero_si128(); const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero); const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero); const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero); const __m128i V1 = _mm_add_epi16(C0, C1); const __m128i V2 = _mm_sub_epi16(V1, C2); const __m128i b = _mm_packus_epi16(V2, V2); const uint32_t output = _mm_cvtsi128_si32(b); return output; }
inline __m128i Convert8DigitsSSE2(uint32_t value) { assert(value <= 99999999); // abcd, efgh = abcdefgh divmod 10000 const __m128i abcdefgh = _mm_cvtsi32_si128(value); const __m128i abcd = _mm_srli_epi64(_mm_mul_epu32(abcdefgh, reinterpret_cast<const __m128i*>(kDiv10000Vector)[0]), 45); const __m128i efgh = _mm_sub_epi32(abcdefgh, _mm_mul_epu32(abcd, reinterpret_cast<const __m128i*>(k10000Vector)[0])); // v1 = [ abcd, efgh, 0, 0, 0, 0, 0, 0 ] const __m128i v1 = _mm_unpacklo_epi16(abcd, efgh); // v1a = v1 * 4 = [ abcd * 4, efgh * 4, 0, 0, 0, 0, 0, 0 ] const __m128i v1a = _mm_slli_epi64(v1, 2); // v2 = [ abcd * 4, abcd * 4, abcd * 4, abcd * 4, efgh * 4, efgh * 4, efgh * 4, efgh * 4 ] const __m128i v2a = _mm_unpacklo_epi16(v1a, v1a); const __m128i v2 = _mm_unpacklo_epi32(v2a, v2a); // v4 = v2 div 10^3, 10^2, 10^1, 10^0 = [ a, ab, abc, abcd, e, ef, efg, efgh ] const __m128i v3 = _mm_mulhi_epu16(v2, reinterpret_cast<const __m128i*>(kDivPowersVector)[0]); const __m128i v4 = _mm_mulhi_epu16(v3, reinterpret_cast<const __m128i*>(kShiftPowersVector)[0]); // v5 = v4 * 10 = [ a0, ab0, abc0, abcd0, e0, ef0, efg0, efgh0 ] const __m128i v5 = _mm_mullo_epi16(v4, reinterpret_cast<const __m128i*>(k10Vector)[0]); // v6 = v5 << 16 = [ 0, a0, ab0, abc0, 0, e0, ef0, efg0 ] const __m128i v6 = _mm_slli_epi64(v5, 16); // v7 = v4 - v6 = { a, b, c, d, e, f, g, h } const __m128i v7 = _mm_sub_epi16(v4, v6); return v7; }
static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i, j; __m128i L = _mm_cvtsi32_si128(out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); __m128i pa; GetSumAbsDiff32(&T, &TL, &pa); // pa = sum |T-TL| for (j = 0; j < 4; ++j) { const __m128i L_lo = _mm_unpacklo_epi32(L, L); const __m128i TL_lo = _mm_unpacklo_epi32(TL, L); const __m128i pb = _mm_sad_epu8(L_lo, TL_lo); // pb = sum |L-TL| const __m128i mask = _mm_cmpgt_epi32(pb, pa); const __m128i A = _mm_and_si128(mask, L); const __m128i B = _mm_andnot_si128(mask, T); const __m128i pred = _mm_or_si128(A, B); // pred = (L > T)? L : T L = _mm_add_epi8(src, pred); out[i + j] = _mm_cvtsi128_si32(L); // Shift the pre-computed value for the next iteration. T = _mm_srli_si128(T, 4); TL = _mm_srli_si128(TL, 4); src = _mm_srli_si128(src, 4); pa = _mm_srli_si128(pa, 4); } } if (i != num_pixels) { VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i); } }
// Predictor10: average of (average of (L,TL), average of (T, TR)). static void PredictorAdd10_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i, j; __m128i L = _mm_cvtsi32_si128(out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); const __m128i TR = _mm_loadu_si128((const __m128i*)&upper[i + 1]); __m128i avgTTR; Average2_m128i(&T, &TR, &avgTTR); for (j = 0; j < 4; ++j) { __m128i avgLTL, avg; Average2_m128i(&L, &TL, &avgLTL); Average2_m128i(&avgTTR, &avgLTL, &avg); L = _mm_add_epi8(avg, src); out[i + j] = _mm_cvtsi128_si32(L); // Rotate the pre-computed values for the next iteration. avgTTR = _mm_srli_si128(avgTTR, 4); TL = _mm_srli_si128(TL, 4); src = _mm_srli_si128(src, 4); } } if (i != num_pixels) { VP8LPredictorsAdd_C[10](in + i, upper + i, num_pixels - i, out + i); } }
int searchSIMDTree(int32_t **tree, int *fanout, int levels, int32_t value) { int iLevel = 0; int lOffset = 0; int pOffset = 0; int32_t cmpmask = 0; int32_t eqmask = 0; __m128i key = _mm_cvtsi32_si128(value); key = _mm_shuffle_epi32(key, _MM_SHUFFLE(0,0,0,0)); while (iLevel < levels) { int f = fanout[iLevel]; pOffset = lOffset; lOffset *= f - 1; int iter = 0; int position = 0; while (iter < f/4) { __m128i delimiters = _mm_load_si128((__m128i const*)&tree[iLevel][lOffset + iter*4]); __m128i compare = _mm_cmpgt_epi32(key, delimiters); cmpmask = _mm_movemask_ps(_mm_castsi128_ps(compare)); cmpmask ^= 0x0F; if (cmpmask) { position = _bit_scan_forward(cmpmask); break; } iter++; } int offset = lOffset + iter*4 + position; lOffset = offset + pOffset; iLevel++; } return lOffset; }
static void PredictorAdd12_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; const __m128i zero = _mm_setzero_si128(); const __m128i L8 = _mm_cvtsi32_si128(out[-1]); __m128i L = _mm_unpacklo_epi8(L8, zero); for (i = 0; i + 4 <= num_pixels; i += 4) { // Load 4 pixels at a time. __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); const __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); const __m128i T_lo = _mm_unpacklo_epi8(T, zero); const __m128i T_hi = _mm_unpackhi_epi8(T, zero); const __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero); const __m128i TL_hi = _mm_unpackhi_epi8(TL, zero); __m128i diff_lo = _mm_sub_epi16(T_lo, TL_lo); __m128i diff_hi = _mm_sub_epi16(T_hi, TL_hi); DO_PRED12(diff_lo, 0, 0); DO_PRED12_SHIFT(diff_lo, 0); DO_PRED12(diff_lo, 1, 1); DO_PRED12_SHIFT(diff_lo, 1); DO_PRED12(diff_hi, 0, 2); DO_PRED12_SHIFT(diff_hi, 0); DO_PRED12(diff_hi, 1, 3); } if (i != num_pixels) { VP8LPredictorsAdd_C[12](in + i, upper + i, num_pixels - i, out + i); } }
static void PredictorAdd11_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; __m128i pa; __m128i L = _mm_cvtsi32_si128(out[-1]); for (i = 0; i + 4 <= num_pixels; i += 4) { __m128i T = _mm_loadu_si128((const __m128i*)&upper[i]); __m128i TL = _mm_loadu_si128((const __m128i*)&upper[i - 1]); __m128i src = _mm_loadu_si128((const __m128i*)&in[i]); { // We can unpack with any value on the upper 32 bits, provided it's the // same on both operands (so that their sum of abs diff is zero). Here we // use T. const __m128i T_lo = _mm_unpacklo_epi32(T, T); const __m128i TL_lo = _mm_unpacklo_epi32(TL, T); const __m128i T_hi = _mm_unpackhi_epi32(T, T); const __m128i TL_hi = _mm_unpackhi_epi32(TL, T); const __m128i s_lo = _mm_sad_epu8(T_lo, TL_lo); const __m128i s_hi = _mm_sad_epu8(T_hi, TL_hi); pa = _mm_packs_epi32(s_lo, s_hi); // pa = sum |T-TL| } DO_PRED11(0); DO_PRED11_SHIFT; DO_PRED11(1); DO_PRED11_SHIFT; DO_PRED11(2); DO_PRED11_SHIFT; DO_PRED11(3); } if (i != num_pixels) { VP8LPredictorsAdd_C[11](in + i, upper + i, num_pixels - i, out + i); } }
static __m128i load(const void* p) { static_assert(bpp <= 4, ""); uint32_t packed; memcpy(&packed, p, bpp); return _mm_cvtsi32_si128(packed); }
static void SkMorph_SSE2(const SkPMColor* src, SkPMColor* dst, int radius, int width, int height, int srcStride, int dstStride) { const int srcStrideX = direction == kX ? 1 : srcStride; const int dstStrideX = direction == kX ? 1 : dstStride; const int srcStrideY = direction == kX ? srcStride : 1; const int dstStrideY = direction == kX ? dstStride : 1; radius = SkMin32(radius, width - 1); const SkPMColor* upperSrc = src + radius * srcStrideX; for (int x = 0; x < width; ++x) { const SkPMColor* lp = src; const SkPMColor* up = upperSrc; SkPMColor* dptr = dst; for (int y = 0; y < height; ++y) { __m128i max = type == kDilate ? _mm_setzero_si128() : _mm_set1_epi32(0xFFFFFFFF); for (const SkPMColor* p = lp; p <= up; p += srcStrideX) { __m128i src_pixel = _mm_cvtsi32_si128(*p); max = type == kDilate ? _mm_max_epu8(src_pixel, max) : _mm_min_epu8(src_pixel, max); } *dptr = _mm_cvtsi128_si32(max); dptr += dstStrideY; lp += srcStrideY; up += srcStrideY; } if (x >= radius) { src += srcStrideX; } if (x + radius < width - 1) { upperSrc += srcStrideX; } dst += dstStrideX; } }
int normL1_(const uchar* a, const uchar* b, int n) { int j = 0, d = 0; #if CV_SSE __m128i d0 = _mm_setzero_si128(); for( ; j <= n - 16; j += 16 ) { __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j)); __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j)); d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1)); } for( ; j <= n - 4; j += 4 ) { __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j)); __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j)); d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1)); } d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0))); #elif CV_NEON uint32x4_t v_sum = vdupq_n_u32(0.0f); for ( ; j <= n - 16; j += 16) { uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j)); uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst)); v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high))); v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high))); } uint CV_DECL_ALIGNED(16) buf[4]; vst1q_u32(buf, v_sum); d = buf[0] + buf[1] + buf[2] + buf[3]; #endif { for( ; j <= n - 4; j += 4 ) { d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) + std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]); } } for( ; j < n; j++ ) d += std::abs(a[j] - b[j]); return d; }
uint32_t halfsiphash(const unsigned char key[16], const unsigned char *m, size_t len) { xmmi k,v02,v20,v13,v11,v33,mi; uint32_t last7; uint32_t lo, hi; size_t i, blocks; k = _mm_loadu_si128((xmmi *)(key + 0)); v02 = siphash_init[0].v; v13 = siphash_init[1].v; v02 = _mm_xor_si128(v02, _mm_unpacklo_epi64(k, k)); v13 = _mm_xor_si128(v13, _mm_unpackhi_epi64(k, k)); last7 = (len & 0xff) << 24; for (i = 0, blocks = (len & ~3); i < blocks; i += 4) { mi = _mm_loadl_epi64((xmmi *)(m + i)); v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8)); sipcompress() sipcompress() v02 = _mm_xor_si128(v02, mi); } switch (len - blocks) { case 3: last7 |= (uint32_t)m[i + 2] << 16; case 2: last7 |= (uint32_t)m[i + 1] << 8; case 1: last7 |= (uint32_t)m[i + 0] ; case 0: default:; }; mi = _mm_unpacklo_epi32(_mm_cvtsi32_si128(last7),_mm_cvtsi32_si128(0)); v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8)); sipcompress() sipcompress() v02 = _mm_xor_si128(v02, mi); v02 = _mm_xor_si128(v02, siphash_final.v); sipcompress() sipcompress() sipcompress() sipcompress() v02 = _mm_xor_si128(v02, v13); v02 = _mm_xor_si128(v02, _mm_shuffle_epi32(v02, _MM_SHUFFLE(1,0,3,2))); lo = _mm_cvtsi128_si32(v02); return lo; }
__m128i test_mm_cvtsi32_si128(int A) { // DAG-LABEL: test_mm_cvtsi32_si128 // DAG: insertelement <4 x i32> undef, i32 %{{.*}}, i32 0 // // ASM-LABEL: test_mm_cvtsi32_si128 // ASM: movd return _mm_cvtsi32_si128(A); }
unsigned int vp9_sad3x16_sse2( const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride) { int r; __m128i s0, s1, s2, s3; __m128i r0, r1, r2, r3; __m128i sad = _mm_setzero_si128(); __m128i mask; const int offset = (uintptr_t)src_ptr & 3; /* In current use case, the offset is 1 if CONFIG_SUBPELREFMV is off. * Here, for offset=1, we adjust src_ptr to be 4-byte aligned. Then, movd * takes much less time. */ if (offset == 1) src_ptr -= 1; /* mask = 0xffffffffffff0000ffffffffffff0000 */ mask = _mm_cmpeq_epi32(sad, sad); mask = _mm_slli_epi64(mask, 16); for (r = 0; r < 16; r += 4) { s0 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 0 * src_stride)); s1 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 1 * src_stride)); s2 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 2 * src_stride)); s3 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 3 * src_stride)); r0 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 0 * ref_stride)); r1 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 1 * ref_stride)); r2 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 2 * ref_stride)); r3 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 3 * ref_stride)); s0 = _mm_unpacklo_epi8(s0, s1); r0 = _mm_unpacklo_epi8(r0, r1); s2 = _mm_unpacklo_epi8(s2, s3); r2 = _mm_unpacklo_epi8(r2, r3); s0 = _mm_unpacklo_epi64(s0, s2); r0 = _mm_unpacklo_epi64(r0, r2); // throw out extra byte if (offset == 1) s0 = _mm_and_si128(s0, mask); else s0 = _mm_slli_epi64(s0, 16); r0 = _mm_slli_epi64(r0, 16); sad = _mm_add_epi16(sad, _mm_sad_epu8(s0, r0)); src_ptr += src_stride*4; ref_ptr += ref_stride*4; } sad = _mm_add_epi16(sad, _mm_srli_si128(sad, 8)); return _mm_cvtsi128_si32(sad); }
static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1, uint32_t c2) { const __m128i zero = _mm_setzero_si128(); const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero); const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero); const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero); const __m128i avg = _mm_add_epi16(C1, C0); const __m128i A0 = _mm_srli_epi16(avg, 1); const __m128i A1 = _mm_sub_epi16(A0, B0); const __m128i BgtA = _mm_cmpgt_epi16(B0, A0); const __m128i A2 = _mm_sub_epi16(A1, BgtA); const __m128i A3 = _mm_srai_epi16(A2, 1); const __m128i A4 = _mm_add_epi16(A0, A3); const __m128i A5 = _mm_packus_epi16(A4, A4); const uint32_t output = _mm_cvtsi128_si32(A5); return output; }
static void HE16(uint8_t* dst) { // horizontal int j; const __m128i kShuffle3 = _mm_set1_epi8(3); for (j = 16; j > 0; --j) { const __m128i in = _mm_cvtsi32_si128(*(int*)(dst - 4)); const __m128i values = _mm_shuffle_epi8(in, kShuffle3); _mm_storeu_si128((__m128i*)dst, values); dst += BPS; } }
static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) { const __m128i zero = _mm_setzero_si128(); const __m128i avg1 = Average2_128i(a0, a2); const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero); const __m128i sum = _mm_add_epi16(avg1, A1); const __m128i avg2 = _mm_srli_epi16(sum, 1); const __m128i A2 = _mm_packus_epi16(avg2, avg2); const uint32_t output = _mm_cvtsi128_si32(A2); return output; }
static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk, const uint8_t* src) { const int x_sub = wrk->x_sub; int accum = 0; const __m128i zero = _mm_setzero_si128(); const __m128i mult0 = _mm_set1_epi16(x_sub); const __m128i mult1 = _mm_set1_epi32(wrk->fx_scale); const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER); __m128i sum = zero; rescaler_t* frow = wrk->frow; const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width; if (wrk->num_channels != 4 || wrk->x_add > (x_sub << 7)) { WebPRescalerImportRowShrink_C(wrk, src); return; } assert(!WebPRescalerInputDone(wrk)); assert(!wrk->x_expand); for (; frow < frow_end; frow += 4) { __m128i base = zero; accum += wrk->x_add; while (accum > 0) { const __m128i A = _mm_cvtsi32_si128(WebPMemToUint32(src)); src += 4; base = _mm_unpacklo_epi8(A, zero); // To avoid overflow, we need: base * x_add / x_sub < 32768 // => x_add < x_sub << 7. That's a 1/128 reduction ratio limit. sum = _mm_add_epi16(sum, base); accum -= x_sub; } { // Emit next horizontal pixel. const __m128i mult = _mm_set1_epi16(-accum); const __m128i frac0 = _mm_mullo_epi16(base, mult); // 16b x 16b -> 32b const __m128i frac1 = _mm_mulhi_epu16(base, mult); const __m128i frac = _mm_unpacklo_epi16(frac0, frac1); // frac is 32b const __m128i A0 = _mm_mullo_epi16(sum, mult0); const __m128i A1 = _mm_mulhi_epu16(sum, mult0); const __m128i B0 = _mm_unpacklo_epi16(A0, A1); // sum * x_sub const __m128i frow_out = _mm_sub_epi32(B0, frac); // sum * x_sub - frac const __m128i D0 = _mm_srli_epi64(frac, 32); const __m128i D1 = _mm_mul_epu32(frac, mult1); // 32b x 16b -> 64b const __m128i D2 = _mm_mul_epu32(D0, mult1); const __m128i E1 = _mm_add_epi64(D1, rounder); const __m128i E2 = _mm_add_epi64(D2, rounder); const __m128i F1 = _mm_shuffle_epi32(E1, 1 | (3 << 2)); const __m128i F2 = _mm_shuffle_epi32(E2, 1 | (3 << 2)); const __m128i G = _mm_unpacklo_epi32(F1, F2); sum = _mm_packs_epi32(G, zero); _mm_storeu_si128((__m128i*)frow, frow_out); } } assert(accum == 0); }
static void TransformAC3(const int16_t* in, uint8_t* dst) { static const int kC1 = 20091 + (1 << 16); static const int kC2 = 35468; const __m128i A = _mm_set1_epi16(in[0] + 4); const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2)); const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1)); const int c1 = MUL(in[1], kC2); const int d1 = MUL(in[1], kC1); const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1); const __m128i B = _mm_adds_epi16(A, CD); const __m128i m0 = _mm_adds_epi16(B, d4); const __m128i m1 = _mm_adds_epi16(B, c4); const __m128i m2 = _mm_subs_epi16(B, c4); const __m128i m3 = _mm_subs_epi16(B, d4); const __m128i zero = _mm_setzero_si128(); // Load the source pixels. __m128i dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS)); __m128i dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS)); __m128i dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS)); __m128i dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS)); // Convert to 16b. dst0 = _mm_unpacklo_epi8(dst0, zero); dst1 = _mm_unpacklo_epi8(dst1, zero); dst2 = _mm_unpacklo_epi8(dst2, zero); dst3 = _mm_unpacklo_epi8(dst3, zero); // Add the inverse transform. dst0 = _mm_adds_epi16(dst0, _mm_srai_epi16(m0, 3)); dst1 = _mm_adds_epi16(dst1, _mm_srai_epi16(m1, 3)); dst2 = _mm_adds_epi16(dst2, _mm_srai_epi16(m2, 3)); dst3 = _mm_adds_epi16(dst3, _mm_srai_epi16(m3, 3)); // Unsigned saturate to 8b. dst0 = _mm_packus_epi16(dst0, dst0); dst1 = _mm_packus_epi16(dst1, dst1); dst2 = _mm_packus_epi16(dst2, dst2); dst3 = _mm_packus_epi16(dst3, dst3); // Store the results. *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0); *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1); *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2); *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3); }
static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride) { __m128i A = _mm_cvtsi32_si128(*(const int *)src); __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride)); __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2)); __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3)); // 00 10 01 11 02 12 03 13 const __m128i tr0_0 = _mm_unpacklo_epi8(A, B); // 20 30 21 31 22 32 23 33 const __m128i tr0_1 = _mm_unpacklo_epi8(C, D); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 A = _mm_unpacklo_epi16(tr0_0, tr0_1); B = _mm_srli_si128(A, 4); C = _mm_srli_si128(A, 8); D = _mm_srli_si128(A, 12); *(int *)(dst) = _mm_cvtsi128_si32(A); *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B); *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C); *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D); }
static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) { int pa_minus_pb; const __m128i zero = _mm_setzero_si128(); const __m128i A0 = _mm_cvtsi32_si128(a); const __m128i B0 = _mm_cvtsi32_si128(b); const __m128i C0 = _mm_cvtsi32_si128(c); const __m128i AC0 = _mm_subs_epu8(A0, C0); const __m128i CA0 = _mm_subs_epu8(C0, A0); const __m128i BC0 = _mm_subs_epu8(B0, C0); const __m128i CB0 = _mm_subs_epu8(C0, B0); const __m128i AC = _mm_or_si128(AC0, CA0); const __m128i BC = _mm_or_si128(BC0, CB0); const __m128i pa = _mm_unpacklo_epi8(AC, zero); // |a - c| const __m128i pb = _mm_unpacklo_epi8(BC, zero); // |b - c| const __m128i diff = _mm_sub_epi16(pb, pa); { int16_t out[8]; _mm_storeu_si128((__m128i*)out, diff); pa_minus_pb = out[0] + out[1] + out[2] + out[3]; } return (pa_minus_pb <= 0) ? a : b; }
/* ===================== R_CopyDecalSurface ===================== */ static void R_CopyDecalSurface( idDrawVert * verts, int numVerts, triIndex_t * indexes, int numIndexes, const decal_t * decal, const float fadeColor[4] ) { assert_16_byte_aligned( &verts[numVerts] ); assert_16_byte_aligned( &indexes[numIndexes] ); assert_16_byte_aligned( decal->indexes ); assert_16_byte_aligned( decal->verts ); assert( ( ( decal->numVerts * sizeof( idDrawVert ) ) & 15 ) == 0 ); assert( ( ( decal->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 ); assert_16_byte_aligned( fadeColor ); const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 ); const __m128i vector_short_num_verts = _mm_packs_epi32( vector_int_num_verts, vector_int_num_verts ); const __m128 vector_fade_color = _mm_load_ps( fadeColor ); const __m128i vector_color_mask = _mm_set_epi32( 0, -1, 0, 0 ); // copy vertices and apply depth/time based fading assert_offsetof( idDrawVert, color, 6 * 4 ); for ( int i = 0; i < decal->numVerts; i++ ) { const idDrawVert &srcVert = decal->verts[i]; idDrawVert &dstVert = verts[numVerts + i]; __m128i v0 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert + 0 ) ); __m128i v1 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert + 16 ) ); __m128 depthFade = _mm_splat_ps( _mm_load_ss( decal->vertDepthFade + i ), 0 ); __m128 timeDepthFade = _mm_mul_ps( depthFade, vector_fade_color ); __m128i colorInt = _mm_cvtps_epi32( timeDepthFade ); __m128i colorShort = _mm_packs_epi32( colorInt, colorInt ); __m128i colorByte = _mm_packus_epi16( colorShort, colorShort ); v1 = _mm_or_si128( v1, _mm_and_si128( colorByte, vector_color_mask ) ); _mm_stream_si128( (__m128i *)( (byte *)&dstVert + 0 ), v0 ); _mm_stream_si128( (__m128i *)( (byte *)&dstVert + 16 ), v1 ); } // copy indexes assert( ( decal->numIndexes & 7 ) == 0 ); assert( sizeof( triIndex_t ) == 2 ); for ( int i = 0; i < decal->numIndexes; i += 8 ) { __m128i vi = _mm_load_si128( (const __m128i *)&decal->indexes[i] ); vi = _mm_add_epi16( vi, vector_short_num_verts ); _mm_stream_si128( (__m128i *)&indexes[numIndexes + i], vi ); } _mm_sfence(); }
static INLINE unsigned int masked_sad4xh_ssse3( const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { int y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); for (y = 0; y < height; y += 2) { // Load two rows at a time, this seems to be a bit faster // than four rows at a time in this case. const __m128i src = _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(uint32_t *)src_ptr), _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride])); const __m128i a = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)a_ptr), _mm_cvtsi32_si128(*(uint32_t *)&a_ptr[a_stride])); const __m128i b = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr), _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride])); const __m128i m = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr), _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride])); const __m128i m_inv = _mm_sub_epi8(mask_max, m); const __m128i data = _mm_unpacklo_epi8(a, b); const __m128i mask = _mm_unpacklo_epi8(m, m_inv); __m128i pred_16bit = _mm_maddubs_epi16(data, mask); pred_16bit = xx_roundn_epu16(pred_16bit, AOM_BLEND_A64_ROUND_BITS); const __m128i pred = _mm_packus_epi16(pred_16bit, _mm_setzero_si128()); res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); src_ptr += src_stride * 2; a_ptr += a_stride * 2; b_ptr += b_stride * 2; m_ptr += m_stride * 2; } // At this point, the SAD is stored in lane 0 of 'res' int32_t sad = _mm_cvtsi128_si32(res); return (sad + 31) >> 6; }