/*__forceinline*/ bool Cmp_ClutBuffer_GSMem<u32>(u32* GSmem, u32 csa, u32 clutsize) { u64* _GSmem = (u64*) GSmem; u64* clut = (u64*)GetClutBufferAddress<u32>(csa); while(clutsize > 0) { #ifdef ZEROGS_SSE2 // Note: local memory datas are swizzles __m128i GSmem_0 = _mm_load_si128((__m128i*)_GSmem); // 9 8 1 0 __m128i GSmem_1 = _mm_load_si128((__m128i*)_GSmem+1); // 11 10 3 2 __m128i GSmem_2 = _mm_load_si128((__m128i*)_GSmem+2); // 13 12 5 4 __m128i GSmem_3 = _mm_load_si128((__m128i*)_GSmem+3); // 15 14 7 6 __m128i clut_0 = _mm_load_si128((__m128i*)clut); __m128i clut_1 = _mm_load_si128((__m128i*)clut+1); __m128i clut_2 = _mm_load_si128((__m128i*)clut+2); __m128i clut_3 = _mm_load_si128((__m128i*)clut+3); __m128i result = _mm_cmpeq_epi32(_mm_unpacklo_epi64(GSmem_0, GSmem_1), clut_0); __m128i result_tmp = _mm_cmpeq_epi32(_mm_unpacklo_epi64(GSmem_2, GSmem_3), clut_1); result = _mm_and_si128(result, result_tmp); result_tmp = _mm_cmpeq_epi32(_mm_unpackhi_epi64(GSmem_0, GSmem_1), clut_2); result = _mm_and_si128(result, result_tmp); result_tmp = _mm_cmpeq_epi32(_mm_unpackhi_epi64(GSmem_2, GSmem_3), clut_3); result = _mm_and_si128(result, result_tmp); u32 result_int = _mm_movemask_epi8(result); if (result_int != 0xFFFF) return true; #else // I see no point to keep an mmx version. SSE2 versions is probably faster. // Keep a slow portable C version for reference/debug // Note: local memory datas are swizzles if (clut[0] != _GSmem[0] || clut[1] != _GSmem[2] || clut[2] != _GSmem[4] || clut[3] != _GSmem[6] || clut[4] != _GSmem[1] || clut[5] != _GSmem[3] || clut[6] != _GSmem[5] || clut[7] != _GSmem[7]) return true; #endif // go to the next memory block _GSmem += 32; // go back to the previous memory block then down one memory column if (clutsize & 0x40) { _GSmem -= (64-8); } // In case previous operation (down one column) cross the block boundary // Go to the next block if (clutsize == 0x240) { _GSmem += 32; } clut += 8; clutsize -= 64; } return false; }
void transpose_out(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3) { __m128i T0 = _mm_unpacklo_epi64(B0, B1); __m128i T1 = _mm_unpacklo_epi64(B2, B3); __m128i T2 = _mm_unpackhi_epi64(B0, B1); __m128i T3 = _mm_unpackhi_epi64(B2, B3); T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0)); T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0)); T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0)); T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0)); T0 = _mm_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0)); T1 = _mm_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0)); T2 = _mm_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0)); T3 = _mm_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0)); T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0)); T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0)); T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0)); T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0)); B0 = _mm_unpacklo_epi32(T0, T1); B1 = _mm_unpackhi_epi32(T0, T1); B2 = _mm_unpacklo_epi32(T2, T3); B3 = _mm_unpackhi_epi32(T2, T3); }
__forceinline void GSMem_to_ClutBuffer__T32_I8_CSM1_sse2(u32* vm, u32 csa) { u32* clut = GetClutBufferAddress<u32>(csa); __m128i* src = (__m128i*)vm; __m128i* dst = (__m128i*)clut; for (int j = 0; j < 64; j += 32, src += 32, dst += 32) { for (int i = 0; i < 16; i += 4) { __m128i r0 = _mm_load_si128(&src[i+0]); __m128i r1 = _mm_load_si128(&src[i+1]); __m128i r2 = _mm_load_si128(&src[i+2]); __m128i r3 = _mm_load_si128(&src[i+3]); _mm_store_si128(&dst[i*2+0], _mm_unpacklo_epi64(r0, r1)); _mm_store_si128(&dst[i*2+1], _mm_unpacklo_epi64(r2, r3)); _mm_store_si128(&dst[i*2+2], _mm_unpackhi_epi64(r0, r1)); _mm_store_si128(&dst[i*2+3], _mm_unpackhi_epi64(r2, r3)); __m128i r4 = _mm_load_si128(&src[i+0+16]); __m128i r5 = _mm_load_si128(&src[i+1+16]); __m128i r6 = _mm_load_si128(&src[i+2+16]); __m128i r7 = _mm_load_si128(&src[i+3+16]); _mm_store_si128(&dst[i*2+4], _mm_unpacklo_epi64(r4, r5)); _mm_store_si128(&dst[i*2+5], _mm_unpacklo_epi64(r6, r7)); _mm_store_si128(&dst[i*2+6], _mm_unpackhi_epi64(r4, r5)); _mm_store_si128(&dst[i*2+7], _mm_unpackhi_epi64(r6, r7)); } } }
inline FORCE_INLINE void transpose8_epi16(__m128i &x0, __m128i &x1, __m128i &x2, __m128i &x3, __m128i &x4, __m128i &x5, __m128i &x6, __m128i &x7) { __m128i t0, t1, t2, t3, t4, t5, t6, t7; __m128i tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7; t0 = _mm_unpacklo_epi16(x0, x1); t1 = _mm_unpacklo_epi16(x2, x3); t2 = _mm_unpacklo_epi16(x4, x5); t3 = _mm_unpacklo_epi16(x6, x7); t4 = _mm_unpackhi_epi16(x0, x1); t5 = _mm_unpackhi_epi16(x2, x3); t6 = _mm_unpackhi_epi16(x4, x5); t7 = _mm_unpackhi_epi16(x6, x7); tt0 = _mm_unpacklo_epi32(t0, t1); tt1 = _mm_unpackhi_epi32(t0, t1); tt2 = _mm_unpacklo_epi32(t2, t3); tt3 = _mm_unpackhi_epi32(t2, t3); tt4 = _mm_unpacklo_epi32(t4, t5); tt5 = _mm_unpackhi_epi32(t4, t5); tt6 = _mm_unpacklo_epi32(t6, t7); tt7 = _mm_unpackhi_epi32(t6, t7); x0 = _mm_unpacklo_epi64(tt0, tt2); x1 = _mm_unpackhi_epi64(tt0, tt2); x2 = _mm_unpacklo_epi64(tt1, tt3); x3 = _mm_unpackhi_epi64(tt1, tt3); x4 = _mm_unpacklo_epi64(tt4, tt6); x5 = _mm_unpackhi_epi64(tt4, tt6); x6 = _mm_unpacklo_epi64(tt5, tt7); x7 = _mm_unpackhi_epi64(tt5, tt7); }
void av1_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) { __m128i in0, in1; __m128i tmp; const __m128i zero = _mm_setzero_si128(); in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); in1 = _mm_unpacklo_epi64( in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride))); in0 = _mm_unpacklo_epi64( in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride))); tmp = _mm_add_epi16(in0, in1); in0 = _mm_unpacklo_epi16(zero, tmp); in1 = _mm_unpackhi_epi16(zero, tmp); in0 = _mm_srai_epi32(in0, 16); in1 = _mm_srai_epi32(in1, 16); tmp = _mm_add_epi32(in0, in1); in0 = _mm_unpacklo_epi32(tmp, zero); in1 = _mm_unpackhi_epi32(tmp, zero); tmp = _mm_add_epi32(in0, in1); in0 = _mm_srli_si128(tmp, 8); in1 = _mm_add_epi32(tmp, in0); in0 = _mm_slli_epi32(in1, 1); store_output(&in0, output); }
static WEBP_INLINE void Load16x4(const uint8_t* r0, const uint8_t* r8, int stride, __m128i* p1, __m128i* p0, __m128i* q0, __m128i* q1) { __m128i t1, t2; // Assume the pixels around the edge (|) are numbered as follows // 00 01 | 02 03 // 10 11 | 12 13 // ... | ... // e0 e1 | e2 e3 // f0 f1 | f2 f3 // // r0 is pointing to the 0th row (00) // r8 is pointing to the 8th row (80) // Load // p1 = 71 61 51 41 31 21 11 01 70 60 50 40 30 20 10 00 // q0 = 73 63 53 43 33 23 13 03 72 62 52 42 32 22 12 02 // p0 = f1 e1 d1 c1 b1 a1 91 81 f0 e0 d0 c0 b0 a0 90 80 // q1 = f3 e3 d3 c3 b3 a3 93 83 f2 e2 d2 c2 b2 a2 92 82 Load8x4(r0, stride, p1, q0); Load8x4(r8, stride, p0, q1); t1 = *p1; t2 = *q0; // p1 = f0 e0 d0 c0 b0 a0 90 80 70 60 50 40 30 20 10 00 // p0 = f1 e1 d1 c1 b1 a1 91 81 71 61 51 41 31 21 11 01 // q0 = f2 e2 d2 c2 b2 a2 92 82 72 62 52 42 32 22 12 02 // q1 = f3 e3 d3 c3 b3 a3 93 83 73 63 53 43 33 23 13 03 *p1 = _mm_unpacklo_epi64(t1, *p0); *p0 = _mm_unpackhi_epi64(t1, *p0); *q0 = _mm_unpacklo_epi64(t2, *q1); *q1 = _mm_unpackhi_epi64(t2, *q1); }
static inline void inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a, const gint16 * b, gint len, const gint16 * icoeff, gint bstride) { gint i = 0; __m128i sum[4], t[4]; __m128i f = _mm_set_epi64x (0, *((long long *) icoeff)); const gint16 *c[4] = { (gint16 *) ((gint8 *) b + 0 * bstride), (gint16 *) ((gint8 *) b + 1 * bstride), (gint16 *) ((gint8 *) b + 2 * bstride), (gint16 *) ((gint8 *) b + 3 * bstride) }; sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 (); f = _mm_unpacklo_epi16 (f, sum[0]); for (; i < len; i += 8) { t[0] = _mm_loadu_si128 ((__m128i *) (a + i)); sum[0] = _mm_add_epi32 (sum[0], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[0] + i)))); sum[1] = _mm_add_epi32 (sum[1], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[1] + i)))); sum[2] = _mm_add_epi32 (sum[2], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[2] + i)))); sum[3] = _mm_add_epi32 (sum[3], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[3] + i)))); } t[0] = _mm_unpacklo_epi32 (sum[0], sum[1]); t[1] = _mm_unpacklo_epi32 (sum[2], sum[3]); t[2] = _mm_unpackhi_epi32 (sum[0], sum[1]); t[3] = _mm_unpackhi_epi32 (sum[2], sum[3]); sum[0] = _mm_add_epi32 (_mm_unpacklo_epi64 (t[0], t[1]), _mm_unpackhi_epi64 (t[0], t[1])); sum[2] = _mm_add_epi32 (_mm_unpacklo_epi64 (t[2], t[3]), _mm_unpackhi_epi64 (t[2], t[3])); sum[0] = _mm_add_epi32 (sum[0], sum[2]); sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); sum[0] = _mm_madd_epi16 (sum[0], f); sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, 3))); sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, 1))); sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); sum[0] = _mm_packs_epi32 (sum[0], sum[0]); *o = _mm_extract_epi16 (sum[0], 0); }
static inline void yuv_to_packed_shader_3(uint8_t** dstp, const uint8_t** srcp, const int dpitch, const int spitch, const int width, const int height, void* _buff) noexcept { const uint8_t* sr = srcp[0]; const uint8_t* sg = srcp[1]; const uint8_t* sb = srcp[2]; uint8_t* d = dstp[0]; float* buff = reinterpret_cast<float*>(_buff); const uint8_t *rlsb, *glsb, *blsb; if (STACK16) { rlsb = sr + height * spitch; glsb = sg + height * spitch; blsb = sb + height * spitch; } const __m128i zero = _mm_setzero_si128(); const __m128 rcp = _mm_set1_ps(1.0f / (STACK16 ? 65535 : 255)); for (int y = 0; y < height; ++y) { for (int x = 0; x < width; x += 4) { __m128i r, g, b; if (!STACK16) { r = _mm_cvtepu8_epi32(loadl(sr + x)); g = _mm_cvtepu8_epi32(loadl(sg + x)); b = _mm_cvtepu8_epi32(loadl(sb + x)); } else { r = _mm_unpacklo_epi16(_mm_unpacklo_epi8(loadl(rlsb + x), loadl(sr + x)), zero); g = _mm_unpacklo_epi16(_mm_unpacklo_epi8(loadl(glsb + x), loadl(sg + x)), zero); b = _mm_unpacklo_epi16(_mm_unpacklo_epi8(loadl(blsb + x), loadl(sb + x)), zero); } __m128i rg = _mm_unpacklo_epi32(r, g); __m128i ba = _mm_unpacklo_epi32(b, zero); __m128 rgba0 = _mm_cvtepi32_ps(_mm_unpacklo_epi64(rg, ba)); __m128 rgba1 = _mm_cvtepi32_ps(_mm_unpackhi_epi64(rg, ba)); _mm_store_ps(buff + 4 * x + 0, _mm_mul_ps(rgba0, rcp)); _mm_store_ps(buff + 4 * x + 4, _mm_mul_ps(rgba1, rcp)); rg = _mm_unpackhi_epi32(r, g); ba = _mm_unpackhi_epi32(b, zero); rgba0 = _mm_cvtepi32_ps(_mm_unpacklo_epi64(rg, ba)); rgba1 = _mm_cvtepi32_ps(_mm_unpackhi_epi64(rg, ba)); _mm_store_ps(buff + 4 * x + 8, _mm_mul_ps(rgba0, rcp)); _mm_store_ps(buff + 4 * x + 12, _mm_mul_ps(rgba1, rcp)); } convert_float_to_half(d, buff, width * 4); d += dpitch; sr += spitch; sg += spitch; sb += spitch; if (STACK16) { rlsb += spitch; glsb += spitch; blsb += spitch; } } }
static INLINE unsigned int highbd_masked_sad4xh_ssse3( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); int y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m128i round_const = _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m128i one = _mm_set1_epi16(1); for (y = 0; y < height; y += 2) { const __m128i src = _mm_unpacklo_epi64( _mm_loadl_epi64((const __m128i *)src_ptr), _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); const __m128i a = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr), _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride])); const __m128i b = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr), _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride])); // Zero-extend mask to 16 bits const __m128i m = _mm_unpacklo_epi8( _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const uint32_t *)m_ptr), _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])), _mm_setzero_si128()); const __m128i m_inv = _mm_sub_epi16(mask_max, m); const __m128i data_l = _mm_unpacklo_epi16(a, b); const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); __m128i pred_l = _mm_madd_epi16(data_l, mask_l); pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpackhi_epi16(a, b); const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); __m128i pred_r = _mm_madd_epi16(data_r, mask_r); pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i pred = _mm_packs_epi32(pred_l, pred_r); const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); src_ptr += src_stride * 2; a_ptr += a_stride * 2; b_ptr += b_stride * 2; m_ptr += m_stride * 2; } res = _mm_hadd_epi32(res, res); res = _mm_hadd_epi32(res, res); int sad = _mm_cvtsi128_si32(res); return (sad + 31) >> 6; }
void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest, int stride) { uint8_t abs_diff; __m128i d; // Prediction data. __m128i p0 = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride)); __m128i p1 = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride)); __m128i p2 = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride)); __m128i p3 = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride)); __m128i p4 = _mm_loadl_epi64((const __m128i *)(dest + 4 * stride)); __m128i p5 = _mm_loadl_epi64((const __m128i *)(dest + 5 * stride)); __m128i p6 = _mm_loadl_epi64((const __m128i *)(dest + 6 * stride)); __m128i p7 = _mm_loadl_epi64((const __m128i *)(dest + 7 * stride)); p0 = _mm_unpacklo_epi64(p0, p1); p2 = _mm_unpacklo_epi64(p2, p3); p4 = _mm_unpacklo_epi64(p4, p5); p6 = _mm_unpacklo_epi64(p6, p7); // Clip diff value to [0, 255] range. Then, do addition or subtraction // according to its sign. if (diff >= 0) { abs_diff = (diff > 255) ? 255 : diff; d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); p0 = _mm_adds_epu8(p0, d); p2 = _mm_adds_epu8(p2, d); p4 = _mm_adds_epu8(p4, d); p6 = _mm_adds_epu8(p6, d); } else { abs_diff = (diff < -255) ? 255 : -diff; d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); p0 = _mm_subs_epu8(p0, d); p2 = _mm_subs_epu8(p2, d); p4 = _mm_subs_epu8(p4, d); p6 = _mm_subs_epu8(p6, d); } _mm_storel_epi64((__m128i *)(dest + 0 * stride), p0); p0 = _mm_srli_si128(p0, 8); _mm_storel_epi64((__m128i *)(dest + 1 * stride), p0); _mm_storel_epi64((__m128i *)(dest + 2 * stride), p2); p2 = _mm_srli_si128(p2, 8); _mm_storel_epi64((__m128i *)(dest + 3 * stride), p2); _mm_storel_epi64((__m128i *)(dest + 4 * stride), p4); p4 = _mm_srli_si128(p4, 8); _mm_storel_epi64((__m128i *)(dest + 5 * stride), p4); _mm_storel_epi64((__m128i *)(dest + 6 * stride), p6); p6 = _mm_srli_si128(p6, 8); _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6); }
static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch, uint8_t *dst, const int16_t *x_filter) { const __m128i k_256 = _mm_set1_epi16(1 << 8); const __m128i f_values = _mm_load_si128((const __m128i *)x_filter); // pack and duplicate the filter values const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); const __m128i A = _mm_loadl_epi64((const __m128i *)src_x); const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch)); const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2)); const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3)); const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4)); const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5)); const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6)); const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7)); // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57 const __m128i tr0_2 = _mm_unpacklo_epi16(E, F); // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77 const __m128i tr0_3 = _mm_unpacklo_epi16(G, H); // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1); // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73 const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3); // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2); const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2); const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3); const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3); // multiply 2 adjacent elements with the filter and add the result const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); // add and saturate the results together const __m128i min_x2x1 = _mm_min_epi16(x2, x1); const __m128i max_x2x1 = _mm_max_epi16(x2, x1); __m128i temp = _mm_adds_epi16(x0, x3); temp = _mm_adds_epi16(temp, min_x2x1); temp = _mm_adds_epi16(temp, max_x2x1); // round and shift by 7 bit each 16 bit temp = _mm_mulhrs_epi16(temp, k_256); // shrink to 8 bit each 16 bits temp = _mm_packus_epi16(temp, temp); // save only 8 bytes convolve result _mm_storel_epi64((__m128i *)dst, temp); }
unsigned int vp9_sad3x16_sse2( const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride) { int r; __m128i s0, s1, s2, s3; __m128i r0, r1, r2, r3; __m128i sad = _mm_setzero_si128(); __m128i mask; const int offset = (uintptr_t)src_ptr & 3; /* In current use case, the offset is 1 if CONFIG_SUBPELREFMV is off. * Here, for offset=1, we adjust src_ptr to be 4-byte aligned. Then, movd * takes much less time. */ if (offset == 1) src_ptr -= 1; /* mask = 0xffffffffffff0000ffffffffffff0000 */ mask = _mm_cmpeq_epi32(sad, sad); mask = _mm_slli_epi64(mask, 16); for (r = 0; r < 16; r += 4) { s0 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 0 * src_stride)); s1 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 1 * src_stride)); s2 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 2 * src_stride)); s3 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 3 * src_stride)); r0 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 0 * ref_stride)); r1 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 1 * ref_stride)); r2 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 2 * ref_stride)); r3 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 3 * ref_stride)); s0 = _mm_unpacklo_epi8(s0, s1); r0 = _mm_unpacklo_epi8(r0, r1); s2 = _mm_unpacklo_epi8(s2, s3); r2 = _mm_unpacklo_epi8(r2, r3); s0 = _mm_unpacklo_epi64(s0, s2); r0 = _mm_unpacklo_epi64(r0, r2); // throw out extra byte if (offset == 1) s0 = _mm_and_si128(s0, mask); else s0 = _mm_slli_epi64(s0, 16); r0 = _mm_slli_epi64(r0, 16); sad = _mm_add_epi16(sad, _mm_sad_epu8(s0, r0)); src_ptr += src_stride*4; ref_ptr += ref_stride*4; } sad = _mm_add_epi16(sad, _mm_srli_si128(sad, 8)); return _mm_cvtsi128_si32(sad); }
OD_SIMD_INLINE void od_transpose4(__m128i *t0, __m128i *t1, __m128i *t2, __m128i *t3) { __m128i a = _mm_unpacklo_epi32(*t0, *t1); __m128i b = _mm_unpacklo_epi32(*t2, *t3); __m128i c = _mm_unpackhi_epi32(*t0, *t1); __m128i d = _mm_unpackhi_epi32(*t2, *t3); *t0 = _mm_unpacklo_epi64(a, b); *t1 = _mm_unpackhi_epi64(a, b); *t2 = _mm_unpacklo_epi64(c, d); *t3 = _mm_unpackhi_epi64(c, d); }
/*Transpose 8 vectors with 8 16-bit values.*/ OD_SIMD_INLINE void od_transpose16x8(__m128i *t0, __m128i *t1, __m128i *t2, __m128i *t3, __m128i *t4, __m128i *t5, __m128i *t6, __m128i *t7) { __m128i a0; __m128i b0; __m128i c0; __m128i d0; __m128i e0; __m128i f0; __m128i g0; __m128i h0; __m128i a1; __m128i b1; __m128i c1; __m128i d1; __m128i e1; __m128i f1; __m128i g1; __m128i h1; /*00112233*/ a0 = _mm_unpacklo_epi16(*t0, *t1); b0 = _mm_unpacklo_epi16(*t2, *t3); c0 = _mm_unpacklo_epi16(*t4, *t5); d0 = _mm_unpacklo_epi16(*t6, *t7); /*44556677*/ e0 = _mm_unpackhi_epi16(*t0, *t1); f0 = _mm_unpackhi_epi16(*t2, *t3); g0 = _mm_unpackhi_epi16(*t4, *t5); h0 = _mm_unpackhi_epi16(*t6, *t7); /*00001111*/ a1 = _mm_unpacklo_epi32(a0, b0); b1 = _mm_unpacklo_epi32(c0, d0); /*22223333*/ c1 = _mm_unpackhi_epi32(a0, b0); d1 = _mm_unpackhi_epi32(c0, d0); /*44445555*/ e1 = _mm_unpacklo_epi32(e0, f0); f1 = _mm_unpacklo_epi32(g0, h0); /*66667777*/ g1 = _mm_unpackhi_epi32(e0, f0); h1 = _mm_unpackhi_epi32(g0, h0); *t0 = _mm_unpacklo_epi64(a1, b1); *t1 = _mm_unpackhi_epi64(a1, b1); *t2 = _mm_unpacklo_epi64(c1, d1); *t3 = _mm_unpackhi_epi64(c1, d1); *t4 = _mm_unpacklo_epi64(e1, f1); *t5 = _mm_unpackhi_epi64(e1, f1); *t6 = _mm_unpacklo_epi64(g1, h1); *t7 = _mm_unpackhi_epi64(g1, h1); }
static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src, int num_pixels, uint8_t* dst) { const __m128i mask_0x0f = _mm_set1_epi8(0x0f); const __m128i mask_0xf0 = _mm_set1_epi8(0xf0); const __m128i* in = (const __m128i*)src; __m128i* out = (__m128i*)dst; while (num_pixels >= 8) { const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3 const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7 const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4... const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6... const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6... const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7... const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7 const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7 const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7 const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7 const __m128i ga1 = _mm_srli_epi16(ga0, 4); // g0-|g1-|...|a6-|a7- const __m128i rb1 = _mm_and_si128(rb0, mask_0xf0); // -r0|-r1|...|-b6|-a7 const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f); // g0-|g1-|...|a6-|a7- const __m128i rgba0 = _mm_or_si128(ga2, rb1); // rg0..rg7 | ba0..ba7 const __m128i rgba1 = _mm_srli_si128(rgba0, 8); // ba0..ba7 | 0 #if (WEBP_SWAP_16BIT_CSP == 1) const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0); // barg0...barg7 #else const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1); // rgba0...rgba7 #endif _mm_storeu_si128(out++, rgba); num_pixels -= 8; } // left-overs if (num_pixels > 0) { VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out); } }
/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */ static void shuffle2_sse2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 2; size_t j; int k; uint8_t* dest_for_jth_element; __m128i xmm0[2], xmm1[2]; for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) { /* Fetch 16 elements (32 bytes) then transpose bytes, words and double words. */ for (k = 0; k < 2; k++) { xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i)))); xmm0[k] = _mm_shufflelo_epi16(xmm0[k], 0xd8); xmm0[k] = _mm_shufflehi_epi16(xmm0[k], 0xd8); xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); xmm0[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]); xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]); xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); } /* Transpose quad words */ for (k = 0; k < 1; k++) { xmm1[k * 2] = _mm_unpacklo_epi64(xmm0[k], xmm0[k + 1]); xmm1[k * 2 + 1] = _mm_unpackhi_epi64(xmm0[k], xmm0[k + 1]); } /* Store the result vectors */ dest_for_jth_element = dest + j; for (k = 0; k < 2; k++) { _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm1[k]); } } }
static void ConvertBGRAToRGBA(const uint32_t* src, int num_pixels, uint8_t* dst) { const __m128i* in = (const __m128i*)src; __m128i* out = (__m128i*)dst; while (num_pixels >= 8) { const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3 const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7 const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4... const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6... const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6... const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7... const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7 const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7 const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7 const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7 const __m128i rg0 = _mm_unpacklo_epi8(rb0, ga0); // r0g0r1g1 ... r6g6r7g7 const __m128i ba0 = _mm_unpackhi_epi8(rb0, ga0); // b0a0b1a1 ... b6a6b7a7 const __m128i rgba0 = _mm_unpacklo_epi16(rg0, ba0); // rgba0|rgba1... const __m128i rgba4 = _mm_unpackhi_epi16(rg0, ba0); // rgba4|rgba5... _mm_storeu_si128(out++, rgba0); _mm_storeu_si128(out++, rgba4); num_pixels -= 8; } // left-overs VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out); }
template<int shift, int active_bits> void Haar_invtransform_H_final_1_sse4_2_int16_t(void *_idata, const int istride, const char *odata, const int ostride, const int iwidth, const int iheight, const int ooffset_x, const int ooffset_y, const int owidth, const int oheight) { int16_t *idata = (int16_t *)_idata; const int skip = 1; const __m128i ONE = _mm_set1_epi16(1); const __m128i OFFSET = _mm_set1_epi16(1 << (active_bits - 1)); const __m128i SHUF = _mm_set_epi8(15,14, 11,10, 7,6, 3,2, 13,12, 9,8, 5,4, 1,0); const __m128i CLIP = _mm_set1_epi16((1 << active_bits) - 1); const __m128i ZERO = _mm_set1_epi16(0); (void)iwidth; (void)iheight; for (int y = ooffset_y; y < ooffset_y + oheight; y+=skip) { for (int x = ooffset_x; x < ooffset_x + owidth; x += 16) { __m128i D0 = _mm_load_si128((__m128i *)&idata[y*istride + x + 0]); __m128i D8 = _mm_load_si128((__m128i *)&idata[y*istride + x + 8]); D0 = _mm_shuffle_epi8(D0, SHUF); D8 = _mm_shuffle_epi8(D8, SHUF); __m128i E0 = _mm_unpacklo_epi64(D0, D8); __m128i O1 = _mm_unpackhi_epi64(D0, D8); __m128i X0 = _mm_sub_epi16(E0, _mm_srai_epi16(_mm_add_epi16(O1, ONE), 1)); __m128i X1 = _mm_add_epi16(O1, X0); __m128i Z0 = _mm_unpacklo_epi16(X0, X1); __m128i Z8 = _mm_unpackhi_epi16(X0, X1); if (shift != 0) { Z0 = _mm_add_epi16(Z0, ONE); Z8 = _mm_add_epi16(Z8, ONE); Z0 = _mm_srai_epi16(Z0, shift); Z8 = _mm_srai_epi16(Z8, shift); } Z0 = _mm_add_epi16(Z0, OFFSET); Z8 = _mm_add_epi16(Z8, OFFSET); Z0 = _mm_min_epi16(Z0, CLIP); Z8 = _mm_min_epi16(Z8, CLIP); Z0 = _mm_max_epi16(Z0, ZERO); Z8 = _mm_max_epi16(Z8, ZERO); _mm_store_si128((__m128i *)&odata[2*((y - ooffset_y)*ostride + x + 0 - ooffset_x)], Z0); _mm_store_si128((__m128i *)&odata[2*((y - ooffset_y)*ostride + x + 8 - ooffset_x)], Z8); } } }
void interpolate_gint16_linear_sse2 (gpointer op, const gpointer ap, gint len, const gpointer icp, gint astride) { gint i = 0; gint16 *o = op, *a = ap, *ic = icp; __m128i ta, tb, t1, t2; __m128i f = _mm_set_epi64x (0, *((gint64 *) ic)); const gint16 *c[2] = { (gint16 *) ((gint8 *) a + 0 * astride), (gint16 *) ((gint8 *) a + 1 * astride) }; f = _mm_unpacklo_epi32 (f, f); f = _mm_unpacklo_epi64 (f, f); for (; i < len; i += 8) { ta = _mm_load_si128 ((__m128i *) (c[0] + i)); tb = _mm_load_si128 ((__m128i *) (c[1] + i)); t1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f); t2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f); t1 = _mm_add_epi32 (t1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); t2 = _mm_add_epi32 (t2, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); t1 = _mm_srai_epi32 (t1, PRECISION_S16); t2 = _mm_srai_epi32 (t2, PRECISION_S16); t1 = _mm_packs_epi32 (t1, t2); _mm_store_si128 ((__m128i *) (o + i), t1); } }
void ie_FillLine(iePwBGRA pDst, DWORD nXW, iewBGRA clr) { #ifndef __X64__ if (g_bSSE2 && (nXW >= 4) && (_mm_isAligned(pDst) || _mm_isAligned(pDst + 1))) { #else if (nXW >= 4) { #endif // Do fill using SSE2! if (!_mm_isAligned(pDst)) { // Fill until destination is aligned *pDst++ = clr; nXW--; } __m128i r0 = _mm_loadl_epi64((const __m128i *)&clr); r0 = _mm_unpacklo_epi64(r0, r0); for (DWORD nXW_2 = nXW >> 1; nXW_2--;) { _mm_store_si128((__m128i *)pDst, r0); pDst += 2; } if (nXW & 1) { _mm_storel_epi64((__m128i *)pDst, r0); } return; } while (nXW--) *pDst++ = clr; }
/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */ static void shuffle2(uint8_t* dest, uint8_t* src, size_t size) { size_t i, j, k; size_t numof16belem; __m128i xmm0[2], xmm1[2]; numof16belem = size / (16*2); for (i = 0, j = 0; i < numof16belem; i++, j += 16*2) { /* Fetch and transpose bytes, words and double words in groups of 32 bytes */ for (k = 0; k < 2; k++) { xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16)); xmm0[k] = _mm_shufflelo_epi16(xmm0[k], 0xd8); xmm0[k] = _mm_shufflehi_epi16(xmm0[k], 0xd8); xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); xmm0[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]); xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e); xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]); xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8); } /* Transpose quad words */ for (k = 0; k < 1; k++) { xmm1[k*2] = _mm_unpacklo_epi64(xmm0[k], xmm0[k+1]); xmm1[k*2+1] = _mm_unpackhi_epi64(xmm0[k], xmm0[k+1]); } /* Store the result vectors */ for (k = 0; k < 2; k++) { ((__m128i *)dest)[k*numof16belem+i] = xmm1[k]; } } }
// Convert 16 packed ARGB 16b-values to r[], g[], b[] static WEBP_INLINE void RGBA32PackedToPlanar_16b_SSE41( const uint16_t* const rgbx, __m128i* const r, __m128i* const g, __m128i* const b) { const __m128i in0 = LOAD_16(rgbx + 0); // r0 | g0 | b0 |x| r1 | g1 | b1 |x const __m128i in1 = LOAD_16(rgbx + 8); // r2 | g2 | b2 |x| r3 | g3 | b3 |x const __m128i in2 = LOAD_16(rgbx + 16); // r4 | ... const __m128i in3 = LOAD_16(rgbx + 24); // r6 | ... // aarrggbb as 16-bit. const __m128i shuff0 = _mm_set_epi8(-1, -1, -1, -1, 13, 12, 5, 4, 11, 10, 3, 2, 9, 8, 1, 0); const __m128i shuff1 = _mm_set_epi8(13, 12, 5, 4, -1, -1, -1, -1, 11, 10, 3, 2, 9, 8, 1, 0); const __m128i A0 = _mm_shuffle_epi8(in0, shuff0); const __m128i A1 = _mm_shuffle_epi8(in1, shuff1); const __m128i A2 = _mm_shuffle_epi8(in2, shuff0); const __m128i A3 = _mm_shuffle_epi8(in3, shuff1); // R0R1G0G1 // B0B1**** // R2R3G2G3 // B2B3**** // (OR is used to free port 5 for the unpack) const __m128i B0 = _mm_unpacklo_epi32(A0, A1); const __m128i B1 = _mm_or_si128(A0, A1); const __m128i B2 = _mm_unpacklo_epi32(A2, A3); const __m128i B3 = _mm_or_si128(A2, A3); // Gather the channels. *r = _mm_unpacklo_epi64(B0, B2); *g = _mm_unpackhi_epi64(B0, B2); *b = _mm_unpackhi_epi64(B1, B3); }
void ff_hevc_transform_skip_8_sse(uint8_t *_dst, int16_t *coeffs, ptrdiff_t _stride) { uint8_t *dst = (uint8_t*)_dst; ptrdiff_t stride = _stride; int shift = 5; int offset = 16; __m128i r0, r1, r2, r3, r4, r5, r6, r9; r9 = _mm_setzero_si128(); r2 = _mm_set1_epi16(offset); r0 = _mm_load_si128((__m128i*)(coeffs)); r1 = _mm_load_si128((__m128i*)(coeffs + 8)); r0 = _mm_adds_epi16(r0, r2); r1 = _mm_adds_epi16(r1, r2); r0 = _mm_srai_epi16(r0, shift); r1 = _mm_srai_epi16(r1, shift); r3 = _mm_loadl_epi64((__m128i*)(dst)); r4 = _mm_loadl_epi64((__m128i*)(dst + stride)); r5 = _mm_loadl_epi64((__m128i*)(dst + 2 * stride)); r6 = _mm_loadl_epi64((__m128i*)(dst + 3 * stride)); r3 = _mm_unpacklo_epi8(r3, r9); r4 = _mm_unpacklo_epi8(r4, r9); r5 = _mm_unpacklo_epi8(r5, r9); r6 = _mm_unpacklo_epi8(r6, r9); r3 = _mm_unpacklo_epi64(r3, r4); r4 = _mm_unpacklo_epi64(r5, r6); r3 = _mm_adds_epi16(r3, r0); r4 = _mm_adds_epi16(r4, r1); r3 = _mm_packus_epi16(r3, r4); *((uint32_t *)(dst)) = _mm_cvtsi128_si32(r3); dst+=stride; *((uint32_t *)(dst)) = _mm_cvtsi128_si32(_mm_srli_si128(r3, 4)); dst+=stride; *((uint32_t *)(dst)) = _mm_cvtsi128_si32(_mm_srli_si128(r3, 8)); dst+=stride; *((uint32_t *)(dst)) = _mm_cvtsi128_si32(_mm_srli_si128(r3, 12)); }
__forceinline void GSMem_to_ClutBuffer__T32_I4_CSM1_sse2(u32* vm, u32 csa) { u32* clut = GetClutBufferAddress<u32>(csa); __m128i* src = (__m128i*)vm; __m128i* dst = (__m128i*)clut; __m128i r0 = _mm_load_si128(&src[0]); __m128i r1 = _mm_load_si128(&src[1]); __m128i r2 = _mm_load_si128(&src[2]); __m128i r3 = _mm_load_si128(&src[3]); _mm_store_si128(&dst[0], _mm_unpacklo_epi64(r0, r1)); _mm_store_si128(&dst[1], _mm_unpacklo_epi64(r2, r3)); _mm_store_si128(&dst[2], _mm_unpackhi_epi64(r0, r1)); _mm_store_si128(&dst[3], _mm_unpackhi_epi64(r2, r3)); }
__m128i test_mm_unpacklo_epi64(__m128i A, __m128i B) { // DAG-LABEL: test_mm_unpacklo_epi64 // DAG: shufflevector <2 x i64> %{{.*}}, <2 x i64> %{{.*}}, <2 x i32> <i32 0, i32 2> // // ASM-LABEL: test_mm_unpacklo_epi64 // ASM: unpcklqdq return _mm_unpacklo_epi64(A, B); }
static inline void transpose_state(__m128i *stateOut, __m128i *stateIn) { __m128i tmpi; __m128i tmpj; tmpi = _mm_unpacklo_epi32(stateIn[0], stateIn[1]); tmpj = _mm_unpacklo_epi32(stateIn[2], stateIn[3]); stateOut[0] = _mm_unpacklo_epi64(tmpi, tmpj); stateOut[1] = _mm_unpackhi_epi64(tmpi, tmpj); tmpi = _mm_unpackhi_epi32(stateIn[0], stateIn[1]); tmpj = _mm_unpackhi_epi32(stateIn[2], stateIn[3]); stateOut[2] = _mm_unpacklo_epi64(tmpi, tmpj); stateOut[3] = _mm_unpackhi_epi64(tmpi, tmpj); }
void aom_highbd_upsampled_pred_sse2(uint16_t *pred, int width, int height, const uint8_t *ref8, const int ref_stride) { const int stride = ref_stride << 3; uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); int i, j; if (width >= 8) { // read 8 points at one time for (i = 0; i < height; i++) { for (j = 0; j < width; j += 8) { __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); __m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32)); __m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40)); __m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48)); __m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56)); __m128i t0, t1, t2, t3; t0 = _mm_unpacklo_epi16(s0, s1); t1 = _mm_unpacklo_epi16(s2, s3); t2 = _mm_unpacklo_epi16(s4, s5); t3 = _mm_unpacklo_epi16(s6, s7); t0 = _mm_unpacklo_epi32(t0, t1); t2 = _mm_unpacklo_epi32(t2, t3); t0 = _mm_unpacklo_epi64(t0, t2); _mm_storeu_si128((__m128i *)(pred), t0); pred += 8; ref += 64; // 8 * 8; } ref += stride - (width << 3); } } else { // read 4 points at one time for (i = 0; i < height; i++) { for (j = 0; j < width; j += 4) { __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); __m128i t0, t1; t0 = _mm_unpacklo_epi16(s0, s1); t1 = _mm_unpacklo_epi16(s2, s3); t0 = _mm_unpacklo_epi32(t0, t1); _mm_storel_epi64((__m128i *)(pred), t0); pred += 4; ref += 4 * 8; } ref += stride - (width << 3); } } }
static INLINE unsigned int highbd_masked_sad8xh_avx2( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); int y; __m256i res = _mm256_setzero_si256(); const __m256i mask_max = _mm256_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m256i round_const = _mm256_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m256i one = _mm256_set1_epi16(1); for (y = 0; y < height; y += 2) { const __m256i src = xx_loadu2_m128i(src_ptr + src_stride, src_ptr); const __m256i a = xx_loadu2_m128i(a_ptr + a_stride, a_ptr); const __m256i b = xx_loadu2_m128i(b_ptr + b_stride, b_ptr); // Zero-extend mask to 16 bits const __m256i m = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64( _mm_loadl_epi64((const __m128i *)(m_ptr)), _mm_loadl_epi64((const __m128i *)(m_ptr + m_stride)))); const __m256i m_inv = _mm256_sub_epi16(mask_max, m); const __m256i data_l = _mm256_unpacklo_epi16(a, b); const __m256i mask_l = _mm256_unpacklo_epi16(m, m_inv); __m256i pred_l = _mm256_madd_epi16(data_l, mask_l); pred_l = _mm256_srai_epi32(_mm256_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m256i data_r = _mm256_unpackhi_epi16(a, b); const __m256i mask_r = _mm256_unpackhi_epi16(m, m_inv); __m256i pred_r = _mm256_madd_epi16(data_r, mask_r); pred_r = _mm256_srai_epi32(_mm256_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, // so it is safe to do signed saturation here. const __m256i pred = _mm256_packs_epi32(pred_l, pred_r); // There is no 16-bit SAD instruction, so we have to synthesize // an 8-element SAD. We do this by storing 4 32-bit partial SADs, // and accumulating them at the end const __m256i diff = _mm256_abs_epi16(_mm256_sub_epi16(pred, src)); res = _mm256_add_epi32(res, _mm256_madd_epi16(diff, one)); src_ptr += src_stride << 1; a_ptr += a_stride << 1; b_ptr += b_stride << 1; m_ptr += m_stride << 1; } // At this point, we have four 32-bit partial SADs stored in 'res'. res = _mm256_hadd_epi32(res, res); res = _mm256_hadd_epi32(res, res); int sad = _mm256_extract_epi32(res, 0) + _mm256_extract_epi32(res, 4); return (sad + 31) >> 6; }
INLINE static __m256i diff_row_dual_avx2(const kvz_pixel *buf1, const kvz_pixel *buf2, const kvz_pixel *orig) { __m128i temp1 = _mm_loadl_epi64((__m128i*)buf1); __m128i temp2 = _mm_loadl_epi64((__m128i*)buf2); __m128i temp3 = _mm_loadl_epi64((__m128i*)orig); __m256i buf1_row = _mm256_cvtepu8_epi16(_mm_unpacklo_epi64(temp1, temp2)); __m256i buf2_row = _mm256_cvtepu8_epi16(_mm_broadcastq_epi64(temp3)); return _mm256_sub_epi16(buf1_row, buf2_row); }
static INLINE unsigned int masked_sad8xh_ssse3( const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { int y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); for (y = 0; y < height; y += 2) { const __m128i src = _mm_unpacklo_epi64( _mm_loadl_epi64((const __m128i *)src_ptr), _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr); const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]); const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr); const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]); const __m128i m = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr), _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride])); const __m128i m_inv = _mm_sub_epi8(mask_max, m); const __m128i data_l = _mm_unpacklo_epi8(a0, b0); const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv); __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpacklo_epi8(a1, b1); const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv); __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); const __m128i pred = _mm_packus_epi16(pred_l, pred_r); res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); src_ptr += src_stride * 2; a_ptr += a_stride * 2; b_ptr += b_stride * 2; m_ptr += m_stride * 2; } int32_t sad = _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8)); return (sad + 31) >> 6; }