void S_Interpolate_4x4_IntPel_Mono_Add_Later(unsigned char *current_part_ptr, int current_part_stride, unsigned char *ref_part_ptr, int ref_part_stride){ static const unsigned int c_0[4] = { 0, 0, 0, 0 }; unsigned long s_row0_0, s_row1_0, s_row2_0, s_row3_0; __m128i v_row0_0, v_row1_0, v_row2_0, v_row3_0; __m128i v_Zero = _mm_loadu_si128((__m128i*)c_0); s_row0_0 = *(unsigned long*)(ref_part_ptr+(0*ref_part_stride)); s_row1_0 = *(unsigned long*)(ref_part_ptr+(1*ref_part_stride)); s_row2_0 = *(unsigned long*)(ref_part_ptr+(2*ref_part_stride)); s_row3_0 = *(unsigned long*)(ref_part_ptr+(3*ref_part_stride)); v_row0_0 = _mm_cvtsi32_si128(s_row0_0); v_row1_0 = _mm_cvtsi32_si128(s_row1_0); v_row2_0 = _mm_cvtsi32_si128(s_row2_0); v_row3_0 = _mm_cvtsi32_si128(s_row3_0); v_row0_0 = _mm_unpacklo_epi8(v_row0_0, v_Zero); v_row1_0 = _mm_unpacklo_epi8(v_row1_0, v_Zero); v_row2_0 = _mm_unpacklo_epi8(v_row2_0, v_Zero); v_row3_0 = _mm_unpacklo_epi8(v_row3_0, v_Zero); _mm_storel_epi64((__m128i*)(current_part_ptr+(0*current_part_stride)), v_row0_0); _mm_storel_epi64((__m128i*)(current_part_ptr+(1*current_part_stride)), v_row1_0); _mm_storel_epi64((__m128i*)(current_part_ptr+(2*current_part_stride)), v_row2_0); _mm_storel_epi64((__m128i*)(current_part_ptr+(3*current_part_stride)), v_row3_0); }
static void ConvertBGRAToBGR_SSE2(const uint32_t* src, int num_pixels, uint8_t* dst) { const __m128i mask_l = _mm_set_epi32(0, 0x00ffffff, 0, 0x00ffffff); const __m128i mask_h = _mm_set_epi32(0x00ffffff, 0, 0x00ffffff, 0); const __m128i* in = (const __m128i*)src; const uint8_t* const end = dst + num_pixels * 3; // the last storel_epi64 below writes 8 bytes starting at offset 18 while (dst + 26 <= end) { const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3 const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7 const __m128i a0l = _mm_and_si128(bgra0, mask_l); // bgr0|0|bgr0|0 const __m128i a4l = _mm_and_si128(bgra4, mask_l); // bgr0|0|bgr0|0 const __m128i a0h = _mm_and_si128(bgra0, mask_h); // 0|bgr0|0|bgr0 const __m128i a4h = _mm_and_si128(bgra4, mask_h); // 0|bgr0|0|bgr0 const __m128i b0h = _mm_srli_epi64(a0h, 8); // 000b|gr00|000b|gr00 const __m128i b4h = _mm_srli_epi64(a4h, 8); // 000b|gr00|000b|gr00 const __m128i c0 = _mm_or_si128(a0l, b0h); // rgbrgb00|rgbrgb00 const __m128i c4 = _mm_or_si128(a4l, b4h); // rgbrgb00|rgbrgb00 const __m128i c2 = _mm_srli_si128(c0, 8); const __m128i c6 = _mm_srli_si128(c4, 8); _mm_storel_epi64((__m128i*)(dst + 0), c0); _mm_storel_epi64((__m128i*)(dst + 6), c2); _mm_storel_epi64((__m128i*)(dst + 12), c4); _mm_storel_epi64((__m128i*)(dst + 18), c6); dst += 24; num_pixels -= 8; } // left-overs if (num_pixels > 0) { VP8LConvertBGRAToBGR_C((const uint32_t*)in, num_pixels, dst); } }
int64_t vp9_block_error_avx2(const int16_t *coeff, const int16_t *dqcoeff, intptr_t block_size, int64_t *ssz) { __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg; __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi; __m256i sse_reg_64hi, ssz_reg_64hi; __m128i sse_reg128, ssz_reg128; int64_t sse; int i; const __m256i zero_reg = _mm256_set1_epi16(0); // init sse and ssz registerd to zero sse_reg = _mm256_set1_epi16(0); ssz_reg = _mm256_set1_epi16(0); for (i = 0 ; i < block_size ; i+= 16) { // load 32 bytes from coeff and dqcoeff coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i)); dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i)); // dqcoeff - coeff dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg); // madd (dqcoeff - coeff) dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg); // madd coeff coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg); // expand each double word of madd (dqcoeff - coeff) to quad word exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg); exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg); // expand each double word of madd (coeff) to quad word exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg); exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg); // add each quad word of madd (dqcoeff - coeff) and madd (coeff) sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo); ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo); sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi); ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi); } // save the higher 64 bit of each 128 bit lane sse_reg_64hi = _mm256_srli_si256(sse_reg, 8); ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8); // add the higher 64 bit to the low 64 bit sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi); ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi); // add each 64 bit from each of the 128 bit lane of the 256 bit sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg), _mm256_extractf128_si256(sse_reg, 1)); ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg), _mm256_extractf128_si256(ssz_reg, 1)); // store the results _mm_storel_epi64((__m128i*)(&sse), sse_reg128); _mm_storel_epi64((__m128i*)(ssz), ssz_reg128); return sse; }
template <bool even> void ReduceGray4x4(const uint8_t *src, size_t srcWidth, size_t srcHeight, size_t srcStride, uint8_t *dst, size_t dstWidth, size_t dstHeight, size_t dstStride) { assert((srcWidth + 1) / 2 == dstWidth && (srcHeight + 1) / 2 == dstHeight && srcWidth > A); size_t alignedDstWidth = Simd::AlignLo(dstWidth, HA); size_t srcTail = Simd::AlignHi(srcWidth - A, 2); Buffer buffer(Simd::AlignHi(dstWidth, A)); __m128i tmp = ReduceColNose(src); Store<true>((__m128i*)buffer.src0, tmp); Store<true>((__m128i*)buffer.src1, tmp); size_t srcCol = A, dstCol = HA; for (; srcCol < srcWidth - A; srcCol += A, dstCol += HA) { tmp = ReduceColBody(src + srcCol); Store<true>((__m128i*)(buffer.src0 + dstCol), tmp); Store<true>((__m128i*)(buffer.src1 + dstCol), tmp); } tmp = ReduceColTail<even>(src + srcTail); Store<false>((__m128i*)(buffer.src0 + dstWidth - HA), tmp); Store<false>((__m128i*)(buffer.src1 + dstWidth - HA), tmp); for (size_t row = 0; row < srcHeight; row += 2, dst += dstStride) { const uint8_t *src2 = src + srcStride*(row + 1); const uint8_t *src3 = src2 + srcStride; if (row >= srcHeight - 2) { src2 = src + srcStride*(srcHeight - 1); src3 = src2; } Store<true>((__m128i*)buffer.src2, ReduceColNose(src2)); Store<true>((__m128i*)buffer.src3, ReduceColNose(src3)); size_t srcCol = A, dstCol = HA; for (; srcCol < srcWidth - A; srcCol += A, dstCol += HA) { Store<true>((__m128i*)(buffer.src2 + dstCol), ReduceColBody(src2 + srcCol)); Store<true>((__m128i*)(buffer.src3 + dstCol), ReduceColBody(src3 + srcCol)); } Store<false>((__m128i*)(buffer.src2 + dstWidth - HA), ReduceColTail<even>(src2 + srcTail)); Store<false>((__m128i*)(buffer.src3 + dstWidth - HA), ReduceColTail<even>(src3 + srcTail)); for (size_t col = 0; col < alignedDstWidth; col += HA) _mm_storel_epi64((__m128i*)(dst + col), ReduceRow<true>(buffer, col)); if (alignedDstWidth != dstWidth) _mm_storel_epi64((__m128i*)(dst + dstWidth - HA), ReduceRow<false>(buffer, dstWidth - HA)); Swap(buffer.src0, buffer.src2); Swap(buffer.src1, buffer.src3); } }
static void MultRow(uint8_t* const ptr, const uint8_t* const alpha, int width, int inverse) { int x = 0; if (!inverse) { const int kSpan = 8; const __m128i zero = _mm_setzero_si128(); const __m128i kRound = _mm_set1_epi16(1 << 7); const int w2 = width & ~(kSpan - 1); for (x = 0; x < w2; x += kSpan) { const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]); const __m128i v1 = _mm_unpacklo_epi8(v0, zero); const __m128i alpha0 = _mm_loadl_epi64((const __m128i*)&alpha[x]); const __m128i alpha1 = _mm_unpacklo_epi8(alpha0, zero); const __m128i alpha2 = _mm_unpacklo_epi8(alpha0, alpha0); const __m128i v2 = _mm_mulhi_epu16(v1, alpha2); const __m128i v3 = _mm_mullo_epi16(v1, alpha1); const __m128i v4 = _mm_adds_epu16(v2, v3); const __m128i v5 = _mm_adds_epu16(v4, kRound); const __m128i v6 = _mm_srli_epi16(v5, 8); const __m128i v7 = _mm_packus_epi16(v6, zero); _mm_storel_epi64((__m128i*)&ptr[x], v7); } } width -= x; if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse); }
static void MultARGBRow(uint32_t* const ptr, int width, int inverse) { int x = 0; if (!inverse) { const int kSpan = 2; const __m128i zero = _mm_setzero_si128(); const __m128i kRound = _mm_set_epi16(0, 1 << 7, 1 << 7, 1 << 7, 0, 1 << 7, 1 << 7, 1 << 7); const __m128i kMult = _mm_set_epi16(0, 0x0101, 0x0101, 0x0101, 0, 0x0101, 0x0101, 0x0101); const __m128i kOne64 = _mm_set_epi16(1u << 8, 0, 0, 0, 1u << 8, 0, 0, 0); const int w2 = width & ~(kSpan - 1); for (x = 0; x < w2; x += kSpan) { const __m128i argb0 = _mm_loadl_epi64((__m128i*)&ptr[x]); const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero); const __m128i tmp0 = _mm_shufflelo_epi16(argb1, _MM_SHUFFLE(3, 3, 3, 3)); const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, _MM_SHUFFLE(3, 3, 3, 3)); const __m128i tmp2 = _mm_srli_epi64(tmp1, 16); const __m128i scale0 = _mm_mullo_epi16(tmp1, kMult); const __m128i scale1 = _mm_or_si128(tmp2, kOne64); const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0); const __m128i argb3 = _mm_mullo_epi16(argb1, scale1); const __m128i argb4 = _mm_adds_epu16(argb2, argb3); const __m128i argb5 = _mm_adds_epu16(argb4, kRound); const __m128i argb6 = _mm_srli_epi16(argb5, 8); const __m128i argb7 = _mm_packus_epi16(argb6, zero); _mm_storel_epi64((__m128i*)&ptr[x], argb7); } } width -= x; if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse); }
// Special case for left-based prediction (when preds==dst-1 or preds==src-1). static void PredictLineLeft(const uint8_t* src, uint8_t* dst, int length, int inverse) { int i; if (length <= 0) return; if (inverse) { const int max_pos = length & ~7; __m128i last = _mm_set_epi32(0, 0, 0, dst[-1]); for (i = 0; i < max_pos; i += 8) { const __m128i A0 = _mm_loadl_epi64((const __m128i*)(src + i)); const __m128i A1 = _mm_add_epi8(A0, last); const __m128i A2 = _mm_slli_si128(A1, 1); const __m128i A3 = _mm_add_epi8(A1, A2); const __m128i A4 = _mm_slli_si128(A3, 2); const __m128i A5 = _mm_add_epi8(A3, A4); const __m128i A6 = _mm_slli_si128(A5, 4); const __m128i A7 = _mm_add_epi8(A5, A6); _mm_storel_epi64((__m128i*)(dst + i), A7); last = _mm_srli_epi64(A7, 56); } for (; i < length; ++i) dst[i] = src[i] + dst[i - 1]; } else { const int max_pos = length & ~31; for (i = 0; i < max_pos; i += 32) { const __m128i A0 = _mm_loadu_si128((const __m128i*)(src + i + 0 )); const __m128i B0 = _mm_loadu_si128((const __m128i*)(src + i + 0 - 1)); const __m128i A1 = _mm_loadu_si128((const __m128i*)(src + i + 16 )); const __m128i B1 = _mm_loadu_si128((const __m128i*)(src + i + 16 - 1)); const __m128i C0 = _mm_sub_epi8(A0, B0); const __m128i C1 = _mm_sub_epi8(A1, B1); _mm_storeu_si128((__m128i*)(dst + i + 0), C0); _mm_storeu_si128((__m128i*)(dst + i + 16), C1); } for (; i < length; ++i) dst[i] = src[i] - src[i - 1]; } }
static WEBP_INLINE void ProcessRow(const __m128i* const A0, const __m128i* const A1, const __m128i* const A2, const __m128i* const A3, const __m128i* const mult, uint8_t* const dst) { const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER); const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0); const __m128i B0 = _mm_mul_epu32(*A0, *mult); const __m128i B1 = _mm_mul_epu32(*A1, *mult); const __m128i B2 = _mm_mul_epu32(*A2, *mult); const __m128i B3 = _mm_mul_epu32(*A3, *mult); const __m128i C0 = _mm_add_epi64(B0, rounder); const __m128i C1 = _mm_add_epi64(B1, rounder); const __m128i C2 = _mm_add_epi64(B2, rounder); const __m128i C3 = _mm_add_epi64(B3, rounder); const __m128i D0 = _mm_srli_epi64(C0, WEBP_RESCALER_RFIX); const __m128i D1 = _mm_srli_epi64(C1, WEBP_RESCALER_RFIX); const __m128i D2 = _mm_and_si128(C2, mask); const __m128i D3 = _mm_and_si128(C3, mask); const __m128i E0 = _mm_or_si128(D0, D2); const __m128i E1 = _mm_or_si128(D1, D3); const __m128i F = _mm_packs_epi32(E0, E1); const __m128i G = _mm_packus_epi16(F, F); _mm_storel_epi64((__m128i*)dst, G); }
void nibble_sort_tom(unsigned long *buf) { for (int i = 0; i < TEST_SIZE; ++i) { __m128i x = _mm_and_si128(_mm_set_epi64x(buf[i] >> 4, buf[i]), g_mask); x = S(x, 0); x = S(x, 1); x = S(x, 0); x = S(x, 2); x = S(x, 3); x = S(x, 0); x = S(x, 4); x = S(x, 5); x = S(x, 3); /* Final step is different; the output is in the right layout * for reassembling for the final write. */ const __m128i a0 = _mm_shuffle_epi8(x, g_shuffles[0][0]); const __m128i b0 = _mm_shuffle_epi8(x, g_shuffles[0][1]); const __m128i a1 = _mm_min_epi8(a0, b0); const __m128i b1 = _mm_max_epi8(a0, b0); const __m128i out = _mm_or_si128(a1, _mm_slli_epi64(b1, 4)); _mm_storel_epi64((__m128i *)&buf[i], out); } }
static inline long conv_yF_yHalf (const float *src, uint16_t *dst, long samples) { const __v4sf *s_vec; uint64_t *d_vec; long n = samples; s_vec = (const __v4sf *)src; d_vec = (uint64_t *)dst; while (n >= 4) { __m128 in_val = _mm_loadu_ps((float *)s_vec++); __m128i out_val = _mm_cvtps_ph(in_val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); _mm_storel_epi64((__m128i *)d_vec++, out_val); n -= 4; } src = (const float *)s_vec; dst = (uint16_t *)d_vec; while (n) { __m128 in_val = _mm_load_ss(src++); __m128i out_val = _mm_cvtps_ph(in_val, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC); *dst++ = _mm_extract_epi16(out_val, 0); n -= 1; } return samples; }
static void vpx_highbd_filter_block1d4_h4_sse2( const uint16_t *src_ptr, ptrdiff_t src_stride, uint16_t *dst_ptr, ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel, int bd) { // We will load multiple shifted versions of the row and shuffle them into // 16-bit words of the form // ... s[2] s[1] s[0] s[-1] // ... s[4] s[3] s[2] s[1] // Then we call multiply and add to get partial results // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2] // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4] // The two results are then added together to get the even output __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3; __m128i res_reg; __m128i even, odd; __m128i kernel_reg; // Kernel __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used const __m128i reg_round = _mm_set1_epi32(CONV8_ROUNDING_NUM); // Used for rounding const __m128i reg_max = _mm_set1_epi16((1 << bd) - 1); const __m128i reg_zero = _mm_setzero_si128(); int h; // Start one pixel before as we need tap/2 - 1 = 1 sample from the past src_ptr -= 1; // Load Kernel kernel_reg = _mm_loadu_si128((const __m128i *)kernel); kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); for (h = height; h > 0; --h) { src_reg = _mm_loadu_si128((const __m128i *)src_ptr); src_reg_shift_1 = _mm_srli_si128(src_reg, 2); src_reg_shift_2 = _mm_srli_si128(src_reg, 4); src_reg_shift_3 = _mm_srli_si128(src_reg, 6); // Output 2 0 even = mm_madd_add_epi16_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, &kernel_reg_45); // Output 3 1 odd = mm_madd_add_epi16_sse2(&src_reg_shift_1, &src_reg_shift_3, &kernel_reg_23, &kernel_reg_45); // Combine to get the first half of the dst res_reg = _mm_unpacklo_epi32(even, odd); res_reg = mm_round_epi32_sse2(&res_reg, ®_round, CONV8_ROUNDING_BITS); res_reg = _mm_packs_epi32(res_reg, reg_zero); // Saturate the result and save res_reg = _mm_min_epi16(res_reg, reg_max); res_reg = _mm_max_epi16(res_reg, reg_zero); _mm_storel_epi64((__m128i *)dst_ptr, res_reg); src_ptr += src_stride; dst_ptr += dst_stride; } }
/** * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more * precise version of a box filter 4:2:2 pixel subsampling in Q3. * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer. */ static INLINE void cfl_luma_subsampling_422_hbd_ssse3(const uint16_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128; do { if (width == 4) { const __m128i top = _mm_loadl_epi64((__m128i *)input); const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2); _mm_storeh_epi32(pred_buf_m128i, sum); } else { const __m128i top = _mm_loadu_si128((__m128i *)input); if (width == 8) { const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top), 2); _mm_storel_epi64(pred_buf_m128i, sum); } else { const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); const __m128i sum = _mm_slli_epi16(_mm_hadd_epi16(top, top_1), 2); _mm_storeu_si128(pred_buf_m128i, sum); if (width == 32) { const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2); const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3); const __m128i sum_1 = _mm_slli_epi16(_mm_hadd_epi16(top_2, top_3), 2); _mm_storeu_si128(pred_buf_m128i + 1, sum_1); } } } pred_buf_m128i += CFL_BUF_LINE_I128; input += input_stride; } while (pred_buf_m128i < end); }
// Predictors13: ClampedAddSubtractHalf static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; const __m128i zero = _mm_setzero_si128(); for (i = 0; i + 2 <= num_pixels; i += 2) { // we can only process two pixels at a time const __m128i L = _mm_loadl_epi64((const __m128i*)&in[i - 1]); const __m128i src = _mm_loadl_epi64((const __m128i*)&in[i]); const __m128i T = _mm_loadl_epi64((const __m128i*)&upper[i]); const __m128i TL = _mm_loadl_epi64((const __m128i*)&upper[i - 1]); const __m128i L_lo = _mm_unpacklo_epi8(L, zero); const __m128i T_lo = _mm_unpacklo_epi8(T, zero); const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero); const __m128i sum = _mm_add_epi16(T_lo, L_lo); const __m128i avg = _mm_srli_epi16(sum, 1); const __m128i A1 = _mm_sub_epi16(avg, TL_lo); const __m128i bit_fix = _mm_cmpgt_epi16(TL_lo, avg); const __m128i A2 = _mm_sub_epi16(A1, bit_fix); const __m128i A3 = _mm_srai_epi16(A2, 1); const __m128i A4 = _mm_add_epi16(avg, A3); const __m128i pred = _mm_packus_epi16(A4, A4); const __m128i res = _mm_sub_epi8(src, pred); _mm_storel_epi64((__m128i*)&out[i], res); } if (i != num_pixels) { VP8LPredictorsSub_C[13](in + i, upper + i, num_pixels - i, out + i); } }
/** Average each 2x2 pixels into 1x1 pixel (arithmetic average) * - <b>Input format:</b> uint8_t, 1 channel * - <b>Output format:</b> uint8_t, 1 channel * - <b>Preconditions:</b> in & out aligned to 16bytes, w = k*16 (w=width in pixels), widthStep=w*1 * - <b>Notes:</b> * - <b>Requires:</b> SSE2 * - <b>Invoked from:</b> mrpt::utils::CImage::scaleHalfSmooth() */ void image_SSE2_scale_half_smooth_1c8u(const uint8_t* in, uint8_t* out, int w, int h) { MRPT_ALIGN16 const unsigned long long mask[2] = {0x00FF00FF00FF00FFull, 0x00FF00FF00FF00FFull}; const uint8_t* nextRow = in + w; __m128i m = _mm_load_si128((const __m128i*)mask); int sw = w >> 4; int sh = h >> 1; for (int i=0; i<sh; i++) { for (int j=0; j<sw; j++) { __m128i here = _mm_load_si128((const __m128i*)in); __m128i next = _mm_load_si128((const __m128i*)nextRow); here = _mm_avg_epu8(here,next); next = _mm_and_si128(_mm_srli_si128(here,1), m); here = _mm_and_si128(here,m); here = _mm_avg_epu16(here, next); _mm_storel_epi64((__m128i*)out, _mm_packus_epi16(here,here)); in += 16; nextRow += 16; out += 8; } in += w; nextRow += w; } }
static INLINE void cfl_luma_subsampling_444_hbd_ssse3(const uint16_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { const uint16_t *end = pred_buf_q3 + height * CFL_BUF_LINE; do { if (width == 4) { const __m128i row = _mm_slli_epi16(_mm_loadl_epi64((__m128i *)input), 3); _mm_storel_epi64((__m128i *)pred_buf_q3, row); } else { const __m128i row = _mm_slli_epi16(_mm_loadu_si128((__m128i *)input), 3); _mm_storeu_si128((__m128i *)pred_buf_q3, row); if (width >= 16) { __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1); row_1 = _mm_slli_epi16(row_1, 3); _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1, row_1); if (width == 32) { __m128i row_2 = _mm_loadu_si128(((__m128i *)input) + 2); row_2 = _mm_slli_epi16(row_2, 3); _mm_storeu_si128(((__m128i *)pred_buf_q3) + 2, row_2); __m128i row_3 = _mm_loadu_si128(((__m128i *)input) + 3); row_3 = _mm_slli_epi16(row_3, 3); _mm_storeu_si128(((__m128i *)pred_buf_q3) + 3, row_3); } } } input += input_stride; pred_buf_q3 += CFL_BUF_LINE; } while (pred_buf_q3 < end); }
static INLINE void cfl_predict_lbd_ssse3(const int16_t *pred_buf_q3, uint8_t *dst, int dst_stride, int alpha_q3, int width, int height) { const __m128i alpha_sign = _mm_set1_epi16(alpha_q3); const __m128i alpha_q12 = _mm_slli_epi16(_mm_abs_epi16(alpha_sign), 9); const __m128i dc_q0 = _mm_set1_epi16(*dst); __m128i *row = (__m128i *)pred_buf_q3; const __m128i *row_end = row + height * CFL_BUF_LINE_I128; do { __m128i res = predict_unclipped(row, alpha_q12, alpha_sign, dc_q0); if (width < 16) { res = _mm_packus_epi16(res, res); if (width == 4) _mm_storeh_epi32((__m128i *)dst, res); else _mm_storel_epi64((__m128i *)dst, res); } else { __m128i next = predict_unclipped(row + 1, alpha_q12, alpha_sign, dc_q0); res = _mm_packus_epi16(res, next); _mm_storeu_si128((__m128i *)dst, res); if (width == 32) { res = predict_unclipped(row + 2, alpha_q12, alpha_sign, dc_q0); next = predict_unclipped(row + 3, alpha_q12, alpha_sign, dc_q0); res = _mm_packus_epi16(res, next); _mm_storeu_si128((__m128i *)(dst + 16), res); } } dst += dst_stride; } while ((row += CFL_BUF_LINE_I128) < row_end); }
/** * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more * precise version of a box filter 4:2:2 pixel subsampling in Q3. * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer. */ static INLINE void cfl_luma_subsampling_422_lbd_ssse3(const uint8_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { const __m128i fours = _mm_set1_epi8(4); __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128; do { if (width == 4) { __m128i top = _mm_loadh_epi32((__m128i *)input); top = _mm_maddubs_epi16(top, fours); _mm_storeh_epi32(pred_buf_m128i, top); } else if (width == 8) { __m128i top = _mm_loadl_epi64((__m128i *)input); top = _mm_maddubs_epi16(top, fours); _mm_storel_epi64(pred_buf_m128i, top); } else { __m128i top = _mm_loadu_si128((__m128i *)input); top = _mm_maddubs_epi16(top, fours); _mm_storeu_si128(pred_buf_m128i, top); if (width == 32) { __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); top_1 = _mm_maddubs_epi16(top_1, fours); _mm_storeu_si128(pred_buf_m128i + 1, top_1); } } input += input_stride; pred_buf_m128i += CFL_BUF_LINE_I128; } while (pred_buf_m128i < end); }
void ie_FillLine(iePwBGRA pDst, DWORD nXW, iewBGRA clr) { #ifndef __X64__ if (g_bSSE2 && (nXW >= 4) && (_mm_isAligned(pDst) || _mm_isAligned(pDst + 1))) { #else if (nXW >= 4) { #endif // Do fill using SSE2! if (!_mm_isAligned(pDst)) { // Fill until destination is aligned *pDst++ = clr; nXW--; } __m128i r0 = _mm_loadl_epi64((const __m128i *)&clr); r0 = _mm_unpacklo_epi64(r0, r0); for (DWORD nXW_2 = nXW >> 1; nXW_2--;) { _mm_store_si128((__m128i *)pDst, r0); pDst += 2; } if (nXW & 1) { _mm_storel_epi64((__m128i *)pDst, r0); } return; } while (nXW--) *pDst++ = clr; }
/** * Multiplies the pixels by 8 (scaling in Q3). * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer. */ static INLINE void cfl_luma_subsampling_444_lbd_ssse3(const uint8_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { const __m128i zeros = _mm_setzero_si128(); const int luma_stride = input_stride; __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128; do { if (width == 4) { __m128i row = _mm_loadh_epi32((__m128i *)input); row = _mm_unpacklo_epi8(row, zeros); _mm_storel_epi64(pred_buf_m128i, _mm_slli_epi16(row, 3)); } else if (width == 8) { __m128i row = _mm_loadl_epi64((__m128i *)input); row = _mm_unpacklo_epi8(row, zeros); _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row, 3)); } else { __m128i row = _mm_loadu_si128((__m128i *)input); const __m128i row_lo = _mm_unpacklo_epi8(row, zeros); const __m128i row_hi = _mm_unpackhi_epi8(row, zeros); _mm_storeu_si128(pred_buf_m128i, _mm_slli_epi16(row_lo, 3)); _mm_storeu_si128(pred_buf_m128i + 1, _mm_slli_epi16(row_hi, 3)); if (width == 32) { __m128i row_1 = _mm_loadu_si128(((__m128i *)input) + 1); const __m128i row_1_lo = _mm_unpacklo_epi8(row_1, zeros); const __m128i row_1_hi = _mm_unpackhi_epi8(row_1, zeros); _mm_storeu_si128(pred_buf_m128i + 2, _mm_slli_epi16(row_1_lo, 3)); _mm_storeu_si128(pred_buf_m128i + 3, _mm_slli_epi16(row_1_hi, 3)); } } input += luma_stride; pred_buf_m128i += CFL_BUF_LINE_I128; } while (pred_buf_m128i < end); }
static WEBP_INLINE void ProcessRow_Floor_SSE2(const __m128i* const A0, const __m128i* const A1, const __m128i* const A2, const __m128i* const A3, const __m128i* const mult, uint8_t* const dst) { const __m128i mask = _mm_set_epi32(0xffffffffu, 0, 0xffffffffu, 0); const __m128i B0 = _mm_mul_epu32(*A0, *mult); const __m128i B1 = _mm_mul_epu32(*A1, *mult); const __m128i B2 = _mm_mul_epu32(*A2, *mult); const __m128i B3 = _mm_mul_epu32(*A3, *mult); const __m128i D0 = _mm_srli_epi64(B0, WEBP_RESCALER_RFIX); const __m128i D1 = _mm_srli_epi64(B1, WEBP_RESCALER_RFIX); #if (WEBP_RESCALER_RFIX < 32) const __m128i D2 = _mm_and_si128(_mm_slli_epi64(B2, 32 - WEBP_RESCALER_RFIX), mask); const __m128i D3 = _mm_and_si128(_mm_slli_epi64(B3, 32 - WEBP_RESCALER_RFIX), mask); #else const __m128i D2 = _mm_and_si128(B2, mask); const __m128i D3 = _mm_and_si128(B3, mask); #endif const __m128i E0 = _mm_or_si128(D0, D2); const __m128i E1 = _mm_or_si128(D1, D3); const __m128i F = _mm_packs_epi32(E0, E1); const __m128i G = _mm_packus_epi16(F, F); _mm_storel_epi64((__m128i*)dst, G); }
void vp9_add_constant_residual_8x8_sse2(const int16_t diff, uint8_t *dest, int stride) { uint8_t abs_diff; __m128i d; // Prediction data. __m128i p0 = _mm_loadl_epi64((const __m128i *)(dest + 0 * stride)); __m128i p1 = _mm_loadl_epi64((const __m128i *)(dest + 1 * stride)); __m128i p2 = _mm_loadl_epi64((const __m128i *)(dest + 2 * stride)); __m128i p3 = _mm_loadl_epi64((const __m128i *)(dest + 3 * stride)); __m128i p4 = _mm_loadl_epi64((const __m128i *)(dest + 4 * stride)); __m128i p5 = _mm_loadl_epi64((const __m128i *)(dest + 5 * stride)); __m128i p6 = _mm_loadl_epi64((const __m128i *)(dest + 6 * stride)); __m128i p7 = _mm_loadl_epi64((const __m128i *)(dest + 7 * stride)); p0 = _mm_unpacklo_epi64(p0, p1); p2 = _mm_unpacklo_epi64(p2, p3); p4 = _mm_unpacklo_epi64(p4, p5); p6 = _mm_unpacklo_epi64(p6, p7); // Clip diff value to [0, 255] range. Then, do addition or subtraction // according to its sign. if (diff >= 0) { abs_diff = (diff > 255) ? 255 : diff; d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); p0 = _mm_adds_epu8(p0, d); p2 = _mm_adds_epu8(p2, d); p4 = _mm_adds_epu8(p4, d); p6 = _mm_adds_epu8(p6, d); } else { abs_diff = (diff < -255) ? 255 : -diff; d = _mm_shuffle_epi32(_mm_cvtsi32_si128((int)(abs_diff * 0x01010101u)), 0); p0 = _mm_subs_epu8(p0, d); p2 = _mm_subs_epu8(p2, d); p4 = _mm_subs_epu8(p4, d); p6 = _mm_subs_epu8(p6, d); } _mm_storel_epi64((__m128i *)(dest + 0 * stride), p0); p0 = _mm_srli_si128(p0, 8); _mm_storel_epi64((__m128i *)(dest + 1 * stride), p0); _mm_storel_epi64((__m128i *)(dest + 2 * stride), p2); p2 = _mm_srli_si128(p2, 8); _mm_storel_epi64((__m128i *)(dest + 3 * stride), p2); _mm_storel_epi64((__m128i *)(dest + 4 * stride), p4); p4 = _mm_srli_si128(p4, 8); _mm_storel_epi64((__m128i *)(dest + 5 * stride), p4); _mm_storel_epi64((__m128i *)(dest + 6 * stride), p6); p6 = _mm_srli_si128(p6, 8); _mm_storel_epi64((__m128i *)(dest + 7 * stride), p6); }
static WEBP_INLINE void YuvToRgbSSE2(uint8_t y, uint8_t u, uint8_t v, uint8_t* const rgb) { const __m128i tmp0 = GetRGBA32b(y, u, v); const __m128i tmp1 = _mm_packs_epi32(tmp0, tmp0); const __m128i tmp2 = _mm_packus_epi16(tmp1, tmp1); // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp _mm_storel_epi64((__m128i*)rgb, tmp2); }
int test() { const float src[] = { 0.0f, 0.0f, 0.0f, 0.0f }; short dst[8]; __m128 v_src = _mm_load_ps(src); __m128i v_dst = _mm_cvtps_ph(v_src, 0); _mm_storel_epi64((__m128i*)dst, v_dst); return (int)dst[0]; }
static WEBP_INLINE void YuvToBgrSSE2(uint8_t y, uint8_t u, uint8_t v, uint8_t* const bgr) { const __m128i tmp0 = GetRGBA32b(y, u, v); const __m128i tmp1 = _mm_shuffle_epi32(tmp0, _MM_SHUFFLE(3, 0, 1, 2)); const __m128i tmp2 = _mm_packs_epi32(tmp1, tmp1); const __m128i tmp3 = _mm_packus_epi16(tmp2, tmp2); // Note: we store 8 bytes at a time, not 3 bytes! -> memory stomp _mm_storel_epi64((__m128i*)bgr, tmp3); }
static void vpx_filter_block1d8_h4_sse2(const uint8_t *src_ptr, ptrdiff_t src_stride, uint8_t *dst_ptr, ptrdiff_t dst_stride, uint32_t height, const int16_t *kernel) { __m128i kernel_reg; // Kernel __m128i kernel_reg_23, kernel_reg_45; // Segments of the kernel used const __m128i reg_32 = _mm_set1_epi16(32); // Used for rounding int h; __m128i src_reg, src_reg_shift_1, src_reg_shift_2, src_reg_shift_3; __m128i dst_first; __m128i even, odd; // Start one pixel before as we need tap/2 - 1 = 1 sample from the past src_ptr -= 1; // Load Kernel kernel_reg = _mm_loadu_si128((const __m128i *)kernel); kernel_reg = _mm_srai_epi16(kernel_reg, 1); kernel_reg_23 = extract_quarter_2_epi16_sse2(&kernel_reg); kernel_reg_45 = extract_quarter_3_epi16_sse2(&kernel_reg); for (h = height; h > 0; --h) { // We will load multiple shifted versions of the row and shuffle them into // 16-bit words of the form // ... s[2] s[1] s[0] s[-1] // ... s[4] s[3] s[2] s[1] // Then we call multiply and add to get partial results // s[2]k[3]+s[1]k[2] s[0]k[3]s[-1]k[2] // s[4]k[5]+s[3]k[4] s[2]k[5]s[1]k[4] // The two results are then added together to get the even output src_reg = _mm_loadu_si128((const __m128i *)src_ptr); src_reg_shift_1 = _mm_srli_si128(src_reg, 1); src_reg_shift_2 = _mm_srli_si128(src_reg, 2); src_reg_shift_3 = _mm_srli_si128(src_reg, 3); // Output 6 4 2 0 even = mm_madd_add_epi8_sse2(&src_reg, &src_reg_shift_2, &kernel_reg_23, &kernel_reg_45); // Output 7 5 3 1 odd = mm_madd_add_epi8_sse2(&src_reg_shift_1, &src_reg_shift_3, &kernel_reg_23, &kernel_reg_45); // Combine to get the first half of the dst dst_first = mm_zip_epi32_sse2(&even, &odd); dst_first = mm_round_epi16_sse2(&dst_first, ®_32, 6); // Saturate and convert to 8-bit words dst_first = _mm_packus_epi16(dst_first, _mm_setzero_si128()); _mm_storel_epi64((__m128i *)dst_ptr, dst_first); src_ptr += src_stride; dst_ptr += dst_stride; } }
static uint64_t aom_sum_squares_i16_64n_sse2(const int16_t *src, uint32_t n) { const __m128i v_zext_mask_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); __m128i v_acc0_q = _mm_setzero_si128(); __m128i v_acc1_q = _mm_setzero_si128(); const int16_t *const end = src + n; assert(n % 64 == 0); while (src < end) { const __m128i v_val_0_w = xx_load_128(src); const __m128i v_val_1_w = xx_load_128(src + 8); const __m128i v_val_2_w = xx_load_128(src + 16); const __m128i v_val_3_w = xx_load_128(src + 24); const __m128i v_val_4_w = xx_load_128(src + 32); const __m128i v_val_5_w = xx_load_128(src + 40); const __m128i v_val_6_w = xx_load_128(src + 48); const __m128i v_val_7_w = xx_load_128(src + 56); const __m128i v_sq_0_d = _mm_madd_epi16(v_val_0_w, v_val_0_w); const __m128i v_sq_1_d = _mm_madd_epi16(v_val_1_w, v_val_1_w); const __m128i v_sq_2_d = _mm_madd_epi16(v_val_2_w, v_val_2_w); const __m128i v_sq_3_d = _mm_madd_epi16(v_val_3_w, v_val_3_w); const __m128i v_sq_4_d = _mm_madd_epi16(v_val_4_w, v_val_4_w); const __m128i v_sq_5_d = _mm_madd_epi16(v_val_5_w, v_val_5_w); const __m128i v_sq_6_d = _mm_madd_epi16(v_val_6_w, v_val_6_w); const __m128i v_sq_7_d = _mm_madd_epi16(v_val_7_w, v_val_7_w); const __m128i v_sum_01_d = _mm_add_epi32(v_sq_0_d, v_sq_1_d); const __m128i v_sum_23_d = _mm_add_epi32(v_sq_2_d, v_sq_3_d); const __m128i v_sum_45_d = _mm_add_epi32(v_sq_4_d, v_sq_5_d); const __m128i v_sum_67_d = _mm_add_epi32(v_sq_6_d, v_sq_7_d); const __m128i v_sum_0123_d = _mm_add_epi32(v_sum_01_d, v_sum_23_d); const __m128i v_sum_4567_d = _mm_add_epi32(v_sum_45_d, v_sum_67_d); const __m128i v_sum_d = _mm_add_epi32(v_sum_0123_d, v_sum_4567_d); v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_and_si128(v_sum_d, v_zext_mask_q)); v_acc1_q = _mm_add_epi64(v_acc1_q, _mm_srli_epi64(v_sum_d, 32)); src += 64; } v_acc0_q = _mm_add_epi64(v_acc0_q, v_acc1_q); v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8)); #if ARCH_X86_64 return (uint64_t)_mm_cvtsi128_si64(v_acc0_q); #else { uint64_t tmp; _mm_storel_epi64((__m128i *)&tmp, v_acc0_q); return tmp; } #endif }
void vpx_highbd_d45_predictor_4x4_ssse3(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i ABCDEFGH = _mm_loadu_si128((const __m128i *)above); const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); const __m128i CDEFGH00 = _mm_srli_si128(ABCDEFGH, 4); const __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGH0, &CDEFGH00); (void)left; (void)bd; _mm_storel_epi64((__m128i *)dst, avg3); dst += stride; _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 2)); dst += stride; _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 4)); dst += stride; _mm_storel_epi64((__m128i *)dst, _mm_srli_si128(avg3, 6)); dst[3] = above[7]; // aka H }
void aom_highbd_upsampled_pred_sse2(uint16_t *pred, int width, int height, const uint8_t *ref8, const int ref_stride) { const int stride = ref_stride << 3; uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); int i, j; if (width >= 8) { // read 8 points at one time for (i = 0; i < height; i++) { for (j = 0; j < width; j += 8) { __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); __m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32)); __m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40)); __m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48)); __m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56)); __m128i t0, t1, t2, t3; t0 = _mm_unpacklo_epi16(s0, s1); t1 = _mm_unpacklo_epi16(s2, s3); t2 = _mm_unpacklo_epi16(s4, s5); t3 = _mm_unpacklo_epi16(s6, s7); t0 = _mm_unpacklo_epi32(t0, t1); t2 = _mm_unpacklo_epi32(t2, t3); t0 = _mm_unpacklo_epi64(t0, t2); _mm_storeu_si128((__m128i *)(pred), t0); pred += 8; ref += 64; // 8 * 8; } ref += stride - (width << 3); } } else { // read 4 points at one time for (i = 0; i < height; i++) { for (j = 0; j < width; j += 4) { __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); __m128i t0, t1; t0 = _mm_unpacklo_epi16(s0, s1); t1 = _mm_unpacklo_epi16(s2, s3); t0 = _mm_unpacklo_epi32(t0, t1); _mm_storel_epi64((__m128i *)(pred), t0); pred += 4; ref += 4 * 8; } ref += stride - (width << 3); } } }
static void filter_horiz_w8_ssse3(const uint8_t *src_x, ptrdiff_t src_pitch, uint8_t *dst, const int16_t *x_filter) { const __m128i k_256 = _mm_set1_epi16(1 << 8); const __m128i f_values = _mm_load_si128((const __m128i *)x_filter); // pack and duplicate the filter values const __m128i f1f0 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0200u)); const __m128i f3f2 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0604u)); const __m128i f5f4 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0a08u)); const __m128i f7f6 = _mm_shuffle_epi8(f_values, _mm_set1_epi16(0x0e0cu)); const __m128i A = _mm_loadl_epi64((const __m128i *)src_x); const __m128i B = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch)); const __m128i C = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 2)); const __m128i D = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 3)); const __m128i E = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 4)); const __m128i F = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 5)); const __m128i G = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 6)); const __m128i H = _mm_loadl_epi64((const __m128i *)(src_x + src_pitch * 7)); // 00 01 10 11 02 03 12 13 04 05 14 15 06 07 16 17 const __m128i tr0_0 = _mm_unpacklo_epi16(A, B); // 20 21 30 31 22 23 32 33 24 25 34 35 26 27 36 37 const __m128i tr0_1 = _mm_unpacklo_epi16(C, D); // 40 41 50 51 42 43 52 53 44 45 54 55 46 47 56 57 const __m128i tr0_2 = _mm_unpacklo_epi16(E, F); // 60 61 70 71 62 63 72 73 64 65 74 75 66 67 76 77 const __m128i tr0_3 = _mm_unpacklo_epi16(G, H); // 00 01 10 11 20 21 30 31 02 03 12 13 22 23 32 33 const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1); // 04 05 14 15 24 25 34 35 06 07 16 17 26 27 36 37 const __m128i tr1_1 = _mm_unpackhi_epi32(tr0_0, tr0_1); // 40 41 50 51 60 61 70 71 42 43 52 53 62 63 72 73 const __m128i tr1_2 = _mm_unpacklo_epi32(tr0_2, tr0_3); // 44 45 54 55 64 65 74 75 46 47 56 57 66 67 76 77 const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3); // 00 01 10 11 20 21 30 31 40 41 50 51 60 61 70 71 const __m128i s1s0 = _mm_unpacklo_epi64(tr1_0, tr1_2); const __m128i s3s2 = _mm_unpackhi_epi64(tr1_0, tr1_2); const __m128i s5s4 = _mm_unpacklo_epi64(tr1_1, tr1_3); const __m128i s7s6 = _mm_unpackhi_epi64(tr1_1, tr1_3); // multiply 2 adjacent elements with the filter and add the result const __m128i x0 = _mm_maddubs_epi16(s1s0, f1f0); const __m128i x1 = _mm_maddubs_epi16(s3s2, f3f2); const __m128i x2 = _mm_maddubs_epi16(s5s4, f5f4); const __m128i x3 = _mm_maddubs_epi16(s7s6, f7f6); // add and saturate the results together const __m128i min_x2x1 = _mm_min_epi16(x2, x1); const __m128i max_x2x1 = _mm_max_epi16(x2, x1); __m128i temp = _mm_adds_epi16(x0, x3); temp = _mm_adds_epi16(temp, min_x2x1); temp = _mm_adds_epi16(temp, max_x2x1); // round and shift by 7 bit each 16 bit temp = _mm_mulhrs_epi16(temp, k_256); // shrink to 8 bit each 16 bits temp = _mm_packus_epi16(temp, temp); // save only 8 bytes convolve result _mm_storel_epi64((__m128i *)dst, temp); }
void read_luma_inter_pred_avg_8x16_intrinsic( BYTE *address1, BYTE *address2, INT stride_src, BYTE *dst, INT stride_dst ) { int i; int src_stride = stride_src; int dst_stride = stride_dst; const unsigned char* src1 = address1; const unsigned char* src2 = address2; for( i = 0; i < 16; i+=8) { __declspec(align(16)) __m128i r0, r1, r2, r3, r4, r5, r6, r7, r0_x, r1_x, r2_x, r3_x, r4_x, r5_x, r6_x, r7_x; int stride2 = (src_stride<<1); int stride4 = (src_stride<<2); int dst_stride2 = (dst_stride<<1); int dst_stride4 = (dst_stride<<2); r0 = _mm_loadl_epi64((__m128i*)(src1)); r1 = _mm_loadl_epi64((__m128i*)(src1+src_stride)); r2 = _mm_loadl_epi64((__m128i*)(src1+stride2)); r3 = _mm_loadl_epi64((__m128i*)(src1+stride2+src_stride)); r4 = _mm_loadl_epi64((__m128i*)(src1+stride4)); r5 = _mm_loadl_epi64((__m128i*)(src1+stride4+src_stride)); r6 = _mm_loadl_epi64((__m128i*)(src1+stride4+stride2)); r7 = _mm_loadl_epi64((__m128i*)(src1+stride4+stride2+src_stride)); r0_x = _mm_loadl_epi64((__m128i*)(src2)); r1_x = _mm_loadl_epi64((__m128i*)(src2+src_stride)); r2_x = _mm_loadl_epi64((__m128i*)(src2+stride2)); r3_x = _mm_loadl_epi64((__m128i*)(src2+stride2+src_stride)); r4_x = _mm_loadl_epi64((__m128i*)(src2+stride4)); r5_x = _mm_loadl_epi64((__m128i*)(src2+stride4+src_stride)); r6_x = _mm_loadl_epi64((__m128i*)(src2+stride4+stride2)); r7_x = _mm_loadl_epi64((__m128i*)(src2+stride4+stride2+src_stride)); r0 = _mm_avg_epu8(r0, r0_x); r1 = _mm_avg_epu8(r1, r1_x); r2 = _mm_avg_epu8(r2, r2_x); r3 = _mm_avg_epu8(r3, r3_x); r4 = _mm_avg_epu8(r4, r4_x); r5 = _mm_avg_epu8(r5, r5_x); r6 = _mm_avg_epu8(r6, r6_x); r7 = _mm_avg_epu8(r7, r7_x); _mm_storel_epi64((__m128i*)(dst), r0); _mm_storel_epi64((__m128i*)(dst+dst_stride), r1); _mm_storel_epi64((__m128i*)(dst+dst_stride2), r2); _mm_storel_epi64((__m128i*)(dst+dst_stride2+dst_stride), r3); _mm_storel_epi64((__m128i*)(dst+dst_stride4), r4); _mm_storel_epi64((__m128i*)(dst+dst_stride4+dst_stride), r5); _mm_storel_epi64((__m128i*)(dst+dst_stride4+dst_stride2), r6); _mm_storel_epi64((__m128i*)(dst+dst_stride4+dst_stride2+dst_stride), r7); src1 += (stride4<<1); src2 += (stride4<<1); dst += (dst_stride4<<1); } }