//FINL int16 __ext_v_sum_int16(int16* x, int len) { __m128i msum = _mm_setzero_si128(); #ifdef SORA_PLATFORM __int16 ret; #else int16_t ret; #endif const int wlen = 8; for (int i = 0; i < len / wlen; i++) { __m128i mx = _mm_loadu_si128((__m128i *)(x + wlen*i)); msum = _mm_add_epi16(msum, mx); } __m128i mout = msum; msum = _mm_shuffle_epi32(msum, _MM_SHUFFLE(2, 1, 0, 3)); mout = _mm_add_epi16(mout, msum); msum = _mm_shuffle_epi32(msum, _MM_SHUFFLE(2, 1, 0, 3)); mout = _mm_add_epi16(mout, msum); msum = _mm_shuffle_epi32(msum, _MM_SHUFFLE(2, 1, 0, 3)); mout = _mm_add_epi16(mout, msum); unsigned int temp = _mm_cvtsi128_si32(mout); ret = temp; ret += (temp >> 16); for (int i = (len / wlen) * wlen; i < len; i++) { ret += x[i]; } return ret; }
__m64 interpolvline_2( unsigned char* image, int PicWidthInPix){ __m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7; __m64 ret; xmm7 = _mm_setzero_si128(); xmm0 = _mm_movpi64_epi64(*((__m64*)(image - 2*PicWidthInPix))); xmm0 = _mm_unpacklo_epi8(xmm0,xmm7); xmm1 = _mm_movpi64_epi64(*((__m64*)(image - 1*PicWidthInPix))); xmm1 = _mm_unpacklo_epi8(xmm1,xmm7); xmm2 = _mm_movpi64_epi64(*((__m64*)(image - 0*PicWidthInPix))); xmm2 = _mm_unpacklo_epi8(xmm2,xmm7); xmm3 = _mm_movpi64_epi64(*((__m64*)(image + 1*PicWidthInPix))); xmm3 = _mm_unpacklo_epi8(xmm3,xmm7); xmm4 = _mm_movpi64_epi64(*((__m64*)(image + 2*PicWidthInPix))); xmm4 = _mm_unpacklo_epi8(xmm4,xmm7); xmm5 = _mm_movpi64_epi64(*((__m64*)(image + 3*PicWidthInPix))); xmm5 = _mm_unpacklo_epi8(xmm5,xmm7); // filter on 8 values xmm6 = _mm_add_epi16(xmm2,xmm3); xmm6 = _mm_slli_epi16(xmm6,2); xmm6 = _mm_sub_epi16(xmm6,xmm1); xmm6 = _mm_sub_epi16(xmm6,xmm4); xmm1 = _mm_set_epi32(0x00050005,0x00050005,0x00050005,0x00050005); xmm6 = _mm_mullo_epi16(xmm6,xmm1); xmm6 = _mm_add_epi16(xmm6,xmm0); xmm6 = _mm_add_epi16(xmm6,xmm5); xmm6 = _mm_add_epi16(xmm6,_mm_set_epi32(0x00100010,0x00100010,0x00100010,0x00100010)); xmm6 = _mm_max_epi16(xmm6, xmm7); // preventing negative values xmm6 = _mm_srli_epi16(xmm6,5); xmm6 = _mm_packus_epi16(xmm6,xmm7); ret = _mm_movepi64_pi64(xmm6); _mm_empty(); return(ret); }
/** * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more * precise version of a box filter 4:2:0 pixel subsampling in Q3. * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer. */ static INLINE void cfl_luma_subsampling_420_lbd_ssse3(const uint8_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { const __m128i twos = _mm_set1_epi8(2); __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3; const __m128i *end = pred_buf_m128i + (height >> 1) * CFL_BUF_LINE_I128; const int luma_stride = input_stride << 1; do { if (width == 4) { __m128i top = _mm_loadh_epi32((__m128i *)input); top = _mm_maddubs_epi16(top, twos); __m128i bot = _mm_loadh_epi32((__m128i *)(input + input_stride)); bot = _mm_maddubs_epi16(bot, twos); const __m128i sum = _mm_add_epi16(top, bot); _mm_storeh_epi32(pred_buf_m128i, sum); } else if (width == 8) { __m128i top = _mm_loadl_epi64((__m128i *)input); top = _mm_maddubs_epi16(top, twos); __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride)); bot = _mm_maddubs_epi16(bot, twos); const __m128i sum = _mm_add_epi16(top, bot); _mm_storel_epi64(pred_buf_m128i, sum); } else { __m128i top = _mm_loadu_si128((__m128i *)input); top = _mm_maddubs_epi16(top, twos); __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride)); bot = _mm_maddubs_epi16(bot, twos); const __m128i sum = _mm_add_epi16(top, bot); _mm_storeu_si128(pred_buf_m128i, sum); if (width == 32) { __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); __m128i bot_1 = _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1); top_1 = _mm_maddubs_epi16(top_1, twos); bot_1 = _mm_maddubs_epi16(bot_1, twos); __m128i sum_1 = _mm_add_epi16(top_1, bot_1); _mm_storeu_si128(pred_buf_m128i + 1, sum_1); } } input += luma_stride; pred_buf_m128i += CFL_BUF_LINE_I128; } while (pred_buf_m128i < end); }
SIMDValue SIMDInt16x8Operation::OpAdd(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); x86Result.m128i_value = _mm_add_epi16(tmpaValue.m128i_value, tmpbValue.m128i_value); // a + b return X86SIMDValue::ToSIMDValue(x86Result); }
OD_SIMD_INLINE void od_mc_butterfly_2x2_16x8(__m128i *t0, __m128i *t1, __m128i *t2, __m128i *t3) { __m128i a; __m128i b; __m128i c; __m128i d; /*a = t0 + t1, c = (t0 + t1) - (t1 + t1) = t0 - t1 b = t2 + t3, d = (t2 + t3) - (t3 + t3) = t2 - t3*/ a = _mm_add_epi16(*t0, *t1); c = _mm_add_epi16(*t1, *t1); c = _mm_sub_epi16(a, c); b = _mm_add_epi16(*t2, *t3); d = _mm_add_epi16(*t3, *t3); d = _mm_sub_epi16(b, d); *t0 = a; *t1 = b; *t2 = c; *t3 = d; }
// Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2) static WEBP_INLINE void DoFilter6(__m128i *p2, __m128i* p1, __m128i *p0, __m128i* q0, __m128i* q1, __m128i *q2, const __m128i* mask, int hev_thresh) { __m128i a, not_hev; const __m128i sign_bit = _mm_set1_epi8(0x80); // compute hev mask GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev); // convert to signed values FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); FLIP_SIGN_BIT2(*p2, *q2); GET_BASE_DELTA(*p1, *p0, *q0, *q1, a); { // do simple filter on pixels with hev const __m128i m = _mm_andnot_si128(not_hev, *mask); const __m128i f = _mm_and_si128(a, m); DO_SIMPLE_FILTER(*p0, *q0, f); } { // do strong filter on pixels with not hev const __m128i zero = _mm_setzero_si128(); const __m128i nine = _mm_set1_epi16(0x0900); const __m128i sixty_three = _mm_set1_epi16(63); const __m128i m = _mm_and_si128(not_hev, *mask); const __m128i f = _mm_and_si128(a, m); const __m128i f_lo = _mm_unpacklo_epi8(zero, f); const __m128i f_hi = _mm_unpackhi_epi8(zero, f); const __m128i f9_lo = _mm_mulhi_epi16(f_lo, nine); // Filter (lo) * 9 const __m128i f9_hi = _mm_mulhi_epi16(f_hi, nine); // Filter (hi) * 9 const __m128i f18_lo = _mm_add_epi16(f9_lo, f9_lo); // Filter (lo) * 18 const __m128i f18_hi = _mm_add_epi16(f9_hi, f9_hi); // Filter (hi) * 18 const __m128i a2_lo = _mm_add_epi16(f9_lo, sixty_three); // Filter * 9 + 63 const __m128i a2_hi = _mm_add_epi16(f9_hi, sixty_three); // Filter * 9 + 63 const __m128i a1_lo = _mm_add_epi16(f18_lo, sixty_three); // F... * 18 + 63 const __m128i a1_hi = _mm_add_epi16(f18_hi, sixty_three); // F... * 18 + 63 const __m128i a0_lo = _mm_add_epi16(f18_lo, a2_lo); // Filter * 27 + 63 const __m128i a0_hi = _mm_add_epi16(f18_hi, a2_hi); // Filter * 27 + 63 UPDATE_2PIXELS(*p2, *q2, a2_lo, a2_hi); UPDATE_2PIXELS(*p1, *q1, a1_lo, a1_hi); UPDATE_2PIXELS(*p0, *q0, a0_lo, a0_hi); } // unoffset FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1); FLIP_SIGN_BIT2(*p2, *q2); }
__m64 interpolhline_2(unsigned char* image){ __m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7; unsigned char* imagetmp = image - 2; __m64 ret; xmm7 = _mm_setzero_si128(); xmm6 = _mm_loadu_si128(((__m128i*)imagetmp)); xmm0 = _mm_unpacklo_epi8(xmm6,xmm7); xmm6 = _mm_srli_si128(xmm6,1); xmm1 = _mm_unpacklo_epi8(xmm6,xmm7); xmm6 = _mm_srli_si128(xmm6,1); xmm2 = _mm_unpacklo_epi8(xmm6,xmm7); xmm6 = _mm_srli_si128(xmm6,1); xmm3 = _mm_unpacklo_epi8(xmm6,xmm7); xmm6 = _mm_srli_si128(xmm6,1); xmm4 = _mm_unpacklo_epi8(xmm6,xmm7); xmm6 = _mm_srli_si128(xmm6,1); xmm5 = _mm_unpacklo_epi8(xmm6,xmm7); // filter on 8 values xmm6 = _mm_add_epi16(xmm2,xmm3); xmm6 = _mm_slli_epi16(xmm6,2); xmm6 = _mm_sub_epi16(xmm6,xmm1); xmm6 = _mm_sub_epi16(xmm6,xmm4); xmm1 = _mm_set_epi32(0x00050005,0x00050005,0x00050005,0x00050005); xmm6 = _mm_mullo_epi16(xmm6,xmm1); xmm6 = _mm_add_epi16(xmm6,xmm0); xmm6 = _mm_add_epi16(xmm6,xmm5); xmm6 = _mm_add_epi16(xmm6,_mm_set_epi32(0x00100010,0x00100010,0x00100010,0x00100010)); xmm6 = _mm_max_epi16(xmm6, xmm7); // preventing negative values xmm6 = _mm_srli_epi16(xmm6,5); xmm6 = _mm_packus_epi16(xmm6,xmm7); ret = _mm_movepi64_pi64(xmm6); _mm_empty(); return(ret); }
static unsigned satd_8bit_4x4_avx2(const kvz_pixel *org, const kvz_pixel *cur) { __m128i original = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)org)); __m128i current = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)cur)); __m128i diff_lo = _mm_sub_epi16(current, original); original = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(org + 8))); current = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(cur + 8))); __m128i diff_hi = _mm_sub_epi16(current, original); //Hor __m128i row0 = _mm_hadd_epi16(diff_lo, diff_hi); __m128i row1 = _mm_hsub_epi16(diff_lo, diff_hi); __m128i row2 = _mm_hadd_epi16(row0, row1); __m128i row3 = _mm_hsub_epi16(row0, row1); //Ver row0 = _mm_hadd_epi16(row2, row3); row1 = _mm_hsub_epi16(row2, row3); row2 = _mm_hadd_epi16(row0, row1); row3 = _mm_hsub_epi16(row0, row1); //Abs and sum row2 = _mm_abs_epi16(row2); row3 = _mm_abs_epi16(row3); row3 = _mm_add_epi16(row2, row3); row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(2, 3, 0, 1) )); row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(1, 0, 1, 0) )); row3 = _mm_add_epi16(row3, _mm_shufflelo_epi16(row3, KVZ_PERMUTE(1, 0, 1, 0) )); unsigned sum = _mm_extract_epi16(row3, 0); unsigned satd = (sum + 1) >> 1; return satd; }
static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1, uint32_t a2, uint32_t a3) { const __m128i avg1 = Average2_uint32_16_SSE2(a0, a1); const __m128i avg2 = Average2_uint32_16_SSE2(a2, a3); const __m128i sum = _mm_add_epi16(avg2, avg1); const __m128i avg3 = _mm_srli_epi16(sum, 1); const __m128i A0 = _mm_packus_epi16(avg3, avg3); const uint32_t output = _mm_cvtsi128_si32(A0); return output; }
static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) { const __m128i zero = _mm_setzero_si128(); const __m128i avg1 = Average2_128i(a0, a2); const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero); const __m128i sum = _mm_add_epi16(avg1, A1); const __m128i avg2 = _mm_srli_epi16(sum, 1); const __m128i A2 = _mm_packus_epi16(avg2, avg2); const uint32_t output = _mm_cvtsi128_si32(A2); return output; }
inline Pixel GetPixelSSE(const Image* img, float x, float y) { const int stride = img->width; const Pixel* p0 = img->data + (int)x + (int)y * stride; // pointer to first pixel // Load the data (2 pixels in one load) __m128i p12 = _mm_loadl_epi64((const __m128i*)&p0[0 * stride]); __m128i p34 = _mm_loadl_epi64((const __m128i*)&p0[1 * stride]); __m128 weight = CalcWeights(x, y); // extend to 16bit p12 = _mm_unpacklo_epi8(p12, _mm_setzero_si128()); p34 = _mm_unpacklo_epi8(p34, _mm_setzero_si128()); // convert floating point weights to 16bit integer weight = _mm_mul_ps(weight, CONST_256); __m128i weighti = _mm_cvtps_epi32(weight); // w4 w3 w2 w1 weighti = _mm_packs_epi32(weighti, _mm_setzero_si128()); // 32->16bit // prepare the weights __m128i w12 = _mm_shufflelo_epi16(weighti, _MM_SHUFFLE(1, 1, 0, 0)); __m128i w34 = _mm_shufflelo_epi16(weighti, _MM_SHUFFLE(3, 3, 2, 2)); w12 = _mm_unpacklo_epi16(w12, w12); // w2 w2 w2 w2 w1 w1 w1 w1 w34 = _mm_unpacklo_epi16(w34, w34); // w4 w4 w4 w4 w3 w3 w3 w3 // multiply each pixel with its weight (2 pixel per SSE mul) __m128i L12 = _mm_mullo_epi16(p12, w12); __m128i L34 = _mm_mullo_epi16(p34, w34); // sum the results __m128i L1234 = _mm_add_epi16(L12, L34); __m128i Lhi = _mm_shuffle_epi32(L1234, _MM_SHUFFLE(3, 2, 3, 2)); __m128i L = _mm_add_epi16(L1234, Lhi); // convert back to 8bit __m128i L8 = _mm_srli_epi16(L, 8); // divide by 256 L8 = _mm_packus_epi16(L8, _mm_setzero_si128()); // return return _mm_cvtsi128_si32(L8); }
unsigned int vp9_get16x16var_sse2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse, int *sum) { const __m128i zero = _mm_setzero_si128(); __m128i vsum = _mm_setzero_si128(); __m128i vsse = _mm_setzero_si128(); int i; for (i = 0; i < 16; ++i) { const __m128i s = _mm_loadu_si128((const __m128i *)src); const __m128i r = _mm_loadu_si128((const __m128i *)ref); const __m128i src0 = _mm_unpacklo_epi8(s, zero); const __m128i ref0 = _mm_unpacklo_epi8(r, zero); const __m128i diff0 = _mm_sub_epi16(src0, ref0); const __m128i src1 = _mm_unpackhi_epi8(s, zero); const __m128i ref1 = _mm_unpackhi_epi8(r, zero); const __m128i diff1 = _mm_sub_epi16(src1, ref1); vsum = _mm_add_epi16(vsum, diff0); vsum = _mm_add_epi16(vsum, diff1); vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); src += src_stride; ref += ref_stride; } // sum vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); *sum = (int16_t)_mm_extract_epi16(vsum, 0) + (int16_t)_mm_extract_epi16(vsum, 1); // sse vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); *sse = _mm_cvtsi128_si32(vsse); return 0; }
static INLINE void hor_transform_row_avx2(__m128i* row){ __m128i mask_pos = _mm_set1_epi16(1); __m128i mask_neg = _mm_set1_epi16(-1); __m128i sign_mask = _mm_unpacklo_epi64(mask_pos, mask_neg); __m128i temp = _mm_shuffle_epi32(*row, KVZ_PERMUTE(2, 3, 0, 1)); *row = _mm_sign_epi16(*row, sign_mask); *row = _mm_add_epi16(*row, temp); sign_mask = _mm_unpacklo_epi32(mask_pos, mask_neg); temp = _mm_shuffle_epi32(*row, KVZ_PERMUTE(1, 0, 3, 2)); *row = _mm_sign_epi16(*row, sign_mask); *row = _mm_add_epi16(*row, temp); sign_mask = _mm_unpacklo_epi16(mask_pos, mask_neg); temp = _mm_shufflelo_epi16(*row, KVZ_PERMUTE(1,0,3,2)); temp = _mm_shufflehi_epi16(temp, KVZ_PERMUTE(1,0,3,2)); *row = _mm_sign_epi16(*row, sign_mask); *row = _mm_add_epi16(*row, temp); }
static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk, const uint8_t* src) { const int x_sub = wrk->x_sub; int accum = 0; const __m128i zero = _mm_setzero_si128(); const __m128i mult0 = _mm_set1_epi16(x_sub); const __m128i mult1 = _mm_set1_epi32(wrk->fx_scale); const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER); __m128i sum = zero; rescaler_t* frow = wrk->frow; const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width; if (wrk->num_channels != 4 || wrk->x_add > (x_sub << 7)) { WebPRescalerImportRowShrink_C(wrk, src); return; } assert(!WebPRescalerInputDone(wrk)); assert(!wrk->x_expand); for (; frow < frow_end; frow += 4) { __m128i base = zero; accum += wrk->x_add; while (accum > 0) { const __m128i A = _mm_cvtsi32_si128(WebPMemToUint32(src)); src += 4; base = _mm_unpacklo_epi8(A, zero); // To avoid overflow, we need: base * x_add / x_sub < 32768 // => x_add < x_sub << 7. That's a 1/128 reduction ratio limit. sum = _mm_add_epi16(sum, base); accum -= x_sub; } { // Emit next horizontal pixel. const __m128i mult = _mm_set1_epi16(-accum); const __m128i frac0 = _mm_mullo_epi16(base, mult); // 16b x 16b -> 32b const __m128i frac1 = _mm_mulhi_epu16(base, mult); const __m128i frac = _mm_unpacklo_epi16(frac0, frac1); // frac is 32b const __m128i A0 = _mm_mullo_epi16(sum, mult0); const __m128i A1 = _mm_mulhi_epu16(sum, mult0); const __m128i B0 = _mm_unpacklo_epi16(A0, A1); // sum * x_sub const __m128i frow_out = _mm_sub_epi32(B0, frac); // sum * x_sub - frac const __m128i D0 = _mm_srli_epi64(frac, 32); const __m128i D1 = _mm_mul_epu32(frac, mult1); // 32b x 16b -> 64b const __m128i D2 = _mm_mul_epu32(D0, mult1); const __m128i E1 = _mm_add_epi64(D1, rounder); const __m128i E2 = _mm_add_epi64(D2, rounder); const __m128i F1 = _mm_shuffle_epi32(E1, 1 | (3 << 2)); const __m128i F2 = _mm_shuffle_epi32(E2, 1 | (3 << 2)); const __m128i G = _mm_unpacklo_epi32(F1, F2); sum = _mm_packs_epi32(G, zero); _mm_storeu_si128((__m128i*)frow, frow_out); } } assert(accum == 0); }
static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1, uint32_t c2) { const __m128i zero = _mm_setzero_si128(); const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero); const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero); const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero); const __m128i V1 = _mm_add_epi16(C0, C1); const __m128i V2 = _mm_sub_epi16(V1, C2); const __m128i b = _mm_packus_epi16(V2, V2); const uint32_t output = _mm_cvtsi128_si32(b); return output; }
static FORCE_INLINE __m128i mm_min_epu(const __m128i &a, const __m128i &b) { if (sizeof(PixelType) == 1) return _mm_min_epu8(a, b); else { __m128i word_32768 = _mm_set1_epi16(32768); __m128i a_minus = _mm_sub_epi16(a, word_32768); __m128i b_minus = _mm_sub_epi16(b, word_32768); return _mm_add_epi16(_mm_min_epi16(a_minus, b_minus), word_32768); } }
// Returns |x| for 16-bit lanes. static __m128i abs_i16(__m128i x) { #if defined(__SSSE3__) return _mm_abs_epi16(x); #else // Read this all as, return x<0 ? -x : x. // To negate two's complement, you flip all the bits then add 1. __m128i is_negative = _mm_cmplt_epi16(x, _mm_setzero_si128()); x = _mm_xor_si128(x, is_negative); // Flip negative lanes. x = _mm_add_epi16(x, _mm_srli_epi16(is_negative, 15)); // +1 to negative lanes, else +0. return x; #endif }
__m64 _m_paddw(__m64 _MM1, __m64 _MM2) { __m128i lhs = {0}, rhs = {0}; lhs.m128i_i64[0] = _MM1.m64_i64; rhs.m128i_i64[0] = _MM2.m64_i64; lhs = _mm_add_epi16(lhs, rhs); _MM1.m64_i64 = lhs.m128i_i64[0]; return _MM1; }
unsigned int vp9_get8x8var_sse2(const uint8_t *src, int src_stride, const uint8_t *ref, int ref_stride, unsigned int *sse, int *sum) { const __m128i zero = _mm_setzero_si128(); __m128i vsum = _mm_setzero_si128(); __m128i vsse = _mm_setzero_si128(); int i; for (i = 0; i < 8; i += 2) { const __m128i src0 = _mm_unpacklo_epi8(_mm_loadl_epi64( (const __m128i *)(src + i * src_stride)), zero); const __m128i ref0 = _mm_unpacklo_epi8(_mm_loadl_epi64( (const __m128i *)(ref + i * ref_stride)), zero); const __m128i diff0 = _mm_sub_epi16(src0, ref0); const __m128i src1 = _mm_unpacklo_epi8(_mm_loadl_epi64( (const __m128i *)(src + (i + 1) * src_stride)), zero); const __m128i ref1 = _mm_unpacklo_epi8(_mm_loadl_epi64( (const __m128i *)(ref + (i + 1) * ref_stride)), zero); const __m128i diff1 = _mm_sub_epi16(src1, ref1); vsum = _mm_add_epi16(vsum, diff0); vsum = _mm_add_epi16(vsum, diff1); vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff0, diff0)); vsse = _mm_add_epi32(vsse, _mm_madd_epi16(diff1, diff1)); } // sum vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 8)); vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 4)); vsum = _mm_add_epi16(vsum, _mm_srli_si128(vsum, 2)); *sum = (int16_t)_mm_extract_epi16(vsum, 0); // sse vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 8)); vsse = _mm_add_epi32(vsse, _mm_srli_si128(vsse, 4)); *sse = _mm_cvtsi128_si32(vsse); return 0; }
template<int shift, int active_bits> void Haar_invtransform_H_final_1_sse4_2_int16_t(void *_idata, const int istride, const char *odata, const int ostride, const int iwidth, const int iheight, const int ooffset_x, const int ooffset_y, const int owidth, const int oheight) { int16_t *idata = (int16_t *)_idata; const int skip = 1; const __m128i ONE = _mm_set1_epi16(1); const __m128i OFFSET = _mm_set1_epi16(1 << (active_bits - 1)); const __m128i SHUF = _mm_set_epi8(15,14, 11,10, 7,6, 3,2, 13,12, 9,8, 5,4, 1,0); const __m128i CLIP = _mm_set1_epi16((1 << active_bits) - 1); const __m128i ZERO = _mm_set1_epi16(0); (void)iwidth; (void)iheight; for (int y = ooffset_y; y < ooffset_y + oheight; y+=skip) { for (int x = ooffset_x; x < ooffset_x + owidth; x += 16) { __m128i D0 = _mm_load_si128((__m128i *)&idata[y*istride + x + 0]); __m128i D8 = _mm_load_si128((__m128i *)&idata[y*istride + x + 8]); D0 = _mm_shuffle_epi8(D0, SHUF); D8 = _mm_shuffle_epi8(D8, SHUF); __m128i E0 = _mm_unpacklo_epi64(D0, D8); __m128i O1 = _mm_unpackhi_epi64(D0, D8); __m128i X0 = _mm_sub_epi16(E0, _mm_srai_epi16(_mm_add_epi16(O1, ONE), 1)); __m128i X1 = _mm_add_epi16(O1, X0); __m128i Z0 = _mm_unpacklo_epi16(X0, X1); __m128i Z8 = _mm_unpackhi_epi16(X0, X1); if (shift != 0) { Z0 = _mm_add_epi16(Z0, ONE); Z8 = _mm_add_epi16(Z8, ONE); Z0 = _mm_srai_epi16(Z0, shift); Z8 = _mm_srai_epi16(Z8, shift); } Z0 = _mm_add_epi16(Z0, OFFSET); Z8 = _mm_add_epi16(Z8, OFFSET); Z0 = _mm_min_epi16(Z0, CLIP); Z8 = _mm_min_epi16(Z8, CLIP); Z0 = _mm_max_epi16(Z0, ZERO); Z8 = _mm_max_epi16(Z8, ZERO); _mm_store_si128((__m128i *)&odata[2*((y - ooffset_y)*ostride + x + 0 - ooffset_x)], Z0); _mm_store_si128((__m128i *)&odata[2*((y - ooffset_y)*ostride + x + 8 - ooffset_x)], Z8); } } }
Image<uint16_t> blur_fast(Image<uint16_t> in) { Image<uint16_t> out(in.width()-8, in.height()-2); begin_timing; __m128i one_third = _mm_set1_epi16(21846); #pragma omp parallel for for (int yTile = 0; yTile < out.height(); yTile += 32) { __m128i a, b, c, sum, avg; __m128i tmp[(128/8) * (32 + 2)]; for (int xTile = 0; xTile < out.width(); xTile += 128) { __m128i *tmpPtr = tmp; for (int y = 0; y < 32+2; y++) { const uint16_t *inPtr = &(in(xTile, yTile+y)); for (int x = 0; x < 128; x += 8) { a = _mm_load_si128((__m128i*)(inPtr)); b = _mm_loadu_si128((__m128i*)(inPtr+1)); c = _mm_loadu_si128((__m128i*)(inPtr+2)); sum = _mm_add_epi16(_mm_add_epi16(a, b), c); avg = _mm_mulhi_epi16(sum, one_third); _mm_store_si128(tmpPtr++, avg); inPtr+=8; } } tmpPtr = tmp; for (int y = 0; y < 32; y++) { __m128i *outPtr = (__m128i *)(&(out(xTile, yTile+y))); for (int x = 0; x < 128; x += 8) { a = _mm_load_si128(tmpPtr+(2*128)/8); b = _mm_load_si128(tmpPtr+128/8); c = _mm_load_si128(tmpPtr++); sum = _mm_add_epi16(_mm_add_epi16(a, b), c); avg = _mm_mulhi_epi16(sum, one_third); _mm_store_si128(outPtr++, avg); } } } } end_timing; return out; }
static INLINE void ver_add_sub_avx2(__m128i (*temp_hor)[8], __m128i (*temp_ver)[8]){ // First stage for (int i = 0; i < 8; i += 2){ (*temp_ver)[i+0] = _mm_hadd_epi16((*temp_hor)[i + 0], (*temp_hor)[i + 1]); (*temp_ver)[i+1] = _mm_hsub_epi16((*temp_hor)[i + 0], (*temp_hor)[i + 1]); } // Second stage for (int i = 0; i < 8; i += 4){ (*temp_hor)[i + 0] = _mm_add_epi16((*temp_ver)[i + 0], (*temp_ver)[i + 2]); (*temp_hor)[i + 1] = _mm_add_epi16((*temp_ver)[i + 1], (*temp_ver)[i + 3]); (*temp_hor)[i + 2] = _mm_sub_epi16((*temp_ver)[i + 0], (*temp_ver)[i + 2]); (*temp_hor)[i + 3] = _mm_sub_epi16((*temp_ver)[i + 1], (*temp_ver)[i + 3]); } // Third stage for (int i = 0; i < 4; ++i){ (*temp_ver)[i + 0] = _mm_add_epi16((*temp_hor)[0 + i], (*temp_hor)[4 + i]); (*temp_ver)[i + 4] = _mm_sub_epi16((*temp_hor)[0 + i], (*temp_hor)[4 + i]); } }
/** * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more * precise version of a box filter 4:2:0 pixel subsampling in Q3. * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer. */ static INLINE void cfl_luma_subsampling_420_hbd_ssse3(const uint16_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE; const int luma_stride = input_stride << 1; do { if (width == 4) { const __m128i top = _mm_loadl_epi64((__m128i *)input); const __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride)); __m128i sum = _mm_add_epi16(top, bot); sum = _mm_hadd_epi16(sum, sum); *((int *)pred_buf_q3) = _mm_cvtsi128_si32(_mm_add_epi16(sum, sum)); } else { const __m128i top = _mm_loadu_si128((__m128i *)input); const __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride)); __m128i sum = _mm_add_epi16(top, bot); if (width == 8) { sum = _mm_hadd_epi16(sum, sum); _mm_storel_epi64((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum)); } else { const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); const __m128i bot_1 = _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1); sum = _mm_hadd_epi16(sum, _mm_add_epi16(top_1, bot_1)); _mm_storeu_si128((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum)); if (width == 32) { const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2); const __m128i bot_2 = _mm_loadu_si128(((__m128i *)(input + input_stride)) + 2); const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3); const __m128i bot_3 = _mm_loadu_si128(((__m128i *)(input + input_stride)) + 3); const __m128i sum_2 = _mm_add_epi16(top_2, bot_2); const __m128i sum_3 = _mm_add_epi16(top_3, bot_3); __m128i next_sum = _mm_hadd_epi16(sum_2, sum_3); _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1, _mm_add_epi16(next_sum, next_sum)); } } } input += luma_stride; } while ((pred_buf_q3 += CFL_BUF_LINE) < end); }
static void GradientPredictInverse(const uint8_t* const in, const uint8_t* const top, uint8_t* const row, int length) { if (length > 0) { int i; const int max_pos = length & ~7; const __m128i zero = _mm_setzero_si128(); __m128i A = _mm_set_epi32(0, 0, 0, row[-1]); // left sample for (i = 0; i < max_pos; i += 8) { const __m128i tmp0 = _mm_loadl_epi64((const __m128i*)&top[i]); const __m128i tmp1 = _mm_loadl_epi64((const __m128i*)&top[i - 1]); const __m128i B = _mm_unpacklo_epi8(tmp0, zero); const __m128i C = _mm_unpacklo_epi8(tmp1, zero); const __m128i tmp2 = _mm_loadl_epi64((const __m128i*)&in[i]); const __m128i D = _mm_unpacklo_epi8(tmp2, zero); // base input const __m128i E = _mm_sub_epi16(B, C); // unclipped gradient basis B - C __m128i out = zero; // accumulator for output __m128i mask_hi = _mm_set_epi32(0, 0, 0, 0xff); int k = 8; while (1) { const __m128i tmp3 = _mm_add_epi16(A, E); // delta = A + B - C const __m128i tmp4 = _mm_min_epi16(tmp3, mask_hi); const __m128i tmp5 = _mm_max_epi16(tmp4, zero); // clipped delta const __m128i tmp6 = _mm_add_epi16(tmp5, D); // add to in[] values A = _mm_and_si128(tmp6, mask_hi); // 1-complement clip out = _mm_or_si128(out, A); // accumulate output if (--k == 0) break; A = _mm_slli_si128(A, 2); // rotate left sample mask_hi = _mm_slli_si128(mask_hi, 2); // rotate mask } A = _mm_srli_si128(A, 14); // prepare left sample for next iteration _mm_storel_epi64((__m128i*)&row[i], _mm_packus_epi16(out, zero)); } for (; i < length; ++i) { row[i] = in[i] + GradientPredictorC(row[i - 1], top[i], top[i - 1]); } } }
// These constants are 14b fixed-point version of ITU-R BT.601 constants. // R = (19077 * y + 26149 * v - 14234) >> 6 // G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6 // B = (19077 * y + 33050 * u - 17685) >> 6 static void ConvertYUV444ToRGB_SSE41(const __m128i* const Y0, const __m128i* const U0, const __m128i* const V0, __m128i* const R, __m128i* const G, __m128i* const B) { const __m128i k19077 = _mm_set1_epi16(19077); const __m128i k26149 = _mm_set1_epi16(26149); const __m128i k14234 = _mm_set1_epi16(14234); // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic const __m128i k33050 = _mm_set1_epi16((short)33050); const __m128i k17685 = _mm_set1_epi16(17685); const __m128i k6419 = _mm_set1_epi16(6419); const __m128i k13320 = _mm_set1_epi16(13320); const __m128i k8708 = _mm_set1_epi16(8708); const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077); const __m128i R0 = _mm_mulhi_epu16(*V0, k26149); const __m128i R1 = _mm_sub_epi16(Y1, k14234); const __m128i R2 = _mm_add_epi16(R1, R0); const __m128i G0 = _mm_mulhi_epu16(*U0, k6419); const __m128i G1 = _mm_mulhi_epu16(*V0, k13320); const __m128i G2 = _mm_add_epi16(Y1, k8708); const __m128i G3 = _mm_add_epi16(G0, G1); const __m128i G4 = _mm_sub_epi16(G2, G3); // be careful with the saturated *unsigned* arithmetic here! const __m128i B0 = _mm_mulhi_epu16(*U0, k33050); const __m128i B1 = _mm_adds_epu16(B0, Y1); const __m128i B2 = _mm_subs_epu16(B1, k17685); // use logical shift for B2, which can be larger than 32767 *R = _mm_srai_epi16(R2, 6); // range: [-14234, 30815] *G = _mm_srai_epi16(G4, 6); // range: [-10953, 27710] *B = _mm_srli_epi16(B2, 6); // range: [0, 34238] }
/* ===================== R_CopyDecalSurface ===================== */ static void R_CopyDecalSurface( idDrawVert * verts, int numVerts, triIndex_t * indexes, int numIndexes, const decal_t * decal, const float fadeColor[4] ) { assert_16_byte_aligned( &verts[numVerts] ); assert_16_byte_aligned( &indexes[numIndexes] ); assert_16_byte_aligned( decal->indexes ); assert_16_byte_aligned( decal->verts ); assert( ( ( decal->numVerts * sizeof( idDrawVert ) ) & 15 ) == 0 ); assert( ( ( decal->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 ); assert_16_byte_aligned( fadeColor ); const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 ); const __m128i vector_short_num_verts = _mm_packs_epi32( vector_int_num_verts, vector_int_num_verts ); const __m128 vector_fade_color = _mm_load_ps( fadeColor ); const __m128i vector_color_mask = _mm_set_epi32( 0, -1, 0, 0 ); // copy vertices and apply depth/time based fading assert_offsetof( idDrawVert, color, 6 * 4 ); for ( int i = 0; i < decal->numVerts; i++ ) { const idDrawVert &srcVert = decal->verts[i]; idDrawVert &dstVert = verts[numVerts + i]; __m128i v0 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert + 0 ) ); __m128i v1 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert + 16 ) ); __m128 depthFade = _mm_splat_ps( _mm_load_ss( decal->vertDepthFade + i ), 0 ); __m128 timeDepthFade = _mm_mul_ps( depthFade, vector_fade_color ); __m128i colorInt = _mm_cvtps_epi32( timeDepthFade ); __m128i colorShort = _mm_packs_epi32( colorInt, colorInt ); __m128i colorByte = _mm_packus_epi16( colorShort, colorShort ); v1 = _mm_or_si128( v1, _mm_and_si128( colorByte, vector_color_mask ) ); _mm_stream_si128( (__m128i *)( (byte *)&dstVert + 0 ), v0 ); _mm_stream_si128( (__m128i *)( (byte *)&dstVert + 16 ), v1 ); } // copy indexes assert( ( decal->numIndexes & 7 ) == 0 ); assert( sizeof( triIndex_t ) == 2 ); for ( int i = 0; i < decal->numIndexes; i += 8 ) { __m128i vi = _mm_load_si128( (const __m128i *)&decal->indexes[i] ); vi = _mm_add_epi16( vi, vector_short_num_verts ); _mm_stream_si128( (__m128i *)&indexes[numIndexes + i], vi ); } _mm_sfence(); }
SIMDValue SIMDInt16x8Operation::OpNeg(const SIMDValue& value) { X86SIMDValue x86Result; X86SIMDValue SIGNMASK, temp; X86SIMDValue negativeOnes = { { -1, -1, -1, -1} }; X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value); temp.m128i_value = _mm_andnot_si128(v.m128i_value, negativeOnes.m128i_value); // (~value) & (negative ones) SIGNMASK.m128i_value = _mm_set1_epi16(0x0001); // set SIGNMASK to 1 x86Result.m128i_value = _mm_add_epi16(SIGNMASK.m128i_value, temp.m128i_value);// add 4 integers respectively return X86SIMDValue::ToSIMDValue(x86Result); }
SIMD_INLINE __m128i AdjustedYuvToHue16(__m128i y, __m128i u, __m128i v, const __m128 & KF_255_DIV_6) { const __m128i red = AdjustedYuvToRed16(y, v); const __m128i green = AdjustedYuvToGreen16(y, u, v); const __m128i blue = AdjustedYuvToBlue16(y, u); const __m128i max = MaxI16(red, green, blue); const __m128i range = _mm_subs_epi16(max, MinI16(red, green, blue)); const __m128i redMaxMask = _mm_cmpeq_epi16(red, max); const __m128i greenMaxMask = _mm_andnot_si128(redMaxMask, _mm_cmpeq_epi16(green, max)); const __m128i blueMaxMask = _mm_andnot_si128(redMaxMask, _mm_andnot_si128(greenMaxMask, K_INV_ZERO)); const __m128i redMaxCase = _mm_and_si128(redMaxMask, _mm_add_epi16(_mm_sub_epi16(green, blue), _mm_mullo_epi16(range, K16_0006))); const __m128i greenMaxCase = _mm_and_si128(greenMaxMask, _mm_add_epi16(_mm_sub_epi16(blue, red), _mm_mullo_epi16(range, K16_0002))); const __m128i blueMaxCase = _mm_and_si128(blueMaxMask, _mm_add_epi16(_mm_sub_epi16(red, green), _mm_mullo_epi16(range, K16_0004))); const __m128i dividend = _mm_or_si128(_mm_or_si128(redMaxCase, greenMaxCase), blueMaxCase); return _mm_andnot_si128(_mm_cmpeq_epi16(range, K_ZERO), _mm_and_si128(MulDiv16(dividend, range, KF_255_DIV_6), K16_00FF)); }
static inline __m128i v4_mul_color_sse2(__m128i x, __m128i y) { const __m128i zero = _mm_setzero_si128(); const __m128i sym4_mask = _mm_set_epi32(0x00FF00FF, 0x000000FF, 0x00FF00FF, 0x000000FF); __m128i x_l = _mm_unpacklo_epi8(x, zero); __m128i x_h = _mm_unpackhi_epi8(x, zero); __m128i y_l = _mm_unpacklo_epi8(y, zero); __m128i y_h = _mm_unpackhi_epi8(y, zero); __m128i r_l = _mm_mullo_epi16(x_l, y_l); __m128i r_h = _mm_mullo_epi16(x_h, y_h); r_l = _mm_add_epi16(r_l, sym4_mask); r_h = _mm_add_epi16(r_h, sym4_mask); r_l = _mm_srli_epi16(r_l, 8); r_h = _mm_srli_epi16(r_h, 8); return _mm_packus_epi16(r_l, r_h); }
// Compute the sum of all pixel differences of this MB. static INLINE int sum_diff_16x1(__m128i acc_diff) { const __m128i k_1 = _mm_set1_epi16(1); const __m128i acc_diff_lo = _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8); const __m128i acc_diff_hi = _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8); const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi); const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1); const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8)); const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4)); return _mm_cvtsi128_si32(hgfedcba); }