static void GradientPredictInverse(const uint8_t* const in, const uint8_t* const top, uint8_t* const row, int length) { if (length > 0) { int i; const int max_pos = length & ~7; const __m128i zero = _mm_setzero_si128(); __m128i A = _mm_set_epi32(0, 0, 0, row[-1]); // left sample for (i = 0; i < max_pos; i += 8) { const __m128i tmp0 = _mm_loadl_epi64((const __m128i*)&top[i]); const __m128i tmp1 = _mm_loadl_epi64((const __m128i*)&top[i - 1]); const __m128i B = _mm_unpacklo_epi8(tmp0, zero); const __m128i C = _mm_unpacklo_epi8(tmp1, zero); const __m128i tmp2 = _mm_loadl_epi64((const __m128i*)&in[i]); const __m128i D = _mm_unpacklo_epi8(tmp2, zero); // base input const __m128i E = _mm_sub_epi16(B, C); // unclipped gradient basis B - C __m128i out = zero; // accumulator for output __m128i mask_hi = _mm_set_epi32(0, 0, 0, 0xff); int k = 8; while (1) { const __m128i tmp3 = _mm_add_epi16(A, E); // delta = A + B - C const __m128i tmp4 = _mm_min_epi16(tmp3, mask_hi); const __m128i tmp5 = _mm_max_epi16(tmp4, zero); // clipped delta const __m128i tmp6 = _mm_add_epi16(tmp5, D); // add to in[] values A = _mm_and_si128(tmp6, mask_hi); // 1-complement clip out = _mm_or_si128(out, A); // accumulate output if (--k == 0) break; A = _mm_slli_si128(A, 2); // rotate left sample mask_hi = _mm_slli_si128(mask_hi, 2); // rotate mask } A = _mm_srli_si128(A, 14); // prepare left sample for next iteration _mm_storel_epi64((__m128i*)&row[i], _mm_packus_epi16(out, zero)); } for (; i < length; ++i) { row[i] = in[i] + GradientPredictorC(row[i - 1], top[i], top[i - 1]); } } }
static void ConvertBGRAToRGB565(const uint32_t* src, int num_pixels, uint8_t* dst) { const __m128i mask_0xe0 = _mm_set1_epi8(0xe0); const __m128i mask_0xf8 = _mm_set1_epi8(0xf8); const __m128i mask_0x07 = _mm_set1_epi8(0x07); const __m128i* in = (const __m128i*)src; __m128i* out = (__m128i*)dst; while (num_pixels >= 8) { const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3 const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7 const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4... const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6... const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6... const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7... const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7 const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7 const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7 const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7 const __m128i rb1 = _mm_and_si128(rb0, mask_0xf8); // -r0..-r7|-b0..-b7 const __m128i g_lo1 = _mm_srli_epi16(ga0, 5); const __m128i g_lo2 = _mm_and_si128(g_lo1, mask_0x07); // g0-...g7-|xx (3b) const __m128i g_hi1 = _mm_slli_epi16(ga0, 3); const __m128i g_hi2 = _mm_and_si128(g_hi1, mask_0xe0); // -g0...-g7|xx (3b) const __m128i b0 = _mm_srli_si128(rb1, 8); // -b0...-b7|0 const __m128i rg1 = _mm_or_si128(rb1, g_lo2); // gr0...gr7|xx const __m128i b1 = _mm_srli_epi16(b0, 3); const __m128i gb1 = _mm_or_si128(b1, g_hi2); // bg0...bg7|xx #ifdef WEBP_SWAP_16BIT_CSP const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1); // rggb0...rggb7 #else const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1); // bgrb0...bgrb7 #endif _mm_storeu_si128(out++, rgba); num_pixels -= 8; } // left-overs VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out); }
void vpx_highbd_d135_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); const __m128i XABCDEFG = _mm_loadu_si128((const __m128i *)(above - 1)); const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above); const __m128i BCDEFGH0 = _mm_srli_si128(ABCDEFGH, 2); const __m128i IJKLMNOP = _mm_load_si128((const __m128i *)left); const __m128i XIJKLMNO = _mm_alignr_epi8(IJKLMNOP, _mm_slli_si128(XABCDEFG, 14), 14); const __m128i AXIJKLMN = _mm_alignr_epi8(XIJKLMNO, _mm_slli_si128(ABCDEFGH, 14), 14); const __m128i avg3 = avg3_epu16(&XABCDEFG, &ABCDEFGH, &BCDEFGH0); __m128i avg3_left = avg3_epu16(&IJKLMNOP, &XIJKLMNO, &AXIJKLMN); __m128i rowa = avg3; int i; (void)bd; for (i = 0; i < 8; ++i) { rowa = _mm_alignr_epi8(rowa, rotr_epu16(&avg3_left, &rotrw), 14); _mm_store_si128((__m128i *)dst, rowa); dst += stride; } }
size_t sse4_strstr_unrolled_len3(const char* s, size_t n, const char* needle) { const __m128i prefix = _mm_loadu_si128(reinterpret_cast<const __m128i*>(needle)); const __m128i zeros = _mm_setzero_si128(); for (size_t i = 0; i < n; i += 8) { const __m128i data = _mm_loadu_si128(reinterpret_cast<const __m128i*>(s + i)); const __m128i lastbyte = _mm_cvtepu8_epi16(_mm_srli_si128(data, 3)); const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0); const __m128i cmp = _mm_cmpeq_epi16(_mm_sub_epi16(result, lastbyte), zeros); unsigned mask = _mm_movemask_epi8(cmp) & 0x5555; if (mask != 0) { return i + bits::get_first_bit_set(mask)/2; } } return std::string::npos; }
void test8bit (void) { i1 = _mm_cmpistrm (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistri (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistra (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistrc (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistro (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistrs (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistrz (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ i1 = _mm_cmpestrm (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestri (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestra (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestrc (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestro (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestrs (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestrz (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ b1 = _mm256_blend_ps (b2, b3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ k1 = _cvtss_sh (f1, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm256_cvtps_ph (b2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ b1 = _mm256_dp_ps (b2, b3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ e1 = _mm256_permute2f128_pd (e2, e3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ b1 = _mm256_permute2f128_ps (b2, b3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ l1 = _mm256_permute2f128_si256 (l2, l3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ b1 = _mm256_permute_ps (b2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_aeskeygenassist_si128 (i2, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_blend_epi16 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_clmulepi64_si128 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_cvtps_ph (a1, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ d1 = _mm_dp_pd (d2, d3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ a1 = _mm_dp_ps (a2, a3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ a1 = _mm_insert_ps (a2, a3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_mpsadbw_epu8 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ a1 = _mm_permute_ps (a2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_slli_si128 (i2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_srli_si128 (i2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ }
unsigned int vp9_sad16x3_sse2( const unsigned char *src_ptr, int src_stride, const unsigned char *ref_ptr, int ref_stride) { __m128i s0, s1, s2; __m128i r0, r1, r2; __m128i sad; s0 = _mm_loadu_si128((const __m128i *)(src_ptr + 0 * src_stride)); s1 = _mm_loadu_si128((const __m128i *)(src_ptr + 1 * src_stride)); s2 = _mm_loadu_si128((const __m128i *)(src_ptr + 2 * src_stride)); r0 = _mm_loadu_si128((const __m128i *)(ref_ptr + 0 * ref_stride)); r1 = _mm_loadu_si128((const __m128i *)(ref_ptr + 1 * ref_stride)); r2 = _mm_loadu_si128((const __m128i *)(ref_ptr + 2 * ref_stride)); sad = _mm_sad_epu8(s0, r0); sad = _mm_add_epi16(sad, _mm_sad_epu8(s1, r1)); sad = _mm_add_epi16(sad, _mm_sad_epu8(s2, r2)); sad = _mm_add_epi16(sad, _mm_srli_si128(sad, 8)); return _mm_cvtsi128_si32(sad); }
void vpx_highbd_d135_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i rotrw = _mm_load_si128((const __m128i *)rotate_right_epu16); const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); const __m128i B0 = _mm_load_si128((const __m128i *)above); const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); const __m128i B1 = _mm_load_si128((const __m128i *)(above + 8)); const __m128i L0 = _mm_load_si128((const __m128i *)left); const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); const __m128i C0 = _mm_alignr_epi8(B1, B0, 2); const __m128i C1 = _mm_srli_si128(B1, 2); const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); const __m128i L0_ = _mm_alignr_epi8(XL0, _mm_slli_si128(B0, 14), 14); const __m128i L1_ = _mm_alignr_epi8(XL1, XL0, 14); __m128i rowa_0 = avg3_0; __m128i rowa_1 = avg3_1; __m128i avg3_left[2]; int i, j; (void)bd; avg3_left[0] = avg3_epu16(&L0, &XL0, &L0_); avg3_left[1] = avg3_epu16(&L1, &XL1, &L1_); for (i = 0; i < 2; ++i) { __m128i avg_left = avg3_left[i]; for (j = 0; j < 8; ++j) { rowa_1 = _mm_alignr_epi8(rowa_1, rowa_0, 14); rowa_0 = _mm_alignr_epi8(rowa_0, rotr_epu16(&avg_left, &rotrw), 14); _mm_store_si128((__m128i *)dst, rowa_0); _mm_store_si128((__m128i *)(dst + 8), rowa_1); dst += stride; } } }
unsigned int luma_sse2(const uint8_t *pSrc, intptr_t nSrcPitch) { __m128i sum = zeroes; for (unsigned y = 0; y < height; y++) { for (unsigned x = 0; x < width; x += 16) { __m128i src; if (width == 4) src = _mm_cvtsi32_si128(*(const int *)pSrc); else if (width == 8) src = _mm_loadl_epi64((const __m128i *)pSrc); else src = _mm_loadu_si128((const __m128i *)&pSrc[x]); sum = _mm_add_epi64(sum, _mm_sad_epu8(src, zeroes)); } pSrc += nSrcPitch; } if (width >= 16) sum = _mm_add_epi64(sum, _mm_srli_si128(sum, 8)); return (unsigned)_mm_cvtsi128_si32(sum); }
#ifdef PARASAIL_TABLE parasail_result_t *result = parasail_result_new_table1(s1Len, s2Len); #else #ifdef PARASAIL_ROWCOL parasail_result_t *result = parasail_result_new_rowcol1(s1Len, s2Len); #else parasail_result_t *result = parasail_result_new(); #endif #endif int32_t i = 0; int32_t j = 0; int32_t end_query = 0; int32_t end_ref = 0; int32_t score = NEG_INF; __m128i vNegInf = _mm_set1_epi32(NEG_INF); __m128i vNegInf0 = _mm_srli_si128(vNegInf, 4); /* shift in a 0 */ __m128i vOpen = _mm_set1_epi32(open); __m128i vGap = _mm_set1_epi32(gap); __m128i vZero = _mm_set1_epi32(0); __m128i vOne = _mm_set1_epi32(1); __m128i vN = _mm_set1_epi32(N); __m128i vNegOne = _mm_set1_epi32(-1); __m128i vI = _mm_set_epi32(0,1,2,3); __m128i vJreset = _mm_set_epi32(0,-1,-2,-3); __m128i vMaxH = vNegInf; __m128i vEndI = vNegInf; __m128i vEndJ = vNegInf; __m128i vILimit = _mm_set1_epi32(s1Len); __m128i vJLimit = _mm_set1_epi32(s2Len);
static inline int32_t _mm_hmax_epi32_rpl(__m128i a) { a = _mm_max_epi32_rpl(a, _mm_srli_si128(a, 8)); a = _mm_max_epi32_rpl(a, _mm_srli_si128(a, 4)); return _mm_extract_epi32_rpl(a, 0); }
// Calculates bounding rectagnle of a point set or retrieves already calculated static Rect pointSetBoundingRect( const Mat& points ) { int npoints = points.checkVector(2); int depth = points.depth(); CV_Assert(npoints >= 0 && (depth == CV_32F || depth == CV_32S)); int xmin = 0, ymin = 0, xmax = -1, ymax = -1, i; bool is_float = depth == CV_32F; if( npoints == 0 ) return Rect(); const Point* pts = (const Point*)points.data; Point pt = pts[0]; #if CV_SSE4_2 if(cv::checkHardwareSupport(CV_CPU_SSE4_2)) { if( !is_float ) { __m128i minval, maxval; minval = maxval = _mm_loadl_epi64((const __m128i*)(&pt)); //min[0]=pt.x, min[1]=pt.y for( i = 1; i < npoints; i++ ) { __m128i ptXY = _mm_loadl_epi64((const __m128i*)&pts[i]); minval = _mm_min_epi32(ptXY, minval); maxval = _mm_max_epi32(ptXY, maxval); } xmin = _mm_cvtsi128_si32(minval); ymin = _mm_cvtsi128_si32(_mm_srli_si128(minval, 4)); xmax = _mm_cvtsi128_si32(maxval); ymax = _mm_cvtsi128_si32(_mm_srli_si128(maxval, 4)); } else { __m128 minvalf, maxvalf, z = _mm_setzero_ps(), ptXY = _mm_setzero_ps(); minvalf = maxvalf = _mm_loadl_pi(z, (const __m64*)(&pt)); for( i = 1; i < npoints; i++ ) { ptXY = _mm_loadl_pi(ptXY, (const __m64*)&pts[i]); minvalf = _mm_min_ps(minvalf, ptXY); maxvalf = _mm_max_ps(maxvalf, ptXY); } float xyminf[2], xymaxf[2]; _mm_storel_pi((__m64*)xyminf, minvalf); _mm_storel_pi((__m64*)xymaxf, maxvalf); xmin = cvFloor(xyminf[0]); ymin = cvFloor(xyminf[1]); xmax = cvFloor(xymaxf[0]); ymax = cvFloor(xymaxf[1]); } } else #endif { if( !is_float ) { xmin = xmax = pt.x; ymin = ymax = pt.y; for( i = 1; i < npoints; i++ ) { pt = pts[i]; if( xmin > pt.x ) xmin = pt.x; if( xmax < pt.x ) xmax = pt.x; if( ymin > pt.y ) ymin = pt.y; if( ymax < pt.y ) ymax = pt.y; } } else { Cv32suf v; // init values xmin = xmax = CV_TOGGLE_FLT(pt.x); ymin = ymax = CV_TOGGLE_FLT(pt.y); for( i = 1; i < npoints; i++ ) { pt = pts[i]; pt.x = CV_TOGGLE_FLT(pt.x); pt.y = CV_TOGGLE_FLT(pt.y); if( xmin > pt.x ) xmin = pt.x; if( xmax < pt.x ) xmax = pt.x; if( ymin > pt.y ) ymin = pt.y; if( ymax < pt.y ) ymax = pt.y; } v.i = CV_TOGGLE_FLT(xmin); xmin = cvFloor(v.f); v.i = CV_TOGGLE_FLT(ymin); ymin = cvFloor(v.f); // because right and bottom sides of the bounding rectangle are not inclusive // (note +1 in width and height calculation below), cvFloor is used here instead of cvCeil v.i = CV_TOGGLE_FLT(xmax); xmax = cvFloor(v.f); v.i = CV_TOGGLE_FLT(ymax); ymax = cvFloor(v.f); } } return Rect(xmin, ymin, xmax - xmin + 1, ymax - ymin + 1); }
void av1_fdct32x32_1_sse2(const int16_t *input, tran_low_t *output, int stride) { __m128i in0, in1, in2, in3; __m128i u0, u1; __m128i sum = _mm_setzero_si128(); int i; for (i = 0; i < 8; ++i) { in0 = _mm_load_si128((const __m128i *)(input + 0)); in1 = _mm_load_si128((const __m128i *)(input + 8)); in2 = _mm_load_si128((const __m128i *)(input + 16)); in3 = _mm_load_si128((const __m128i *)(input + 24)); input += stride; u0 = _mm_add_epi16(in0, in1); u1 = _mm_add_epi16(in2, in3); sum = _mm_add_epi16(sum, u0); in0 = _mm_load_si128((const __m128i *)(input + 0)); in1 = _mm_load_si128((const __m128i *)(input + 8)); in2 = _mm_load_si128((const __m128i *)(input + 16)); in3 = _mm_load_si128((const __m128i *)(input + 24)); input += stride; sum = _mm_add_epi16(sum, u1); u0 = _mm_add_epi16(in0, in1); u1 = _mm_add_epi16(in2, in3); sum = _mm_add_epi16(sum, u0); in0 = _mm_load_si128((const __m128i *)(input + 0)); in1 = _mm_load_si128((const __m128i *)(input + 8)); in2 = _mm_load_si128((const __m128i *)(input + 16)); in3 = _mm_load_si128((const __m128i *)(input + 24)); input += stride; sum = _mm_add_epi16(sum, u1); u0 = _mm_add_epi16(in0, in1); u1 = _mm_add_epi16(in2, in3); sum = _mm_add_epi16(sum, u0); in0 = _mm_load_si128((const __m128i *)(input + 0)); in1 = _mm_load_si128((const __m128i *)(input + 8)); in2 = _mm_load_si128((const __m128i *)(input + 16)); in3 = _mm_load_si128((const __m128i *)(input + 24)); input += stride; sum = _mm_add_epi16(sum, u1); u0 = _mm_add_epi16(in0, in1); u1 = _mm_add_epi16(in2, in3); sum = _mm_add_epi16(sum, u0); sum = _mm_add_epi16(sum, u1); } u0 = _mm_setzero_si128(); in0 = _mm_unpacklo_epi16(u0, sum); in1 = _mm_unpackhi_epi16(u0, sum); in0 = _mm_srai_epi32(in0, 16); in1 = _mm_srai_epi32(in1, 16); sum = _mm_add_epi32(in0, in1); in0 = _mm_unpacklo_epi32(sum, u0); in1 = _mm_unpackhi_epi32(sum, u0); sum = _mm_add_epi32(in0, in1); in0 = _mm_srli_si128(sum, 8); in1 = _mm_add_epi32(sum, in0); in1 = _mm_srai_epi32(in1, 3); store_output(&in1, output); }
constexpr static __m128d RightShift( __m128d input ) { return (__m128d)_mm_srli_si128( (__m128i)input, SHIFT ); }
static uint32_t maxbitas32int(const __m128i accumulator) { const __m128i _tmp1 = _mm_or_si128(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/ const __m128i _tmp2 = _mm_or_si128(_mm_srli_si128(_tmp1, 4), _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/ uint32_t ans = _mm_cvtsi128_si32(_tmp2); return bits(ans); }
// Hadamard transform // Returns the difference between the weighted sum of the absolute value of // transformed coefficients. static int TTransform(const uint8_t* inA, const uint8_t* inB, const uint16_t* const w) { __m128i tmp_0, tmp_1, tmp_2, tmp_3; // Load, combine and transpose inputs. { const __m128i inA_0 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 0]); const __m128i inA_1 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 1]); const __m128i inA_2 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 2]); const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]); const __m128i inB_0 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 0]); const __m128i inB_1 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 1]); const __m128i inB_2 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 2]); const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]); // Combine inA and inB (we'll do two transforms in parallel). const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0); const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1); const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2); const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3); // a00 b00 a01 b01 a02 b03 a03 b03 0 0 0 0 0 0 0 0 // a10 b10 a11 b11 a12 b12 a13 b13 0 0 0 0 0 0 0 0 // a20 b20 a21 b21 a22 b22 a23 b23 0 0 0 0 0 0 0 0 // a30 b30 a31 b31 a32 b32 a33 b33 0 0 0 0 0 0 0 0 // Transpose the two 4x4, discarding the filling zeroes. const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2); const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3); // a00 a20 b00 b20 a01 a21 b01 b21 a02 a22 b02 b22 a03 a23 b03 b23 // a10 a30 b10 b30 a11 a31 b11 b31 a12 a32 b12 b32 a13 a33 b13 b33 const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1); // a00 a10 a20 a30 b00 b10 b20 b30 a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 a03 a13 a23 a33 b03 b13 b23 b33 // Convert to 16b. tmp_0 = _mm_cvtepu8_epi16(transpose1_0); tmp_1 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_0, 8)); tmp_2 = _mm_cvtepu8_epi16(transpose1_1); tmp_3 = _mm_cvtepu8_epi16(_mm_srli_si128(transpose1_1, 8)); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Horizontal pass and subsequent transpose. { // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); const __m128i b0 = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 // Transpose the two 4x4. const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1); const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3); const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1); const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Vertical pass and difference of weighted sums. { // Load all inputs. const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]); const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]); // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); const __m128i b0 = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); // Separate the transforms of inA and inB. __m128i A_b0 = _mm_unpacklo_epi64(b0, b1); __m128i A_b2 = _mm_unpacklo_epi64(b2, b3); __m128i B_b0 = _mm_unpackhi_epi64(b0, b1); __m128i B_b2 = _mm_unpackhi_epi64(b2, b3); A_b0 = _mm_abs_epi16(A_b0); A_b2 = _mm_abs_epi16(A_b2); B_b0 = _mm_abs_epi16(B_b0); B_b2 = _mm_abs_epi16(B_b2); // weighted sums A_b0 = _mm_madd_epi16(A_b0, w_0); A_b2 = _mm_madd_epi16(A_b2, w_8); B_b0 = _mm_madd_epi16(B_b0, w_0); B_b2 = _mm_madd_epi16(B_b2, w_8); A_b0 = _mm_add_epi32(A_b0, A_b2); B_b0 = _mm_add_epi32(B_b0, B_b2); // difference of weighted sums A_b2 = _mm_sub_epi32(A_b0, B_b0); // cascading summation of the differences B_b0 = _mm_hadd_epi32(A_b2, A_b2); B_b2 = _mm_hadd_epi32(B_b0, B_b0); return _mm_cvtsi128_si32(B_b2); } }
static inline void desc_to_olflags_v(__m128i descs[4], uint8_t vlan_flags, struct rte_mbuf **rx_pkts) { __m128i ptype0, ptype1, vtag0, vtag1, csum; union { uint16_t e[4]; uint64_t dword; } vol; /* mask everything except rss type */ const __m128i rsstype_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x000F, 0x000F, 0x000F, 0x000F); /* mask the lower byte of ol_flags */ const __m128i ol_flags_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x00FF, 0x00FF, 0x00FF, 0x00FF); /* map rss type to rss hash flag */ const __m128i rss_flags = _mm_set_epi8(PKT_RX_FDIR, 0, 0, 0, 0, 0, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0); /* mask everything except vlan present and l4/ip csum error */ const __m128i vlan_csum_msk = _mm_set_epi16( (IXGBE_RXDADV_ERR_TCPE | IXGBE_RXDADV_ERR_IPE) >> 16, (IXGBE_RXDADV_ERR_TCPE | IXGBE_RXDADV_ERR_IPE) >> 16, (IXGBE_RXDADV_ERR_TCPE | IXGBE_RXDADV_ERR_IPE) >> 16, (IXGBE_RXDADV_ERR_TCPE | IXGBE_RXDADV_ERR_IPE) >> 16, IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP); /* map vlan present (0x8), IPE (0x2), L4E (0x1) to ol_flags */ const __m128i vlan_csum_map_lo = _mm_set_epi8( 0, 0, 0, 0, vlan_flags | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD, vlan_flags | PKT_RX_IP_CKSUM_BAD, vlan_flags | PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD, vlan_flags | PKT_RX_IP_CKSUM_GOOD, 0, 0, 0, 0, PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD, PKT_RX_IP_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD); const __m128i vlan_csum_map_hi = _mm_set_epi8( 0, 0, 0, 0, 0, PKT_RX_L4_CKSUM_GOOD >> sizeof(uint8_t), 0, PKT_RX_L4_CKSUM_GOOD >> sizeof(uint8_t), 0, 0, 0, 0, 0, PKT_RX_L4_CKSUM_GOOD >> sizeof(uint8_t), 0, PKT_RX_L4_CKSUM_GOOD >> sizeof(uint8_t)); ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]); ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]); vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]); vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]); ptype0 = _mm_unpacklo_epi32(ptype0, ptype1); ptype0 = _mm_and_si128(ptype0, rsstype_msk); ptype0 = _mm_shuffle_epi8(rss_flags, ptype0); vtag1 = _mm_unpacklo_epi32(vtag0, vtag1); vtag1 = _mm_and_si128(vtag1, vlan_csum_msk); /* csum bits are in the most significant, to use shuffle we need to * shift them. Change mask to 0xc000 to 0x0003. */ csum = _mm_srli_epi16(vtag1, 14); /* now or the most significant 64 bits containing the checksum * flags with the vlan present flags. */ csum = _mm_srli_si128(csum, 8); vtag1 = _mm_or_si128(csum, vtag1); /* convert VP, IPE, L4E to ol_flags */ vtag0 = _mm_shuffle_epi8(vlan_csum_map_hi, vtag1); vtag0 = _mm_slli_epi16(vtag0, sizeof(uint8_t)); vtag1 = _mm_shuffle_epi8(vlan_csum_map_lo, vtag1); vtag1 = _mm_and_si128(vtag1, ol_flags_msk); vtag1 = _mm_or_si128(vtag0, vtag1); vtag1 = _mm_or_si128(ptype0, vtag1); vol.dword = _mm_cvtsi128_si64(vtag1); rx_pkts[0]->ol_flags = vol.e[0]; rx_pkts[1]->ol_flags = vol.e[1]; rx_pkts[2]->ol_flags = vol.e[2]; rx_pkts[3]->ol_flags = vol.e[3]; }
void png_read_filter_row_avg3_sse(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { png_size_t i; png_bytep rp = row; png_const_bytep prp = prev_row; __m128i nrb = _mm_load_si128((__m128i*)(rp)); __m128i pixel = _mm_setzero_si128(); const __m128i mask = _mm_set1_epi8(0x01); for (i = 0; i < row_info->rowbytes; i += 15, rp += 15, prp += 15) { #ifndef __SSSE3__ __m128i prb = _mm_loadu_si128((__m128i*)prp); #else __m128i prb = _mm_lddqu_si128((__m128i*)prp); #endif __m128i rb = nrb; // First pixel pixel = calculate_pixel_avg(rb, prb, pixel, mask); prb = _mm_srli_si128(prb, 3); #ifndef __SSSE3__ rb = _mm_srli_si128(rb, 3); rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 13)); #else rb = _mm_alignr_epi8(pixel, rb, 3); #endif // Second pixel pixel = calculate_pixel_avg(rb, prb, pixel, mask); prb = _mm_srli_si128(prb, 3); #ifndef __SSSE3__ rb = _mm_srli_si128(rb, 3); rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 13)); #else rb = _mm_alignr_epi8(pixel, rb, 3); #endif // Third pixel pixel = calculate_pixel_avg(rb, prb, pixel, mask); prb = _mm_srli_si128(prb, 3); #ifndef __SSSE3__ rb = _mm_srli_si128(rb, 3); rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 13)); #else rb = _mm_alignr_epi8(pixel, rb, 3); #endif // Fourth pixel pixel = calculate_pixel_avg(rb, prb, pixel, mask); prb = _mm_srli_si128(prb, 3); #ifndef __SSSE3__ rb = _mm_srli_si128(rb, 3); rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 13)); #else rb = _mm_alignr_epi8(pixel, rb, 3); #endif // Fifth pixel pixel = calculate_pixel_avg(rb, prb, pixel, mask); #ifndef __SSSE3__ nrb = _mm_loadu_si128((__m128i*)(rp + 15)); rb = _mm_srli_si128(rb, 3); rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 13)); #else nrb = _mm_lddqu_si128((__m128i*)(rp + 15)); rb = _mm_alignr_epi8(pixel, rb, 3); #endif rb = _mm_srli_si128(rb, 1); _mm_storeu_si128((__m128i*)rp, rb); } }
static uint32_t maxasint(const __m128i accumulator) { const __m128i _tmp1 = _mm_max_epu32(_mm_srli_si128(accumulator, 8), accumulator); /* (A,B,C,D) xor (0,0,A,B) = (A,B,C xor A,D xor B)*/ const __m128i _tmp2 = _mm_max_epu32(_mm_srli_si128(_tmp1, 4), _tmp1); /* (A,B,C xor A,D xor B) xor (0,0,0,C xor A)*/ return _mm_cvtsi128_si32(_tmp2); }
/** ******************************************************************************* * * @brief * Performs spatial edge adaptive filtering * * @par Description * Performs spatial edge adaptive filtering by detecting edge direction * * @param[in] pu1_src * Source buffer * * @param[in] pu1_out * Destination buffer * * @param[in] src_strd * Source stride * * @param[in] out_strd * Destination stride * @returns * None * * @remarks * ******************************************************************************* */ void ideint_spatial_filter_ssse3(UWORD8 *pu1_src, UWORD8 *pu1_out, WORD32 src_strd, WORD32 out_strd) { WORD32 i; WORD32 adiff[6]; WORD32 *pi4_diff; WORD32 shifts[2]; WORD32 dir_45_le_90, dir_45_le_135, dir_135_le_90; __m128i row1_0, row1_m1, row1_p1; __m128i row2_0, row2_m1, row2_p1; __m128i diff, diffs[3]; __m128i zero; /*****************************************************************/ /* Direction detection */ /*****************************************************************/ zero = _mm_setzero_si128(); diffs[0] = _mm_setzero_si128(); diffs[1] = _mm_setzero_si128(); diffs[2] = _mm_setzero_si128(); /* Load source */ row1_m1 = _mm_loadl_epi64((__m128i *) (pu1_src - 1)); row1_0 = _mm_loadl_epi64((__m128i *) (pu1_src)); row1_p1 = _mm_loadl_epi64((__m128i *) (pu1_src + 1)); pu1_src += src_strd; /* Unpack to 16 bits */ row1_m1 = _mm_unpacklo_epi8(row1_m1, zero); row1_0 = _mm_unpacklo_epi8(row1_0, zero); row1_p1 = _mm_unpacklo_epi8(row1_p1, zero); /*****************************************************************/ /* Calculating the difference along each of the 3 directions. */ /*****************************************************************/ for(i = 0; i < SUB_BLK_HT; i ++) { row2_m1 = _mm_loadl_epi64((__m128i *) (pu1_src - 1)); row2_0 = _mm_loadl_epi64((__m128i *) (pu1_src)); row2_p1 = _mm_loadl_epi64((__m128i *) (pu1_src + 1)); pu1_src += src_strd; /* Unpack to 16 bits */ row2_m1 = _mm_unpacklo_epi8(row2_m1, zero); row2_0 = _mm_unpacklo_epi8(row2_0, zero); row2_p1 = _mm_unpacklo_epi8(row2_p1, zero); diff = _mm_sad_epu8(row1_0, row2_0); diffs[0] = _mm_add_epi64(diffs[0], diff); diff = _mm_sad_epu8(row1_m1, row2_p1); diffs[1] = _mm_add_epi64(diffs[1], diff); diff = _mm_sad_epu8(row1_p1, row2_m1); diffs[2] = _mm_add_epi64(diffs[2], diff); row1_m1 = row2_m1; row1_0 = row2_0; row1_p1 = row2_p1; } /* Revert pu1_src increment */ pu1_src -= (SUB_BLK_HT + 1) * src_strd; adiff[0] = _mm_cvtsi128_si32(diffs[0]); adiff[1] = _mm_cvtsi128_si32(diffs[1]); adiff[2] = _mm_cvtsi128_si32(diffs[2]); adiff[3] = _mm_cvtsi128_si32(_mm_srli_si128(diffs[0], 8)); adiff[4] = _mm_cvtsi128_si32(_mm_srli_si128(diffs[1], 8)); adiff[5] = _mm_cvtsi128_si32(_mm_srli_si128(diffs[2], 8)); pi4_diff = adiff; for(i = 0; i < 2; i++) { /*****************************************************************/ /* Applying bias, to make the diff comparision more robust. */ /*****************************************************************/ pi4_diff[0] *= EDGE_BIAS_0; pi4_diff[1] *= EDGE_BIAS_1; pi4_diff[2] *= EDGE_BIAS_1; /*****************************************************************/ /* comapring the diffs */ /*****************************************************************/ dir_45_le_90 = (pi4_diff[2] <= pi4_diff[0]); dir_45_le_135 = (pi4_diff[2] <= pi4_diff[1]); dir_135_le_90 = (pi4_diff[1] <= pi4_diff[0]); /*****************************************************************/ /* Direction selection. */ /*****************************************************************/ shifts[i] = 0; if(1 == dir_45_le_135) { if(1 == dir_45_le_90) shifts[i] = 1; } else { if(1 == dir_135_le_90) shifts[i] = -1; } pi4_diff += 3; } /*****************************************************************/ /* Directional interpolation */ /*****************************************************************/ for(i = 0; i < SUB_BLK_HT / 2; i++) { __m128i dst; __m128i row1, row2; UWORD32 *pu4_row1th, *pu4_row1tl; UWORD32 *pu4_row2th, *pu4_row2tl; UWORD32 *pu4_row1bh, *pu4_row1bl; UWORD32 *pu4_row2bh, *pu4_row2bl; pu4_row1th = (UWORD32 *)(pu1_src + shifts[0]); pu4_row1tl = (UWORD32 *)(pu1_src + SUB_BLK_WD + shifts[1]); pu1_src += src_strd; pu4_row2th = (UWORD32 *)(pu1_src + shifts[0]); pu4_row2tl = (UWORD32 *)(pu1_src + SUB_BLK_WD + shifts[1]); pu4_row1bh = (UWORD32 *)(pu1_src - shifts[0]); pu4_row1bl = (UWORD32 *)(pu1_src + SUB_BLK_WD - shifts[1]); pu1_src += src_strd; pu4_row2bh = (UWORD32 *)(pu1_src - shifts[0]); pu4_row2bl = (UWORD32 *)(pu1_src + SUB_BLK_WD - shifts[1]); row1 = _mm_set_epi32(*pu4_row1tl, *pu4_row1th, *pu4_row2tl, *pu4_row2th); row2 = _mm_set_epi32(*pu4_row1bl, *pu4_row1bh, *pu4_row2bl, *pu4_row2bh); dst = _mm_avg_epu8(row1, row2); _mm_storel_epi64((__m128i *)pu1_out, _mm_srli_si128(dst, 8)); pu1_out += out_strd; _mm_storel_epi64((__m128i *)pu1_out, dst); pu1_out += out_strd; } }
void precompute_partition_info_sums_intrin_ssse3(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps) { const unsigned default_partition_samples = (residual_samples + predictor_order) >> max_partition_order; unsigned partitions = 1u << max_partition_order; FLAC__ASSERT(default_partition_samples > predictor_order); /* first do max_partition_order */ { unsigned partition, residual_sample, end = (unsigned)(-(int)predictor_order); unsigned e1, e3; __m128i mm_res, mm_sum; if(bps <= 16) { FLAC__uint32 abs_residual_partition_sum; for(partition = residual_sample = 0; partition < partitions; partition++) { end += default_partition_samples; abs_residual_partition_sum = 0; mm_sum = _mm_setzero_si128(); e1 = (residual_sample + 3) & ~3; e3 = end & ~3; if(e1 > end) e1 = end; /* try flac -l 1 -b 16 and you'll be here */ /* assumption: residual[] is properly aligned so (residual + e1) is properly aligned too and _mm_loadu_si128() is fast*/ for( ; residual_sample < e1; residual_sample++) abs_residual_partition_sum += abs(residual[residual_sample]); /* abs(INT_MIN) is undefined, but if the residual is INT_MIN we have bigger problems */ for( ; residual_sample < e3; residual_sample+=4) { mm_res = _mm_loadu_si128((const __m128i*)(residual+residual_sample)); mm_res = _mm_abs_epi32(mm_res); mm_sum = _mm_add_epi32(mm_sum, mm_res); } mm_sum = _mm_hadd_epi32(mm_sum, mm_sum); mm_sum = _mm_hadd_epi32(mm_sum, mm_sum); abs_residual_partition_sum += _mm_cvtsi128_si32(mm_sum); for( ; residual_sample < end; residual_sample++) abs_residual_partition_sum += abs(residual[residual_sample]); abs_residual_partition_sums[partition] = abs_residual_partition_sum; } } else { /* have to pessimistically use 64 bits for accumulator */ FLAC__uint64 abs_residual_partition_sum; for(partition = residual_sample = 0; partition < partitions; partition++) { end += default_partition_samples; abs_residual_partition_sum = 0; mm_sum = _mm_setzero_si128(); e1 = (residual_sample + 1) & ~1; e3 = end & ~1; FLAC__ASSERT(e1 <= end); for( ; residual_sample < e1; residual_sample++) abs_residual_partition_sum += abs(residual[residual_sample]); for( ; residual_sample < e3; residual_sample+=2) { mm_res = _mm_loadl_epi64((const __m128i*)(residual+residual_sample)); /* 0 0 r1 r0 */ mm_res = _mm_abs_epi32(mm_res); /* 0 0 |r1| |r0| */ mm_res = _mm_shuffle_epi32(mm_res, _MM_SHUFFLE(3,1,2,0)); /* 0 |r1| 0 |r0| == |r1_64| |r0_64| */ mm_sum = _mm_add_epi64(mm_sum, mm_res); } mm_sum = _mm_add_epi64(mm_sum, _mm_srli_si128(mm_sum, 8)); #ifdef FLAC__CPU_IA32 #ifdef _MSC_VER abs_residual_partition_sum += mm_sum.m128i_u64[0]; #else { FLAC__uint64 tmp[2]; _mm_storel_epi64((__m128i *)tmp, mm_sum); abs_residual_partition_sum += tmp[0]; } #endif #else abs_residual_partition_sum += _mm_cvtsi128_si64(mm_sum); #endif for( ; residual_sample < end; residual_sample++) abs_residual_partition_sum += abs(residual[residual_sample]); abs_residual_partition_sums[partition] = abs_residual_partition_sum; } } } /* now merge partitions for lower orders */ { unsigned from_partition = 0, to_partition = partitions; int partition_order; for(partition_order = (int)max_partition_order - 1; partition_order >= (int)min_partition_order; partition_order--) { unsigned i; partitions >>= 1; for(i = 0; i < partitions; i++) { abs_residual_partition_sums[to_partition++] = abs_residual_partition_sums[from_partition ] + abs_residual_partition_sums[from_partition+1]; from_partition += 2; } } } }
void av1_fdct16x16_1_sse2(const int16_t *input, tran_low_t *output, int stride) { __m128i in0, in1, in2, in3; __m128i u0, u1; __m128i sum = _mm_setzero_si128(); int i; for (i = 0; i < 2; ++i) { input += 8 * i; in0 = _mm_load_si128((const __m128i *)(input + 0 * stride)); in1 = _mm_load_si128((const __m128i *)(input + 1 * stride)); in2 = _mm_load_si128((const __m128i *)(input + 2 * stride)); in3 = _mm_load_si128((const __m128i *)(input + 3 * stride)); u0 = _mm_add_epi16(in0, in1); u1 = _mm_add_epi16(in2, in3); sum = _mm_add_epi16(sum, u0); in0 = _mm_load_si128((const __m128i *)(input + 4 * stride)); in1 = _mm_load_si128((const __m128i *)(input + 5 * stride)); in2 = _mm_load_si128((const __m128i *)(input + 6 * stride)); in3 = _mm_load_si128((const __m128i *)(input + 7 * stride)); sum = _mm_add_epi16(sum, u1); u0 = _mm_add_epi16(in0, in1); u1 = _mm_add_epi16(in2, in3); sum = _mm_add_epi16(sum, u0); in0 = _mm_load_si128((const __m128i *)(input + 8 * stride)); in1 = _mm_load_si128((const __m128i *)(input + 9 * stride)); in2 = _mm_load_si128((const __m128i *)(input + 10 * stride)); in3 = _mm_load_si128((const __m128i *)(input + 11 * stride)); sum = _mm_add_epi16(sum, u1); u0 = _mm_add_epi16(in0, in1); u1 = _mm_add_epi16(in2, in3); sum = _mm_add_epi16(sum, u0); in0 = _mm_load_si128((const __m128i *)(input + 12 * stride)); in1 = _mm_load_si128((const __m128i *)(input + 13 * stride)); in2 = _mm_load_si128((const __m128i *)(input + 14 * stride)); in3 = _mm_load_si128((const __m128i *)(input + 15 * stride)); sum = _mm_add_epi16(sum, u1); u0 = _mm_add_epi16(in0, in1); u1 = _mm_add_epi16(in2, in3); sum = _mm_add_epi16(sum, u0); sum = _mm_add_epi16(sum, u1); } u0 = _mm_setzero_si128(); in0 = _mm_unpacklo_epi16(u0, sum); in1 = _mm_unpackhi_epi16(u0, sum); in0 = _mm_srai_epi32(in0, 16); in1 = _mm_srai_epi32(in1, 16); sum = _mm_add_epi32(in0, in1); in0 = _mm_unpacklo_epi32(sum, u0); in1 = _mm_unpackhi_epi32(sum, u0); sum = _mm_add_epi32(in0, in1); in0 = _mm_srli_si128(sum, 8); in1 = _mm_add_epi32(sum, in0); in1 = _mm_srai_epi32(in1, 1); store_output(&in1, output); }
/** * See av1_wedge_sse_from_residuals_c */ uint64_t av1_wedge_sse_from_residuals_sse2(const int16_t *r1, const int16_t *d, const uint8_t *m, int N) { int n = -N; int n8 = n + 8; uint64_t csse; const __m128i v_mask_max_w = _mm_set1_epi16(MAX_MASK_VALUE); const __m128i v_zext_q = _mm_set_epi32(0, 0xffffffff, 0, 0xffffffff); __m128i v_acc0_q = _mm_setzero_si128(); assert(N % 64 == 0); r1 += N; d += N; m += N; do { const __m128i v_r0_w = xx_load_128(r1 + n); const __m128i v_r1_w = xx_load_128(r1 + n8); const __m128i v_d0_w = xx_load_128(d + n); const __m128i v_d1_w = xx_load_128(d + n8); const __m128i v_m01_b = xx_load_128(m + n); const __m128i v_rd0l_w = _mm_unpacklo_epi16(v_d0_w, v_r0_w); const __m128i v_rd0h_w = _mm_unpackhi_epi16(v_d0_w, v_r0_w); const __m128i v_rd1l_w = _mm_unpacklo_epi16(v_d1_w, v_r1_w); const __m128i v_rd1h_w = _mm_unpackhi_epi16(v_d1_w, v_r1_w); const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128()); const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128()); const __m128i v_m0l_w = _mm_unpacklo_epi16(v_m0_w, v_mask_max_w); const __m128i v_m0h_w = _mm_unpackhi_epi16(v_m0_w, v_mask_max_w); const __m128i v_m1l_w = _mm_unpacklo_epi16(v_m1_w, v_mask_max_w); const __m128i v_m1h_w = _mm_unpackhi_epi16(v_m1_w, v_mask_max_w); const __m128i v_t0l_d = _mm_madd_epi16(v_rd0l_w, v_m0l_w); const __m128i v_t0h_d = _mm_madd_epi16(v_rd0h_w, v_m0h_w); const __m128i v_t1l_d = _mm_madd_epi16(v_rd1l_w, v_m1l_w); const __m128i v_t1h_d = _mm_madd_epi16(v_rd1h_w, v_m1h_w); const __m128i v_t0_w = _mm_packs_epi32(v_t0l_d, v_t0h_d); const __m128i v_t1_w = _mm_packs_epi32(v_t1l_d, v_t1h_d); const __m128i v_sq0_d = _mm_madd_epi16(v_t0_w, v_t0_w); const __m128i v_sq1_d = _mm_madd_epi16(v_t1_w, v_t1_w); const __m128i v_sum0_q = _mm_add_epi64(_mm_and_si128(v_sq0_d, v_zext_q), _mm_srli_epi64(v_sq0_d, 32)); const __m128i v_sum1_q = _mm_add_epi64(_mm_and_si128(v_sq1_d, v_zext_q), _mm_srli_epi64(v_sq1_d, 32)); v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum0_q); v_acc0_q = _mm_add_epi64(v_acc0_q, v_sum1_q); n8 += 16; n += 16; } while (n); v_acc0_q = _mm_add_epi64(v_acc0_q, _mm_srli_si128(v_acc0_q, 8)); #if ARCH_X86_64 csse = (uint64_t)_mm_cvtsi128_si64(v_acc0_q); #else xx_storel_64(&csse, v_acc0_q); #endif return ROUND_POWER_OF_TWO(csse, 2 * WEDGE_WEIGHT_BITS); }
void fb_slvn_low(dig_t *c, const dig_t *a) { int i; dig_t *p, u0, u1, u2, u3; void *tab = fb_poly_get_slv(); __m128i m0, m1, m2, m3, m4, sqrt0, sqrt1, mask0, mask1, mask2, r0, r1, t0, t1, perm; perm = _mm_set_epi32(0x0F0D0B09, 0x07050301, 0x0E0C0A08, 0x06040200); mask2 = _mm_set_epi32(0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000); mask1 = _mm_set_epi32(0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0, 0xF0F0F0F0); mask0 = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F); sqrt0 = _mm_set_epi32(0x03020302, 0x01000100, 0x03020302, 0x01000100); sqrt1 = _mm_set_epi32(0x0c080c08, 0x04000400, 0x0c080c08, 0x04000400); t0 = _mm_load_si128((__m128i *)a); t1 = _mm_load_si128((__m128i *)(a + 2)); r0 = r1 = _mm_setzero_si128(); m0 = _mm_shuffle_epi8(t1, perm); m1 = _mm_and_si128(m0, mask0); m2 = _mm_and_si128(m0, mask1); m2 = _mm_srli_epi64(m2, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m1 = _mm_xor_si128(m1, m2); m2 = _mm_slli_si128(m1, 8); m1 = _mm_and_si128(m1, mask2); m1 = _mm_slli_epi64(m1, 4); m1 = _mm_xor_si128(m1, m2); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m0 = _mm_and_si128(t0, mask2); m0 = _mm_shuffle_epi8(m0, perm); m1 = _mm_and_si128(m0, mask0); m2 = _mm_and_si128(m0, mask1); m2 = _mm_srli_epi64(m2, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m1 = _mm_xor_si128(m1, m2); m2 = _mm_srli_si128(m1, 8); m1 = _mm_andnot_si128(mask2, m1); m2 = _mm_slli_epi64(m2, 4); m1 = _mm_xor_si128(m1, m2); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_si128(t0, 4); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0xFFFFFFFF)); m0 = _mm_shuffle_epi8(m1, perm); m1 = _mm_and_si128(m0, mask0); m2 = _mm_and_si128(m0, mask1); m2 = _mm_srli_epi64(m2, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m1 = _mm_xor_si128(m1, m2); m2 = _mm_slli_si128(m1, 8); m1 = _mm_slli_epi64(m1, 4); m1 = _mm_xor_si128(m1, m2); m1 = _mm_srli_si128(m1, 6); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_si128(t0, 2); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0xFFFF)); m0 = _mm_shuffle_epi8(m1, perm); m1 = _mm_and_si128(m0, mask0); m2 = _mm_and_si128(m0, mask1); m2 = _mm_srli_epi64(m2, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m1 = _mm_xor_si128(m1, m2); m2 = _mm_slli_si128(m1, 8); m1 = _mm_slli_epi64(m1, 4); m1 = _mm_xor_si128(m1, m2); m1 = _mm_srli_si128(m1, 7); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_si128(t0, 1); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x55)); m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 1)); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x33)); m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 2)); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x0F)); m1 = _mm_slli_epi64(m1, 4); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_epi64(t0, 4); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x5)); m1 = _mm_or_si128(m1, _mm_srli_epi64(m1, 1)); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x3)); m1 = _mm_slli_epi64(m1, 2); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); m1 = _mm_srli_epi64(t0, 2); m1 = _mm_and_si128(m1, _mm_set_epi32(0, 0, 0, 0x1)); m1 = _mm_slli_epi64(m1, 1); t0 = _mm_xor_si128(t0, m1); r0 = _mm_xor_si128(r0, m1); sqrt0 = _mm_set_epi32(0x03030202, 0x03030202, 0x01010000, 0x01010000); sqrt1 = _mm_set_epi32(0x0C0C0808, 0x0C0C0808, 0x04040000, 0x04040000); m1 = _mm_and_si128(t0, mask0); m2 = _mm_and_si128(t0, mask1); m3 = _mm_and_si128(t1, mask0); m4 = _mm_and_si128(t1, mask1); m2 = _mm_srli_epi64(m2, 4); m4 = _mm_srli_epi64(m4, 4); m2 = _mm_shuffle_epi8(sqrt1, m2); m1 = _mm_shuffle_epi8(sqrt0, m1); m4 = _mm_shuffle_epi8(sqrt1, m4); m3 = _mm_shuffle_epi8(sqrt0, m3); m1 = _mm_or_si128(m1, m2); m3 = _mm_or_si128(m3, m4); #ifndef __PCLMUL__ align dig_t x[2]; _mm_store_si128((__m128i *)x, m1); u0 = x[0]; u1 = x[1]; _mm_store_si128((__m128i *)x, m3); u2 = x[0]; u3 = x[1]; #else u0 = _mm_extract_epi64(m1, 0); u1 = _mm_extract_epi64(m1, 1); u2 = _mm_extract_epi64(m3, 0); u3 = _mm_extract_epi64(m3, 1); #endif for (i = 0; i < 8; i++) { p = (dig_t *)(tab + (16 * i + (u0 & 0x0F)) * sizeof(fb_st)); r0 = _mm_xor_si128(r0, *(__m128i *)(p)); r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); u0 >>= 8; p = (dig_t *)(tab + (16 * (i + 8) + (u1 & 0x0F)) * sizeof(fb_st)); r0 = _mm_xor_si128(r0, *(__m128i *)(p)); r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); u1 >>= 8; p = (dig_t *)(tab + (16 * (i + 16) + (u2 & 0x0F)) * sizeof(fb_st)); r0 = _mm_xor_si128(r0, *(__m128i *)(p)); r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); u2 >>= 8; p = (dig_t *)(tab + (16 * (i + 24) + (u3 & 0xF)) * sizeof(fb_st)); r0 = _mm_xor_si128(r0, *(__m128i *)(p)); r1 = _mm_xor_si128(r1, *(__m128i *)(p + 2)); u3 >>= 8; } _mm_store_si128((__m128i *)c, r0); _mm_store_si128((__m128i *)(c + 2), r1); }
/** * See av1_wedge_sign_from_residuals_c */ int av1_wedge_sign_from_residuals_sse2(const int16_t *ds, const uint8_t *m, int N, int64_t limit) { int64_t acc; __m128i v_sign_d; __m128i v_acc0_d = _mm_setzero_si128(); __m128i v_acc1_d = _mm_setzero_si128(); __m128i v_acc_q; // Input size limited to 8192 by the use of 32 bit accumulators and m // being between [0, 64]. Overflow might happen at larger sizes, // though it is practically impossible on real video input. assert(N < 8192); assert(N % 64 == 0); do { const __m128i v_m01_b = xx_load_128(m); const __m128i v_m23_b = xx_load_128(m + 16); const __m128i v_m45_b = xx_load_128(m + 32); const __m128i v_m67_b = xx_load_128(m + 48); const __m128i v_d0_w = xx_load_128(ds); const __m128i v_d1_w = xx_load_128(ds + 8); const __m128i v_d2_w = xx_load_128(ds + 16); const __m128i v_d3_w = xx_load_128(ds + 24); const __m128i v_d4_w = xx_load_128(ds + 32); const __m128i v_d5_w = xx_load_128(ds + 40); const __m128i v_d6_w = xx_load_128(ds + 48); const __m128i v_d7_w = xx_load_128(ds + 56); const __m128i v_m0_w = _mm_unpacklo_epi8(v_m01_b, _mm_setzero_si128()); const __m128i v_m1_w = _mm_unpackhi_epi8(v_m01_b, _mm_setzero_si128()); const __m128i v_m2_w = _mm_unpacklo_epi8(v_m23_b, _mm_setzero_si128()); const __m128i v_m3_w = _mm_unpackhi_epi8(v_m23_b, _mm_setzero_si128()); const __m128i v_m4_w = _mm_unpacklo_epi8(v_m45_b, _mm_setzero_si128()); const __m128i v_m5_w = _mm_unpackhi_epi8(v_m45_b, _mm_setzero_si128()); const __m128i v_m6_w = _mm_unpacklo_epi8(v_m67_b, _mm_setzero_si128()); const __m128i v_m7_w = _mm_unpackhi_epi8(v_m67_b, _mm_setzero_si128()); const __m128i v_p0_d = _mm_madd_epi16(v_d0_w, v_m0_w); const __m128i v_p1_d = _mm_madd_epi16(v_d1_w, v_m1_w); const __m128i v_p2_d = _mm_madd_epi16(v_d2_w, v_m2_w); const __m128i v_p3_d = _mm_madd_epi16(v_d3_w, v_m3_w); const __m128i v_p4_d = _mm_madd_epi16(v_d4_w, v_m4_w); const __m128i v_p5_d = _mm_madd_epi16(v_d5_w, v_m5_w); const __m128i v_p6_d = _mm_madd_epi16(v_d6_w, v_m6_w); const __m128i v_p7_d = _mm_madd_epi16(v_d7_w, v_m7_w); const __m128i v_p01_d = _mm_add_epi32(v_p0_d, v_p1_d); const __m128i v_p23_d = _mm_add_epi32(v_p2_d, v_p3_d); const __m128i v_p45_d = _mm_add_epi32(v_p4_d, v_p5_d); const __m128i v_p67_d = _mm_add_epi32(v_p6_d, v_p7_d); const __m128i v_p0123_d = _mm_add_epi32(v_p01_d, v_p23_d); const __m128i v_p4567_d = _mm_add_epi32(v_p45_d, v_p67_d); v_acc0_d = _mm_add_epi32(v_acc0_d, v_p0123_d); v_acc1_d = _mm_add_epi32(v_acc1_d, v_p4567_d); ds += 64; m += 64; N -= 64; } while (N); v_sign_d = _mm_cmplt_epi32(v_acc0_d, _mm_setzero_si128()); v_acc0_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc0_d, v_sign_d), _mm_unpackhi_epi32(v_acc0_d, v_sign_d)); v_sign_d = _mm_cmplt_epi32(v_acc1_d, _mm_setzero_si128()); v_acc1_d = _mm_add_epi64(_mm_unpacklo_epi32(v_acc1_d, v_sign_d), _mm_unpackhi_epi32(v_acc1_d, v_sign_d)); v_acc_q = _mm_add_epi64(v_acc0_d, v_acc1_d); v_acc_q = _mm_add_epi64(v_acc_q, _mm_srli_si128(v_acc_q, 8)); #if ARCH_X86_64 acc = (uint64_t)_mm_cvtsi128_si64(v_acc_q); #else xx_storel_64(&acc, v_acc_q); #endif return acc > limit; }
/***************************************************************************** * This function utilises 3 properties of the cost function lookup tables, * * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in * * vp9_encoder.c. * * For the joint cost: * * - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3] * * For the component costs: * * - For all i: mvsadcost[0][i] == mvsadcost[1][i] * * (Equal costs for both components) * * - For all i: mvsadcost[0][i] == mvsadcost[0][-i] * * (Cost function is even) * * If these do not hold, then this function cannot be used without * * modification, in which case you can revert to using the C implementation, * * which does not rely on these properties. * *****************************************************************************/ int vp9_diamond_search_sad_avx(const MACROBLOCK *x, const search_site_config *cfg, MV *ref_mv, MV *best_mv, int search_param, int sad_per_bit, int *num00, const vp9_variance_fn_ptr_t *fn_ptr, const MV *center_mv) { const int_mv maxmv = pack_int_mv(x->mv_row_max, x->mv_col_max); const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int); const int_mv minmv = pack_int_mv(x->mv_row_min, x->mv_col_min); const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int); const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit); const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]); const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]); // search_param determines the length of the initial step and hence the number // of iterations. // 0 = initial step (MAX_FIRST_STEP) pel // 1 = (MAX_FIRST_STEP/2) pel, // 2 = (MAX_FIRST_STEP/4) pel... const MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param]; const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param]; const int tot_steps = cfg->total_steps - search_param; const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3, center_mv->col >> 3); const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int); const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row); const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col); int_mv bmv = pack_int_mv(ref_row, ref_col); int_mv new_bmv = bmv; __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int); const int what_stride = x->plane[0].src.stride; const int in_what_stride = x->e_mbd.plane[0].pre[0].stride; const uint8_t *const what = x->plane[0].src.buf; const uint8_t *const in_what = x->e_mbd.plane[0].pre[0].buf + ref_row * in_what_stride + ref_col; // Work out the start point for the search const uint8_t *best_address = in_what; const uint8_t *new_best_address = best_address; #if ARCH_X86_64 __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address); #else __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address); #endif unsigned int best_sad; int i; int j; int step; // Check the prerequisite cost function properties that are easy to check // in an assert. See the function-level documentation for details on all // prerequisites. assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]); assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]); // Check the starting position best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride); best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit); *num00 = 0; for (i = 0, step = 0; step < tot_steps; step++) { for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) { __m128i v_sad_d; __m128i v_cost_d; __m128i v_outside_d; __m128i v_inside_d; __m128i v_diff_mv_w; #if ARCH_X86_64 __m128i v_blocka[2]; #else __m128i v_blocka[1]; #endif // Compute the candidate motion vectors const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i*)&ss_mv[i]); const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w); // Clamp them to the search bounds __m128i v_these_mv_clamp_w = v_these_mv_w; v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w); v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w); // The ones that did not change are inside the search area v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w); // If none of them are inside, then move on if (__likely__(_mm_test_all_zeros(v_inside_d, v_inside_d))) { continue; } // The inverse mask indicates which of the MVs are outside v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff)); // Shift right to keep the sign bit clear, we will use this later // to set the cost to the maximum value. v_outside_d = _mm_srli_epi32(v_outside_d, 1); // Compute the difference MV v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv); // We utilise the fact that the cost function is even, and use the // absolute difference. This allows us to use unsigned indexes later // and reduces cache pressure somewhat as only a half of the table // is ever referenced. v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w); // Compute the SIMD pointer offsets. { #if ARCH_X86_64 // sizeof(intptr_t) == 8 // Load the offsets __m128i v_bo10_q = _mm_loadu_si128((const __m128i*)&ss_os[i+0]); __m128i v_bo32_q = _mm_loadu_si128((const __m128i*)&ss_os[i+2]); // Set the ones falling outside to zero v_bo10_q = _mm_and_si128(v_bo10_q, _mm_cvtepi32_epi64(v_inside_d)); v_bo32_q = _mm_and_si128(v_bo32_q, _mm_unpackhi_epi32(v_inside_d, v_inside_d)); // Compute the candidate addresses v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q); v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q); #else // ARCH_X86 // sizeof(intptr_t) == 4 __m128i v_bo_d = _mm_loadu_si128((const __m128i*)&ss_os[i]); v_bo_d = _mm_and_si128(v_bo_d, v_inside_d); v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d); #endif } fn_ptr->sdx4df(what, what_stride, (const uint8_t **)&v_blocka[0], in_what_stride, (uint32_t*)&v_sad_d); // Look up the component cost of the residual motion vector { const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0); const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1); const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2); const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3); const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4); const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5); const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6); const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7); // Note: This is a use case for vpgather in AVX2 const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0]; const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1]; const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2]; const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3]; __m128i v_cost_10_d, v_cost_32_d; v_cost_10_d = _mm_cvtsi32_si128(cost0); v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1); v_cost_32_d = _mm_cvtsi32_si128(cost2); v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1); v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d); } // Now add in the joint cost { const __m128i v_sel_d = _mm_cmpeq_epi32(v_diff_mv_w, _mm_setzero_si128()); const __m128i v_joint_cost_d = _mm_blendv_epi8(v_joint_cost_1_d, v_joint_cost_0_d, v_sel_d); v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d); } // Multiply by sad_per_bit v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d); // ROUND_POWER_OF_TWO(v_cost_d, 8) v_cost_d = _mm_add_epi32(v_cost_d, _mm_set1_epi32(0x80)); v_cost_d = _mm_srai_epi32(v_cost_d, 8); // Add the cost to the sad v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d); // Make the motion vectors outside the search area have max cost // by or'ing in the comparison mask, this way the minimum search won't // pick them. v_sad_d = _mm_or_si128(v_sad_d, v_outside_d); // Find the minimum value and index horizontally in v_sad_d { // Try speculatively on 16 bits, so we can use the minpos intrinsic const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d); const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w); uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0); uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1); // If the local best value is not saturated, just use it, otherwise // find the horizontal minimum again the hard way on 32 bits. // This is executed rarely. if (__unlikely__(local_best_sad == 0xffff)) { __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d; v_loval_d = v_sad_d; v_loidx_d = _mm_set_epi32(3, 2, 1, 0); v_hival_d = _mm_srli_si128(v_loval_d, 8); v_hiidx_d = _mm_srli_si128(v_loidx_d, 8); v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); v_hival_d = _mm_srli_si128(v_loval_d, 4); v_hiidx_d = _mm_srli_si128(v_loidx_d, 4); v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d); v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d); v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d); local_best_sad = _mm_extract_epi32(v_loval_d, 0); local_best_idx = _mm_extract_epi32(v_loidx_d, 0); } // Update the global minimum if the local minimum is smaller if (__likely__(local_best_sad < best_sad)) { new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx]; new_best_address = ((const uint8_t **)v_blocka)[local_best_idx]; best_sad = local_best_sad; } } } bmv = new_bmv; best_address = new_best_address; v_bmv_w = _mm_set1_epi32(bmv.as_int); #if ARCH_X86_64 v_ba_q = _mm_set1_epi64x((intptr_t)best_address); #else v_ba_d = _mm_set1_epi32((intptr_t)best_address); #endif if (__unlikely__(best_address == in_what)) { (*num00)++; } } *best_mv = bmv.as_mv; return best_sad; }
wchar_t * __cdecl wcsstr ( const wchar_t * wcs1, const wchar_t * wcs2 ) { const wchar_t *stmp1, *stmp2; __m128i zero, pattern, characters1, characters2; // An empty search string matches everything. if (0 == *wcs2) return (wchar_t *)wcs1; if (__isa_available > __ISA_AVAILABLE_SSE2) { wchar_t c; unsigned i; // Load XMM with first characters of wcs2. if (XMM_PAGE_SAFE(wcs2)) { pattern = _mm_loadu_si128((__m128i*)wcs2); } else { pattern = _mm_xor_si128(pattern, pattern); c = *(stmp2 = wcs2); for (i = 0; i < XMM_CHARS; ++i) { pattern = _mm_srli_si128(pattern, sizeof(wchar_t)); pattern = _mm_insert_epi16(pattern, c, (XMM_CHARS-1)); if (0 != c) c = *++stmp2; } } for(;;) { // Check for partial match, if none step forward and continue. if (XMM_PAGE_SAFE(wcs1)) { characters1 = _mm_loadu_si128((__m128i*)wcs1); // If no potential match or end found, try next XMMWORD. if (_mm_cmpistra(pattern, characters1, f_srch_sub)) { wcs1 += XMM_CHARS; continue; } // If end found there was no match. else if (!_mm_cmpistrc(pattern, characters1, f_srch_sub)) { return NULL; } // Get position of potential match. wcs1 += _mm_cmpistri(pattern, characters1, f_srch_sub); } else { // If end of string found there was no match. if (0 == *wcs1) { return NULL; } // If current character doesn't match first character // of search string try next character. if (*wcs1 != *wcs2) { ++wcs1; continue; } } // Potential match, compare to check for full match. stmp1 = wcs1; stmp2 = wcs2; for (;;) { // If next XMMWORD is page-safe for each string // do a XMMWORD comparison. if (XMM_PAGE_SAFE(stmp1) && XMM_PAGE_SAFE(stmp2)) { characters1 = _mm_loadu_si128((__m128i*)stmp1); characters2 = _mm_loadu_si128((__m128i*)stmp2); // If unequal then no match found. if (!_mm_cmpistro(characters2, characters1, f_srch_sub)) { break; } // If end of search string then match found. else if (_mm_cmpistrs(characters2, characters1, f_srch_sub)) { return (wchar_t *)wcs1; } stmp1 += XMM_CHARS; stmp2 += XMM_CHARS; continue; } // Compare next character. else { // If end of search string then match found. if (0 == *stmp2) { return (wchar_t *)wcs1; } // If unequal then no match found. if (*stmp1 != *stmp2) { break; } // Character matched - try next character. ++stmp1; ++stmp2; } } // Match not found at current position, try next. ++wcs1; } } else if (__isa_available == __ISA_AVAILABLE_SSE2) { unsigned offset, mask; // Build search pattern and zero pattern. Search pattern is // XMMWORD with the initial character of the search string // in every position. Zero pattern has a zero termination // character in every position. pattern = _mm_cvtsi32_si128(wcs2[0]); pattern = _mm_shufflelo_epi16(pattern, 0); pattern = _mm_shuffle_epi32(pattern, 0); zero = _mm_xor_si128(zero, zero); // Main loop for searching wcs1. for (;;) { // If XMM check is safe advance wcs1 to the next // possible match or end. if (XMM_PAGE_SAFE(wcs1)) { characters1 = _mm_loadu_si128((__m128i*)wcs1); characters2 = _mm_cmpeq_epi16(characters1, zero); characters1 = _mm_cmpeq_epi16(characters1, pattern); characters1 = _mm_or_si128(characters1, characters2); mask = _mm_movemask_epi8(characters1); // If no character match or end found try next XMMWORD. if (0 == mask) { wcs1 += XMM_CHARS; continue; } // Advance wcs1 pointer to next possible match or end. _BitScanForward(&offset, mask); wcs1 += (offset/sizeof(wchar_t)); } // If at the end of wcs1, then no match found. if (0 == wcs1[0]) return NULL; // If a first-character match is found compare // strings to look for match. if (wcs2[0] == wcs1[0]) { stmp1 = wcs1; stmp2 = wcs2; for (;;) { // If aligned as specified advance to next // possible difference or wcs2 end. if (XMM_PAGE_SAFE(stmp2) && XMM_PAGE_SAFE(stmp1)) { characters1 = _mm_loadu_si128((__m128i*)stmp1); characters2 = _mm_loadu_si128((__m128i*)stmp2); characters1 = _mm_cmpeq_epi16(characters1, characters2); characters2 = _mm_cmpeq_epi16(characters2, zero); characters1 = _mm_cmpeq_epi16(characters1, zero); characters1 = _mm_or_si128(characters1, characters2); mask = _mm_movemask_epi8(characters1); // If mask is zero there is no difference and // wcs2 does not end in this XMMWORD. Continue // with next XMMWORD. if (0 == mask) { stmp1 += XMM_CHARS; stmp2 += XMM_CHARS; continue; } // Advance string pointers to next significant // character. _BitScanForward(&offset, mask); stmp1 += (offset/sizeof(wchar_t)); stmp2 += (offset/sizeof(wchar_t)); } // If we've reached the end of wcs2 then a match // has been found. if (0 == stmp2[0]) return (wchar_t *)wcs1; // If we've reached a difference then no match // was found. if (stmp1[0] != stmp2[0]) break; // Otherwise advance to next character and try // again. ++stmp1; ++stmp2; } } // Current character wasn't a match, try next character. ++wcs1; } } else { const wchar_t *cp = wcs1; const wchar_t *s1, *s2; while (*cp) { s1 = cp; s2 = wcs2; while ( *s1 && *s2 && !(*s1-*s2) ) s1++, s2++; if (!*s2) return (wchar_t *) cp; cp++; } return NULL; } }
uint32_t FLAC__fixed_compute_best_predictor_intrin_sse2(const FLAC__int32 data[], uint32_t data_len, float residual_bits_per_sample[FLAC__MAX_FIXED_ORDER + 1]) { FLAC__uint32 total_error_0, total_error_1, total_error_2, total_error_3, total_error_4; uint32_t i, order; __m128i total_err0, total_err1, total_err2; { FLAC__int32 itmp; __m128i last_error; last_error = _mm_cvtsi32_si128(data[-1]); // 0 0 0 le0 itmp = data[-2]; last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0)); last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp)); // 0 0 le0 le1 itmp -= data[-3]; last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0)); last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp)); // 0 le0 le1 le2 itmp -= data[-3] - data[-4]; last_error = _mm_shuffle_epi32(last_error, _MM_SHUFFLE(2,1,0,0)); last_error = _mm_sub_epi32(last_error, _mm_cvtsi32_si128(itmp)); // le0 le1 le2 le3 total_err0 = total_err1 = _mm_setzero_si128(); for(i = 0; i < data_len; i++) { __m128i err0, err1, tmp; err0 = _mm_cvtsi32_si128(data[i]); // 0 0 0 e0 err1 = _mm_shuffle_epi32(err0, _MM_SHUFFLE(0,0,0,0)); // e0 e0 e0 e0 #if 1 /* OPT_SSE */ err1 = _mm_sub_epi32(err1, last_error); last_error = _mm_srli_si128(last_error, 4); // 0 le0 le1 le2 err1 = _mm_sub_epi32(err1, last_error); last_error = _mm_srli_si128(last_error, 4); // 0 0 le0 le1 err1 = _mm_sub_epi32(err1, last_error); last_error = _mm_srli_si128(last_error, 4); // 0 0 0 le0 err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4 #else last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 8)); // le0 le1 le2+le0 le3+le1 last_error = _mm_add_epi32(last_error, _mm_srli_si128(last_error, 4)); // le0 le1+le0 le2+le0+le1 le3+le1+le2+le0 err1 = _mm_sub_epi32(err1, last_error); // e1 e2 e3 e4 #endif tmp = _mm_slli_si128(err0, 12); // e0 0 0 0 last_error = _mm_srli_si128(err1, 4); // 0 e1 e2 e3 last_error = _mm_or_si128(last_error, tmp); // e0 e1 e2 e3 tmp = _mm_srai_epi32(err0, 31); err0 = _mm_xor_si128(err0, tmp); err0 = _mm_sub_epi32(err0, tmp); tmp = _mm_srai_epi32(err1, 31); err1 = _mm_xor_si128(err1, tmp); err1 = _mm_sub_epi32(err1, tmp); total_err0 = _mm_add_epi32(total_err0, err0); // 0 0 0 te0 total_err1 = _mm_add_epi32(total_err1, err1); // te1 te2 te3 te4 } } total_error_0 = _mm_cvtsi128_si32(total_err0); total_err2 = total_err1; // te1 te2 te3 te4 total_err1 = _mm_srli_si128(total_err1, 8); // 0 0 te1 te2 total_error_4 = _mm_cvtsi128_si32(total_err2); total_error_2 = _mm_cvtsi128_si32(total_err1); total_err2 = _mm_srli_si128(total_err2, 4); // 0 te1 te2 te3 total_err1 = _mm_srli_si128(total_err1, 4); // 0 0 0 te1 total_error_3 = _mm_cvtsi128_si32(total_err2); total_error_1 = _mm_cvtsi128_si32(total_err1); /* prefer higher order */ if(total_error_0 < flac_min(flac_min(flac_min(total_error_1, total_error_2), total_error_3), total_error_4)) order = 0; else if(total_error_1 < flac_min(flac_min(total_error_2, total_error_3), total_error_4)) order = 1; else if(total_error_2 < flac_min(total_error_3, total_error_4)) order = 2; else if(total_error_3 < total_error_4) order = 3; else order = 4; /* Estimate the expected number of bits per residual signal sample. */ /* 'total_error*' is linearly related to the variance of the residual */ /* signal, so we use it directly to compute E(|x|) */ FLAC__ASSERT(data_len > 0 || total_error_0 == 0); FLAC__ASSERT(data_len > 0 || total_error_1 == 0); FLAC__ASSERT(data_len > 0 || total_error_2 == 0); FLAC__ASSERT(data_len > 0 || total_error_3 == 0); FLAC__ASSERT(data_len > 0 || total_error_4 == 0); residual_bits_per_sample[0] = (float)((total_error_0 > 0) ? log(M_LN2 * (double)total_error_0 / (double)data_len) / M_LN2 : 0.0); residual_bits_per_sample[1] = (float)((total_error_1 > 0) ? log(M_LN2 * (double)total_error_1 / (double)data_len) / M_LN2 : 0.0); residual_bits_per_sample[2] = (float)((total_error_2 > 0) ? log(M_LN2 * (double)total_error_2 / (double)data_len) / M_LN2 : 0.0); residual_bits_per_sample[3] = (float)((total_error_3 > 0) ? log(M_LN2 * (double)total_error_3 / (double)data_len) / M_LN2 : 0.0); residual_bits_per_sample[4] = (float)((total_error_4 > 0) ? log(M_LN2 * (double)total_error_4 / (double)data_len) / M_LN2 : 0.0); return order; }
void matrix_vector_mul_SSE_f48_loop_unrolled (fl48** mat, fl48* &vec) { // TESTING change SIZE to min 8 - but multiple of 8 fl48* result = new fl48[SIZE]; __m128i load_mask = _mm_set_epi8(11, 10, 9, 8, 7, 6, 255, 255, 5, 4, 3, 2, 1, 0, 255, 255); for(unsigned i=0;i<SIZE;i+=8) { // row // requiring 8 at a time - because loop un-roll __m128d running_sum1 = _mm_set1_pd(0.0); // running sum initially 0 __m128d running_sum2 = _mm_set1_pd(0.0); // running sum initially 0 __m128d running_sum3 = _mm_set1_pd(0.0); // running sum initially 0 __m128d running_sum4 = _mm_set1_pd(0.0); // running sum initially 0 __m128d running_sum5 = _mm_set1_pd(0.0); // running sum initially 0 __m128d running_sum6 = _mm_set1_pd(0.0); // running sum initially 0 __m128d running_sum7 = _mm_set1_pd(0.0); // running sum initially 0 __m128d running_sum8 = _mm_set1_pd(0.0); // running sum initially 0 for(unsigned j=0;j<SIZE;j+=2) { // col - requires skipping on 2 at a time __m128i mat_vect = _mm_loadu_si128((__m128i*) &mat[i][j]); // hoping that addresses are as expected - seems like this is the way it's stored mat_vect = _mm_shuffle_epi8(mat_vect, load_mask); __m128i vec_elem = _mm_loadu_si128((__m128i*) &vec[j]); vec_elem = _mm_shuffle_epi8(vec_elem, load_mask); __m128d mult = _mm_mul_pd((__m128d)mat_vect,(__m128d)vec_elem); running_sum1 = _mm_add_pd(mult,running_sum1); mat_vect = _mm_loadu_si128((__m128i*) &mat[i+1][j]); mat_vect = _mm_shuffle_epi8(mat_vect, load_mask); mult = _mm_mul_pd((__m128d)mat_vect,(__m128d)vec_elem); running_sum2 = _mm_add_pd(mult,running_sum2); mat_vect = _mm_loadu_si128((__m128i*) &mat[i+2][j]); mat_vect = _mm_shuffle_epi8(mat_vect, load_mask); mult = _mm_mul_pd((__m128d)mat_vect,(__m128d)vec_elem); running_sum3 = _mm_add_pd(mult,running_sum3); mat_vect = _mm_loadu_si128((__m128i*) &mat[i+3][j]); mat_vect = _mm_shuffle_epi8(mat_vect, load_mask); mult = _mm_mul_pd((__m128d)mat_vect,(__m128d)vec_elem); running_sum4 = _mm_add_pd(mult,running_sum4); mat_vect = _mm_loadu_si128((__m128i*) &mat[i+4][j]); mat_vect = _mm_shuffle_epi8(mat_vect, load_mask); mult = _mm_mul_pd((__m128d)mat_vect,(__m128d)vec_elem); running_sum5 = _mm_add_pd(mult,running_sum5); mat_vect = _mm_loadu_si128((__m128i*) &mat[i+5][j]); mat_vect = _mm_shuffle_epi8(mat_vect, load_mask); mult = _mm_mul_pd((__m128d)mat_vect,(__m128d)vec_elem); running_sum6 = _mm_add_pd(mult,running_sum6); mat_vect = _mm_loadu_si128((__m128i*) &mat[i+6][j]); mat_vect = _mm_shuffle_epi8(mat_vect, load_mask); mult = _mm_mul_pd((__m128d)mat_vect,(__m128d)vec_elem); running_sum7 = _mm_add_pd(mult,running_sum7); mat_vect = _mm_loadu_si128((__m128i*) &mat[i+7][j]); mat_vect = _mm_shuffle_epi8(mat_vect, load_mask); mult = _mm_mul_pd((__m128d)mat_vect,(__m128d)vec_elem); running_sum8 = _mm_add_pd(mult,running_sum8); } __m128i mask = _mm_set_epi8(7 ,6 ,5, 4, 3, 2, 1, 0, 15, 14, 13, 12, 11, 10, 9, 8); __m128i sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum1, mask); running_sum1 = _mm_add_pd(running_sum1,(__m128d)sum_shuffled); sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum2, mask); running_sum2 = _mm_add_pd(running_sum2,(__m128d)sum_shuffled); sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum3, mask); running_sum3 = _mm_add_pd(running_sum3,(__m128d)sum_shuffled); sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum4, mask); running_sum4 = _mm_add_pd(running_sum4,(__m128d)sum_shuffled); sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum5, mask); running_sum5 = _mm_add_pd(running_sum5,(__m128d)sum_shuffled); sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum6, mask); running_sum6 = _mm_add_pd(running_sum6,(__m128d)sum_shuffled); sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum7, mask); running_sum7 = _mm_add_pd(running_sum7,(__m128d)sum_shuffled); sum_shuffled = _mm_shuffle_epi8((__m128i)running_sum8, mask); running_sum8 = _mm_add_pd(running_sum8,(__m128d)sum_shuffled); // mesh them into 4 __m128i mask_first = _mm_set_epi8(255,255,255,255,255,255,255,255, 7 ,6 ,5, 4, 3, 2, 1, 0); __m128i mask_second = _mm_set_epi8(7 ,6 ,5, 4, 3, 2, 1, 0, 255,255,255,255,255,255,255,255); running_sum1 = (__m128d)_mm_shuffle_epi8((__m128i)running_sum1, mask_first); running_sum2 = (__m128d)_mm_shuffle_epi8((__m128i)running_sum2, mask_second); running_sum1 = (__m128d)_mm_or_si128((__m128i)running_sum1, (__m128i)running_sum2); running_sum3 = (__m128d)_mm_shuffle_epi8((__m128i)running_sum3, mask_first); running_sum4 = (__m128d)_mm_shuffle_epi8((__m128i)running_sum4, mask_second); running_sum2 = (__m128d)_mm_or_si128((__m128i)running_sum3, (__m128i)running_sum4); running_sum5 = (__m128d)_mm_shuffle_epi8((__m128i)running_sum5, mask_first); running_sum6 = (__m128d)_mm_shuffle_epi8((__m128i)running_sum6, mask_second); running_sum3 = (__m128d)_mm_or_si128((__m128i)running_sum6, (__m128i)running_sum5); running_sum7 = (__m128d)_mm_shuffle_epi8((__m128i)running_sum7, mask_first); running_sum8 = (__m128d)_mm_shuffle_epi8((__m128i)running_sum8, mask_second); running_sum4 = (__m128d)_mm_or_si128((__m128i)running_sum8, (__m128i)running_sum7); // RS 1-4 are right and expected here too // rs 5-8 neglected and not required from now __m128i a01_round = convert_double_to_f48_SSE((__m128i)running_sum1); __m128i a23_round = convert_double_to_f48_SSE((__m128i)running_sum2); __m128i a45_round = convert_double_to_f48_SSE((__m128i)running_sum3); __m128i a67_round = convert_double_to_f48_SSE((__m128i)running_sum4); // place them right for memory write __m128i match_mask = _mm_set_epi8(3,2,1,0,255,255,255,255,255,255,255,255,255,255,255,255); // mask used to match the missing spaces __m128i a23_shuffled = _mm_shuffle_epi8((__m128i)a23_round, match_mask); // shuffle the positions required for the space in a01 for a2 a01_round = _mm_or_si128(a01_round,a23_shuffled); a23_round = _mm_srli_si128 (a23_round, 4); // using _mm_srli_si128 instead of _mm_sll_epi64 because the epi64 shitfs witin each double element in the 128 item match_mask = _mm_set_epi8(7,6,5,4,3,2,1,0,255,255,255,255,255,255,255,255); // reset the match mask for a4 and small bit of a5 __m128i a45_shuffled = _mm_shuffle_epi8((__m128i)a45_round, match_mask); // shuffle a45 to fit in a23 a23_round = _mm_or_si128(a23_round,a45_shuffled); a45_round = _mm_srli_si128(a45_round, 8); // using _mm_srli_si128 instead of _mm_sll_epi64 because the epi64 shitfs witin each double element in the 128 item match_mask = _mm_set_epi8(11,10,9,8,7,6,5,4,3,2,1,0,255,255,255,255); __m128i a67_shuffled = _mm_shuffle_epi8((__m128i)a67_round, match_mask); a45_round = _mm_or_si128(a45_round,a67_shuffled); // WRITE BACK TO MEMORY! _mm_storeu_pd((double*)&result[i], (__m128d)a01_round); _mm_storeu_pd(bofs(&result[i],2), (__m128d)a23_round); _mm_storeu_pd(bofs(&result[i],4), (__m128d)a45_round); } vec = result; }
static inline void calc_lbp_16_strip(IplImage * src, IplImage * dst, unsigned base) { const signed char* src_data = (signed char*)(src->imageData + base); unsigned char * dst_data = (unsigned char*)(dst->imageData + base); const signed char* const src_end = (signed char*)src->imageData + (src->height-1) * src->widthStep; __m128i pixels[3]; // Load first two rows //pixels[0] = *(__m128i*)src_data;//_mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data)); pixels[0] = _mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data)); //pixels[0] = _mm_xor_si128(pixels[0], sign_bit.q); // conversion from unsigned to signed - invert sign bit src_data += src->widthStep; //pixels[1] = *(__m128i*)src_data;//_mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data)); pixels[1] = _mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data)); //pixels[1] = _mm_xor_si128(pixels[1], sign_bit.q); src_data += src->widthStep; int phase = 2; __m128i * phase_map[3][3] = { {pixels+1, pixels+2, pixels}, {pixels+2, pixels, pixels+1}, {pixels, pixels+1, pixels+2}, }; while (src_data < src_end) { register __m128i weight = ones.q; register __m128i code = _mm_setzero_si128(); //pixels[phase] = _mm_set_epi64(*(__m64*)(src_data+8), *(__m64*)(src_data)); //pixels[phase] = _mm_xor_si128(pixels[phase], sign_bit.q); //pixels[phase] = _mm_xor_si128(_mm_lddqu_si128((__m128i*)src_data), sign_bit.q); pixels[phase] = _mm_lddqu_si128((__m128i*)src_data); src_data += src->widthStep; dst_data += dst->widthStep; _mm_prefetch(src_data, _MM_HINT_T0); register __m128i a = *(phase_map[phase][0]); register __m128i b = *(phase_map[phase][1]); register __m128i c = *(phase_map[phase][2]); phase++; phase = (phase == 3) ? 0 : phase; // X . . A // . o . B // . . . C code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_slli_si128(a, 1)), weight)); weight = _mm_slli_epi64(weight, 1); // . X . // . . // . . . code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, a), weight)); weight = _mm_slli_epi64(weight, 1); // . . X // . . // . . . code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_srli_si128(a, 1)), weight)); weight = _mm_slli_epi64(weight, 1); // . . . // . X // . . . code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_srli_si128(b, 1)), weight)); weight = _mm_slli_epi64(weight, 1); // . . . // . . // . . X code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_srli_si128(c, 1)), weight)); weight = _mm_slli_epi64(weight, 1); // . . . // . . // . X . code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, c), weight)); weight = _mm_slli_epi64(weight, 1); // . . . // . . // X . . code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_slli_si128(c, 1)), weight)); weight = _mm_slli_epi64(weight, 1); // . . . // X . // . . . code = _mm_or_si128(code, _mm_and_si128(_mm_cmplt_epi8(b, _mm_slli_si128(b, 1)), weight)); _mm_maskmoveu_si128(code, lbp_valid_mask.q, (char*)dst_data); // store the results - unaligned write } }
void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[], unsigned residual_samples, unsigned predictor_order, unsigned min_partition_order, unsigned max_partition_order, unsigned bps) { const unsigned default_partition_samples = (residual_samples + predictor_order) >> max_partition_order; unsigned partitions = 1u << max_partition_order; FLAC__ASSERT(default_partition_samples > predictor_order); /* first do max_partition_order */ { const unsigned threshold = 32 - FLAC__bitmath_ilog2(default_partition_samples); unsigned partition, residual_sample, end = (unsigned)(-(int)predictor_order); if(bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < threshold) { for(partition = residual_sample = 0; partition < partitions; partition++) { __m128i mm_sum = _mm_setzero_si128(); unsigned e1, e3; end += default_partition_samples; e1 = (residual_sample + 3) & ~3; e3 = end & ~3; if(e1 > end) e1 = end; /* try flac -l 1 -b 16 and you'll be here */ /* assumption: residual[] is properly aligned so (residual + e1) is properly aligned too and _mm_loadu_si128() is fast */ for( ; residual_sample < e1; residual_sample++) { __m128i mm_res = _mm_cvtsi32_si128(residual[residual_sample]); __m128i mm_mask = _mm_srai_epi32(mm_res, 31); mm_res = _mm_xor_si128(mm_res, mm_mask); mm_res = _mm_sub_epi32(mm_res, mm_mask); /* abs(INT_MIN) is undefined, but if the residual is INT_MIN we have bigger problems */ mm_sum = _mm_add_epi32(mm_sum, mm_res); } for( ; residual_sample < e3; residual_sample+=4) { __m128i mm_res = _mm_loadu_si128((const __m128i*)(residual+residual_sample)); __m128i mm_mask = _mm_srai_epi32(mm_res, 31); mm_res = _mm_xor_si128(mm_res, mm_mask); mm_res = _mm_sub_epi32(mm_res, mm_mask); mm_sum = _mm_add_epi32(mm_sum, mm_res); } for( ; residual_sample < end; residual_sample++) { __m128i mm_res = _mm_cvtsi32_si128(residual[residual_sample]); __m128i mm_mask = _mm_srai_epi32(mm_res, 31); mm_res = _mm_xor_si128(mm_res, mm_mask); mm_res = _mm_sub_epi32(mm_res, mm_mask); mm_sum = _mm_add_epi32(mm_sum, mm_res); } mm_sum = _mm_add_epi32(mm_sum, _mm_srli_si128(mm_sum, 8)); mm_sum = _mm_add_epi32(mm_sum, _mm_srli_si128(mm_sum, 4)); abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(mm_sum); } } else { /* have to pessimistically use 64 bits for accumulator */ for(partition = residual_sample = 0; partition < partitions; partition++) { __m128i mm_sum = _mm_setzero_si128(); unsigned e1, e3; end += default_partition_samples; e1 = (residual_sample + 1) & ~1; e3 = end & ~1; FLAC__ASSERT(e1 <= end); for( ; residual_sample < e1; residual_sample++) { __m128i mm_res = _mm_cvtsi32_si128(residual[residual_sample]); /* 0 0 0 r0 */ __m128i mm_mask = _mm_srai_epi32(mm_res, 31); mm_res = _mm_xor_si128(mm_res, mm_mask); mm_res = _mm_sub_epi32(mm_res, mm_mask); /* 0 0 0 |r0| == 00 |r0_64| */ mm_sum = _mm_add_epi64(mm_sum, mm_res); } for( ; residual_sample < e3; residual_sample+=2) { __m128i mm_res = _mm_loadl_epi64((const __m128i*)(residual+residual_sample)); /* 0 0 r1 r0 */ __m128i mm_mask = _mm_srai_epi32(mm_res, 31); mm_res = _mm_xor_si128(mm_res, mm_mask); mm_res = _mm_sub_epi32(mm_res, mm_mask); /* 0 0 |r1| |r0| */ mm_res = _mm_shuffle_epi32(mm_res, _MM_SHUFFLE(3,1,2,0)); /* 0 |r1| 0 |r0| == |r1_64| |r0_64| */ mm_sum = _mm_add_epi64(mm_sum, mm_res); } for( ; residual_sample < end; residual_sample++) { __m128i mm_res = _mm_cvtsi32_si128(residual[residual_sample]); __m128i mm_mask = _mm_srai_epi32(mm_res, 31); mm_res = _mm_xor_si128(mm_res, mm_mask); mm_res = _mm_sub_epi32(mm_res, mm_mask); mm_sum = _mm_add_epi64(mm_sum, mm_res); } mm_sum = _mm_add_epi64(mm_sum, _mm_srli_si128(mm_sum, 8)); _mm_storel_epi64((__m128i*)(abs_residual_partition_sums+partition), mm_sum); } } } /* now merge partitions for lower orders */ { unsigned from_partition = 0, to_partition = partitions; int partition_order; for(partition_order = (int)max_partition_order - 1; partition_order >= (int)min_partition_order; partition_order--) { unsigned i; partitions >>= 1; for(i = 0; i < partitions; i++) { abs_residual_partition_sums[to_partition++] = abs_residual_partition_sums[from_partition ] + abs_residual_partition_sums[from_partition+1]; from_partition += 2; } } } }