void lines_scale2(const unsigned char *src, unsigned y, unsigned char *dst1, unsigned char *dst2, unsigned nPix) { const unsigned char *u = src + ((y-1) & 7)*sc2lines_width, *m = src + ((y+0) & 7)*sc2lines_width, *l = src + ((y+1) & 7)*sc2lines_width; for (unsigned i = 0; i < nPix; i += 8) { __m64 uu = *(__m64*)(u+i); __m64 ll = *(__m64*)(l+i); __m64 cmp = _mm_cmpeq_pi8(uu,ll); if (_mm_movemask_pi8(cmp) != 0xFF) { __m128i mm = _mm_loadu_si128((__m128i*)(m+i-4)); __m128i uu = _mm_loadu_si128((__m128i*)(u+i-4)); __m128i ll = _mm_loadu_si128((__m128i*)(l+i-4)); __m128i md = _mm_slli_si128(mm,1); __m128i mf = _mm_srli_si128(mm,1); __m128i maskall = _mm_or_si128(_mm_cmpeq_epi8(md,mf), _mm_cmpeq_epi8(uu,ll)); __m128i e0, e1, v1, v2, v3; e0 = _mm_cmpeq_epi8(md,uu); e0 = _mm_andnot_si128(maskall, e0); e0 = _mm_srli_si128(e0,4); e0 = _mm_unpacklo_epi8(e0, _mm_setzero_si128()); e1 = _mm_cmpeq_epi8(mf,uu); e1 = _mm_andnot_si128(maskall, e1); e1 = _mm_srli_si128(e1,4); e1 = _mm_unpacklo_epi8(_mm_setzero_si128(), e1); e0 = _mm_or_si128(e0, e1); v1 = _mm_srli_si128(mm,4); v1 = _mm_unpacklo_epi8(v1,v1); v2 = _mm_srli_si128(uu,4); v2 = _mm_unpacklo_epi8(v2,v2); _mm_store_si128((__m128i*)(dst1 + 2*i), _mm_or_si128( _mm_and_si128(e0,v2), _mm_andnot_si128(e0,v1) ) ); e0 = _mm_cmpeq_epi8(md,ll); e0 = _mm_andnot_si128(maskall, e0); e0 = _mm_srli_si128(e0,4); e0 = _mm_unpacklo_epi8(e0, _mm_setzero_si128()); e1 = _mm_cmpeq_epi8(mf,ll); e1 = _mm_andnot_si128(maskall, e1); e1 = _mm_srli_si128(e1,4); e1 = _mm_unpacklo_epi8(_mm_setzero_si128(), e1); e0 = _mm_or_si128(e0, e1); v3 = _mm_srli_si128(ll,4); v3 = _mm_unpacklo_epi8(v3,v3); _mm_store_si128((__m128i*)(dst2 + 2*i), _mm_or_si128( _mm_and_si128(e0,v3), _mm_andnot_si128(e0,v1) ) ); } else { __m64 v0 = *(__m64*)(m+i); __m128i v1 = _mm_movpi64_epi64(v0); v1 = _mm_unpacklo_epi8(v1,v1); _mm_store_si128((__m128i*)(dst1 + 2*i), v1); _mm_store_si128((__m128i*)(dst2 + 2*i), v1); } } }
// Does one or two inverse transforms. static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst, int do_two) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 // // To be able to use signed 16-bit integers, we use the following trick to // have constants within range: // - Associated constants are obtained by subtracting the 16-bit fixed point // version of one: // k = K - (1 << 16) => K = k + (1 << 16) // K1 = 85267 => k1 = 20091 // K2 = 35468 => k2 = -30068 // - The multiplication of a variable by a constant become the sum of the // variable and the multiplication of that variable by the associated // constant: // (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x const __m128i k1 = _mm_set1_epi16(20091); const __m128i k2 = _mm_set1_epi16(-30068); __m128i T0, T1, T2, T3; // Load and concatenate the transform coefficients (we'll do two inverse // transforms in parallel). In the case of only one inverse transform, the // second half of the vectors will just contain random value we'll never // use nor store. __m128i in0, in1, in2, in3; { in0 = _mm_loadl_epi64((__m128i*)&in[0]); in1 = _mm_loadl_epi64((__m128i*)&in[4]); in2 = _mm_loadl_epi64((__m128i*)&in[8]); in3 = _mm_loadl_epi64((__m128i*)&in[12]); // a00 a10 a20 a30 x x x x // a01 a11 a21 a31 x x x x // a02 a12 a22 a32 x x x x // a03 a13 a23 a33 x x x x if (do_two) { const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]); const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]); const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]); const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]); in0 = _mm_unpacklo_epi64(in0, inB0); in1 = _mm_unpacklo_epi64(in1, inB1); in2 = _mm_unpacklo_epi64(in2, inB2); in3 = _mm_unpacklo_epi64(in3, inB3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } } // Vertical pass and subsequent transpose. { // First pass, c and d calculations are longer because of the "trick" // multiplications. const __m128i a = _mm_add_epi16(in0, in2); const __m128i b = _mm_sub_epi16(in0, in2); // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 const __m128i c1 = _mm_mulhi_epi16(in1, k2); const __m128i c2 = _mm_mulhi_epi16(in3, k1); const __m128i c3 = _mm_sub_epi16(in1, in3); const __m128i c4 = _mm_sub_epi16(c1, c2); const __m128i c = _mm_add_epi16(c3, c4); // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 const __m128i d1 = _mm_mulhi_epi16(in1, k1); const __m128i d2 = _mm_mulhi_epi16(in3, k2); const __m128i d3 = _mm_add_epi16(in1, in3); const __m128i d4 = _mm_add_epi16(d1, d2); const __m128i d = _mm_add_epi16(d3, d4); // Second pass. const __m128i tmp0 = _mm_add_epi16(a, d); const __m128i tmp1 = _mm_add_epi16(b, c); const __m128i tmp2 = _mm_sub_epi16(b, c); const __m128i tmp3 = _mm_sub_epi16(a, d); // Transpose the two 4x4. // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1); const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3); const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1); const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Horizontal pass and subsequent transpose. { // First pass, c and d calculations are longer because of the "trick" // multiplications. const __m128i four = _mm_set1_epi16(4); const __m128i dc = _mm_add_epi16(T0, four); const __m128i a = _mm_add_epi16(dc, T2); const __m128i b = _mm_sub_epi16(dc, T2); // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 const __m128i c1 = _mm_mulhi_epi16(T1, k2); const __m128i c2 = _mm_mulhi_epi16(T3, k1); const __m128i c3 = _mm_sub_epi16(T1, T3); const __m128i c4 = _mm_sub_epi16(c1, c2); const __m128i c = _mm_add_epi16(c3, c4); // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 const __m128i d1 = _mm_mulhi_epi16(T1, k1); const __m128i d2 = _mm_mulhi_epi16(T3, k2); const __m128i d3 = _mm_add_epi16(T1, T3); const __m128i d4 = _mm_add_epi16(d1, d2); const __m128i d = _mm_add_epi16(d3, d4); // Second pass. const __m128i tmp0 = _mm_add_epi16(a, d); const __m128i tmp1 = _mm_add_epi16(b, c); const __m128i tmp2 = _mm_sub_epi16(b, c); const __m128i tmp3 = _mm_sub_epi16(a, d); const __m128i shifted0 = _mm_srai_epi16(tmp0, 3); const __m128i shifted1 = _mm_srai_epi16(tmp1, 3); const __m128i shifted2 = _mm_srai_epi16(tmp2, 3); const __m128i shifted3 = _mm_srai_epi16(tmp3, 3); // Transpose the two 4x4. // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1); const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3); const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1); const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Add inverse transform to 'ref' and store. { const __m128i zero = _mm_set1_epi16(0); // Load the reference(s). __m128i ref0, ref1, ref2, ref3; if (do_two) { // Load eight bytes/pixels per line. ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]); ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]); ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]); ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]); } else { // Load four bytes/pixels per line. ref0 = _mm_cvtsi32_si128(*(int*)&ref[0 * BPS]); ref1 = _mm_cvtsi32_si128(*(int*)&ref[1 * BPS]); ref2 = _mm_cvtsi32_si128(*(int*)&ref[2 * BPS]); ref3 = _mm_cvtsi32_si128(*(int*)&ref[3 * BPS]); } // Convert to 16b. ref0 = _mm_unpacklo_epi8(ref0, zero); ref1 = _mm_unpacklo_epi8(ref1, zero); ref2 = _mm_unpacklo_epi8(ref2, zero); ref3 = _mm_unpacklo_epi8(ref3, zero); // Add the inverse transform(s). ref0 = _mm_add_epi16(ref0, T0); ref1 = _mm_add_epi16(ref1, T1); ref2 = _mm_add_epi16(ref2, T2); ref3 = _mm_add_epi16(ref3, T3); // Unsigned saturate to 8b. ref0 = _mm_packus_epi16(ref0, ref0); ref1 = _mm_packus_epi16(ref1, ref1); ref2 = _mm_packus_epi16(ref2, ref2); ref3 = _mm_packus_epi16(ref3, ref3); // Store the results. if (do_two) { // Store eight bytes/pixels per line. _mm_storel_epi64((__m128i*)&dst[0 * BPS], ref0); _mm_storel_epi64((__m128i*)&dst[1 * BPS], ref1); _mm_storel_epi64((__m128i*)&dst[2 * BPS], ref2); _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3); } else { // Store four bytes/pixels per line. *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(ref0); *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(ref1); *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(ref2); *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(ref3); } } }
#include "strategyselector.h" /** * \brief Linear interpolation for 4 pixels. Returns 4 filtered pixels in lowest 32-bits of the register. * \param ref_main Reference pixels * \param delta_pos Fractional pixel precise position of sample displacement * \param x Sample offset in direction x in ref_main array */ static INLINE __m128i filter_4x1_avx2(const kvz_pixel *ref_main, int16_t delta_pos, int x){ int8_t delta_int = delta_pos >> 5; int8_t delta_fract = delta_pos & (32-1); __m128i sample0 = _mm_cvtsi32_si128(*(uint32_t*)&(ref_main[x + delta_int])); __m128i sample1 = _mm_cvtsi32_si128(*(uint32_t*)&(ref_main[x + delta_int + 1])); __m128i pairs = _mm_unpacklo_epi8(sample0, sample1); __m128i weight = _mm_set1_epi16( (delta_fract << 8) | (32 - delta_fract) ); sample0 = _mm_maddubs_epi16(pairs, weight); sample0 = _mm_add_epi16(sample0, _mm_set1_epi16(16)); sample0 = _mm_srli_epi16(sample0, 5); sample0 = _mm_packus_epi16(sample0, sample0); return sample0; } /** * \brief Linear interpolation for 4x4 block. Writes filtered 4x4 block to dst. * \param dst Destination buffer * \param ref_main Reference pixels * \param sample_disp Sample displacement per row * \param vertical_mode Mode direction, true if vertical
void tuned_ConvertULY4ToRGB(uint8_t *pDstBegin, uint8_t *pDstEnd, const uint8_t *pYBegin, const uint8_t *pUBegin, const uint8_t *pVBegin, size_t cbWidth, ssize_t scbStride) { const int shift = 13; __m128i xy2rgb = _mm_set2_epi16_shift((-16 * C::Y2RGB + 0.5) / 0xff, C::Y2RGB, shift); __m128i vu2r = _mm_set2_epi16_shift(C::V2R, 0, shift); __m128i vu2g = _mm_set2_epi16_shift(C::V2G, C::U2G, shift); __m128i vu2b = _mm_set2_epi16_shift(0, C::U2B, shift); auto y = pYBegin; auto u = pUBegin; auto v = pVBegin; for (auto p = pDstBegin; p != pDstEnd; p += scbStride) { auto pp = p; for (; pp <= p + cbWidth - 16; pp += T::BYPP * 4) { __m128i yy = _mm_cvtsi32_si128(*(const int *)y); __m128i uu = _mm_cvtsi32_si128(*(const int *)u); __m128i vv = _mm_cvtsi32_si128(*(const int *)v); __m128i xy = _mm_unpacklo_epi8(_mm_unpacklo_epi8(yy, _mm_setone_si128()), _mm_setzero_si128()); // 00 ff 00 Y3 00 ff 00 Y2 00 ff 00 Y1 00 ff 00 Y0 __m128i vu = _mm_unpacklo_epi8(_mm_unpacklo_epi8(uu, vv), _mm_setzero_si128()); // 00 V3 00 U3 00 V2 00 U2 00 V1 00 U1 00 V0 00 U0 vu = _mm_sub_epi16(vu, _mm_set1_epi16(128)); __m128i rgbtmp = _mm_madd_epi16(xy, xy2rgb); auto xyuv2rgb = [rgbtmp, vu, shift](__m128i vu2rgb) -> __m128i { __m128i rgb = _mm_add_epi32(rgbtmp, _mm_madd_epi16(vu, vu2rgb)); rgb = _mm_srai_epi32(rgb, shift); rgb = _mm_packs_epi32(rgb, rgb); rgb = _mm_packus_epi16(rgb, rgb); return rgb; }; __m128i rr = xyuv2rgb(vu2r); __m128i gg = xyuv2rgb(vu2g); __m128i bb = xyuv2rgb(vu2b); if (std::is_same<T, CBGRAColorOrder>::value) { __m128i bgrx = _mm_unpacklo_epi16(_mm_unpacklo_epi8(bb, gg), _mm_unpacklo_epi8(rr, _mm_setone_si128())); _mm_storeu_si128((__m128i *)pp, bgrx); } #ifdef __SSSE3__ else if (std::is_same<T, CBGRColorOrder>::value) { __m128i bgrx = _mm_unpacklo_epi16(_mm_unpacklo_epi8(bb, gg), _mm_unpacklo_epi8(rr, rr)); __m128i bgr = _mm_shuffle_epi8(bgrx, _mm_set_epi8(-1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0)); _mm_storeu_si128((__m128i *)pp, bgr); } #endif else if (std::is_same<T, CARGBColorOrder>::value) { __m128i xrgb = _mm_unpacklo_epi16(_mm_unpacklo_epi8(rr, rr), _mm_unpacklo_epi8(gg, bb)); _mm_storeu_si128((__m128i *)pp, xrgb); } #ifdef __SSSE3__ else if (std::is_same<T, CRGBColorOrder>::value) { __m128i xrgb = _mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_setone_si128(), rr), _mm_unpacklo_epi8(gg, bb)); __m128i rgb = _mm_shuffle_epi8(xrgb, _mm_set_epi8(-1, -1, -1, -1, 15, 14, 13, 11, 10, 9, 7, 6, 5, 3, 2, 1)); _mm_storeu_si128((__m128i *)pp, rgb); } #endif y += 4; u += 4; v += 4; } for (; pp < p + cbWidth; pp += T::BYPP) { __m128i xy = _mm_cvtsi32_si128(*y | 0x00ff0000); __m128i uu = _mm_cvtsi32_si128(*u); __m128i vv = _mm_cvtsi32_si128(*v); __m128i vu = _mm_unpacklo_epi8(_mm_unpacklo_epi8(uu, vv), _mm_setzero_si128()); // 00 V3 00 U3 00 V2 00 U2 00 V1 00 U1 00 V0 00 U0 vu = _mm_sub_epi16(vu, _mm_set1_epi16(128)); __m128i rgbtmp = _mm_madd_epi16(xy, xy2rgb); auto xyuv2rgb = [rgbtmp, vu, shift](__m128i vu2rgb) -> __m128i { __m128i rgb = _mm_add_epi32(rgbtmp, _mm_madd_epi16(vu, vu2rgb)); rgb = _mm_srai_epi32(rgb, shift); rgb = _mm_packs_epi32(rgb, rgb); rgb = _mm_packus_epi16(rgb, rgb); return rgb; }; __m128i rr = xyuv2rgb(vu2r); __m128i gg = xyuv2rgb(vu2g); __m128i bb = xyuv2rgb(vu2b); if (std::is_same<T, CBGRAColorOrder>::value) { __m128i bgrx = _mm_unpacklo_epi16(_mm_unpacklo_epi8(bb, gg), _mm_unpacklo_epi8(rr, _mm_setone_si128())); *(uint32_t *)pp = _mm_cvtsi128_si32(bgrx); } else if (std::is_same<T, CARGBColorOrder>::value) { __m128i xrgb = _mm_unpacklo_epi16(_mm_unpacklo_epi8(rr, rr), _mm_unpacklo_epi8(gg, bb)); *(uint32_t *)pp = _mm_cvtsi128_si32(xrgb); } else if (std::is_same<T, CBGRColorOrder>::value || std::is_same<T, CRGBColorOrder>::value) { *(pp + T::B) = (uint8_t)_mm_cvtsi128_si32(bb); *(pp + T::G) = (uint8_t)_mm_cvtsi128_si32(gg); *(pp + T::R) = (uint8_t)_mm_cvtsi128_si32(rr); } y += 1; u += 1; v += 1; } } }
static void FTransformSSE2(const uint8_t* src, const uint8_t* ref, int16_t* out) { const __m128i zero = _mm_setzero_si128(); const __m128i seven = _mm_set1_epi16(7); const __m128i k7500 = _mm_set1_epi32(7500); const __m128i k14500 = _mm_set1_epi32(14500); const __m128i k51000 = _mm_set1_epi32(51000); const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16)); const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217); const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352); __m128i v01, v32; // Difference between src and ref and initial transpose. { // Load src and convert to 16b. const __m128i src0 = _mm_loadl_epi64((__m128i*)&src[0 * BPS]); const __m128i src1 = _mm_loadl_epi64((__m128i*)&src[1 * BPS]); const __m128i src2 = _mm_loadl_epi64((__m128i*)&src[2 * BPS]); const __m128i src3 = _mm_loadl_epi64((__m128i*)&src[3 * BPS]); const __m128i src_0 = _mm_unpacklo_epi8(src0, zero); const __m128i src_1 = _mm_unpacklo_epi8(src1, zero); const __m128i src_2 = _mm_unpacklo_epi8(src2, zero); const __m128i src_3 = _mm_unpacklo_epi8(src3, zero); // Load ref and convert to 16b. const __m128i ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]); const __m128i ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]); const __m128i ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]); const __m128i ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]); const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero); const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); // Compute difference. const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); // Transpose. // 00 01 02 03 0 0 0 0 // 10 11 12 13 0 0 0 0 // 20 21 22 23 0 0 0 0 // 30 31 32 33 0 0 0 0 const __m128i transpose0_0 = _mm_unpacklo_epi16(diff0, diff1); const __m128i transpose0_1 = _mm_unpacklo_epi16(diff2, diff3); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // a02 a12 a22 a32 a03 a13 a23 a33 // a00 a10 a20 a30 a01 a11 a21 a31 // a03 a13 a23 a33 a02 a12 a22 a32 } // First pass and subsequent transpose. { // Same operations are done on the (0,3) and (1,2) pairs. // b0 = (a0 + a3) << 3 // b1 = (a1 + a2) << 3 // b3 = (a0 - a3) << 3 // b2 = (a1 - a2) << 3 const __m128i a01 = _mm_add_epi16(v01, v32); const __m128i a32 = _mm_sub_epi16(v01, v32); const __m128i b01 = _mm_slli_epi16(a01, 3); const __m128i b32 = _mm_slli_epi16(a32, 3); const __m128i b11 = _mm_unpackhi_epi64(b01, b01); const __m128i b22 = _mm_unpackhi_epi64(b32, b32); // e0 = b0 + b1 // e2 = b0 - b1 const __m128i e0 = _mm_add_epi16(b01, b11); const __m128i e2 = _mm_sub_epi16(b01, b11); const __m128i e02 = _mm_unpacklo_epi64(e0, e2); // e1 = (b3 * 5352 + b2 * 2217 + 14500) >> 12 // e3 = (b3 * 2217 - b2 * 5352 + 7500) >> 12 const __m128i b23 = _mm_unpacklo_epi16(b22, b32); const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); const __m128i d1 = _mm_add_epi32(c1, k14500); const __m128i d3 = _mm_add_epi32(c3, k7500); const __m128i e1 = _mm_srai_epi32(d1, 12); const __m128i e3 = _mm_srai_epi32(d3, 12); const __m128i e13 = _mm_packs_epi32(e1, e3); // Transpose. // 00 01 02 03 20 21 22 23 // 10 11 12 13 30 31 32 33 const __m128i transpose0_0 = _mm_unpacklo_epi16(e02, e13); const __m128i transpose0_1 = _mm_unpackhi_epi16(e02, e13); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 02 12 22 32 03 13 23 33 // 00 10 20 30 01 11 21 31 // 03 13 23 33 02 12 22 32 } // Second pass { // Same operations are done on the (0,3) and (1,2) pairs. // a0 = v0 + v3 // a1 = v1 + v2 // a3 = v0 - v3 // a2 = v1 - v2 const __m128i a01 = _mm_add_epi16(v01, v32); const __m128i a32 = _mm_sub_epi16(v01, v32); const __m128i a11 = _mm_unpackhi_epi64(a01, a01); const __m128i a22 = _mm_unpackhi_epi64(a32, a32); // d0 = (a0 + a1 + 7) >> 4; // d2 = (a0 - a1 + 7) >> 4; const __m128i b0 = _mm_add_epi16(a01, a11); const __m128i b2 = _mm_sub_epi16(a01, a11); const __m128i c0 = _mm_add_epi16(b0, seven); const __m128i c2 = _mm_add_epi16(b2, seven); const __m128i d0 = _mm_srai_epi16(c0, 4); const __m128i d2 = _mm_srai_epi16(c2, 4); // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16) // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16) const __m128i b23 = _mm_unpacklo_epi16(a22, a32); const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one); const __m128i d3 = _mm_add_epi32(c3, k51000); const __m128i e1 = _mm_srai_epi32(d1, 16); const __m128i e3 = _mm_srai_epi32(d3, 16); const __m128i f1 = _mm_packs_epi32(e1, e1); const __m128i f3 = _mm_packs_epi32(e3, e3); // f1 = f1 + (a3 != 0); // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the // desired (0, 1), we add one earlier through k12000_plus_one. const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); _mm_storel_epi64((__m128i*)&out[ 0], d0); _mm_storel_epi64((__m128i*)&out[ 4], g1); _mm_storel_epi64((__m128i*)&out[ 8], d2); _mm_storel_epi64((__m128i*)&out[12], f3); } }
/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */ static void unshuffle16(uint8_t* dest, uint8_t* orig, size_t size) { size_t i, j, k; size_t neblock, numof16belem; __m128i xmm1[16], xmm2[16]; neblock = size / 16; numof16belem = neblock / 16; for (i = 0, k = 0; i < numof16belem; i++, k += 16) { /* Load the first 128 bytes in 16 XMM registrers */ for (j = 0; j < 16; j++) { xmm1[j] = ((__m128i *)orig)[j*numof16belem+i]; } /* Shuffle bytes */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ xmm2[j] = _mm_unpacklo_epi8(xmm1[j*2], xmm1[j*2+1]); /* Compute the hi 32 bytes */ xmm2[8+j] = _mm_unpackhi_epi8(xmm1[j*2], xmm1[j*2+1]); } /* Shuffle 2-byte words */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ xmm1[j] = _mm_unpacklo_epi16(xmm2[j*2], xmm2[j*2+1]); /* Compute the hi 32 bytes */ xmm1[8+j] = _mm_unpackhi_epi16(xmm2[j*2], xmm2[j*2+1]); } /* Shuffle 4-byte dwords */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ xmm2[j] = _mm_unpacklo_epi32(xmm1[j*2], xmm1[j*2+1]); /* Compute the hi 32 bytes */ xmm2[8+j] = _mm_unpackhi_epi32(xmm1[j*2], xmm1[j*2+1]); } /* Shuffle 8-byte qwords */ for (j = 0; j < 8; j++) { /* Compute the low 32 bytes */ xmm1[j] = _mm_unpacklo_epi64(xmm2[j*2], xmm2[j*2+1]); /* Compute the hi 32 bytes */ xmm1[8+j] = _mm_unpackhi_epi64(xmm2[j*2], xmm2[j*2+1]); } /* Store the result vectors in proper order */ ((__m128i *)dest)[k+0] = xmm1[0]; ((__m128i *)dest)[k+1] = xmm1[8]; ((__m128i *)dest)[k+2] = xmm1[4]; ((__m128i *)dest)[k+3] = xmm1[12]; ((__m128i *)dest)[k+4] = xmm1[2]; ((__m128i *)dest)[k+5] = xmm1[10]; ((__m128i *)dest)[k+6] = xmm1[6]; ((__m128i *)dest)[k+7] = xmm1[14]; ((__m128i *)dest)[k+8] = xmm1[1]; ((__m128i *)dest)[k+9] = xmm1[9]; ((__m128i *)dest)[k+10] = xmm1[5]; ((__m128i *)dest)[k+11] = xmm1[13]; ((__m128i *)dest)[k+12] = xmm1[3]; ((__m128i *)dest)[k+13] = xmm1[11]; ((__m128i *)dest)[k+14] = xmm1[7]; ((__m128i *)dest)[k+15] = xmm1[15]; } }
IVTCScore ComputeScanImprovement_X8R8G8B8_SSE2(const void *src1, const void *src2, ptrdiff_t srcpitch, uint32 w, uint32 h) { IVTCScore score = {0}; __m128i zero = _mm_setzero_si128(); uint32 w2 = w >> 1; static const __m128i mask = { -1, -1, -1, -1, -1, -1, 0, 0, -1, -1, -1, -1, -1, -1, 0, 0 }; bool firstfield = true; do { __m128i var = zero; __m128i varshift = zero; const uint8 *src1r0 = (const uint8 *)src1; const uint8 *src1r1 = src1r0 + srcpitch; const uint8 *src1r2 = src1r1 + srcpitch; const uint8 *src2r = (const uint8 *)src2 + srcpitch; for(uint32 x=0; x<w2; ++x) { __m128i rA = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)src1r0), zero); __m128i rB = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)src1r1), zero); __m128i rC = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)src1r2), zero); __m128i rE = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)src2r), zero); __m128i rAC = _mm_add_epi16(rA, rC); __m128i d1 = _mm_sub_epi16(rAC, _mm_add_epi16(rB, rB)); // combing in current frame __m128i d3 = _mm_sub_epi16(rAC, _mm_add_epi16(rE, rE)); // combing in merged frame d1 = _mm_and_si128(d1, mask); d3 = _mm_and_si128(d3, mask); var = _mm_add_epi32(var, _mm_madd_epi16(d1, d1)); varshift = _mm_add_epi32(varshift, _mm_madd_epi16(d3, d3)); src1r0 += 8; src1r1 += 8; src1r2 += 8; src2r += 8; } if (w & 1) { __m128i rA = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)src1r0), zero); __m128i rB = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)src1r1), zero); __m128i rC = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)src1r2), zero); __m128i rE = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)src2r), zero); __m128i rAC = _mm_add_epi16(rA, rC); __m128i d1 = _mm_sub_epi16(rAC, _mm_add_epi16(rB, rB)); // combing in current frame __m128i d3 = _mm_sub_epi16(rAC, _mm_add_epi16(rE, rE)); // combing in merged frame d1 = _mm_and_si128(d1, mask); d3 = _mm_and_si128(d3, mask); var = _mm_add_epi32(var, _mm_madd_epi16(d1, d1)); varshift = _mm_add_epi32(varshift, _mm_madd_epi16(d3, d3)); } src1 = (const uint8 *)src1 + srcpitch; src2 = (const uint8 *)src2 + srcpitch; var = _mm_add_epi32(var, _mm_shuffle_epi32(var, 0xee)); varshift = _mm_add_epi32(varshift, _mm_shuffle_epi32(varshift, 0xee)); var = _mm_add_epi32(var, _mm_shuffle_epi32(var, 0x55)); varshift = _mm_add_epi32(varshift, _mm_shuffle_epi32(varshift, 0x55)); uint32 ivar = _mm_cvtsi128_si32(var); uint32 ivarshift = _mm_cvtsi128_si32(varshift); if (firstfield) { score.mVar[0] += ivar; score.mVarShift[0] += ivarshift; } else { score.mVar[1] += ivar; score.mVarShift[1] += ivarshift; } firstfield = !firstfield; } while(--h); return score; }
void CColorAdjustment::ProcessY(int Width, int Height, uint8_t *pData, int Pitch) { if (m_Brightness != 0 || m_Contrast != 0) { if (m_fUpdateYTable) { MakeYTable(m_YTable, m_Brightness, m_Contrast); m_fUpdateYTable = false; } #ifdef TVTVIDEODEC_SSE2_SUPPORT const bool fSSE2 = IsSSE2Enabled(); #endif for (int y = 0; y < Height; y++) { uint8_t *p = pData; int x = 0; #ifdef TVTVIDEODEC_SSE2_SUPPORT if (fSSE2 && !((uintptr_t)p & 15)) { const short c = (short)(min((m_Contrast * 512 / 100) + 512, (1 << 16) - 1)); const short b = (short)((m_Brightness * 255 / 100) + 16); const __m128i bc = _mm_set_epi16(b, c, b, c, b, c, b, c); const __m128i zero = _mm_setzero_si128(); const __m128i w16 = _mm_set1_epi16(16); const __m128i w512 = _mm_set1_epi16(512); for (; x + 16 <= Width; x += 16) { __m128i r = _mm_load_si128((const __m128i*)p); __m128i rl = _mm_unpacklo_epi8(r, zero); __m128i rh = _mm_unpackhi_epi8(r, zero); rl = _mm_subs_epi16(rl, w16); rh = _mm_subs_epi16(rh, w16); __m128i rll = _mm_unpacklo_epi16(rl, w512); __m128i rlh = _mm_unpackhi_epi16(rl, w512); __m128i rhl = _mm_unpacklo_epi16(rh, w512); __m128i rhh = _mm_unpackhi_epi16(rh, w512); rll = _mm_madd_epi16(rll, bc); rlh = _mm_madd_epi16(rlh, bc); rhl = _mm_madd_epi16(rhl, bc); rhh = _mm_madd_epi16(rhh, bc); rll = _mm_srai_epi32(rll, 9); rlh = _mm_srai_epi32(rlh, 9); rhl = _mm_srai_epi32(rhl, 9); rhh = _mm_srai_epi32(rhh, 9); rl = _mm_packs_epi32(rll, rlh); rh = _mm_packs_epi32(rhl, rhh); r = _mm_packus_epi16(rl, rh); _mm_store_si128((__m128i*)p, r); p += 16; } } #endif for (; x < Width; x++) { *p = m_YTable[*p]; p++; } pData += Pitch; } } }
void fb_sqrm_low(dig_t *c, const dig_t *a) { __m128i t0, m0, m1, m2, m3, m4, m5, m6, mask; align dig_t t[2*FB_DIGS]; t0 = _mm_set_epi32(0x55545150, 0x45444140, 0x15141110, 0x05040100); mask = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F); m0 = _mm_load_si128((__m128i *)(a)); m1 = _mm_and_si128(m0, mask); m1 = _mm_shuffle_epi8(t0, m1); m2 = _mm_srli_epi64(m0, 4); m2 = _mm_and_si128(m2, mask); m2 = _mm_shuffle_epi8(t0, m2); m3 = _mm_unpacklo_epi8(m1, m2); m4 = _mm_unpackhi_epi8(m1, m2); m0 = _mm_load_si128((__m128i *)(a+2)); m1 = _mm_and_si128(m0, mask); m1 = _mm_shuffle_epi8(t0, m1); m2 = _mm_srli_epi64(m0, 4); m2 = _mm_and_si128(m2, mask); m2 = _mm_shuffle_epi8(t0, m2); m5 = _mm_unpacklo_epi8(m1, m2); m6 = _mm_unpackhi_epi8(m1, m2); m0 = m3; m1 = m4; m2 = m5; m3 = m6; _mm_store_si128((__m128i *) t + 0, m0); _mm_store_si128((__m128i *) t + 1, m1); _mm_store_si128((__m128i *) t + 2, m2); _mm_store_si128((__m128i *) t + 3, m3); const int ra = 52; const int rb = 55; const int rc = 57; const int rh = 59; const int lh = 5; const int la = 12; const int lb = 9; const int lc = 7; dig_t d = t[7], a0 = t[0], a1 = t[1], a2 = t[2], a3 = t[3], a4 = t[4]; a4 ^= (d >> rh); a4 ^= (d >> ra); a4 ^= (d >> rb); a4 ^= (d >> rc); a3 ^= (d << lh); a3 ^= (d << la); a3 ^= (d << lb); a3 ^= (d << lc); d = t[6]; a3 ^= (d >> rh); a3 ^= (d >> ra); a3 ^= (d >> rb); a3 ^= (d >> rc); a2 ^= (d << lh); a2 ^= (d << la); a2 ^= (d << lb); a2 ^= (d << lc); d = t[5]; a2 ^= (d >> rh); a2 ^= (d >> ra); a2 ^= (d >> rb); a2 ^= (d >> rc); a1 ^= (d << lh); a1 ^= (d << la); a1 ^= (d << lb); a1 ^= (d << lc); d = a4; a1 ^= (d >> rh); a1 ^= (d >> ra); a1 ^= (d >> rb); a1 ^= (d >> rc); a0 ^= (d << lh); a0 ^= (d << la); a0 ^= (d << lb); a0 ^= (d << lc); d = a3 >> rh; a0 ^= d; d <<= rh; a0 ^= (d >> ra); a0 ^= (d >> rb); a0 ^= (d >> rc); a3 ^= d; c[3] = a3; c[2] = a2; c[1] = a1; c[0] = a0; return; }
void aom_convolve8_add_src_hip_sse2(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride, const int16_t *filter_x, int x_step_q4, const int16_t *filter_y, int y_step_q4, int w, int h) { const int bd = 8; assert(x_step_q4 == 16 && y_step_q4 == 16); assert(!(w & 7)); (void)x_step_q4; (void)y_step_q4; uint16_t temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE]; int intermediate_height = h + SUBPEL_TAPS - 1; int i, j; const int center_tap = ((SUBPEL_TAPS - 1) / 2); const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap; const __m128i zero = _mm_setzero_si128(); // Add an offset to account for the "add_src" part of the convolve function. const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3); /* Horizontal filter */ { const __m128i coeffs_x = _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x); // coeffs 4 5 4 5 6 7 6 7 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x); // coeffs 0 1 0 1 0 1 0 1 const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 4 5 4 5 4 5 4 5 const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); const __m128i round_const = _mm_set1_epi32((1 << (FILTER_BITS - EXTRAPREC_BITS - 1)) + (1 << (bd + FILTER_BITS - 1))); for (i = 0; i < intermediate_height; ++i) { for (j = 0; j < w; j += 8) { const __m128i data = _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]); // Filter even-index pixels const __m128i src_0 = _mm_unpacklo_epi8(data, zero); const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero); const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero); const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero); const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4), _mm_add_epi32(res_2, res_6)); res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const), FILTER_BITS - EXTRAPREC_BITS); // Filter odd-index pixels const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero); const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero); const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero); const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero); const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5), _mm_add_epi32(res_3, res_7)); res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const), FILTER_BITS - EXTRAPREC_BITS); // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7 __m128i res = _mm_packs_epi32(res_even, res_odd); res = _mm_min_epi16(_mm_max_epi16(res, zero), _mm_set1_epi16(EXTRAPREC_CLAMP_LIMIT(bd) - 1)); _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res); } } } /* Vertical filter */ { const __m128i coeffs_y = _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset); // coeffs 0 1 0 1 2 3 2 3 const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y); // coeffs 4 5 4 5 6 7 6 7 const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y); // coeffs 0 1 0 1 0 1 0 1 const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0); // coeffs 2 3 2 3 2 3 2 3 const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0); // coeffs 4 5 4 5 4 5 4 5 const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1); // coeffs 6 7 6 7 6 7 6 7 const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1); const __m128i round_const = _mm_set1_epi32((1 << (FILTER_BITS + EXTRAPREC_BITS - 1)) - (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1))); for (i = 0; i < h; ++i) { for (j = 0; j < w; j += 8) { // Filter even-index pixels const uint16_t *data = &temp[i * MAX_SB_SIZE + j]; const __m128i src_0 = _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), *(__m128i *)(data + 1 * MAX_SB_SIZE)); const __m128i src_2 = _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), *(__m128i *)(data + 3 * MAX_SB_SIZE)); const __m128i src_4 = _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), *(__m128i *)(data + 5 * MAX_SB_SIZE)); const __m128i src_6 = _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), *(__m128i *)(data + 7 * MAX_SB_SIZE)); const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01); const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23); const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45); const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67); const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2), _mm_add_epi32(res_4, res_6)); // Filter odd-index pixels const __m128i src_1 = _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE), *(__m128i *)(data + 1 * MAX_SB_SIZE)); const __m128i src_3 = _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE), *(__m128i *)(data + 3 * MAX_SB_SIZE)); const __m128i src_5 = _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE), *(__m128i *)(data + 5 * MAX_SB_SIZE)); const __m128i src_7 = _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE), *(__m128i *)(data + 7 * MAX_SB_SIZE)); const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01); const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23); const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45); const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67); const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3), _mm_add_epi32(res_5, res_7)); // Rearrange pixels back into the order 0 ... 7 const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd); const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd); const __m128i res_lo_round = _mm_srai_epi32( _mm_add_epi32(res_lo, round_const), FILTER_BITS + EXTRAPREC_BITS); const __m128i res_hi_round = _mm_srai_epi32( _mm_add_epi32(res_hi, round_const), FILTER_BITS + EXTRAPREC_BITS); const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round); __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit); __m128i *const p = (__m128i *)&dst[i * dst_stride + j]; _mm_storel_epi64(p, res_8bit); } } } }
// Unroll 4x, interleave bytes, use pmaddubsw (all_x is small) while (count > 3) { count -= 4; int x0[4]; int x1[4]; __m128i all_x, sixteen_minus_x; PrepareConstantsTwoPixelPairs(xy, mask_3FFF, mask_000F, sixteen_8bit, mask_dist_select, &all_x, &sixteen_minus_x, x0, x1); xy += 4; // First pair of pixel pairs // (4x(x1, 16-x1), 4x(x0, 16-x0)) __m128i scale_x; scale_x = _mm_unpacklo_epi8(sixteen_minus_x, all_x); __m128i sum0 = ProcessTwoPixelPairs<has_alpha>( row0, row1, x0, x1, scale_x, all_y, neg_y, alpha); // second pair of pixel pairs // (4x (x3, 16-x3), 4x (16-x2, x2)) scale_x = _mm_unpackhi_epi8(sixteen_minus_x, all_x); __m128i sum1 = ProcessTwoPixelPairs<has_alpha>( row0, row1, x0 + 2, x1 + 2, scale_x, all_y, neg_y, alpha); // Do the final packing of the two results
void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, unsigned int src_pitch, unsigned char *output_ptr, unsigned int out_pitch, unsigned int output_height, int16_t *filter) { __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3; __m128i firstFilters, secondFilters, thirdFilters, forthFilters; __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((__m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); // duplicate only the first 16 bits in the filter firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); // duplicate only the second 16 bits in the filter secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); // duplicate only the third 16 bits in the filter thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); // duplicate only the forth 16 bits in the filter forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); for (i = 0; i < output_height; i++) { // load the first 16 bytes srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr)); // load the next 16 bytes in stride of src_pitch srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch)); srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6)); srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7)); // merge the result together srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2); srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4); srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2); srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4); // multiply 2 adjacent elements with the filter and add the result srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters); srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters); srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); // add and saturate the results together srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); // load the next 16 bytes in stride of two/three src_pitch srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2)); srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3)); // merge the result together srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); // multiply 2 adjacent elements with the filter and add the result srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters); srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters); // load the next 16 bytes in stride of four/five src_pitch srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4)); srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5)); // merge the result together srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3); srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3); // multiply 2 adjacent elements with the filter and add the result srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters); srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters); // add and saturate the results together srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, _mm_min_epi16(srcRegFilt4, srcRegFilt7)); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt6, srcRegFilt8)); // add and saturate the results together srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, _mm_max_epi16(srcRegFilt4, srcRegFilt7)); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt6, srcRegFilt8)); srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); // shift by 7 bit each 16 bit srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7); srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1); src_ptr+=src_pitch; // save 16 bytes convolve result _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); output_ptr+=out_pitch; } }
void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride, int tx_type) { __m128i in[2]; const __m128i zero = _mm_setzero_si128(); const __m128i eight = _mm_set1_epi16(8); in[0] = _mm_loadu_si128((const __m128i *)(input)); in[1] = _mm_loadu_si128((const __m128i *)(input + 8)); switch (tx_type) { case 0: // DCT_DCT idct4_sse2(in); idct4_sse2(in); break; case 1: // ADST_DCT idct4_sse2(in); iadst4_sse2(in); break; case 2: // DCT_ADST iadst4_sse2(in); idct4_sse2(in); break; case 3: // ADST_ADST iadst4_sse2(in); iadst4_sse2(in); break; default: assert(0); break; } // Final round and shift in[0] = _mm_add_epi16(in[0], eight); in[1] = _mm_add_epi16(in[1], eight); in[0] = _mm_srai_epi16(in[0], 4); in[1] = _mm_srai_epi16(in[1], 4); // Reconstruction and Store { __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest)); __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2)); d0 = _mm_unpacklo_epi32(d0, _mm_cvtsi32_si128(*(const int *)(dest + stride))); d2 = _mm_unpacklo_epi32( d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3))); d0 = _mm_unpacklo_epi8(d0, zero); d2 = _mm_unpacklo_epi8(d2, zero); d0 = _mm_add_epi16(d0, in[0]); d2 = _mm_add_epi16(d2, in[1]); d0 = _mm_packus_epi16(d0, d2); // store result[0] *(int *)dest = _mm_cvtsi128_si32(d0); // store result[1] d0 = _mm_srli_si128(d0, 4); *(int *)(dest + stride) = _mm_cvtsi128_si32(d0); // store result[2] d0 = _mm_srli_si128(d0, 4); *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0); // store result[3] d0 = _mm_srli_si128(d0, 4); *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0); } }
// input: 8 bytes ABCDEFGH -> output: A0B0C0D0E0F0G0H0 static void LoadEightPixels_SSE2(const uint8_t* const src, __m128i* out) { const __m128i zero = _mm_setzero_si128(); const __m128i A = _mm_loadl_epi64((const __m128i*)(src)); // ABCDEFGH *out = _mm_unpacklo_epi8(A, zero); }
void SGMStereo::calcRowCosts(unsigned char*& leftSobelRow, int*& leftCensusRow, unsigned char*& rightSobelRow, int*& rightCensusRow, unsigned short* costImageRow) { const int widthStepCost = width_*disparityTotal_; const __m128i registerZero = _mm_setzero_si128(); for (int y = 1; y < height_; ++y) { int addRowIndex = y + aggregationWindowRadius_; int addRowAggregatedCostIndex = std::min(addRowIndex, height_ - 1)%(aggregationWindowRadius_*2 + 2); unsigned short* addRowAggregatedCost = rowAggregatedCost_ + width_*disparityTotal_*addRowAggregatedCostIndex; if (addRowIndex < height_) { calcPixelwiseSAD(leftSobelRow, rightSobelRow); addPixelwiseHamming(leftCensusRow, rightCensusRow); memset(addRowAggregatedCost, 0, disparityTotal_*sizeof(unsigned short)); // x = 0 for (int x = 0; x <= aggregationWindowRadius_; ++x) { int scale = x == 0 ? aggregationWindowRadius_ + 1 : 1; for (int d = 0; d < disparityTotal_; ++d) { addRowAggregatedCost[d] += static_cast<unsigned short>(pixelwiseCostRow_[disparityTotal_*x + d]*scale); } } // x = 1...width-1 int subRowAggregatedCostIndex = std::max(y - aggregationWindowRadius_ - 1, 0)%(aggregationWindowRadius_*2 + 2); const unsigned short* subRowAggregatedCost = rowAggregatedCost_ + width_*disparityTotal_*subRowAggregatedCostIndex; const unsigned short* previousCostRow = costImageRow - widthStepCost; for (int x = 1; x < width_; ++x) { const unsigned char* addPixelwiseCost = pixelwiseCostRow_ + std::min((x + aggregationWindowRadius_)*disparityTotal_, (width_ - 1)*disparityTotal_); const unsigned char* subPixelwiseCost = pixelwiseCostRow_ + std::max((x - aggregationWindowRadius_ - 1)*disparityTotal_, 0); for (int d = 0; d < disparityTotal_; d += 16) { __m128i registerAddPixelwiseLow = _mm_load_si128(reinterpret_cast<const __m128i*>(addPixelwiseCost + d)); __m128i registerAddPixelwiseHigh = _mm_unpackhi_epi8(registerAddPixelwiseLow, registerZero); registerAddPixelwiseLow = _mm_unpacklo_epi8(registerAddPixelwiseLow, registerZero); __m128i registerSubPixelwiseLow = _mm_load_si128(reinterpret_cast<const __m128i*>(subPixelwiseCost + d)); __m128i registerSubPixelwiseHigh = _mm_unpackhi_epi8(registerSubPixelwiseLow, registerZero); registerSubPixelwiseLow = _mm_unpacklo_epi8(registerSubPixelwiseLow, registerZero); // Low __m128i registerAddAggregated = _mm_load_si128(reinterpret_cast<const __m128i*>(addRowAggregatedCost + disparityTotal_*(x - 1) + d)); registerAddAggregated = _mm_adds_epi16(_mm_subs_epi16(registerAddAggregated, registerSubPixelwiseLow), registerAddPixelwiseLow); __m128i registerCost = _mm_load_si128(reinterpret_cast<const __m128i*>(previousCostRow + disparityTotal_*x + d)); registerCost = _mm_adds_epi16(_mm_subs_epi16(registerCost, _mm_load_si128(reinterpret_cast<const __m128i*>(subRowAggregatedCost + disparityTotal_*x + d))), registerAddAggregated); _mm_store_si128(reinterpret_cast<__m128i*>(addRowAggregatedCost + disparityTotal_*x + d), registerAddAggregated); _mm_store_si128(reinterpret_cast<__m128i*>(costImageRow + disparityTotal_*x + d), registerCost); // High registerAddAggregated = _mm_load_si128(reinterpret_cast<const __m128i*>(addRowAggregatedCost + disparityTotal_*(x-1) + d + 8)); registerAddAggregated = _mm_adds_epi16(_mm_subs_epi16(registerAddAggregated, registerSubPixelwiseHigh), registerAddPixelwiseHigh); registerCost = _mm_load_si128(reinterpret_cast<const __m128i*>(previousCostRow + disparityTotal_*x + d + 8)); registerCost = _mm_adds_epi16(_mm_subs_epi16(registerCost, _mm_load_si128(reinterpret_cast<const __m128i*>(subRowAggregatedCost + disparityTotal_*x + d + 8))), registerAddAggregated); _mm_store_si128(reinterpret_cast<__m128i*>(addRowAggregatedCost + disparityTotal_*x + d + 8), registerAddAggregated); _mm_store_si128(reinterpret_cast<__m128i*>(costImageRow + disparityTotal_*x + d + 8), registerCost); } } } leftSobelRow += widthStep_; rightSobelRow += widthStep_; leftCensusRow += width_; rightCensusRow += width_; costImageRow += widthStepCost; } }
void mpeg2_idct_add_sse2(int,int16_t* block, uint8_t* dest, const int stride) { __m128i &src0=*(__m128i*)(block+0*16/2); __m128i &src1=*(__m128i*)(block+1*16/2); __m128i &src2=*(__m128i*)(block+2*16/2); __m128i &src3=*(__m128i*)(block+3*16/2); __m128i &src4=*(__m128i*)(block+4*16/2); __m128i &src5=*(__m128i*)(block+5*16/2); __m128i &src6=*(__m128i*)(block+6*16/2); __m128i &src7=*(__m128i*)(block+7*16/2); idct_M128ASM (src0,src1,src2,src3,src4,src5,src6,src7); __m128i zero = _mm_setzero_si128(); __m128i r0 = _mm_load_si128(&src0); __m128i r1 = _mm_load_si128(&src1); __m128i r2 = _mm_load_si128(&src2); __m128i r3 = _mm_load_si128(&src3); __m128i r4 = _mm_load_si128(&src4); __m128i r5 = _mm_load_si128(&src5); __m128i r6 = _mm_load_si128(&src6); __m128i r7 = _mm_load_si128(&src7); __m128 q0 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[0*stride]); __m128 q1 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[1*stride]); __m128 q2 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[2*stride]); __m128 q3 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[3*stride]); __m128 q4 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[4*stride]); __m128 q5 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[5*stride]); __m128 q6 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[6*stride]); __m128 q7 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[7*stride]); r0 = _mm_adds_epi16(r0, _mm_unpacklo_epi8(*(__m128i*)&q0, zero)); r1 = _mm_adds_epi16(r1, _mm_unpacklo_epi8(*(__m128i*)&q1, zero)); r2 = _mm_adds_epi16(r2, _mm_unpacklo_epi8(*(__m128i*)&q2, zero)); r3 = _mm_adds_epi16(r3, _mm_unpacklo_epi8(*(__m128i*)&q3, zero)); r4 = _mm_adds_epi16(r4, _mm_unpacklo_epi8(*(__m128i*)&q4, zero)); r5 = _mm_adds_epi16(r5, _mm_unpacklo_epi8(*(__m128i*)&q5, zero)); r6 = _mm_adds_epi16(r6, _mm_unpacklo_epi8(*(__m128i*)&q6, zero)); r7 = _mm_adds_epi16(r7, _mm_unpacklo_epi8(*(__m128i*)&q7, zero)); r0 = _mm_packus_epi16(r0, r1); r1 = _mm_packus_epi16(r2, r3); r2 = _mm_packus_epi16(r4, r5); r3 = _mm_packus_epi16(r6, r7); _mm_storel_pi((__m64*)&dest[0*stride], *(__m128*)&r0); _mm_storeh_pi((__m64*)&dest[1*stride], *(__m128*)&r0); _mm_storel_pi((__m64*)&dest[2*stride], *(__m128*)&r1); _mm_storeh_pi((__m64*)&dest[3*stride], *(__m128*)&r1); _mm_storel_pi((__m64*)&dest[4*stride], *(__m128*)&r2); _mm_storeh_pi((__m64*)&dest[5*stride], *(__m128*)&r2); _mm_storel_pi((__m64*)&dest[6*stride], *(__m128*)&r3); _mm_storeh_pi((__m64*)&dest[7*stride], *(__m128*)&r3); _mm_store_si128(&src0, zero); _mm_store_si128(&src1, zero); _mm_store_si128(&src2, zero); _mm_store_si128(&src3, zero); _mm_store_si128(&src4, zero); _mm_store_si128(&src5, zero); _mm_store_si128(&src6, zero); _mm_store_si128(&src7, zero); }
__m64 interpolvline128_3(__m128i* temp){ __m128i xmm6; __m64 ret; __m128i xmm7 = _mm_setzero_si128(); __m128i xmm0 = _mm_load_si128(temp++); __m128i xmm1 = _mm_load_si128(temp++); __m128i xmm2 = _mm_load_si128(temp++); __m128i xmm3 = _mm_load_si128(temp++); __m128i xmm4 = _mm_load_si128(temp++); __m128i xmm5 = _mm_load_si128(temp); xmm1 = _mm_add_epi16(xmm1,xmm4); xmm0 = _mm_add_epi16(xmm0,xmm5); xmm6 = _mm_set_epi32(0xFFFBFFFB,0xFFFBFFFB,0xFFFBFFFB,0xFFFBFFFB); xmm4 = _mm_mullo_epi16(xmm1, xmm6); xmm5 = _mm_mulhi_epi16(xmm1, xmm6); xmm1 = _mm_unpacklo_epi16(xmm4, xmm5); xmm6 = _mm_unpackhi_epi16(xmm4, xmm5); xmm7 = _mm_set_epi32(0x00140014,0x00140014,0x00140014,0x00140014); xmm5 = _mm_add_epi16(xmm2,xmm3); xmm4 = _mm_mullo_epi16(xmm5, xmm7); xmm5 = _mm_mulhi_epi16(xmm5, xmm7); xmm7 = _mm_unpacklo_epi16(xmm4, xmm5); xmm4 = _mm_unpackhi_epi16(xmm4, xmm5); xmm7 = _mm_add_epi32(xmm7,xmm1); xmm4 = _mm_add_epi32(xmm4,xmm6); xmm6 = _mm_set_epi32(0x00010001,0x00010001,0x00010001,0x00010001); xmm6 = _mm_mulhi_epi16(xmm0, xmm6); xmm1 = _mm_unpacklo_epi16(xmm0, xmm6); xmm6 = _mm_unpackhi_epi16(xmm0, xmm6); xmm7 = _mm_add_epi32(xmm7,xmm1); xmm4 = _mm_add_epi32(xmm4,xmm6); xmm1 = _mm_set_epi32(0x00000200,0x00000200,0x00000200,0x00000200); xmm7 = _mm_add_epi32(xmm7,xmm1); xmm4 = _mm_add_epi32(xmm4,xmm1); xmm5 = _mm_setzero_si128(); xmm7 = _mm_srli_epi32(xmm7, 10); xmm7 = _mm_max_epi16(xmm7, xmm5); // preventing negative values xmm7 = _mm_slli_epi32(xmm7,16); xmm7 = _mm_srli_epi32(xmm7,16); xmm4 = _mm_srli_epi32(xmm4, 10); xmm4 = _mm_max_epi16(xmm4, xmm5); // preventing negative values xmm4 = _mm_slli_epi32(xmm4,16); xmm4 = _mm_srli_epi32(xmm4,16); xmm6 = _mm_packs_epi32(xmm7, xmm4); xmm1 = _mm_set_epi32(0x00100010,0x00100010,0x00100010,0x00100010); xmm2 = _mm_add_epi16(xmm2,xmm1); xmm2 = _mm_max_epi16(xmm2, xmm5); // preventing negative values xmm2 = _mm_srli_epi16(xmm2,5); xmm3 = _mm_add_epi16(xmm3,xmm1); xmm3 = _mm_max_epi16(xmm3, xmm5); // preventing negative values xmm3 = _mm_srli_epi16(xmm3,5); xmm2 = _mm_packus_epi16(xmm2,xmm5); xmm3 = _mm_packus_epi16(xmm3,xmm5); xmm6 = _mm_packus_epi16(xmm6,xmm5); xmm7 = _mm_unpacklo_epi8(xmm2,xmm6); xmm4 = _mm_unpacklo_epi8(xmm6,xmm3); xmm6 = _mm_avg_epu8(xmm4,xmm7); xmm6 = _mm_srli_epi16(xmm6,8); xmm6 = _mm_packus_epi16(xmm6,xmm5); ret = _mm_movepi64_pi64(xmm6); _mm_empty(); return(ret); }
void mpeg2_idct_add_sse2(const int last, int16_t* block, uint8_t* dest, const int stride) { idct_M128ASM(block); /* for(int i = 0; i < 8; i++) { dest[0] = CLIP(block[0] + dest[0]); dest[1] = CLIP(block[1] + dest[1]); dest[2] = CLIP(block[2] + dest[2]); dest[3] = CLIP(block[3] + dest[3]); dest[4] = CLIP(block[4] + dest[4]); dest[5] = CLIP(block[5] + dest[5]); dest[6] = CLIP(block[6] + dest[6]); dest[7] = CLIP(block[7] + dest[7]); memset(block, 0, sizeof(short)*8); dest += stride; block += 8; } */ __m128i* src = (__m128i*)block; __m128i zero = _mm_setzero_si128(); __m128i r0 = _mm_load_si128(&src[0]); __m128i r1 = _mm_load_si128(&src[1]); __m128i r2 = _mm_load_si128(&src[2]); __m128i r3 = _mm_load_si128(&src[3]); __m128i r4 = _mm_load_si128(&src[4]); __m128i r5 = _mm_load_si128(&src[5]); __m128i r6 = _mm_load_si128(&src[6]); __m128i r7 = _mm_load_si128(&src[7]); __m128 q0 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[0*stride]); __m128 q1 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[1*stride]); __m128 q2 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[2*stride]); __m128 q3 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[3*stride]); __m128 q4 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[4*stride]); __m128 q5 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[5*stride]); __m128 q6 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[6*stride]); __m128 q7 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[7*stride]); r0 = _mm_adds_epi16(r0, _mm_unpacklo_epi8(*(__m128i*)&q0, zero)); r1 = _mm_adds_epi16(r1, _mm_unpacklo_epi8(*(__m128i*)&q1, zero)); r2 = _mm_adds_epi16(r2, _mm_unpacklo_epi8(*(__m128i*)&q2, zero)); r3 = _mm_adds_epi16(r3, _mm_unpacklo_epi8(*(__m128i*)&q3, zero)); r4 = _mm_adds_epi16(r4, _mm_unpacklo_epi8(*(__m128i*)&q4, zero)); r5 = _mm_adds_epi16(r5, _mm_unpacklo_epi8(*(__m128i*)&q5, zero)); r6 = _mm_adds_epi16(r6, _mm_unpacklo_epi8(*(__m128i*)&q6, zero)); r7 = _mm_adds_epi16(r7, _mm_unpacklo_epi8(*(__m128i*)&q7, zero)); r0 = _mm_packus_epi16(r0, r1); r1 = _mm_packus_epi16(r2, r3); r2 = _mm_packus_epi16(r4, r5); r3 = _mm_packus_epi16(r6, r7); _mm_storel_pi((__m64*)&dest[0*stride], *(__m128*)&r0); _mm_storeh_pi((__m64*)&dest[1*stride], *(__m128*)&r0); _mm_storel_pi((__m64*)&dest[2*stride], *(__m128*)&r1); _mm_storeh_pi((__m64*)&dest[3*stride], *(__m128*)&r1); _mm_storel_pi((__m64*)&dest[4*stride], *(__m128*)&r2); _mm_storeh_pi((__m64*)&dest[5*stride], *(__m128*)&r2); _mm_storel_pi((__m64*)&dest[6*stride], *(__m128*)&r3); _mm_storeh_pi((__m64*)&dest[7*stride], *(__m128*)&r3); _mm_store_si128(&src[0], zero); _mm_store_si128(&src[1], zero); _mm_store_si128(&src[2], zero); _mm_store_si128(&src[3], zero); _mm_store_si128(&src[4], zero); _mm_store_si128(&src[5], zero); _mm_store_si128(&src[6], zero); _mm_store_si128(&src[7], zero); }
/* motion templates */ CV_IMPL void cvUpdateMotionHistory( const void* silhouette, void* mhimg, double timestamp, double mhi_duration ) { CvMat silhstub, *silh = cvGetMat(silhouette, &silhstub); CvMat mhistub, *mhi = cvGetMat(mhimg, &mhistub); if( !CV_IS_MASK_ARR( silh )) CV_Error( CV_StsBadMask, "" ); if( CV_MAT_TYPE( mhi->type ) != CV_32FC1 ) CV_Error( CV_StsUnsupportedFormat, "" ); if( !CV_ARE_SIZES_EQ( mhi, silh )) CV_Error( CV_StsUnmatchedSizes, "" ); CvSize size = cvGetMatSize( mhi ); if( CV_IS_MAT_CONT( mhi->type & silh->type )) { size.width *= size.height; size.height = 1; } float ts = (float)timestamp; float delbound = (float)(timestamp - mhi_duration); int x, y; #if CV_SSE2 volatile bool useSIMD = cv::checkHardwareSupport(CV_CPU_SSE2); #endif for( y = 0; y < size.height; y++ ) { const uchar* silhData = silh->data.ptr + silh->step*y; float* mhiData = (float*)(mhi->data.ptr + mhi->step*y); x = 0; #if CV_SSE2 if( useSIMD ) { __m128 ts4 = _mm_set1_ps(ts), db4 = _mm_set1_ps(delbound); for( ; x <= size.width - 8; x += 8 ) { __m128i z = _mm_setzero_si128(); __m128i s = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(silhData + x)), z); __m128 s0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(s, z)), s1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(s, z)); __m128 v0 = _mm_loadu_ps(mhiData + x), v1 = _mm_loadu_ps(mhiData + x + 4); __m128 fz = _mm_setzero_ps(); v0 = _mm_and_ps(v0, _mm_cmpge_ps(v0, db4)); v1 = _mm_and_ps(v1, _mm_cmpge_ps(v1, db4)); __m128 m0 = _mm_and_ps(_mm_xor_ps(v0, ts4), _mm_cmpneq_ps(s0, fz)); __m128 m1 = _mm_and_ps(_mm_xor_ps(v1, ts4), _mm_cmpneq_ps(s1, fz)); v0 = _mm_xor_ps(v0, m0); v1 = _mm_xor_ps(v1, m1); _mm_storeu_ps(mhiData + x, v0); _mm_storeu_ps(mhiData + x + 4, v1); } } #endif for( ; x < size.width; x++ ) { float val = mhiData[x]; val = silhData[x] ? ts : val < delbound ? 0 : val; mhiData[x] = val; } } }
static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p, const unsigned char *_blimit, const unsigned char *_limit, const unsigned char *_thresh) { __m128i mask, hev, flat, flat2; const __m128i zero = _mm_set1_epi16(0); const __m128i one = _mm_set1_epi8(1); __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1; __m128i abs_p1p0; const __m128i thresh = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0])); const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0])); const __m128i blimit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0])); q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p)); q4p4 = _mm_castps_si128( _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p))); q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p)); q3p3 = _mm_castps_si128( _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p))); q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p)); q2p2 = _mm_castps_si128( _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p))); q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p)); q1p1 = _mm_castps_si128( _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p))); p1q1 = _mm_shuffle_epi32(q1p1, 78); q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p)); q0p0 = _mm_castps_si128( _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p))); p0q0 = _mm_shuffle_epi32(q0p0, 78); { __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work; abs_p1p0 = _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1)); abs_q1q0 = _mm_srli_si128(abs_p1p0, 8); fe = _mm_set1_epi8(0xfe); ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0); abs_p0q0 = _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0)); abs_p1q1 = _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1)); flat = _mm_max_epu8(abs_p1p0, abs_q1q0); hev = _mm_subs_epu8(flat, thresh); hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff); abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0); abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1); mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit); mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff); // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2 > blimit) * -1; mask = _mm_max_epu8(abs_p1p0, mask); // mask |= (abs(p1 - p0) > limit) * -1; // mask |= (abs(q1 - q0) > limit) * -1; work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)), _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3))); mask = _mm_max_epu8(work, mask); mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8)); mask = _mm_subs_epu8(mask, limit); mask = _mm_cmpeq_epi8(mask, zero); } // lp filter { const __m128i t4 = _mm_set1_epi8(4); const __m128i t3 = _mm_set1_epi8(3); const __m128i t80 = _mm_set1_epi8(0x80); const __m128i t1 = _mm_set1_epi16(0x1); __m128i qs1ps1 = _mm_xor_si128(q1p1, t80); __m128i qs0ps0 = _mm_xor_si128(q0p0, t80); __m128i qs0 = _mm_xor_si128(p0q0, t80); __m128i qs1 = _mm_xor_si128(p1q1, t80); __m128i filt; __m128i work_a; __m128i filter1, filter2; __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2; __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0; filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev); work_a = _mm_subs_epi8(qs0, qs0ps0); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); filt = _mm_adds_epi8(filt, work_a); /* (vpx_filter + 3 * (qs0 - ps0)) & mask */ filt = _mm_and_si128(filt, mask); filter1 = _mm_adds_epi8(filt, t4); filter2 = _mm_adds_epi8(filt, t3); filter1 = _mm_unpacklo_epi8(zero, filter1); filter1 = _mm_srai_epi16(filter1, 0xB); filter2 = _mm_unpacklo_epi8(zero, filter2); filter2 = _mm_srai_epi16(filter2, 0xB); /* Filter1 >> 3 */ filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1)); qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80); /* filt >> 1 */ filt = _mm_adds_epi16(filter1, t1); filt = _mm_srai_epi16(filt, 1); filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8), filt); filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt)); qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80); // loopfilter done { __m128i work; flat = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)), _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3))); flat = _mm_max_epu8(abs_p1p0, flat); flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8)); flat = _mm_subs_epu8(flat, one); flat = _mm_cmpeq_epi8(flat, zero); flat = _mm_and_si128(flat, mask); q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p)); q5p5 = _mm_castps_si128( _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p))); q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p)); q6p6 = _mm_castps_si128( _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p))); flat2 = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)), _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5))); q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p)); q7p7 = _mm_castps_si128( _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p))); work = _mm_max_epu8( _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)), _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7))); flat2 = _mm_max_epu8(work, flat2); flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8)); flat2 = _mm_subs_epu8(flat2, one); flat2 = _mm_cmpeq_epi8(flat2, zero); flat2 = _mm_and_si128(flat2, flat); // flat2 & flat & mask } // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ // flat and wide flat calculations { const __m128i eight = _mm_set1_epi16(8); const __m128i four = _mm_set1_epi16(4); __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16; __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16; __m128i pixelFilter_p, pixelFilter_q; __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0; __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q; p7_16 = _mm_unpacklo_epi8(q7p7, zero); p6_16 = _mm_unpacklo_epi8(q6p6, zero); p5_16 = _mm_unpacklo_epi8(q5p5, zero); p4_16 = _mm_unpacklo_epi8(q4p4, zero); p3_16 = _mm_unpacklo_epi8(q3p3, zero); p2_16 = _mm_unpacklo_epi8(q2p2, zero); p1_16 = _mm_unpacklo_epi8(q1p1, zero); p0_16 = _mm_unpacklo_epi8(q0p0, zero); q0_16 = _mm_unpackhi_epi8(q0p0, zero); q1_16 = _mm_unpackhi_epi8(q1p1, zero); q2_16 = _mm_unpackhi_epi8(q2p2, zero); q3_16 = _mm_unpackhi_epi8(q3p3, zero); q4_16 = _mm_unpackhi_epi8(q4p4, zero); q5_16 = _mm_unpackhi_epi8(q5p5, zero); q6_16 = _mm_unpackhi_epi8(q6p6, zero); q7_16 = _mm_unpackhi_epi8(q7p7, zero); pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16), _mm_add_epi16(p4_16, p3_16)); pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16), _mm_add_epi16(q4_16, q3_16)); pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16)); pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0); pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16)); pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0); pixelFilter_p = _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q)); pixetFilter_p2p1p0 = _mm_add_epi16( four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0)); res_p = _mm_srli_epi16( _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4); res_q = _mm_srli_epi16( _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4); flat2_q0p0 = _mm_packus_epi16(res_p, res_q); res_p = _mm_srli_epi16( _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3); res_q = _mm_srli_epi16( _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3); flat_q0p0 = _mm_packus_epi16(res_p, res_q); sum_p7 = _mm_add_epi16(p7_16, p7_16); sum_q7 = _mm_add_epi16(q7_16, q7_16); sum_p3 = _mm_add_epi16(p3_16, p3_16); sum_q3 = _mm_add_epi16(q3_16, q3_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16); res_p = _mm_srli_epi16( _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4); res_q = _mm_srli_epi16( _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4); flat2_q1p1 = _mm_packus_epi16(res_p, res_q); pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16); pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16); res_p = _mm_srli_epi16( _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3); res_q = _mm_srli_epi16( _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3); flat_q1p1 = _mm_packus_epi16(res_p, res_q); sum_p7 = _mm_add_epi16(sum_p7, p7_16); sum_q7 = _mm_add_epi16(sum_q7, q7_16); sum_p3 = _mm_add_epi16(sum_p3, p3_16); sum_q3 = _mm_add_epi16(sum_q3, q3_16); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16); res_p = _mm_srli_epi16( _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4); res_q = _mm_srli_epi16( _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4); flat2_q2p2 = _mm_packus_epi16(res_p, res_q); pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16); pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16); res_p = _mm_srli_epi16( _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3); res_q = _mm_srli_epi16( _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3); flat_q2p2 = _mm_packus_epi16(res_p, res_q); sum_p7 = _mm_add_epi16(sum_p7, p7_16); sum_q7 = _mm_add_epi16(sum_q7, q7_16); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16); res_p = _mm_srli_epi16( _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4); res_q = _mm_srli_epi16( _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4); flat2_q3p3 = _mm_packus_epi16(res_p, res_q); sum_p7 = _mm_add_epi16(sum_p7, p7_16); sum_q7 = _mm_add_epi16(sum_q7, q7_16); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16); res_p = _mm_srli_epi16( _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4); res_q = _mm_srli_epi16( _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4); flat2_q4p4 = _mm_packus_epi16(res_p, res_q); sum_p7 = _mm_add_epi16(sum_p7, p7_16); sum_q7 = _mm_add_epi16(sum_q7, q7_16); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16); res_p = _mm_srli_epi16( _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4); res_q = _mm_srli_epi16( _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4); flat2_q5p5 = _mm_packus_epi16(res_p, res_q); sum_p7 = _mm_add_epi16(sum_p7, p7_16); sum_q7 = _mm_add_epi16(sum_q7, q7_16); pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16); pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16); res_p = _mm_srli_epi16( _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4); res_q = _mm_srli_epi16( _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4); flat2_q6p6 = _mm_packus_epi16(res_p, res_q); } // wide flat // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ flat = _mm_shuffle_epi32(flat, 68); flat2 = _mm_shuffle_epi32(flat2, 68); q2p2 = _mm_andnot_si128(flat, q2p2); flat_q2p2 = _mm_and_si128(flat, flat_q2p2); q2p2 = _mm_or_si128(q2p2, flat_q2p2); qs1ps1 = _mm_andnot_si128(flat, qs1ps1); flat_q1p1 = _mm_and_si128(flat, flat_q1p1); q1p1 = _mm_or_si128(qs1ps1, flat_q1p1); qs0ps0 = _mm_andnot_si128(flat, qs0ps0); flat_q0p0 = _mm_and_si128(flat, flat_q0p0); q0p0 = _mm_or_si128(qs0ps0, flat_q0p0); q6p6 = _mm_andnot_si128(flat2, q6p6); flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6); q6p6 = _mm_or_si128(q6p6, flat2_q6p6); _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6); _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6)); q5p5 = _mm_andnot_si128(flat2, q5p5); flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5); q5p5 = _mm_or_si128(q5p5, flat2_q5p5); _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5); _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5)); q4p4 = _mm_andnot_si128(flat2, q4p4); flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4); q4p4 = _mm_or_si128(q4p4, flat2_q4p4); _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4); _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4)); q3p3 = _mm_andnot_si128(flat2, q3p3); flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3); q3p3 = _mm_or_si128(q3p3, flat2_q3p3); _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3); _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3)); q2p2 = _mm_andnot_si128(flat2, q2p2); flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2); q2p2 = _mm_or_si128(q2p2, flat2_q2p2); _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2); _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2)); q1p1 = _mm_andnot_si128(flat2, q1p1); flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1); q1p1 = _mm_or_si128(q1p1, flat2_q1p1); _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1); _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1)); q0p0 = _mm_andnot_si128(flat2, q0p0); flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0); q0p0 = _mm_or_si128(q0p0, flat2_q0p0); _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0); _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0)); } }
void LW_FUNC_ALIGN convert_lw48_to_yuy2_sse41( int thread_id, int thread_num, void *param1, void *param2 ) { /* LW48 -> YUY2 using SSE4.1 */ COLOR_PROC_INFO *cpip = (COLOR_PROC_INFO *)param1; int start = (cpip->h * thread_id ) / thread_num; int end = (cpip->h * (thread_id + 1)) / thread_num; int w = cpip->w; BYTE *ycp_line = (BYTE *)cpip->ycp + start * cpip->line_size; BYTE *pixel_line = (BYTE *)cpip->pixelp + start * w * 2; __m128i x0, x1, x2, x3, x5, x6, x7; static const char LW_ALIGN(16) SHUFFLE_Y[16] = { 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11 }; for( int y = start; y < end; y++ ) { BYTE *ycp = ycp_line; BYTE *yuy2_ptr = pixel_line; for( int x = 0, i_step = 0; x < w; x += i_step, ycp += i_step*6, yuy2_ptr += i_step*2 ) { x5 = _mm_loadu_si128((__m128i *)(ycp + 0)); x6 = _mm_loadu_si128((__m128i *)(ycp + 16)); x7 = _mm_loadu_si128((__m128i *)(ycp + 32)); x0 = _mm_blend_epi16(x5, x6, 0x80+0x10+0x02); x0 = _mm_blend_epi16(x0, x7, 0x20+0x04); x1 = _mm_blend_epi16(x5, x6, 0x40+0x20+0x01); x1 = _mm_blend_epi16(x1, x7, 0x10+0x08); x0 = _mm_shuffle_epi8(x0, _mm_load_si128((__m128i*)SHUFFLE_Y)); x1 = _mm_alignr_epi8(x1, x1, 2); x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(1,2,3,0)); x0 = _mm_srli_epi16(x0, 8); x1 = _mm_srli_epi16(x1, 8); x5 = _mm_loadu_si128((__m128i *)(ycp + 48)); x6 = _mm_loadu_si128((__m128i *)(ycp + 64)); x7 = _mm_loadu_si128((__m128i *)(ycp + 80)); x2 = _mm_blend_epi16(x5, x6, 0x80+0x10+0x02); x2 = _mm_blend_epi16(x2, x7, 0x20+0x04); x3 = _mm_blend_epi16(x5, x6, 0x40+0x20+0x01); x3 = _mm_blend_epi16(x3, x7, 0x10+0x08); x2 = _mm_shuffle_epi8(x2, _mm_load_si128((__m128i*)SHUFFLE_Y)); x3 = _mm_alignr_epi8(x3, x3, 2); x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(1,2,3,0)); x2 = _mm_srli_epi16(x2, 8); x3 = _mm_srli_epi16(x3, 8); x0 = _mm_packus_epi16(x0, x2); x1 = _mm_packus_epi16(x1, x3); _mm_storeu_si128((__m128i*)(yuy2_ptr + 0), _mm_unpacklo_epi8(x0, x1)); _mm_storeu_si128((__m128i*)(yuy2_ptr + 16), _mm_unpackhi_epi8(x0, x1)); int remain = w - x; i_step = (remain >= 16); i_step = (i_step<<4) + (remain & ((~(0-i_step)) & 0x0f)); } ycp_line += cpip->line_size; pixel_line += w*2; } }
void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row, png_const_bytep prev) { /* Paeth tries to predict pixel d using the pixel to the left of it, a, * and two pixels from the previous row, b and c: * prev: c b * row: a d * The Paeth function predicts d to be whichever of a, b, or c is nearest to * p=a+b-c. * * The first pixel has no left context, and so uses an Up filter, p = b. * This works naturally with our main loop's p = a+b-c if we force a and c * to zero. * Here we zero b and d, which become c and a respectively at the start of * the loop. */ png_debug(1, "in png_read_filter_row_paeth3_sse2"); const __m128i zero = _mm_setzero_si128(); __m128i c, b = zero, a, d = zero; int rb = row_info->rowbytes; while (rb >= 4) { /* It's easiest to do this math (particularly, deal with pc) with 16-bit * intermediates. */ c = b; b = _mm_unpacklo_epi8(load4(prev), zero); a = d; d = _mm_unpacklo_epi8(load4(row ), zero); /* (p-a) == (a+b-c - a) == (b-c) */ __m128i pa = _mm_sub_epi16(b,c); /* (p-b) == (a+b-c - b) == (a-c) */ __m128i pb = _mm_sub_epi16(a,c); /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ __m128i pc = _mm_add_epi16(pa,pb); pa = abs_i16(pa); /* |p-a| */ pb = abs_i16(pb); /* |p-b| */ pc = abs_i16(pc); /* |p-c| */ __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); /* Paeth breaks ties favoring a over b over c. */ __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c)); /* Note `_epi8`: we need addition to wrap modulo 255. */ d = _mm_add_epi8(d, nearest); store3(row, _mm_packus_epi16(d,d)); prev += 3; row += 3; rb -= 3; } if (rb > 0) { /* It's easiest to do this math (particularly, deal with pc) with 16-bit * intermediates. */ c = b; b = _mm_unpacklo_epi8(load3(prev), zero); a = d; d = _mm_unpacklo_epi8(load3(row ), zero); /* (p-a) == (a+b-c - a) == (b-c) */ __m128i pa = _mm_sub_epi16(b,c); /* (p-b) == (a+b-c - b) == (a-c) */ __m128i pb = _mm_sub_epi16(a,c); /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ __m128i pc = _mm_add_epi16(pa,pb); pa = abs_i16(pa); /* |p-a| */ pb = abs_i16(pb); /* |p-b| */ pc = abs_i16(pc); /* |p-c| */ __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); /* Paeth breaks ties favoring a over b over c. */ __m128i nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c)); /* Note `_epi8`: we need addition to wrap modulo 255. */ d = _mm_add_epi8(d, nearest); store3(row, _mm_packus_epi16(d,d)); prev += 3; row += 3; rb -= 3; } }
test (__m128i s1, __m128i s2) { return _mm_unpacklo_epi8 (s1, s2); }
void matrixCalc(void *inputs, void *outputs, long in_bitRate) { //Validate the input TML::Matrix i(inputs,0); if (i.dims() != 2) throw "Input should be a 2D matrix"; if (!i.isChar()) throw "Input should have character data"; if (i.planes() != 4) throw "Input needs 4 planes"; if (i.dim(0) % 64 != 0) throw "Width needs to be a multiple of 64"; if (i.dim(1) % 2 != 0) throw "Height needs to be a multiple of 2"; if (i.dim(0) != m_cfg.g_w || i.dim(1) != m_cfg.g_h || m_cfg.rc_target_bitrate != in_bitRate) { vpx_img_free(&m_raw); vpx_img_alloc(&m_raw, VPX_IMG_FMT_I420, i.dim(0), i.dim(1), 1); vpx_codec_destroy(&m_codec); vpx_codec_enc_config_default(vpx_codec_vp8_cx(), &m_cfg, 0); m_cfg.rc_target_bitrate = in_bitRate; m_cfg.g_w = i.dim(0); m_cfg.g_h = i.dim(1); vpx_codec_enc_init(&m_codec, vpx_codec_vp8_cx(), &m_cfg, 0); } //ARGB -> YYYY U V int x; const int N = 32; const int Uoffset = i.dim(0)*i.dim(1); const int Voffset = Uoffset + Uoffset/4; const int w = i.dim(0); const int h = i.dim(1); const int sy = i.stride(1); unsigned char *data = (unsigned char*)i.data(); int y; unsigned char *buffer = m_raw.planes[0]; //RRRR __v16qi rShuffle = {1, -1,-1,-1, 5, -1,-1,-1, 9, -1,-1,-1, 13, -1,-1,-1 }; __v16qi gShuffle = {2, -1,-1,-1, 6, -1,-1,-1, 10,-1,-1,-1, 14,-1,-1,-1 }; __v16qi bShuffle = {3,-1,-1,-1, 7,-1,-1,-1, 11,-1,-1,-1, 15,-1,-1,-1 }; //Shuffle so elements are moved to front/back __v16qi _aShuffle = { 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; __v16qi _bShuffle = { -1, -1, -1, -1, 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1 }; __v16qi _cShuffle = { -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12, -1, -1, -1, -1 }; __v16qi _dShuffle = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12 }; __v8hi R2Y = { 27, 27, 27, 27, 27, 27, 27, 27}; __v8hi G2Y = { 91, 91, 91, 91, 91, 91, 91, 91}; __v8hi B2Y = { 9, 9, 9, 9, 9, 9, 9, 9}; __v8hi R2U = { -12,-12,-12,-12,-12,-12,-12,-12}; __v8hi G2U = { -43,-43,-43,-43,-43,-43,-43,-43}; __v8hi B2U = { 55, 55, 55, 55, 55, 55, 55, 55}; __v8hi R2V = { 78, 78, 78, 78, 78, 78, 78, 78}; __v8hi G2V = { -71,-71,-71,-71,-71,-71,-71,-71}; __v8hi B2V = { -7, -7, -7, -7, -7, -7, -7, -7}; __v8hi m127 = {127,127,127,127,127,127,127,127}; __v8hi zero = { 0, 0, 0, 0, 0, 0, 0, 0}; __v8hi two55 = {255,255,255,255,255,255,255,255}; for (y=0; y<h; y+=2) { for (x=0; x<w; x+=N) { __v8hi tY[N/8]; __v8hi bY[N/8]; __v8hi tU[N/8]; __v8hi bU[N/8]; __v8hi tV[N/8]; __v8hi bV[N/8]; //Step 1: Convert to YUV int n; for (n=0; n<N; n+=8) //Read 8x per lane { __v16qi tARGBx4_l = _mm_load_si128((__m128i*)(data + y*w*4 + x*4 + n*4)); __v16qi tARGBx4_r = _mm_load_si128((__m128i*)(data + y*w*4 + x*4 + n*4 + 16)); __v16qi bARGBx4_l = _mm_load_si128((__m128i*)(data + y*w*4 + x*4 + n*4 + sy)); __v16qi bARGBx4_r = _mm_load_si128((__m128i*)(data + y*w*4 + x*4 + n*4 + sy + 16)); // ARGB(1) ARGB(2) ARGB(3) ARGB(4) | ARGB(5) ARGB(6) ARGB(7) ARGB(8) // => AARRGGBB(1,5) AARRGGBB(2,6) | AARRGGBB(3,7) AARRGGBB(4,8) __v16qi tARGBx2_15 = _mm_unpacklo_epi8(tARGBx4_l, tARGBx4_r); __v16qi tARGBx2_26 = _mm_unpackhi_epi8(tARGBx4_l, tARGBx4_r); __v16qi bARGBx2_15 = _mm_unpacklo_epi8(bARGBx4_l, bARGBx4_r); __v16qi bARGBx2_26 = _mm_unpackhi_epi8(bARGBx4_l, bARGBx4_r); // AARRGGBB(1,5) AARRGGBB(2,6) | AARRGGBB(3,7) AARRGGBB(4,8) // => AAAARRRRGGGGBBBB(1,3,5,7) | AAAARRRRGGGGBBBB(2,4,6,8) __v16qi tARGB_1357 = _mm_unpacklo_epi8(tARGBx2_15, tARGBx2_26); __v16qi tARGB_2468 = _mm_unpackhi_epi8(tARGBx2_15, tARGBx2_26); __v16qi bARGB_1357 = _mm_unpacklo_epi8(bARGBx2_15, bARGBx2_26); __v16qi bARGB_2468 = _mm_unpackhi_epi8(bARGBx2_15, bARGBx2_26); //AAAARRRRGGGGBBBB(1,3,5,7) | AAAARRRRGGGGBBBB(2,4,6,8) // => AAAAAAAARRRRRRRR | GGGGGGGGBBBBBBBB __v16qi tAARR = _mm_unpacklo_epi8(tARGB_1357, tARGB_2468); __v16qi tGGBB = _mm_unpackhi_epi8(tARGB_1357, tARGB_2468); __v16qi bAARR = _mm_unpacklo_epi8(bARGB_1357, bARGB_2468); __v16qi bGGBB = _mm_unpackhi_epi8(bARGB_1357, bARGB_2468); //Unpack to 8 R's, 8 G's, and 8 B's. __v8hi tRRRR = _mm_unpackhi_epi8(tAARR, zero); __v8hi tGGGG = _mm_unpacklo_epi8(tGGBB, zero); __v8hi tBBBB = _mm_unpackhi_epi8(tGGBB, zero); __v8hi bRRRR = _mm_unpackhi_epi8(bAARR, zero); __v8hi bGGGG = _mm_unpacklo_epi8(bGGBB, zero); __v8hi bBBBB = _mm_unpackhi_epi8(bGGBB, zero); //Convert to YUV (8x parallel) __v8hi tYYYY = _mm_add_epi16(_mm_mullo_epi16(tRRRR, R2Y), _mm_add_epi16(_mm_mullo_epi16(tGGGG, G2Y), _mm_mullo_epi16(tBBBB, B2Y))); __v8hi tUUUU = _mm_add_epi16(_mm_mullo_epi16(tRRRR, R2U), _mm_add_epi16(_mm_mullo_epi16(tGGGG, G2U), _mm_mullo_epi16(tBBBB, B2U))); __v8hi tVVVV = _mm_add_epi16(_mm_mullo_epi16(tRRRR, R2V), _mm_add_epi16(_mm_mullo_epi16(tGGGG, G2V), _mm_mullo_epi16(tBBBB, B2V))); __v8hi bYYYY = _mm_add_epi16(_mm_mullo_epi16(bRRRR, R2Y), _mm_add_epi16(_mm_mullo_epi16(bGGGG, G2Y), _mm_mullo_epi16(bBBBB, B2Y))); __v8hi bUUUU = _mm_add_epi16(_mm_mullo_epi16(bRRRR, R2U), _mm_add_epi16(_mm_mullo_epi16(bGGGG, G2U), _mm_mullo_epi16(bBBBB, B2U))); __v8hi bVVVV = _mm_add_epi16(_mm_mullo_epi16(bRRRR, R2V), _mm_add_epi16(_mm_mullo_epi16(bGGGG, G2V), _mm_mullo_epi16(bBBBB, B2V))); tUUUU = _mm_add_epi16(_mm_srai_epi16(tUUUU, 7), m127); tVVVV = _mm_add_epi16(_mm_srai_epi16(tVVVV, 7), m127); bUUUU = _mm_add_epi16(_mm_srai_epi16(bUUUU, 7), m127); bVVVV = _mm_add_epi16(_mm_srai_epi16(bVVVV, 7), m127); //Remove the fractional portion and clamp in 0...255 tY[n/8] = _mm_min_epi16(_mm_srai_epi16(_mm_max_epi16(tYYYY,zero), 7), two55); tU[n/8] = _mm_min_epi16(_mm_max_epi16(tUUUU,zero), two55); tV[n/8] = _mm_min_epi16(_mm_max_epi16(tVVVV,zero), two55); bY[n/8] = _mm_min_epi16(_mm_srai_epi16(_mm_max_epi16(bYYYY,zero), 7), two55); bU[n/8] = _mm_min_epi16(_mm_max_epi16(bUUUU,zero), two55); bV[n/8] = _mm_min_epi16(_mm_max_epi16(bVVVV,zero), two55); } // Step 2 - Write out Luma (part 1) for (n=0; n<N; n+=16) { __v8hi A = tY[n/8]; __v8hi B = tY[n/8+1]; __m128i Y = _mm_packus_epi16(A,B); _mm_storeu_si128((__m128i*)(buffer+y*w+x+n), Y); } for (n=0; n<N; n+=16) { __v8hi A = bY[n/8]; __v8hi B = bY[n/8+1]; __m128i Y = _mm_packus_epi16(A,B); _mm_storeu_si128((__m128i*)(buffer+y*w+x+n+w), Y); } //Step 3 -- U and V data... for (n=0; n<N; n+=32) { __m128i U16a = _mm_add_epi16(tU[n/8], bU[n/8]); __m128i U16b = _mm_add_epi16(tU[n/8+1], bU[n/8+1]); __m128i U16c = _mm_add_epi16(tU[n/8+2], bU[n/8+2]); __m128i U16d = _mm_add_epi16(tU[n/8+3], bU[n/8+3]); U16a = _mm_srli_epi16(_mm_hadd_epi16(U16a, U16b),2); U16c = _mm_srli_epi16(_mm_hadd_epi16(U16c, U16d),2); __m128i U = _mm_packus_epi16(U16a, U16c); _mm_storeu_si128((__m128i*)(buffer+Uoffset+y/2*w/2 + x/2+n/2), U); } for (n=0; n<N; n+=32) { __m128i U16a = _mm_add_epi16(tV[n/8], bV[n/8]); __m128i U16b = _mm_add_epi16(tV[n/8+1], bV[n/8+1]); __m128i U16c = _mm_add_epi16(tV[n/8+2], bV[n/8+2]); __m128i U16d = _mm_add_epi16(tV[n/8+3], bV[n/8+3]); U16a = _mm_srli_epi16(_mm_hadd_epi16(U16a, U16b),2); U16c = _mm_srli_epi16(_mm_hadd_epi16(U16c, U16d),2); __m128i U = _mm_packus_epi16(U16a, U16c); _mm_storeu_si128((__m128i*)(buffer+Voffset+y/2*w/2 + x/2+n/2), U); } } } m_frameCnt++; vpx_codec_encode(&m_codec, &m_raw, m_frameCnt, 1, 0, VPX_DL_REALTIME); vpx_codec_iter_t iter = NULL; const vpx_codec_cx_pkt_t *pkt; while ((pkt = vpx_codec_get_cx_data(&m_codec, &iter))) { if (pkt->kind == VPX_CODEC_CX_FRAME_PKT) { //Generate output TML::Matrix o(outputs,0); _jit_matrix_info m; memset(&m, 0, sizeof(m)); m.dimcount = 2; m.dim[0] = pkt->data.frame.sz; m.dim[1] = 1; m.dimstride[0] = pkt->data.frame.sz; m.dimstride[1] = 1; m.planecount = 1; m.type = _jit_sym_char; o.resizeTo(&m); memcpy(o.data(), pkt->data.frame.buf, pkt->data.frame.sz); break; } } }
// Hadamard transform // Returns the difference between the weighted sum of the absolute value of // transformed coefficients. static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB, const uint16_t* const w) { int32_t sum[4]; __m128i tmp_0, tmp_1, tmp_2, tmp_3; const __m128i zero = _mm_setzero_si128(); const __m128i one = _mm_set1_epi16(1); const __m128i three = _mm_set1_epi16(3); // Load, combine and tranpose inputs. { const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]); const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]); const __m128i inA_2 = _mm_loadl_epi64((__m128i*)&inA[BPS * 2]); const __m128i inA_3 = _mm_loadl_epi64((__m128i*)&inA[BPS * 3]); const __m128i inB_0 = _mm_loadl_epi64((__m128i*)&inB[BPS * 0]); const __m128i inB_1 = _mm_loadl_epi64((__m128i*)&inB[BPS * 1]); const __m128i inB_2 = _mm_loadl_epi64((__m128i*)&inB[BPS * 2]); const __m128i inB_3 = _mm_loadl_epi64((__m128i*)&inB[BPS * 3]); // Combine inA and inB (we'll do two transforms in parallel). const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0); const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1); const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2); const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3); // a00 b00 a01 b01 a02 b03 a03 b03 0 0 0 0 0 0 0 0 // a10 b10 a11 b11 a12 b12 a13 b13 0 0 0 0 0 0 0 0 // a20 b20 a21 b21 a22 b22 a23 b23 0 0 0 0 0 0 0 0 // a30 b30 a31 b31 a32 b32 a33 b33 0 0 0 0 0 0 0 0 // Transpose the two 4x4, discarding the filling zeroes. const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2); const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3); // a00 a20 b00 b20 a01 a21 b01 b21 a02 a22 b02 b22 a03 a23 b03 b23 // a10 a30 b10 b30 a11 a31 b11 b31 a12 a32 b12 b32 a13 a33 b13 b33 const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1); // a00 a10 a20 a30 b00 b10 b20 b30 a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 a03 a13 a23 a33 b03 b13 b23 b33 // Convert to 16b. tmp_0 = _mm_unpacklo_epi8(transpose1_0, zero); tmp_1 = _mm_unpackhi_epi8(transpose1_0, zero); tmp_2 = _mm_unpacklo_epi8(transpose1_1, zero); tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Horizontal pass and subsequent transpose. { // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_slli_epi16(_mm_add_epi16(tmp_0, tmp_2), 2); const __m128i a1 = _mm_slli_epi16(_mm_add_epi16(tmp_1, tmp_3), 2); const __m128i a2 = _mm_slli_epi16(_mm_sub_epi16(tmp_1, tmp_3), 2); const __m128i a3 = _mm_slli_epi16(_mm_sub_epi16(tmp_0, tmp_2), 2); // b0_extra = (a0 != 0); const __m128i b0_extra = _mm_andnot_si128(_mm_cmpeq_epi16 (a0, zero), one); const __m128i b0_base = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); const __m128i b0 = _mm_add_epi16(b0_base, b0_extra); // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 // Transpose the two 4x4. const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1); const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3); const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1); const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Vertical pass and difference of weighted sums. { // Load all inputs. // TODO(cduvivier): Make variable declarations and allocations aligned so // we can use _mm_load_si128 instead of _mm_loadu_si128. const __m128i w_0 = _mm_loadu_si128((__m128i*)&w[0]); const __m128i w_8 = _mm_loadu_si128((__m128i*)&w[8]); // Calculate a and b (two 4x4 at once). const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2); const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3); const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3); const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2); const __m128i b0 = _mm_add_epi16(a0, a1); const __m128i b1 = _mm_add_epi16(a3, a2); const __m128i b2 = _mm_sub_epi16(a3, a2); const __m128i b3 = _mm_sub_epi16(a0, a1); // Separate the transforms of inA and inB. __m128i A_b0 = _mm_unpacklo_epi64(b0, b1); __m128i A_b2 = _mm_unpacklo_epi64(b2, b3); __m128i B_b0 = _mm_unpackhi_epi64(b0, b1); __m128i B_b2 = _mm_unpackhi_epi64(b2, b3); { // sign(b) = b >> 15 (0x0000 if positive, 0xffff if negative) const __m128i sign_A_b0 = _mm_srai_epi16(A_b0, 15); const __m128i sign_A_b2 = _mm_srai_epi16(A_b2, 15); const __m128i sign_B_b0 = _mm_srai_epi16(B_b0, 15); const __m128i sign_B_b2 = _mm_srai_epi16(B_b2, 15); // b = abs(b) = (b ^ sign) - sign A_b0 = _mm_xor_si128(A_b0, sign_A_b0); A_b2 = _mm_xor_si128(A_b2, sign_A_b2); B_b0 = _mm_xor_si128(B_b0, sign_B_b0); B_b2 = _mm_xor_si128(B_b2, sign_B_b2); A_b0 = _mm_sub_epi16(A_b0, sign_A_b0); A_b2 = _mm_sub_epi16(A_b2, sign_A_b2); B_b0 = _mm_sub_epi16(B_b0, sign_B_b0); B_b2 = _mm_sub_epi16(B_b2, sign_B_b2); } // b = abs(b) + 3 A_b0 = _mm_add_epi16(A_b0, three); A_b2 = _mm_add_epi16(A_b2, three); B_b0 = _mm_add_epi16(B_b0, three); B_b2 = _mm_add_epi16(B_b2, three); // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3 // b = (abs(b) + 3) >> 3 A_b0 = _mm_srai_epi16(A_b0, 3); A_b2 = _mm_srai_epi16(A_b2, 3); B_b0 = _mm_srai_epi16(B_b0, 3); B_b2 = _mm_srai_epi16(B_b2, 3); // weighted sums A_b0 = _mm_madd_epi16(A_b0, w_0); A_b2 = _mm_madd_epi16(A_b2, w_8); B_b0 = _mm_madd_epi16(B_b0, w_0); B_b2 = _mm_madd_epi16(B_b2, w_8); A_b0 = _mm_add_epi32(A_b0, A_b2); B_b0 = _mm_add_epi32(B_b0, B_b2); // difference of weighted sums A_b0 = _mm_sub_epi32(A_b0, B_b0); _mm_storeu_si128((__m128i*)&sum[0], A_b0); } return sum[0] + sum[1] + sum[2] + sum[3]; }
template<> void momentsInTile<uchar, int, int>( const cv::Mat& img, double* moments ) { typedef uchar T; typedef int WT; typedef int MT; Size size = img.size(); int y; MT mom[10] = {0,0,0,0,0,0,0,0,0,0}; bool useSIMD = checkHardwareSupport(CV_CPU_SSE2); for( y = 0; y < size.height; y++ ) { const T* ptr = img.ptr<T>(y); int x0 = 0, x1 = 0, x2 = 0, x3 = 0, x = 0; if( useSIMD ) { __m128i qx_init = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7); __m128i dx = _mm_set1_epi16(8); __m128i z = _mm_setzero_si128(), qx0 = z, qx1 = z, qx2 = z, qx3 = z, qx = qx_init; for( ; x <= size.width - 8; x += 8 ) { __m128i p = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr + x)), z); qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z)); __m128i px = _mm_mullo_epi16(p, qx); __m128i sx = _mm_mullo_epi16(qx, qx); qx1 = _mm_add_epi32(qx1, _mm_madd_epi16(p, qx)); qx2 = _mm_add_epi32(qx2, _mm_madd_epi16(p, sx)); qx3 = _mm_add_epi32(qx3, _mm_madd_epi16(px, sx)); qx = _mm_add_epi16(qx, dx); } int CV_DECL_ALIGNED(16) buf[4]; _mm_store_si128((__m128i*)buf, qx0); x0 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, qx1); x1 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, qx2); x2 = buf[0] + buf[1] + buf[2] + buf[3]; _mm_store_si128((__m128i*)buf, qx3); x3 = buf[0] + buf[1] + buf[2] + buf[3]; } for( ; x < size.width; x++ ) { WT p = ptr[x]; WT xp = x * p, xxp; x0 += p; x1 += xp; xxp = xp * x; x2 += xxp; x3 += xxp * x; } WT py = y * x0, sy = y*y; mom[9] += ((MT)py) * sy; // m03 mom[8] += ((MT)x1) * sy; // m12 mom[7] += ((MT)x2) * y; // m21 mom[6] += x3; // m30 mom[5] += x0 * sy; // m02 mom[4] += x1 * y; // m11 mom[3] += x2; // m20 mom[2] += py; // m01 mom[1] += x1; // m10 mom[0] += x0; // m00 } for(int x = 0; x < 10; x++ ) moments[x] = (double)mom[x]; }
}bool validate_utf8_sse(const char *src, size_t len) { const char *end = src + len; while (src + 16 < end) { __m128i chunk = _mm_loadu_si128((const __m128i *)(src)); int asciiMask = _mm_movemask_epi8(chunk); if (!asciiMask) { src += 16; continue; } __m128i chunk_signed = _mm_add_epi8(chunk, _mm_set1_epi8(0x80)); __m128i cond2 = _mm_cmplt_epi8(_mm_set1_epi8(0xc2 - 1 - 0x80), chunk_signed); __m128i state = _mm_set1_epi8((char)(0x0 | 0x80)); state = _mm_blendv_epi8(state, _mm_set1_epi8((char)(0x2 | 0xc0)), cond2); __m128i cond3 = _mm_cmplt_epi8(_mm_set1_epi8(0xe0 - 1 - 0x80), chunk_signed); state = _mm_blendv_epi8(state, _mm_set1_epi8((char)(0x3 | 0xe0)), cond3); __m128i mask3 = _mm_slli_si128(cond3, 1); __m128i cond4 = _mm_cmplt_epi8(_mm_set1_epi8(0xf0 - 1 - 0x80), chunk_signed); // Fall back to the scalar processing if (_mm_movemask_epi8(cond4)) { break; } __m128i count = _mm_and_si128(state, _mm_set1_epi8(0x7)); __m128i count_sub1 = _mm_subs_epu8(count, _mm_set1_epi8(0x1)); __m128i counts = _mm_add_epi8(count, _mm_slli_si128(count_sub1, 1)); __m128i shifts = count_sub1; shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 1)); counts = _mm_add_epi8( counts, _mm_slli_si128(_mm_subs_epu8(counts, _mm_set1_epi8(0x2)), 2)); shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 2)); if (asciiMask ^ _mm_movemask_epi8(_mm_cmpgt_epi8(counts, _mm_set1_epi8(0)))) return false; // error shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 4)); if (_mm_movemask_epi8(_mm_cmpgt_epi8( _mm_sub_epi8(_mm_slli_si128(counts, 1), counts), _mm_set1_epi8(1)))) return false; // error shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 8)); __m128i mask = _mm_and_si128(state, _mm_set1_epi8(0xf8)); shifts = _mm_and_si128(shifts, _mm_cmplt_epi8(counts, _mm_set1_epi8(2))); // <=1 chunk = _mm_andnot_si128(mask, chunk); // from now on, we only have usefull bits shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 1), _mm_srli_si128(_mm_slli_epi16(shifts, 7), 1)); __m128i chunk_right = _mm_slli_si128(chunk, 1); __m128i chunk_low = _mm_blendv_epi8( chunk, _mm_or_si128(chunk, _mm_and_si128(_mm_slli_epi16(chunk_right, 6), _mm_set1_epi8(0xc0))), _mm_cmpeq_epi8(counts, _mm_set1_epi8(1))); __m128i chunk_high = _mm_and_si128(chunk, _mm_cmpeq_epi8(counts, _mm_set1_epi8(2))); shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 2), _mm_srli_si128(_mm_slli_epi16(shifts, 6), 2)); chunk_high = _mm_srli_epi32(chunk_high, 2); shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 4), _mm_srli_si128(_mm_slli_epi16(shifts, 5), 4)); chunk_high = _mm_or_si128( chunk_high, _mm_and_si128(_mm_and_si128(_mm_slli_epi32(chunk_right, 4), _mm_set1_epi8(0xf0)), mask3)); int c = _mm_extract_epi16(counts, 7); int source_advance = !(c & 0x0200) ? 16 : !(c & 0x02) ? 15 : 14; __m128i high_bits = _mm_and_si128(chunk_high, _mm_set1_epi8(0xf8)); if (!_mm_testz_si128( mask3, _mm_or_si128(_mm_cmpeq_epi8(high_bits, _mm_set1_epi8(0x00)), _mm_cmpeq_epi8(high_bits, _mm_set1_epi8(0xd8))))) return false; shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 8), _mm_srli_si128(_mm_slli_epi16(shifts, 4), 8)); chunk_high = _mm_slli_si128(chunk_high, 1); __m128i shuf = _mm_add_epi8(shifts, _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); chunk_low = _mm_shuffle_epi8(chunk_low, shuf); chunk_high = _mm_shuffle_epi8(chunk_high, shuf); __m128i utf16_low = _mm_unpacklo_epi8(chunk_low, chunk_high); __m128i utf16_high = _mm_unpackhi_epi8(chunk_low, chunk_high); if (_mm_cmpestrc(_mm_cvtsi64_si128(0xfdeffdd0fffffffe), 4, utf16_high, 8, _SIDD_UWORD_OPS | _SIDD_CMP_RANGES) | _mm_cmpestrc(_mm_cvtsi64_si128(0xfdeffdd0fffffffe), 4, utf16_low, 8, _SIDD_UWORD_OPS | _SIDD_CMP_RANGES)) { return false; } src += source_advance; } return validate_utf8(src, end - src); }
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) { const __m128i zero = _mm_setzero_si128(); const __m128i seven = _mm_set1_epi16(7); const __m128i k937 = _mm_set1_epi32(937); const __m128i k1812 = _mm_set1_epi32(1812); const __m128i k51000 = _mm_set1_epi32(51000); const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16)); const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217); const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352); const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8); const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8); const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352, 2217, 5352, 2217, 5352); const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217, -5352, 2217, -5352, 2217); __m128i v01, v32; // Difference between src and ref and initial transpose. { // Load src and convert to 16b. const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]); const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]); const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]); const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]); const __m128i src_0 = _mm_unpacklo_epi8(src0, zero); const __m128i src_1 = _mm_unpacklo_epi8(src1, zero); const __m128i src_2 = _mm_unpacklo_epi8(src2, zero); const __m128i src_3 = _mm_unpacklo_epi8(src3, zero); // Load ref and convert to 16b. const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]); const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]); const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero); const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); // Compute difference. -> 00 01 02 03 00 00 00 00 const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); // Unpack and shuffle // 00 01 02 03 0 0 0 0 // 10 11 12 13 0 0 0 0 // 20 21 22 23 0 0 0 0 // 30 31 32 33 0 0 0 0 const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1); const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3); // 00 01 10 11 02 03 12 13 // 20 21 30 31 22 23 32 33 const __m128i shuf01_p = _mm_shufflehi_epi16(shuf01, _MM_SHUFFLE(2, 3, 0, 1)); const __m128i shuf23_p = _mm_shufflehi_epi16(shuf23, _MM_SHUFFLE(2, 3, 0, 1)); // 00 01 10 11 03 02 13 12 // 20 21 30 31 23 22 33 32 const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p); const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p); // 00 01 10 11 20 21 30 31 // 03 02 13 12 23 22 33 32 const __m128i a01 = _mm_add_epi16(s01, s32); const __m128i a32 = _mm_sub_epi16(s01, s32); // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ] // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ] const __m128i tmp0 = _mm_madd_epi16(a01, k88p); // [ (a0 + a1) << 3, ... ] const __m128i tmp2 = _mm_madd_epi16(a01, k88m); // [ (a0 - a1) << 3, ... ] const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p); const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m); const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812); const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937); const __m128i tmp1 = _mm_srai_epi32(tmp1_2, 9); const __m128i tmp3 = _mm_srai_epi32(tmp3_2, 9); const __m128i s03 = _mm_packs_epi32(tmp0, tmp2); const __m128i s12 = _mm_packs_epi32(tmp1, tmp3); const __m128i s_lo = _mm_unpacklo_epi16(s03, s12); // 0 1 0 1 0 1... const __m128i s_hi = _mm_unpackhi_epi16(s03, s12); // 2 3 2 3 2 3 const __m128i v23 = _mm_unpackhi_epi32(s_lo, s_hi); v01 = _mm_unpacklo_epi32(s_lo, s_hi); v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 3 2 3 2 3 2.. } // Second pass { // Same operations are done on the (0,3) and (1,2) pairs. // a0 = v0 + v3 // a1 = v1 + v2 // a3 = v0 - v3 // a2 = v1 - v2 const __m128i a01 = _mm_add_epi16(v01, v32); const __m128i a32 = _mm_sub_epi16(v01, v32); const __m128i a11 = _mm_unpackhi_epi64(a01, a01); const __m128i a22 = _mm_unpackhi_epi64(a32, a32); const __m128i a01_plus_7 = _mm_add_epi16(a01, seven); // d0 = (a0 + a1 + 7) >> 4; // d2 = (a0 - a1 + 7) >> 4; const __m128i c0 = _mm_add_epi16(a01_plus_7, a11); const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11); const __m128i d0 = _mm_srai_epi16(c0, 4); const __m128i d2 = _mm_srai_epi16(c2, 4); // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16) // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16) const __m128i b23 = _mm_unpacklo_epi16(a22, a32); const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one); const __m128i d3 = _mm_add_epi32(c3, k51000); const __m128i e1 = _mm_srai_epi32(d1, 16); const __m128i e3 = _mm_srai_epi32(d3, 16); const __m128i f1 = _mm_packs_epi32(e1, e1); const __m128i f3 = _mm_packs_epi32(e3, e3); // f1 = f1 + (a3 != 0); // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the // desired (0, 1), we add one earlier through k12000_plus_one. // -> f1 = f1 + 1 - (a3 == 0) const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1); const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3); _mm_storeu_si128((__m128i*)&out[0], d0_g1); _mm_storeu_si128((__m128i*)&out[8], d2_f3); } }
static void GF_FUNC_ALIGN VS_CC proc_8bit_sse2(convolution_t *ch, uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *dstp, const uint8_t *srcp) { uint8_t *p0 = buff + 16; uint8_t *p1 = p0 + bstride; uint8_t *p2 = p1 + bstride; uint8_t *p3 = p2 + bstride; uint8_t *p4 = p3 + bstride; uint8_t *orig = p0, *end = p4; line_copy8(p0, srcp + 2 * stride , width, 2); line_copy8(p1, srcp + stride, width, 2); line_copy8(p2, srcp, width, 2); srcp += stride; line_copy8(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128 rdiv = _mm_set1_ps((float)ch->rdiv); __m128 bias = _mm_set1_ps((float)ch->bias); __m128i matrix[25]; for (int i = 0; i < 25; i++) { matrix[i] = _mm_unpacklo_epi16(_mm_set1_epi16((int16_t)ch->m[i]), zero); } for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy8(p4, srcp, width, 2); uint8_t *array[] = { p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2, p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2, p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2, p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2, p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2 }; for (int x = 0; x < width; x += 16) { __m128i sum[4] = { zero, zero, zero, zero }; for (int i = 0; i < 25; i++) { __m128i xmm0, xmm1, xmm2; xmm0 = _mm_loadu_si128((__m128i *)(array[i] + x)); xmm2 = _mm_unpackhi_epi8(xmm0, zero); xmm0 = _mm_unpacklo_epi8(xmm0, zero); xmm1 = _mm_unpackhi_epi16(xmm0, zero); xmm0 = _mm_unpacklo_epi16(xmm0, zero); sum[0] = _mm_add_epi32(sum[0], _mm_madd_epi16(xmm0, matrix[i])); sum[1] = _mm_add_epi32(sum[1], _mm_madd_epi16(xmm1, matrix[i])); xmm1 = _mm_unpackhi_epi16(xmm2, zero); xmm0 = _mm_unpacklo_epi16(xmm2, zero); sum[2] = _mm_add_epi32(sum[2], _mm_madd_epi16(xmm0, matrix[i])); sum[3] = _mm_add_epi32(sum[3], _mm_madd_epi16(xmm1, matrix[i])); } for (int i = 0; i < 4; i++) { __m128 sumfp = _mm_cvtepi32_ps(sum[i]); sumfp = _mm_mul_ps(sumfp, rdiv); sumfp = _mm_add_ps(sumfp, bias); if (!ch->saturate) { sumfp = mm_abs_ps(sumfp); } sum[i] = _mm_cvttps_epi32(sumfp); } sum[0] = _mm_packs_epi32(sum[0], sum[1]); sum[1] = _mm_packs_epi32(sum[2], sum[3]); sum[0] = _mm_packus_epi16(sum[0], sum[1]); _mm_store_si128((__m128i *)(dstp + x), sum[0]); } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
//计算TI、SI int tisi(char* ydata,char* prev_ydata,int width,int height,SDLParam sdlparam,float &TI,float &SI) { int nYSize=height*width; float nFrameSize = nYSize*1.5; int realframe_size=(width-2*PADDING)*(height-2*PADDING); unsigned char *pFrame=(unsigned char*)malloc(nYSize); unsigned char *pNextFrame=(unsigned char*)malloc(nYSize); unsigned char *pFrame_0; unsigned char *pFrame_1; unsigned char *pFrame_2; unsigned char *pNextFrame_0; unsigned char *pSobelScreen=(unsigned char*)malloc(realframe_size); unsigned char *pDiffScreen=(unsigned char*)malloc(realframe_size); memset(pSobelScreen,0,realframe_size); memset(pDiffScreen,0,realframe_size); float *frame_sobel=(float*)malloc(realframe_size*sizeof(float)); memset(frame_sobel,0.0f,realframe_size*sizeof(float)); float avg_frame_sobel=0; int index=0; float *frame_diff=(float*)malloc(realframe_size*sizeof(float)); memset(frame_diff,0.0f,realframe_size*sizeof(float)); float avg_frame_diff=0; __m128 sobel_avg_sum=_mm_set1_ps(+0.0f); __m128 sobel_square_sum=_mm_set1_ps(+0.0f); __m128 diff_avg_sum=_mm_set1_ps(+0.0f); __m128 diff_square_sum=_mm_set1_ps(+0.0f); __m128 avg_sobel=_mm_set1_ps(+0.0f); __m128 avg_diff=_mm_set1_ps(+0.0f); int i,j,k; int pad_threshold=0; memcpy(pFrame,prev_ydata,nYSize); memcpy(pNextFrame,ydata,nYSize); pFrame_0=pFrame+width*(PADDING-1); pFrame_1=pFrame+width*PADDING; pFrame_2=pFrame+width*(PADDING+1); pNextFrame_0=pNextFrame+width*(PADDING-1); //Check if(mark_exit==1){ return -1; } for(j = PADDING; j < height-PADDING; j++) { for(i = PADDING; i < width-PADDING; i+=4) { if(i+4>width-PADDING) pad_threshold=1; // load 16 components. (0~6 will be used) __m128i current_0 = _mm_unpacklo_epi8(_mm_loadu_si128((__m128i*)(pFrame_0+i-1)), _mm_setzero_si128()); __m128i current_1 = _mm_unpacklo_epi8(_mm_loadu_si128((__m128i*)(pFrame_1+i-1)), _mm_setzero_si128()); __m128i current_2 = _mm_unpacklo_epi8(_mm_loadu_si128((__m128i*)(pFrame_2+i-1)), _mm_setzero_si128()); __m128i next_0 = _mm_unpacklo_epi8(_mm_loadu_si128((__m128i*)(pNextFrame_0+i-1)), _mm_setzero_si128()); // pFrame_00 = { pFrame_0[i-1], pFrame_0[i], pFrame_0[i+1], pFrame_0[i+2] } __m128 pFrame_00 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(current_0, _mm_setzero_si128())); // pFrame_01 = { pFrame_0[i], pFrame_0[i+1], pFrame_0[i+2], pFrame_0[i+3] } __m128 pFrame_01 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_srli_si128(current_0, 2), _mm_setzero_si128())); // pFrame_02 = { pFrame_0[i+1], pFrame_0[i+2], pFrame_0[i+3], pFrame_0[i+4] } __m128 pFrame_02 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_srli_si128(current_0, 4), _mm_setzero_si128())); // pFrame_10 = { pFrame_1[i-1], pFrame_1[i], pFrame_1[i+1], pFrame_1[i+2] } __m128 pFrame_10 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(current_1, _mm_setzero_si128())); // pFrame_12 = { pFrame_1[i+1], pFrame_1[i+2], pFrame_1[i+3], pFrame_1[i+4] } __m128 pFrame_12 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_srli_si128(current_1, 4), _mm_setzero_si128())); // pFrame_20 = { pFrame_2[i-1], pFrame_2[i], pFrame_2[i+1], pFrame_2[i+2] } __m128 pFrame_20 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(current_2, _mm_setzero_si128())); // pFrame_21 = { pFrame_2[i], pFrame_2[i+1], pFrame_2[i+2], pFrame_2[i+3] } __m128 pFrame_21 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_srli_si128(current_2, 2), _mm_setzero_si128())); // pFrame_22 = { pFrame_2[i+1], pFrame_2[i+2], pFrame_2[i+3], pFrame_2[i+4] } __m128 pFrame_22 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_srli_si128(current_2, 4), _mm_setzero_si128())); __m128 pNextFrame_00 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(next_0, _mm_setzero_si128())); __m128 gx=_mm_add_ps(_mm_sub_ps(_mm_add_ps(_mm_add_ps(_mm_sub_ps(_mm_sub_ps(_mm_sub_ps(pFrame_20,pFrame_22),pFrame_12),pFrame_12),pFrame_10),pFrame_10),pFrame_02),pFrame_00); __m128 gy=_mm_sub_ps(_mm_sub_ps(_mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_sub_ps(pFrame_00,pFrame_20),_mm_sub_ps(pFrame_02,pFrame_22)),pFrame_01),pFrame_01),pFrame_21),pFrame_21); __m128 sobel_result = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(gx, gx), _mm_mul_ps(gy,gy))); __m128 diff_result=_mm_sub_ps(pNextFrame_00,pFrame_00); if(!pad_threshold) { frame_sobel[index]=sobel_result.m128_f32[0]; frame_sobel[index+1]=sobel_result.m128_f32[1]; frame_sobel[index+2]=sobel_result.m128_f32[2]; frame_sobel[index+3]=sobel_result.m128_f32[3]; pSobelScreen[index]=(unsigned char)sobel_result.m128_f32[0]; pSobelScreen[index+1]=(unsigned char)sobel_result.m128_f32[1]; pSobelScreen[index+2]=(unsigned char)sobel_result.m128_f32[2]; pSobelScreen[index+3]=(unsigned char)sobel_result.m128_f32[3]; frame_diff[index]=diff_result.m128_f32[0]; frame_diff[index+1]=diff_result.m128_f32[1]; frame_diff[index+2]=diff_result.m128_f32[2]; frame_diff[index+3]=diff_result.m128_f32[3]; pDiffScreen[index]=(unsigned char)abs(diff_result.m128_f32[0]); pDiffScreen[index+1]=(unsigned char)abs(diff_result.m128_f32[1]); pDiffScreen[index+2]=(unsigned char)abs(diff_result.m128_f32[2]); pDiffScreen[index+3]=(unsigned char)abs(diff_result.m128_f32[3]); index+=4; } else { for(k=0;k<width-PADDING-i;k++) { frame_sobel[index+k]=sobel_result.m128_f32[k]; pSobelScreen[index+k]=(unsigned char)sobel_result.m128_f32[k]; frame_diff[index+k]=diff_result.m128_f32[k]; pDiffScreen[index+k]=(unsigned char)abs(diff_result.m128_f32[k]); } index+=width-PADDING-i; } } pFrame_0 += width; pFrame_1 += width; pFrame_2 += width; pNextFrame_0 += width; pad_threshold=0; } //画图 if(sdlparam.graphically_si==true&&sdlparam.isinterval==false){ memcpy(sdlparam.show_YBuffer,pSobelScreen,realframe_size); SDL_Event event; event.type = REFRESH_EVENT; SDL_PushEvent(&event); } //画图 if(sdlparam.graphically_ti==true&&sdlparam.isinterval==false){ memcpy(sdlparam.show_YBuffer,pDiffScreen,realframe_size); SDL_Event event; event.type = REFRESH_EVENT; SDL_PushEvent(&event); } for(i=0;i<index;i+=4) { __m128 sobel_result_0 = _mm_set_ps (frame_sobel[i],frame_sobel[i+1], frame_sobel[i+2], frame_sobel[i+3]); sobel_avg_sum = _mm_add_ps(sobel_avg_sum,sobel_result_0); __m128 diff_result_0 = _mm_set_ps (frame_diff[i],frame_diff[i+1], frame_diff[i+2], frame_diff[i+3]); diff_avg_sum = _mm_add_ps(diff_avg_sum,diff_result_0); } avg_frame_sobel=(sobel_avg_sum.m128_f32[0]+ sobel_avg_sum.m128_f32[1]+ sobel_avg_sum.m128_f32[2]+ sobel_avg_sum.m128_f32[3])/index; avg_sobel = _mm_set_ps (avg_frame_sobel,avg_frame_sobel, avg_frame_sobel, avg_frame_sobel); avg_frame_diff=(diff_avg_sum.m128_f32[0]+ diff_avg_sum.m128_f32[1]+ diff_avg_sum.m128_f32[2]+ diff_avg_sum.m128_f32[3])/index; avg_diff = _mm_set_ps (avg_frame_diff,avg_frame_diff, avg_frame_diff, avg_frame_diff); for(i=0;i<index;i+=4) { __m128 sobel_result_1 = _mm_set_ps (frame_sobel[i],frame_sobel[i+1], frame_sobel[i+2], frame_sobel[i+3]); __m128 sobel_square=_mm_mul_ps(_mm_sub_ps(sobel_result_1,avg_sobel),_mm_sub_ps(sobel_result_1,avg_sobel)); sobel_square_sum = _mm_add_ps(sobel_square_sum,sobel_square); __m128 diff_result_1 = _mm_set_ps (frame_diff[i],frame_diff[i+1], frame_diff[i+2], frame_diff[i+3]); __m128 diff_square=_mm_mul_ps(_mm_sub_ps(diff_result_1,avg_diff),_mm_sub_ps(diff_result_1,avg_diff)); diff_square_sum = _mm_add_ps(diff_square_sum,diff_square); } SI=sqrt((sobel_square_sum.m128_f32[0]+sobel_square_sum.m128_f32[1]+sobel_square_sum.m128_f32[2]+sobel_square_sum.m128_f32[3])/index); avg_frame_sobel=0; sobel_avg_sum=_mm_set1_ps(+0.0f); sobel_square_sum=_mm_set1_ps(+0.0f); TI=sqrt((diff_square_sum.m128_f32[0]+diff_square_sum.m128_f32[1]+diff_square_sum.m128_f32[2]+diff_square_sum.m128_f32[3])/index); avg_frame_diff=0; diff_avg_sum=_mm_set1_ps(+0.0f); diff_square_sum=_mm_set1_ps(+0.0f); index=0; //-------- delete pFrame; delete pNextFrame; delete pSobelScreen; delete pDiffScreen; delete frame_diff; delete frame_sobel; return 0; }