Exemple #1
0
void lines_scale2(const unsigned char *src, unsigned y, unsigned char *dst1, unsigned char *dst2, unsigned nPix)
{
   const unsigned char
      *u = src + ((y-1) & 7)*sc2lines_width,
      *m = src + ((y+0) & 7)*sc2lines_width,
      *l = src + ((y+1) & 7)*sc2lines_width;

   for (unsigned i = 0; i < nPix; i += 8) {

      __m64 uu = *(__m64*)(u+i);
      __m64 ll = *(__m64*)(l+i);
      __m64 cmp = _mm_cmpeq_pi8(uu,ll);

      if (_mm_movemask_pi8(cmp) != 0xFF) {

         __m128i mm = _mm_loadu_si128((__m128i*)(m+i-4));
         __m128i uu = _mm_loadu_si128((__m128i*)(u+i-4));
         __m128i ll = _mm_loadu_si128((__m128i*)(l+i-4));

         __m128i md = _mm_slli_si128(mm,1);
         __m128i mf = _mm_srli_si128(mm,1);
         __m128i maskall = _mm_or_si128(_mm_cmpeq_epi8(md,mf), _mm_cmpeq_epi8(uu,ll));

         __m128i e0, e1, v1, v2, v3;

         e0 = _mm_cmpeq_epi8(md,uu);
         e0 = _mm_andnot_si128(maskall, e0);
         e0 = _mm_srli_si128(e0,4);
         e0 = _mm_unpacklo_epi8(e0, _mm_setzero_si128());

         e1 = _mm_cmpeq_epi8(mf,uu);
         e1 = _mm_andnot_si128(maskall, e1);
         e1 = _mm_srli_si128(e1,4);
         e1 = _mm_unpacklo_epi8(_mm_setzero_si128(), e1);

         e0 = _mm_or_si128(e0, e1);

         v1 = _mm_srli_si128(mm,4);
         v1 = _mm_unpacklo_epi8(v1,v1);
         v2 = _mm_srli_si128(uu,4);
         v2 = _mm_unpacklo_epi8(v2,v2);

         _mm_store_si128((__m128i*)(dst1 + 2*i), _mm_or_si128( _mm_and_si128(e0,v2), _mm_andnot_si128(e0,v1) ) );

         e0 = _mm_cmpeq_epi8(md,ll);
         e0 = _mm_andnot_si128(maskall, e0);
         e0 = _mm_srli_si128(e0,4);
         e0 = _mm_unpacklo_epi8(e0, _mm_setzero_si128());

         e1 = _mm_cmpeq_epi8(mf,ll);
         e1 = _mm_andnot_si128(maskall, e1);
         e1 = _mm_srli_si128(e1,4);
         e1 = _mm_unpacklo_epi8(_mm_setzero_si128(), e1);

         e0 = _mm_or_si128(e0, e1);

         v3 = _mm_srli_si128(ll,4);
         v3 = _mm_unpacklo_epi8(v3,v3);

         _mm_store_si128((__m128i*)(dst2 + 2*i), _mm_or_si128( _mm_and_si128(e0,v3), _mm_andnot_si128(e0,v1) ) );

      } else {

         __m64 v0 = *(__m64*)(m+i);
         __m128i v1 = _mm_movpi64_epi64(v0);
         v1 = _mm_unpacklo_epi8(v1,v1);
         _mm_store_si128((__m128i*)(dst1 + 2*i), v1);
         _mm_store_si128((__m128i*)(dst2 + 2*i), v1);
      }
   }
}
Exemple #2
0
// Does one or two inverse transforms.
static void ITransformSSE2(const uint8_t* ref, const int16_t* in, uint8_t* dst,
                           int do_two) {
  // This implementation makes use of 16-bit fixed point versions of two
  // multiply constants:
  //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
  //    K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16
  //
  // To be able to use signed 16-bit integers, we use the following trick to
  // have constants within range:
  // - Associated constants are obtained by subtracting the 16-bit fixed point
  //   version of one:
  //      k = K - (1 << 16)  =>  K = k + (1 << 16)
  //      K1 = 85267  =>  k1 =  20091
  //      K2 = 35468  =>  k2 = -30068
  // - The multiplication of a variable by a constant become the sum of the
  //   variable and the multiplication of that variable by the associated
  //   constant:
  //      (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x
  const __m128i k1 = _mm_set1_epi16(20091);
  const __m128i k2 = _mm_set1_epi16(-30068);
  __m128i T0, T1, T2, T3;

  // Load and concatenate the transform coefficients (we'll do two inverse
  // transforms in parallel). In the case of only one inverse transform, the
  // second half of the vectors will just contain random value we'll never
  // use nor store.
  __m128i in0, in1, in2, in3;
  {
    in0 = _mm_loadl_epi64((__m128i*)&in[0]);
    in1 = _mm_loadl_epi64((__m128i*)&in[4]);
    in2 = _mm_loadl_epi64((__m128i*)&in[8]);
    in3 = _mm_loadl_epi64((__m128i*)&in[12]);
    // a00 a10 a20 a30   x x x x
    // a01 a11 a21 a31   x x x x
    // a02 a12 a22 a32   x x x x
    // a03 a13 a23 a33   x x x x
    if (do_two) {
      const __m128i inB0 = _mm_loadl_epi64((__m128i*)&in[16]);
      const __m128i inB1 = _mm_loadl_epi64((__m128i*)&in[20]);
      const __m128i inB2 = _mm_loadl_epi64((__m128i*)&in[24]);
      const __m128i inB3 = _mm_loadl_epi64((__m128i*)&in[28]);
      in0 = _mm_unpacklo_epi64(in0, inB0);
      in1 = _mm_unpacklo_epi64(in1, inB1);
      in2 = _mm_unpacklo_epi64(in2, inB2);
      in3 = _mm_unpacklo_epi64(in3, inB3);
      // a00 a10 a20 a30   b00 b10 b20 b30
      // a01 a11 a21 a31   b01 b11 b21 b31
      // a02 a12 a22 a32   b02 b12 b22 b32
      // a03 a13 a23 a33   b03 b13 b23 b33
    }
  }

  // Vertical pass and subsequent transpose.
  {
    // First pass, c and d calculations are longer because of the "trick"
    // multiplications.
    const __m128i a = _mm_add_epi16(in0, in2);
    const __m128i b = _mm_sub_epi16(in0, in2);
    // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
    const __m128i c1 = _mm_mulhi_epi16(in1, k2);
    const __m128i c2 = _mm_mulhi_epi16(in3, k1);
    const __m128i c3 = _mm_sub_epi16(in1, in3);
    const __m128i c4 = _mm_sub_epi16(c1, c2);
    const __m128i c = _mm_add_epi16(c3, c4);
    // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
    const __m128i d1 = _mm_mulhi_epi16(in1, k1);
    const __m128i d2 = _mm_mulhi_epi16(in3, k2);
    const __m128i d3 = _mm_add_epi16(in1, in3);
    const __m128i d4 = _mm_add_epi16(d1, d2);
    const __m128i d = _mm_add_epi16(d3, d4);

    // Second pass.
    const __m128i tmp0 = _mm_add_epi16(a, d);
    const __m128i tmp1 = _mm_add_epi16(b, c);
    const __m128i tmp2 = _mm_sub_epi16(b, c);
    const __m128i tmp3 = _mm_sub_epi16(a, d);

    // Transpose the two 4x4.
    // a00 a01 a02 a03   b00 b01 b02 b03
    // a10 a11 a12 a13   b10 b11 b12 b13
    // a20 a21 a22 a23   b20 b21 b22 b23
    // a30 a31 a32 a33   b30 b31 b32 b33
    const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1);
    const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3);
    const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1);
    const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3);
    // a00 a10 a01 a11   a02 a12 a03 a13
    // a20 a30 a21 a31   a22 a32 a23 a33
    // b00 b10 b01 b11   b02 b12 b03 b13
    // b20 b30 b21 b31   b22 b32 b23 b33
    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
    // a00 a10 a20 a30 a01 a11 a21 a31
    // b00 b10 b20 b30 b01 b11 b21 b31
    // a02 a12 a22 a32 a03 a13 a23 a33
    // b02 b12 a22 b32 b03 b13 b23 b33
    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
    // a00 a10 a20 a30   b00 b10 b20 b30
    // a01 a11 a21 a31   b01 b11 b21 b31
    // a02 a12 a22 a32   b02 b12 b22 b32
    // a03 a13 a23 a33   b03 b13 b23 b33
  }

  // Horizontal pass and subsequent transpose.
  {
    // First pass, c and d calculations are longer because of the "trick"
    // multiplications.
    const __m128i four = _mm_set1_epi16(4);
    const __m128i dc = _mm_add_epi16(T0, four);
    const __m128i a =  _mm_add_epi16(dc, T2);
    const __m128i b =  _mm_sub_epi16(dc, T2);
    // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
    const __m128i c1 = _mm_mulhi_epi16(T1, k2);
    const __m128i c2 = _mm_mulhi_epi16(T3, k1);
    const __m128i c3 = _mm_sub_epi16(T1, T3);
    const __m128i c4 = _mm_sub_epi16(c1, c2);
    const __m128i c = _mm_add_epi16(c3, c4);
    // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
    const __m128i d1 = _mm_mulhi_epi16(T1, k1);
    const __m128i d2 = _mm_mulhi_epi16(T3, k2);
    const __m128i d3 = _mm_add_epi16(T1, T3);
    const __m128i d4 = _mm_add_epi16(d1, d2);
    const __m128i d = _mm_add_epi16(d3, d4);

    // Second pass.
    const __m128i tmp0 = _mm_add_epi16(a, d);
    const __m128i tmp1 = _mm_add_epi16(b, c);
    const __m128i tmp2 = _mm_sub_epi16(b, c);
    const __m128i tmp3 = _mm_sub_epi16(a, d);
    const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);
    const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);
    const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);
    const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);

    // Transpose the two 4x4.
    // a00 a01 a02 a03   b00 b01 b02 b03
    // a10 a11 a12 a13   b10 b11 b12 b13
    // a20 a21 a22 a23   b20 b21 b22 b23
    // a30 a31 a32 a33   b30 b31 b32 b33
    const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1);
    const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3);
    const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1);
    const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3);
    // a00 a10 a01 a11   a02 a12 a03 a13
    // a20 a30 a21 a31   a22 a32 a23 a33
    // b00 b10 b01 b11   b02 b12 b03 b13
    // b20 b30 b21 b31   b22 b32 b23 b33
    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
    // a00 a10 a20 a30 a01 a11 a21 a31
    // b00 b10 b20 b30 b01 b11 b21 b31
    // a02 a12 a22 a32 a03 a13 a23 a33
    // b02 b12 a22 b32 b03 b13 b23 b33
    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
    // a00 a10 a20 a30   b00 b10 b20 b30
    // a01 a11 a21 a31   b01 b11 b21 b31
    // a02 a12 a22 a32   b02 b12 b22 b32
    // a03 a13 a23 a33   b03 b13 b23 b33
  }

  // Add inverse transform to 'ref' and store.
  {
    const __m128i zero = _mm_set1_epi16(0);
    // Load the reference(s).
    __m128i ref0, ref1, ref2, ref3;
    if (do_two) {
      // Load eight bytes/pixels per line.
      ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]);
      ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]);
      ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]);
      ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]);
    } else {
      // Load four bytes/pixels per line.
      ref0 = _mm_cvtsi32_si128(*(int*)&ref[0 * BPS]);
      ref1 = _mm_cvtsi32_si128(*(int*)&ref[1 * BPS]);
      ref2 = _mm_cvtsi32_si128(*(int*)&ref[2 * BPS]);
      ref3 = _mm_cvtsi32_si128(*(int*)&ref[3 * BPS]);
    }
    // Convert to 16b.
    ref0 = _mm_unpacklo_epi8(ref0, zero);
    ref1 = _mm_unpacklo_epi8(ref1, zero);
    ref2 = _mm_unpacklo_epi8(ref2, zero);
    ref3 = _mm_unpacklo_epi8(ref3, zero);
    // Add the inverse transform(s).
    ref0 = _mm_add_epi16(ref0, T0);
    ref1 = _mm_add_epi16(ref1, T1);
    ref2 = _mm_add_epi16(ref2, T2);
    ref3 = _mm_add_epi16(ref3, T3);
    // Unsigned saturate to 8b.
    ref0 = _mm_packus_epi16(ref0, ref0);
    ref1 = _mm_packus_epi16(ref1, ref1);
    ref2 = _mm_packus_epi16(ref2, ref2);
    ref3 = _mm_packus_epi16(ref3, ref3);
    // Store the results.
    if (do_two) {
      // Store eight bytes/pixels per line.
      _mm_storel_epi64((__m128i*)&dst[0 * BPS], ref0);
      _mm_storel_epi64((__m128i*)&dst[1 * BPS], ref1);
      _mm_storel_epi64((__m128i*)&dst[2 * BPS], ref2);
      _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3);
    } else {
      // Store four bytes/pixels per line.
      *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(ref0);
      *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(ref1);
      *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(ref2);
      *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(ref3);
    }
  }
}
Exemple #3
0
#include "strategyselector.h"


 /**
 * \brief Linear interpolation for 4 pixels. Returns 4 filtered pixels in lowest 32-bits of the register.
 * \param ref_main      Reference pixels
 * \param delta_pos     Fractional pixel precise position of sample displacement
 * \param x             Sample offset in direction x in ref_main array
 */
static INLINE __m128i filter_4x1_avx2(const kvz_pixel *ref_main, int16_t delta_pos, int x){

  int8_t delta_int = delta_pos >> 5;
  int8_t delta_fract = delta_pos & (32-1);
  __m128i sample0 = _mm_cvtsi32_si128(*(uint32_t*)&(ref_main[x + delta_int]));
  __m128i sample1 = _mm_cvtsi32_si128(*(uint32_t*)&(ref_main[x + delta_int + 1]));
  __m128i pairs = _mm_unpacklo_epi8(sample0, sample1);
  __m128i weight = _mm_set1_epi16( (delta_fract << 8) | (32 - delta_fract) );
  sample0 = _mm_maddubs_epi16(pairs, weight);
  sample0 = _mm_add_epi16(sample0, _mm_set1_epi16(16));
  sample0 = _mm_srli_epi16(sample0, 5);
  sample0 = _mm_packus_epi16(sample0, sample0);

  return sample0;
}

 /**
 * \brief Linear interpolation for 4x4 block. Writes filtered 4x4 block to dst.
 * \param dst           Destination buffer
 * \param ref_main      Reference pixels
 * \param sample_disp   Sample displacement per row
 * \param vertical_mode Mode direction, true if vertical
void tuned_ConvertULY4ToRGB(uint8_t *pDstBegin, uint8_t *pDstEnd, const uint8_t *pYBegin, const uint8_t *pUBegin, const uint8_t *pVBegin, size_t cbWidth, ssize_t scbStride)
{
	const int shift = 13;

	__m128i xy2rgb = _mm_set2_epi16_shift((-16 * C::Y2RGB + 0.5) / 0xff, C::Y2RGB, shift);
	__m128i vu2r = _mm_set2_epi16_shift(C::V2R, 0, shift);
	__m128i vu2g = _mm_set2_epi16_shift(C::V2G, C::U2G, shift);
	__m128i vu2b = _mm_set2_epi16_shift(0, C::U2B, shift);

	auto y = pYBegin;
	auto u = pUBegin;
	auto v = pVBegin;

	for (auto p = pDstBegin; p != pDstEnd; p += scbStride)
	{
		auto pp = p;

		for (; pp <= p + cbWidth - 16; pp += T::BYPP * 4)
		{
			__m128i yy = _mm_cvtsi32_si128(*(const int *)y);
			__m128i uu = _mm_cvtsi32_si128(*(const int *)u);
			__m128i vv = _mm_cvtsi32_si128(*(const int *)v);

			__m128i xy = _mm_unpacklo_epi8(_mm_unpacklo_epi8(yy, _mm_setone_si128()), _mm_setzero_si128()); // 00 ff 00 Y3 00 ff 00 Y2 00 ff 00 Y1 00 ff 00 Y0
			__m128i vu = _mm_unpacklo_epi8(_mm_unpacklo_epi8(uu, vv), _mm_setzero_si128()); // 00 V3 00 U3 00 V2 00 U2 00 V1 00 U1 00 V0 00 U0
			vu = _mm_sub_epi16(vu, _mm_set1_epi16(128));

			__m128i rgbtmp = _mm_madd_epi16(xy, xy2rgb);

			auto xyuv2rgb = [rgbtmp, vu, shift](__m128i vu2rgb) -> __m128i {
				__m128i rgb = _mm_add_epi32(rgbtmp, _mm_madd_epi16(vu, vu2rgb));
				rgb = _mm_srai_epi32(rgb, shift);
				rgb = _mm_packs_epi32(rgb, rgb);
				rgb = _mm_packus_epi16(rgb, rgb);
				return rgb;
			};
			__m128i rr = xyuv2rgb(vu2r);
			__m128i gg = xyuv2rgb(vu2g);
			__m128i bb = xyuv2rgb(vu2b);

			if (std::is_same<T, CBGRAColorOrder>::value)
			{
				__m128i bgrx = _mm_unpacklo_epi16(_mm_unpacklo_epi8(bb, gg), _mm_unpacklo_epi8(rr, _mm_setone_si128()));
				_mm_storeu_si128((__m128i *)pp, bgrx);
			}
#ifdef __SSSE3__
			else if (std::is_same<T, CBGRColorOrder>::value)
			{
				__m128i bgrx = _mm_unpacklo_epi16(_mm_unpacklo_epi8(bb, gg), _mm_unpacklo_epi8(rr, rr));
				__m128i bgr = _mm_shuffle_epi8(bgrx, _mm_set_epi8(-1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0));
				_mm_storeu_si128((__m128i *)pp, bgr);
			}
#endif
			else if (std::is_same<T, CARGBColorOrder>::value)
			{
				__m128i xrgb = _mm_unpacklo_epi16(_mm_unpacklo_epi8(rr, rr), _mm_unpacklo_epi8(gg, bb));
				_mm_storeu_si128((__m128i *)pp, xrgb);
			}
#ifdef __SSSE3__
			else if (std::is_same<T, CRGBColorOrder>::value)
			{
				__m128i xrgb = _mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_setone_si128(), rr), _mm_unpacklo_epi8(gg, bb));
				__m128i rgb = _mm_shuffle_epi8(xrgb, _mm_set_epi8(-1, -1, -1, -1, 15, 14, 13, 11, 10, 9, 7, 6, 5, 3, 2, 1));
				_mm_storeu_si128((__m128i *)pp, rgb);
			}
#endif

			y += 4;
			u += 4;
			v += 4;
		}

		for (; pp < p + cbWidth; pp += T::BYPP)
		{
			__m128i xy = _mm_cvtsi32_si128(*y | 0x00ff0000);
			__m128i uu = _mm_cvtsi32_si128(*u);
			__m128i vv = _mm_cvtsi32_si128(*v);

			__m128i vu = _mm_unpacklo_epi8(_mm_unpacklo_epi8(uu, vv), _mm_setzero_si128()); // 00 V3 00 U3 00 V2 00 U2 00 V1 00 U1 00 V0 00 U0
			vu = _mm_sub_epi16(vu, _mm_set1_epi16(128));

			__m128i rgbtmp = _mm_madd_epi16(xy, xy2rgb);

			auto xyuv2rgb = [rgbtmp, vu, shift](__m128i vu2rgb) -> __m128i {
				__m128i rgb = _mm_add_epi32(rgbtmp, _mm_madd_epi16(vu, vu2rgb));
				rgb = _mm_srai_epi32(rgb, shift);
				rgb = _mm_packs_epi32(rgb, rgb);
				rgb = _mm_packus_epi16(rgb, rgb);
				return rgb;
			};
			__m128i rr = xyuv2rgb(vu2r);
			__m128i gg = xyuv2rgb(vu2g);
			__m128i bb = xyuv2rgb(vu2b);

			if (std::is_same<T, CBGRAColorOrder>::value)
			{
				__m128i bgrx = _mm_unpacklo_epi16(_mm_unpacklo_epi8(bb, gg), _mm_unpacklo_epi8(rr, _mm_setone_si128()));
				*(uint32_t *)pp = _mm_cvtsi128_si32(bgrx);
			}
			else if (std::is_same<T, CARGBColorOrder>::value)
			{
				__m128i xrgb = _mm_unpacklo_epi16(_mm_unpacklo_epi8(rr, rr), _mm_unpacklo_epi8(gg, bb));
				*(uint32_t *)pp = _mm_cvtsi128_si32(xrgb);
			}
			else if (std::is_same<T, CBGRColorOrder>::value || std::is_same<T, CRGBColorOrder>::value)
			{
				*(pp + T::B) = (uint8_t)_mm_cvtsi128_si32(bb);
				*(pp + T::G) = (uint8_t)_mm_cvtsi128_si32(gg);
				*(pp + T::R) = (uint8_t)_mm_cvtsi128_si32(rr);
			}

			y += 1;
			u += 1;
			v += 1;
		}
	}
}
Exemple #5
0
static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
                           int16_t* out) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i seven = _mm_set1_epi16(7);
  const __m128i k7500 = _mm_set1_epi32(7500);
  const __m128i k14500 = _mm_set1_epi32(14500);
  const __m128i k51000 = _mm_set1_epi32(51000);
  const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
  const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
                                           5352,  2217, 5352,  2217);
  const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
                                           2217, -5352, 2217, -5352);

  __m128i v01, v32;

  // Difference between src and ref and initial transpose.
  {
    // Load src and convert to 16b.
    const __m128i src0 = _mm_loadl_epi64((__m128i*)&src[0 * BPS]);
    const __m128i src1 = _mm_loadl_epi64((__m128i*)&src[1 * BPS]);
    const __m128i src2 = _mm_loadl_epi64((__m128i*)&src[2 * BPS]);
    const __m128i src3 = _mm_loadl_epi64((__m128i*)&src[3 * BPS]);
    const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
    const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
    const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
    const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
    // Load ref and convert to 16b.
    const __m128i ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]);
    const __m128i ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]);
    const __m128i ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]);
    const __m128i ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]);
    const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
    const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
    const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
    const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
    // Compute difference.
    const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
    const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
    const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
    const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);

    // Transpose.
    // 00 01 02 03   0 0 0 0
    // 10 11 12 13   0 0 0 0
    // 20 21 22 23   0 0 0 0
    // 30 31 32 33   0 0 0 0
    const __m128i transpose0_0 = _mm_unpacklo_epi16(diff0, diff1);
    const __m128i transpose0_1 = _mm_unpacklo_epi16(diff2, diff3);
    // 00 10 01 11   02 12 03 13
    // 20 30 21 31   22 32 23 33
    const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
    v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
    // a02 a12 a22 a32   a03 a13 a23 a33
    // a00 a10 a20 a30   a01 a11 a21 a31
    // a03 a13 a23 a33   a02 a12 a22 a32
  }

  // First pass and subsequent transpose.
  {
    // Same operations are done on the (0,3) and (1,2) pairs.
    // b0 = (a0 + a3) << 3
    // b1 = (a1 + a2) << 3
    // b3 = (a0 - a3) << 3
    // b2 = (a1 - a2) << 3
    const __m128i a01 = _mm_add_epi16(v01, v32);
    const __m128i a32 = _mm_sub_epi16(v01, v32);
    const __m128i b01 = _mm_slli_epi16(a01, 3);
    const __m128i b32 = _mm_slli_epi16(a32, 3);
    const __m128i b11 = _mm_unpackhi_epi64(b01, b01);
    const __m128i b22 = _mm_unpackhi_epi64(b32, b32);

    // e0 = b0 + b1
    // e2 = b0 - b1
    const __m128i e0 = _mm_add_epi16(b01, b11);
    const __m128i e2 = _mm_sub_epi16(b01, b11);
    const __m128i e02 = _mm_unpacklo_epi64(e0, e2);

    // e1 = (b3 * 5352 + b2 * 2217 + 14500) >> 12
    // e3 = (b3 * 2217 - b2 * 5352 +  7500) >> 12
    const __m128i b23 = _mm_unpacklo_epi16(b22, b32);
    const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
    const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
    const __m128i d1 = _mm_add_epi32(c1, k14500);
    const __m128i d3 = _mm_add_epi32(c3, k7500);
    const __m128i e1 = _mm_srai_epi32(d1, 12);
    const __m128i e3 = _mm_srai_epi32(d3, 12);
    const __m128i e13 = _mm_packs_epi32(e1, e3);

    // Transpose.
    // 00 01 02 03  20 21 22 23
    // 10 11 12 13  30 31 32 33
    const __m128i transpose0_0 = _mm_unpacklo_epi16(e02, e13);
    const __m128i transpose0_1 = _mm_unpackhi_epi16(e02, e13);
    // 00 10 01 11   02 12 03 13
    // 20 30 21 31   22 32 23 33
    const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
    v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
    // 02 12 22 32   03 13 23 33
    // 00 10 20 30   01 11 21 31
    // 03 13 23 33   02 12 22 32
  }

  // Second pass
  {
    // Same operations are done on the (0,3) and (1,2) pairs.
    // a0 = v0 + v3
    // a1 = v1 + v2
    // a3 = v0 - v3
    // a2 = v1 - v2
    const __m128i a01 = _mm_add_epi16(v01, v32);
    const __m128i a32 = _mm_sub_epi16(v01, v32);
    const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
    const __m128i a22 = _mm_unpackhi_epi64(a32, a32);

    // d0 = (a0 + a1 + 7) >> 4;
    // d2 = (a0 - a1 + 7) >> 4;
    const __m128i b0 = _mm_add_epi16(a01, a11);
    const __m128i b2 = _mm_sub_epi16(a01, a11);
    const __m128i c0 = _mm_add_epi16(b0, seven);
    const __m128i c2 = _mm_add_epi16(b2, seven);
    const __m128i d0 = _mm_srai_epi16(c0, 4);
    const __m128i d2 = _mm_srai_epi16(c2, 4);

    // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
    // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
    const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
    const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
    const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
    const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);
    const __m128i d3 = _mm_add_epi32(c3, k51000);
    const __m128i e1 = _mm_srai_epi32(d1, 16);
    const __m128i e3 = _mm_srai_epi32(d3, 16);
    const __m128i f1 = _mm_packs_epi32(e1, e1);
    const __m128i f3 = _mm_packs_epi32(e3, e3);
    // f1 = f1 + (a3 != 0);
    // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
    // desired (0, 1), we add one earlier through k12000_plus_one.
    const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));

    _mm_storel_epi64((__m128i*)&out[ 0], d0);
    _mm_storel_epi64((__m128i*)&out[ 4], g1);
    _mm_storel_epi64((__m128i*)&out[ 8], d2);
    _mm_storel_epi64((__m128i*)&out[12], f3);
  }
}
Exemple #6
0
/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */
static void
unshuffle16(uint8_t* dest, uint8_t* orig, size_t size)
{
  size_t i, j, k;
  size_t neblock, numof16belem;
  __m128i xmm1[16], xmm2[16];

  neblock = size / 16;
  numof16belem = neblock / 16;
  for (i = 0, k = 0; i < numof16belem; i++, k += 16) {
    /* Load the first 128 bytes in 16 XMM registrers */
    for (j = 0; j < 16; j++) {
      xmm1[j] = ((__m128i *)orig)[j*numof16belem+i];
    }
    /* Shuffle bytes */
    for (j = 0; j < 8; j++) {
      /* Compute the low 32 bytes */
      xmm2[j] = _mm_unpacklo_epi8(xmm1[j*2], xmm1[j*2+1]);
      /* Compute the hi 32 bytes */
      xmm2[8+j] = _mm_unpackhi_epi8(xmm1[j*2], xmm1[j*2+1]);
    }
    /* Shuffle 2-byte words */
    for (j = 0; j < 8; j++) {
      /* Compute the low 32 bytes */
      xmm1[j] = _mm_unpacklo_epi16(xmm2[j*2], xmm2[j*2+1]);
      /* Compute the hi 32 bytes */
      xmm1[8+j] = _mm_unpackhi_epi16(xmm2[j*2], xmm2[j*2+1]);
    }
    /* Shuffle 4-byte dwords */
    for (j = 0; j < 8; j++) {
      /* Compute the low 32 bytes */
      xmm2[j] = _mm_unpacklo_epi32(xmm1[j*2], xmm1[j*2+1]);
      /* Compute the hi 32 bytes */
      xmm2[8+j] = _mm_unpackhi_epi32(xmm1[j*2], xmm1[j*2+1]);
    }
    /* Shuffle 8-byte qwords */
    for (j = 0; j < 8; j++) {
      /* Compute the low 32 bytes */
      xmm1[j] = _mm_unpacklo_epi64(xmm2[j*2], xmm2[j*2+1]);
      /* Compute the hi 32 bytes */
      xmm1[8+j] = _mm_unpackhi_epi64(xmm2[j*2], xmm2[j*2+1]);
    }
    /* Store the result vectors in proper order */
    ((__m128i *)dest)[k+0] = xmm1[0];
    ((__m128i *)dest)[k+1] = xmm1[8];
    ((__m128i *)dest)[k+2] = xmm1[4];
    ((__m128i *)dest)[k+3] = xmm1[12];
    ((__m128i *)dest)[k+4] = xmm1[2];
    ((__m128i *)dest)[k+5] = xmm1[10];
    ((__m128i *)dest)[k+6] = xmm1[6];
    ((__m128i *)dest)[k+7] = xmm1[14];
    ((__m128i *)dest)[k+8] = xmm1[1];
    ((__m128i *)dest)[k+9] = xmm1[9];
    ((__m128i *)dest)[k+10] = xmm1[5];
    ((__m128i *)dest)[k+11] = xmm1[13];
    ((__m128i *)dest)[k+12] = xmm1[3];
    ((__m128i *)dest)[k+13] = xmm1[11];
    ((__m128i *)dest)[k+14] = xmm1[7];
    ((__m128i *)dest)[k+15] = xmm1[15];
  }
}
Exemple #7
0
	IVTCScore ComputeScanImprovement_X8R8G8B8_SSE2(const void *src1, const void *src2, ptrdiff_t srcpitch, uint32 w, uint32 h) {
		IVTCScore score = {0};

		__m128i zero = _mm_setzero_si128();

		uint32 w2 = w >> 1;

		static const __m128i mask = { -1, -1, -1, -1, -1, -1, 0, 0, -1, -1, -1, -1, -1, -1, 0, 0 };

		bool firstfield = true;
		do {
			__m128i var = zero;
			__m128i varshift = zero;

			const uint8 *src1r0 = (const uint8 *)src1;
			const uint8 *src1r1 = src1r0 + srcpitch;
			const uint8 *src1r2 = src1r1 + srcpitch;
			const uint8 *src2r = (const uint8 *)src2 + srcpitch;
			for(uint32 x=0; x<w2; ++x) {
				__m128i rA = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)src1r0), zero);
				__m128i rB = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)src1r1), zero);
				__m128i rC = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)src1r2), zero);
				__m128i rE = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)src2r), zero);
				__m128i rAC = _mm_add_epi16(rA, rC);
				__m128i d1 = _mm_sub_epi16(rAC, _mm_add_epi16(rB, rB));		// combing in current frame
				__m128i d3 = _mm_sub_epi16(rAC, _mm_add_epi16(rE, rE));		// combing in merged frame

				d1 = _mm_and_si128(d1, mask);
				d3 = _mm_and_si128(d3, mask);

				var = _mm_add_epi32(var, _mm_madd_epi16(d1, d1));
				varshift = _mm_add_epi32(varshift, _mm_madd_epi16(d3, d3));

				src1r0 += 8;
				src1r1 += 8;
				src1r2 += 8;
				src2r += 8;
			}

			if (w & 1) {
				__m128i rA = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)src1r0), zero);
				__m128i rB = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)src1r1), zero);
				__m128i rC = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)src1r2), zero);
				__m128i rE = _mm_unpacklo_epi8(_mm_cvtsi32_si128(*(const int *)src2r), zero);
				__m128i rAC = _mm_add_epi16(rA, rC);
				__m128i d1 = _mm_sub_epi16(rAC, _mm_add_epi16(rB, rB));		// combing in current frame
				__m128i d3 = _mm_sub_epi16(rAC, _mm_add_epi16(rE, rE));		// combing in merged frame

				d1 = _mm_and_si128(d1, mask);
				d3 = _mm_and_si128(d3, mask);

				var = _mm_add_epi32(var, _mm_madd_epi16(d1, d1));
				varshift = _mm_add_epi32(varshift, _mm_madd_epi16(d3, d3));
			}

			src1 = (const uint8 *)src1 + srcpitch;
			src2 = (const uint8 *)src2 + srcpitch;

			var = _mm_add_epi32(var, _mm_shuffle_epi32(var, 0xee));
			varshift = _mm_add_epi32(varshift, _mm_shuffle_epi32(varshift, 0xee));
			var = _mm_add_epi32(var, _mm_shuffle_epi32(var, 0x55));
			varshift = _mm_add_epi32(varshift, _mm_shuffle_epi32(varshift, 0x55));

			uint32 ivar = _mm_cvtsi128_si32(var);
			uint32 ivarshift = _mm_cvtsi128_si32(varshift);

			if (firstfield) {
				score.mVar[0] += ivar;
				score.mVarShift[0] += ivarshift;
			} else {
				score.mVar[1] += ivar;
				score.mVarShift[1] += ivarshift;
			}

			firstfield = !firstfield;
		} while(--h);

		return score;
	}
void CColorAdjustment::ProcessY(int Width, int Height, uint8_t *pData, int Pitch)
{
	if (m_Brightness != 0 || m_Contrast != 0) {
		if (m_fUpdateYTable) {
			MakeYTable(m_YTable, m_Brightness, m_Contrast);
			m_fUpdateYTable = false;
		}

#ifdef TVTVIDEODEC_SSE2_SUPPORT
		const bool fSSE2 = IsSSE2Enabled();
#endif

		for (int y = 0; y < Height; y++) {
			uint8_t *p = pData;
			int x = 0;

#ifdef TVTVIDEODEC_SSE2_SUPPORT
			if (fSSE2 && !((uintptr_t)p & 15)) {
				const short c = (short)(min((m_Contrast * 512 / 100) + 512, (1 << 16) - 1));
				const short b = (short)((m_Brightness * 255 / 100) + 16);
				const __m128i bc = _mm_set_epi16(b, c, b, c, b, c, b, c);
				const __m128i zero = _mm_setzero_si128();
				const __m128i w16 = _mm_set1_epi16(16);
				const __m128i w512 = _mm_set1_epi16(512);

				for (; x + 16 <= Width; x += 16) {
					__m128i r = _mm_load_si128((const __m128i*)p);

					__m128i rl = _mm_unpacklo_epi8(r, zero);
					__m128i rh = _mm_unpackhi_epi8(r, zero);

					rl = _mm_subs_epi16(rl, w16);
					rh = _mm_subs_epi16(rh, w16);

					__m128i rll = _mm_unpacklo_epi16(rl, w512);
					__m128i rlh = _mm_unpackhi_epi16(rl, w512);
					__m128i rhl = _mm_unpacklo_epi16(rh, w512);
					__m128i rhh = _mm_unpackhi_epi16(rh, w512);

					rll = _mm_madd_epi16(rll, bc);
					rlh = _mm_madd_epi16(rlh, bc);
					rhl = _mm_madd_epi16(rhl, bc);
					rhh = _mm_madd_epi16(rhh, bc);

					rll = _mm_srai_epi32(rll, 9);
					rlh = _mm_srai_epi32(rlh, 9);
					rhl = _mm_srai_epi32(rhl, 9);
					rhh = _mm_srai_epi32(rhh, 9);

					rl = _mm_packs_epi32(rll, rlh);
					rh = _mm_packs_epi32(rhl, rhh);

					r = _mm_packus_epi16(rl, rh);

					_mm_store_si128((__m128i*)p, r);

					p += 16;
				}
			}
#endif

			for (; x < Width; x++) {
				*p = m_YTable[*p];
				p++;
			}

			pData += Pitch;
		}
	}
}
void fb_sqrm_low(dig_t *c, const dig_t *a) {
	__m128i t0, m0, m1, m2, m3, m4, m5, m6, mask;
	align dig_t t[2*FB_DIGS];

	t0 = _mm_set_epi32(0x55545150, 0x45444140, 0x15141110, 0x05040100);
	mask = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F);

	m0 = _mm_load_si128((__m128i *)(a));
	m1 = _mm_and_si128(m0, mask);
	m1 = _mm_shuffle_epi8(t0, m1);
	m2 = _mm_srli_epi64(m0, 4);
	m2 = _mm_and_si128(m2, mask);
	m2 = _mm_shuffle_epi8(t0, m2);
	m3 = _mm_unpacklo_epi8(m1, m2);
	m4 = _mm_unpackhi_epi8(m1, m2);

	m0 = _mm_load_si128((__m128i *)(a+2));
	m1 = _mm_and_si128(m0, mask);
	m1 = _mm_shuffle_epi8(t0, m1);
	m2 = _mm_srli_epi64(m0, 4);
	m2 = _mm_and_si128(m2, mask);
	m2 = _mm_shuffle_epi8(t0, m2);
	m5 = _mm_unpacklo_epi8(m1, m2);
	m6 = _mm_unpackhi_epi8(m1, m2);

	m0 = m3;
	m1 = m4;
	m2 = m5;
	m3 = m6;

	_mm_store_si128((__m128i *) t + 0, m0);
	_mm_store_si128((__m128i *) t + 1, m1);
	_mm_store_si128((__m128i *) t + 2, m2);
	_mm_store_si128((__m128i *) t + 3, m3);

	const int ra = 52;
	const int rb = 55;
	const int rc = 57;
	const int rh = 59;
	const int lh = 5;
	const int la = 12;
	const int lb = 9;
	const int lc = 7;

	dig_t d = t[7], a0 = t[0], a1 = t[1], a2 = t[2], a3 = t[3], a4 = t[4];

	a4 ^= (d >> rh);
	a4 ^= (d >> ra);
	a4 ^= (d >> rb);
	a4 ^= (d >> rc);

	a3 ^= (d << lh);
	a3 ^= (d << la);
	a3 ^= (d << lb);
	a3 ^= (d << lc);

	d = t[6];
	a3 ^= (d >> rh);
	a3 ^= (d >> ra);
	a3 ^= (d >> rb);
	a3 ^= (d >> rc);

	a2 ^= (d << lh);
	a2 ^= (d << la);
	a2 ^= (d << lb);
	a2 ^= (d << lc);

	d = t[5];
	a2 ^= (d >> rh);
	a2 ^= (d >> ra);
	a2 ^= (d >> rb);
	a2 ^= (d >> rc);

	a1 ^= (d << lh);
	a1 ^= (d << la);
	a1 ^= (d << lb);
	a1 ^= (d << lc);

	d = a4;
	a1 ^= (d >> rh);
	a1 ^= (d >> ra);
	a1 ^= (d >> rb);
	a1 ^= (d >> rc);

	a0 ^= (d << lh);
	a0 ^= (d << la);
	a0 ^= (d << lb);
	a0 ^= (d << lc);

	d = a3 >> rh;
	a0 ^= d;
	d <<= rh;

	a0 ^= (d >> ra);
	a0 ^= (d >> rb);
	a0 ^= (d >> rc);
	a3 ^= d;

	c[3] = a3;
	c[2] = a2;
	c[1] = a1;
	c[0] = a0;

	return;
}
void aom_convolve8_add_src_hip_sse2(const uint8_t *src, ptrdiff_t src_stride,
                                    uint8_t *dst, ptrdiff_t dst_stride,
                                    const int16_t *filter_x, int x_step_q4,
                                    const int16_t *filter_y, int y_step_q4,
                                    int w, int h) {
  const int bd = 8;
  assert(x_step_q4 == 16 && y_step_q4 == 16);
  assert(!(w & 7));
  (void)x_step_q4;
  (void)y_step_q4;

  uint16_t temp[(MAX_SB_SIZE + SUBPEL_TAPS - 1) * MAX_SB_SIZE];
  int intermediate_height = h + SUBPEL_TAPS - 1;
  int i, j;
  const int center_tap = ((SUBPEL_TAPS - 1) / 2);
  const uint8_t *const src_ptr = src - center_tap * src_stride - center_tap;

  const __m128i zero = _mm_setzero_si128();
  // Add an offset to account for the "add_src" part of the convolve function.
  const __m128i offset = _mm_insert_epi16(zero, 1 << FILTER_BITS, 3);

  /* Horizontal filter */
  {
    const __m128i coeffs_x =
        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_x), offset);

    // coeffs 0 1 0 1 2 3 2 3
    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_x, coeffs_x);
    // coeffs 4 5 4 5 6 7 6 7
    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_x, coeffs_x);

    // coeffs 0 1 0 1 0 1 0 1
    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
    // coeffs 2 3 2 3 2 3 2 3
    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
    // coeffs 4 5 4 5 4 5 4 5
    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
    // coeffs 6 7 6 7 6 7 6 7
    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);

    const __m128i round_const =
        _mm_set1_epi32((1 << (FILTER_BITS - EXTRAPREC_BITS - 1)) +
                       (1 << (bd + FILTER_BITS - 1)));

    for (i = 0; i < intermediate_height; ++i) {
      for (j = 0; j < w; j += 8) {
        const __m128i data =
            _mm_loadu_si128((__m128i *)&src_ptr[i * src_stride + j]);

        // Filter even-index pixels
        const __m128i src_0 = _mm_unpacklo_epi8(data, zero);
        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
        const __m128i src_2 = _mm_unpacklo_epi8(_mm_srli_si128(data, 2), zero);
        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
        const __m128i src_4 = _mm_unpacklo_epi8(_mm_srli_si128(data, 4), zero);
        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
        const __m128i src_6 = _mm_unpacklo_epi8(_mm_srli_si128(data, 6), zero);
        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);

        __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_4),
                                         _mm_add_epi32(res_2, res_6));
        res_even = _mm_srai_epi32(_mm_add_epi32(res_even, round_const),
                                  FILTER_BITS - EXTRAPREC_BITS);

        // Filter odd-index pixels
        const __m128i src_1 = _mm_unpacklo_epi8(_mm_srli_si128(data, 1), zero);
        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
        const __m128i src_3 = _mm_unpacklo_epi8(_mm_srli_si128(data, 3), zero);
        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
        const __m128i src_5 = _mm_unpacklo_epi8(_mm_srli_si128(data, 5), zero);
        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
        const __m128i src_7 = _mm_unpacklo_epi8(_mm_srli_si128(data, 7), zero);
        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);

        __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_5),
                                        _mm_add_epi32(res_3, res_7));
        res_odd = _mm_srai_epi32(_mm_add_epi32(res_odd, round_const),
                                 FILTER_BITS - EXTRAPREC_BITS);

        // Pack in the column order 0, 2, 4, 6, 1, 3, 5, 7
        __m128i res = _mm_packs_epi32(res_even, res_odd);
        res = _mm_min_epi16(_mm_max_epi16(res, zero),
                            _mm_set1_epi16(EXTRAPREC_CLAMP_LIMIT(bd) - 1));
        _mm_storeu_si128((__m128i *)&temp[i * MAX_SB_SIZE + j], res);
      }
    }
  }

  /* Vertical filter */
  {
    const __m128i coeffs_y =
        _mm_add_epi16(_mm_loadu_si128((__m128i *)filter_y), offset);

    // coeffs 0 1 0 1 2 3 2 3
    const __m128i tmp_0 = _mm_unpacklo_epi32(coeffs_y, coeffs_y);
    // coeffs 4 5 4 5 6 7 6 7
    const __m128i tmp_1 = _mm_unpackhi_epi32(coeffs_y, coeffs_y);

    // coeffs 0 1 0 1 0 1 0 1
    const __m128i coeff_01 = _mm_unpacklo_epi64(tmp_0, tmp_0);
    // coeffs 2 3 2 3 2 3 2 3
    const __m128i coeff_23 = _mm_unpackhi_epi64(tmp_0, tmp_0);
    // coeffs 4 5 4 5 4 5 4 5
    const __m128i coeff_45 = _mm_unpacklo_epi64(tmp_1, tmp_1);
    // coeffs 6 7 6 7 6 7 6 7
    const __m128i coeff_67 = _mm_unpackhi_epi64(tmp_1, tmp_1);

    const __m128i round_const =
        _mm_set1_epi32((1 << (FILTER_BITS + EXTRAPREC_BITS - 1)) -
                       (1 << (bd + FILTER_BITS + EXTRAPREC_BITS - 1)));

    for (i = 0; i < h; ++i) {
      for (j = 0; j < w; j += 8) {
        // Filter even-index pixels
        const uint16_t *data = &temp[i * MAX_SB_SIZE + j];
        const __m128i src_0 =
            _mm_unpacklo_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
        const __m128i src_2 =
            _mm_unpacklo_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
        const __m128i src_4 =
            _mm_unpacklo_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
        const __m128i src_6 =
            _mm_unpacklo_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
                               *(__m128i *)(data + 7 * MAX_SB_SIZE));

        const __m128i res_0 = _mm_madd_epi16(src_0, coeff_01);
        const __m128i res_2 = _mm_madd_epi16(src_2, coeff_23);
        const __m128i res_4 = _mm_madd_epi16(src_4, coeff_45);
        const __m128i res_6 = _mm_madd_epi16(src_6, coeff_67);

        const __m128i res_even = _mm_add_epi32(_mm_add_epi32(res_0, res_2),
                                               _mm_add_epi32(res_4, res_6));

        // Filter odd-index pixels
        const __m128i src_1 =
            _mm_unpackhi_epi16(*(__m128i *)(data + 0 * MAX_SB_SIZE),
                               *(__m128i *)(data + 1 * MAX_SB_SIZE));
        const __m128i src_3 =
            _mm_unpackhi_epi16(*(__m128i *)(data + 2 * MAX_SB_SIZE),
                               *(__m128i *)(data + 3 * MAX_SB_SIZE));
        const __m128i src_5 =
            _mm_unpackhi_epi16(*(__m128i *)(data + 4 * MAX_SB_SIZE),
                               *(__m128i *)(data + 5 * MAX_SB_SIZE));
        const __m128i src_7 =
            _mm_unpackhi_epi16(*(__m128i *)(data + 6 * MAX_SB_SIZE),
                               *(__m128i *)(data + 7 * MAX_SB_SIZE));

        const __m128i res_1 = _mm_madd_epi16(src_1, coeff_01);
        const __m128i res_3 = _mm_madd_epi16(src_3, coeff_23);
        const __m128i res_5 = _mm_madd_epi16(src_5, coeff_45);
        const __m128i res_7 = _mm_madd_epi16(src_7, coeff_67);

        const __m128i res_odd = _mm_add_epi32(_mm_add_epi32(res_1, res_3),
                                              _mm_add_epi32(res_5, res_7));

        // Rearrange pixels back into the order 0 ... 7
        const __m128i res_lo = _mm_unpacklo_epi32(res_even, res_odd);
        const __m128i res_hi = _mm_unpackhi_epi32(res_even, res_odd);

        const __m128i res_lo_round = _mm_srai_epi32(
            _mm_add_epi32(res_lo, round_const), FILTER_BITS + EXTRAPREC_BITS);
        const __m128i res_hi_round = _mm_srai_epi32(
            _mm_add_epi32(res_hi, round_const), FILTER_BITS + EXTRAPREC_BITS);

        const __m128i res_16bit = _mm_packs_epi32(res_lo_round, res_hi_round);
        __m128i res_8bit = _mm_packus_epi16(res_16bit, res_16bit);

        __m128i *const p = (__m128i *)&dst[i * dst_stride + j];
        _mm_storel_epi64(p, res_8bit);
      }
    }
  }
}
        // Unroll 4x, interleave bytes, use pmaddubsw (all_x is small)
        while (count > 3) {
            count -= 4;

            int x0[4];
            int x1[4];
            __m128i all_x, sixteen_minus_x;
            PrepareConstantsTwoPixelPairs(xy, mask_3FFF, mask_000F,
                                          sixteen_8bit, mask_dist_select,
                                          &all_x, &sixteen_minus_x, x0, x1);
            xy += 4;

            // First pair of pixel pairs
            // (4x(x1, 16-x1), 4x(x0, 16-x0))
            __m128i scale_x;
            scale_x = _mm_unpacklo_epi8(sixteen_minus_x, all_x);

            __m128i sum0 = ProcessTwoPixelPairs<has_alpha>(
                row0, row1, x0, x1,
                scale_x, all_y, neg_y, alpha);

            // second pair of pixel pairs
            // (4x (x3, 16-x3), 4x (16-x2, x2))
            scale_x = _mm_unpackhi_epi8(sixteen_minus_x, all_x);

            __m128i sum1 = ProcessTwoPixelPairs<has_alpha>(
                row0, row1, x0 + 2, x1 + 2,
                scale_x, all_y, neg_y, alpha);

            // Do the final packing of the two results
void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr,
                                          unsigned int src_pitch,
                                          unsigned char *output_ptr,
                                          unsigned int out_pitch,
                                          unsigned int output_height,
                                          int16_t *filter) {
  __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt2, srcRegFilt3;
  __m128i firstFilters, secondFilters, thirdFilters, forthFilters;
  __m128i srcRegFilt4, srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8;
  unsigned int i;

  // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64
  addFilterReg64 = _mm_set1_epi32((int)0x0400040u);
  filtersReg = _mm_loadu_si128((__m128i *)filter);
  // converting the 16 bit (short) to  8 bit (byte) and have the same data
  // in both lanes of 128 bit register.
  filtersReg =_mm_packs_epi16(filtersReg, filtersReg);

  // duplicate only the first 16 bits in the filter
  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u));
  // duplicate only the second 16 bits in the filter
  secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u));
  // duplicate only the third 16 bits in the filter
  thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u));
  // duplicate only the forth 16 bits in the filter
  forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u));

  for (i = 0; i < output_height; i++) {
    // load the first 16 bytes
    srcRegFilt1 = _mm_loadu_si128((__m128i *)(src_ptr));
    // load the next 16 bytes in stride of src_pitch
    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch));
    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*6));
    srcRegFilt4 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*7));

    // merge the result together
    srcRegFilt5 = _mm_unpacklo_epi8(srcRegFilt1, srcRegFilt2);
    srcRegFilt6 = _mm_unpacklo_epi8(srcRegFilt3, srcRegFilt4);
    srcRegFilt1 = _mm_unpackhi_epi8(srcRegFilt1, srcRegFilt2);
    srcRegFilt3 = _mm_unpackhi_epi8(srcRegFilt3, srcRegFilt4);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters);
    srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters);
    srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters);
    srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters);

    // add and saturate the results together
    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6);
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3);

    // load the next 16 bytes in stride of two/three src_pitch
    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*2));
    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*3));

    // merge the result together
    srcRegFilt4 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
    srcRegFilt6 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, secondFilters);
    srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters);

    // load the next 16 bytes in stride of four/five src_pitch
    srcRegFilt2 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*4));
    srcRegFilt3 = _mm_loadu_si128((__m128i *)(src_ptr+src_pitch*5));

    // merge the result together
    srcRegFilt7 = _mm_unpacklo_epi8(srcRegFilt2, srcRegFilt3);
    srcRegFilt8 = _mm_unpackhi_epi8(srcRegFilt2, srcRegFilt3);

    // multiply 2 adjacent elements with the filter and add the result
    srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters);
    srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters);

    // add and saturate the results together
    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
                                 _mm_min_epi16(srcRegFilt4, srcRegFilt7));
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
                                 _mm_min_epi16(srcRegFilt6, srcRegFilt8));

    // add and saturate the results together
    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5,
                                 _mm_max_epi16(srcRegFilt4, srcRegFilt7));
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1,
                                 _mm_max_epi16(srcRegFilt6, srcRegFilt8));
    srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64);
    srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64);

    // shift by 7 bit each 16 bit
    srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7);
    srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7);

    // shrink to 8 bit each 16 bits, the first lane contain the first
    // convolve result and the second lane contain the second convolve
    // result
    srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1);

    src_ptr+=src_pitch;

    // save 16 bytes convolve result
    _mm_store_si128((__m128i*)output_ptr, srcRegFilt1);

    output_ptr+=out_pitch;
  }
}
void vp9_iht4x4_16_add_sse2(const int16_t *input, uint8_t *dest, int stride,
                            int tx_type) {
  __m128i in[2];
  const __m128i zero = _mm_setzero_si128();
  const __m128i eight = _mm_set1_epi16(8);

  in[0] = _mm_loadu_si128((const __m128i *)(input));
  in[1] = _mm_loadu_si128((const __m128i *)(input + 8));

  switch (tx_type) {
    case 0:  // DCT_DCT
      idct4_sse2(in);
      idct4_sse2(in);
      break;
    case 1:  // ADST_DCT
      idct4_sse2(in);
      iadst4_sse2(in);
      break;
    case 2:  // DCT_ADST
      iadst4_sse2(in);
      idct4_sse2(in);
      break;
    case 3:  // ADST_ADST
      iadst4_sse2(in);
      iadst4_sse2(in);
      break;
    default:
      assert(0);
      break;
  }

  // Final round and shift
  in[0] = _mm_add_epi16(in[0], eight);
  in[1] = _mm_add_epi16(in[1], eight);

  in[0] = _mm_srai_epi16(in[0], 4);
  in[1] = _mm_srai_epi16(in[1], 4);

  // Reconstruction and Store
  {
    __m128i d0 = _mm_cvtsi32_si128(*(const int *)(dest));
    __m128i d2 = _mm_cvtsi32_si128(*(const int *)(dest + stride * 2));
    d0 = _mm_unpacklo_epi32(d0,
                            _mm_cvtsi32_si128(*(const int *)(dest + stride)));
    d2 = _mm_unpacklo_epi32(
        d2, _mm_cvtsi32_si128(*(const int *)(dest + stride * 3)));
    d0 = _mm_unpacklo_epi8(d0, zero);
    d2 = _mm_unpacklo_epi8(d2, zero);
    d0 = _mm_add_epi16(d0, in[0]);
    d2 = _mm_add_epi16(d2, in[1]);
    d0 = _mm_packus_epi16(d0, d2);
    // store result[0]
    *(int *)dest = _mm_cvtsi128_si32(d0);
    // store result[1]
    d0 = _mm_srli_si128(d0, 4);
    *(int *)(dest + stride) = _mm_cvtsi128_si32(d0);
    // store result[2]
    d0 = _mm_srli_si128(d0, 4);
    *(int *)(dest + stride * 2) = _mm_cvtsi128_si32(d0);
    // store result[3]
    d0 = _mm_srli_si128(d0, 4);
    *(int *)(dest + stride * 3) = _mm_cvtsi128_si32(d0);
  }
}
Exemple #14
0
// input: 8 bytes ABCDEFGH -> output: A0B0C0D0E0F0G0H0
static void LoadEightPixels_SSE2(const uint8_t* const src, __m128i* out) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i A = _mm_loadl_epi64((const __m128i*)(src));  // ABCDEFGH
  *out = _mm_unpacklo_epi8(A, zero);
}
Exemple #15
0
void SGMStereo::calcRowCosts(unsigned char*& leftSobelRow, int*& leftCensusRow,
							 unsigned char*& rightSobelRow, int*& rightCensusRow,
							 unsigned short* costImageRow)
{
	const int widthStepCost = width_*disparityTotal_;
	const __m128i registerZero = _mm_setzero_si128();

	for (int y = 1; y < height_; ++y) {
		int addRowIndex = y + aggregationWindowRadius_;
		int addRowAggregatedCostIndex = std::min(addRowIndex, height_ - 1)%(aggregationWindowRadius_*2 + 2);
		unsigned short* addRowAggregatedCost = rowAggregatedCost_ + width_*disparityTotal_*addRowAggregatedCostIndex;

		if (addRowIndex < height_) {
			calcPixelwiseSAD(leftSobelRow, rightSobelRow);
			addPixelwiseHamming(leftCensusRow, rightCensusRow);

			memset(addRowAggregatedCost, 0, disparityTotal_*sizeof(unsigned short));
			// x = 0
			for (int x = 0; x <= aggregationWindowRadius_; ++x) {
				int scale = x == 0 ? aggregationWindowRadius_ + 1 : 1;
				for (int d = 0; d < disparityTotal_; ++d) {
					addRowAggregatedCost[d] += static_cast<unsigned short>(pixelwiseCostRow_[disparityTotal_*x + d]*scale);
				}
			}
			// x = 1...width-1
			int subRowAggregatedCostIndex = std::max(y - aggregationWindowRadius_ - 1, 0)%(aggregationWindowRadius_*2 + 2);
			const unsigned short* subRowAggregatedCost = rowAggregatedCost_ + width_*disparityTotal_*subRowAggregatedCostIndex;
			const unsigned short* previousCostRow = costImageRow - widthStepCost;
			for (int x = 1; x < width_; ++x) {
				const unsigned char* addPixelwiseCost = pixelwiseCostRow_
					+ std::min((x + aggregationWindowRadius_)*disparityTotal_, (width_ - 1)*disparityTotal_);
				const unsigned char* subPixelwiseCost = pixelwiseCostRow_
					+ std::max((x - aggregationWindowRadius_ - 1)*disparityTotal_, 0);

				for (int d = 0; d < disparityTotal_; d += 16) {
					__m128i registerAddPixelwiseLow = _mm_load_si128(reinterpret_cast<const __m128i*>(addPixelwiseCost + d));
					__m128i registerAddPixelwiseHigh = _mm_unpackhi_epi8(registerAddPixelwiseLow, registerZero);
					registerAddPixelwiseLow = _mm_unpacklo_epi8(registerAddPixelwiseLow, registerZero);
					__m128i registerSubPixelwiseLow = _mm_load_si128(reinterpret_cast<const __m128i*>(subPixelwiseCost + d));
					__m128i registerSubPixelwiseHigh = _mm_unpackhi_epi8(registerSubPixelwiseLow, registerZero);
					registerSubPixelwiseLow = _mm_unpacklo_epi8(registerSubPixelwiseLow, registerZero);

					// Low
					__m128i registerAddAggregated = _mm_load_si128(reinterpret_cast<const __m128i*>(addRowAggregatedCost
						+ disparityTotal_*(x - 1) + d));
					registerAddAggregated = _mm_adds_epi16(_mm_subs_epi16(registerAddAggregated, registerSubPixelwiseLow),
														   registerAddPixelwiseLow);
					__m128i registerCost = _mm_load_si128(reinterpret_cast<const __m128i*>(previousCostRow + disparityTotal_*x + d));
					registerCost = _mm_adds_epi16(_mm_subs_epi16(registerCost,
						_mm_load_si128(reinterpret_cast<const __m128i*>(subRowAggregatedCost + disparityTotal_*x + d))),
						registerAddAggregated);
					_mm_store_si128(reinterpret_cast<__m128i*>(addRowAggregatedCost + disparityTotal_*x + d), registerAddAggregated);
					_mm_store_si128(reinterpret_cast<__m128i*>(costImageRow + disparityTotal_*x + d), registerCost);

					// High
					registerAddAggregated = _mm_load_si128(reinterpret_cast<const __m128i*>(addRowAggregatedCost + disparityTotal_*(x-1) + d + 8));
					registerAddAggregated = _mm_adds_epi16(_mm_subs_epi16(registerAddAggregated, registerSubPixelwiseHigh),
														   registerAddPixelwiseHigh);
					registerCost = _mm_load_si128(reinterpret_cast<const __m128i*>(previousCostRow + disparityTotal_*x + d + 8));
					registerCost = _mm_adds_epi16(_mm_subs_epi16(registerCost,
						_mm_load_si128(reinterpret_cast<const __m128i*>(subRowAggregatedCost + disparityTotal_*x + d + 8))),
						registerAddAggregated);
					_mm_store_si128(reinterpret_cast<__m128i*>(addRowAggregatedCost + disparityTotal_*x + d + 8), registerAddAggregated);
					_mm_store_si128(reinterpret_cast<__m128i*>(costImageRow + disparityTotal_*x + d + 8), registerCost);
				}
			}
		}

		leftSobelRow += widthStep_;
		rightSobelRow += widthStep_;
		leftCensusRow += width_;
		rightCensusRow += width_;
		costImageRow += widthStepCost;
	}
}
Exemple #16
0
void mpeg2_idct_add_sse2(int,int16_t* block, uint8_t* dest, const int stride)
{
 __m128i &src0=*(__m128i*)(block+0*16/2);
 __m128i &src1=*(__m128i*)(block+1*16/2);
 __m128i &src2=*(__m128i*)(block+2*16/2);
 __m128i &src3=*(__m128i*)(block+3*16/2);
 __m128i &src4=*(__m128i*)(block+4*16/2);
 __m128i &src5=*(__m128i*)(block+5*16/2);
 __m128i &src6=*(__m128i*)(block+6*16/2);
 __m128i &src7=*(__m128i*)(block+7*16/2);
            idct_M128ASM (src0,src1,src2,src3,src4,src5,src6,src7);

    __m128i zero = _mm_setzero_si128();

    __m128i r0 = _mm_load_si128(&src0);
    __m128i r1 = _mm_load_si128(&src1);
    __m128i r2 = _mm_load_si128(&src2);
    __m128i r3 = _mm_load_si128(&src3);
    __m128i r4 = _mm_load_si128(&src4);
    __m128i r5 = _mm_load_si128(&src5);
    __m128i r6 = _mm_load_si128(&src6);
    __m128i r7 = _mm_load_si128(&src7);

    __m128 q0 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[0*stride]);
    __m128 q1 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[1*stride]);
    __m128 q2 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[2*stride]);
    __m128 q3 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[3*stride]);
    __m128 q4 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[4*stride]);
    __m128 q5 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[5*stride]);
    __m128 q6 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[6*stride]);
    __m128 q7 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[7*stride]);

    r0 = _mm_adds_epi16(r0, _mm_unpacklo_epi8(*(__m128i*)&q0, zero));
    r1 = _mm_adds_epi16(r1, _mm_unpacklo_epi8(*(__m128i*)&q1, zero));
    r2 = _mm_adds_epi16(r2, _mm_unpacklo_epi8(*(__m128i*)&q2, zero));
    r3 = _mm_adds_epi16(r3, _mm_unpacklo_epi8(*(__m128i*)&q3, zero));
    r4 = _mm_adds_epi16(r4, _mm_unpacklo_epi8(*(__m128i*)&q4, zero));
    r5 = _mm_adds_epi16(r5, _mm_unpacklo_epi8(*(__m128i*)&q5, zero));
    r6 = _mm_adds_epi16(r6, _mm_unpacklo_epi8(*(__m128i*)&q6, zero));
    r7 = _mm_adds_epi16(r7, _mm_unpacklo_epi8(*(__m128i*)&q7, zero));

    r0 = _mm_packus_epi16(r0, r1);
    r1 = _mm_packus_epi16(r2, r3);
    r2 = _mm_packus_epi16(r4, r5);
    r3 = _mm_packus_epi16(r6, r7);

    _mm_storel_pi((__m64*)&dest[0*stride], *(__m128*)&r0);
    _mm_storeh_pi((__m64*)&dest[1*stride], *(__m128*)&r0);
    _mm_storel_pi((__m64*)&dest[2*stride], *(__m128*)&r1);
    _mm_storeh_pi((__m64*)&dest[3*stride], *(__m128*)&r1);
    _mm_storel_pi((__m64*)&dest[4*stride], *(__m128*)&r2);
    _mm_storeh_pi((__m64*)&dest[5*stride], *(__m128*)&r2);
    _mm_storel_pi((__m64*)&dest[6*stride], *(__m128*)&r3);
    _mm_storeh_pi((__m64*)&dest[7*stride], *(__m128*)&r3);

    _mm_store_si128(&src0, zero);
    _mm_store_si128(&src1, zero);
    _mm_store_si128(&src2, zero);
    _mm_store_si128(&src3, zero);
    _mm_store_si128(&src4, zero);
    _mm_store_si128(&src5, zero);
    _mm_store_si128(&src6, zero);
    _mm_store_si128(&src7, zero);
}
 __m64 interpolvline128_3(__m128i* temp){



	__m128i xmm6;

	__m64 ret;

	__m128i xmm7 = _mm_setzero_si128();

	__m128i xmm0 = _mm_load_si128(temp++);
	__m128i xmm1 = _mm_load_si128(temp++);
	__m128i xmm2 = _mm_load_si128(temp++);
	__m128i xmm3 = _mm_load_si128(temp++);
	__m128i xmm4 = _mm_load_si128(temp++);
	__m128i xmm5 = _mm_load_si128(temp);

	xmm1 = _mm_add_epi16(xmm1,xmm4);
	xmm0 = _mm_add_epi16(xmm0,xmm5);

	xmm6 = _mm_set_epi32(0xFFFBFFFB,0xFFFBFFFB,0xFFFBFFFB,0xFFFBFFFB);

	xmm4 = _mm_mullo_epi16(xmm1, xmm6);
	xmm5 = _mm_mulhi_epi16(xmm1, xmm6);

	xmm1 = _mm_unpacklo_epi16(xmm4, xmm5);
	xmm6 = _mm_unpackhi_epi16(xmm4, xmm5);

	xmm7 = _mm_set_epi32(0x00140014,0x00140014,0x00140014,0x00140014);
	xmm5 = _mm_add_epi16(xmm2,xmm3);

	xmm4 = _mm_mullo_epi16(xmm5, xmm7);
	xmm5 = _mm_mulhi_epi16(xmm5, xmm7);

	xmm7 = _mm_unpacklo_epi16(xmm4, xmm5);
	xmm4 = _mm_unpackhi_epi16(xmm4, xmm5);

	xmm7 = _mm_add_epi32(xmm7,xmm1);
	xmm4 = _mm_add_epi32(xmm4,xmm6); 

	xmm6 = _mm_set_epi32(0x00010001,0x00010001,0x00010001,0x00010001);
	xmm6 = _mm_mulhi_epi16(xmm0, xmm6);

	xmm1 = _mm_unpacklo_epi16(xmm0, xmm6);
	xmm6 = _mm_unpackhi_epi16(xmm0, xmm6);

	xmm7 = _mm_add_epi32(xmm7,xmm1);
	xmm4 = _mm_add_epi32(xmm4,xmm6); 
	
	xmm1 = _mm_set_epi32(0x00000200,0x00000200,0x00000200,0x00000200);
	
	xmm7 = _mm_add_epi32(xmm7,xmm1);
	xmm4 = _mm_add_epi32(xmm4,xmm1);
	
	xmm5 = _mm_setzero_si128();

	xmm7 = _mm_srli_epi32(xmm7, 10);
	xmm7 = _mm_max_epi16(xmm7, xmm5); // preventing negative values
	xmm7 = _mm_slli_epi32(xmm7,16);
	xmm7 = _mm_srli_epi32(xmm7,16);

	xmm4 = _mm_srli_epi32(xmm4, 10);

	xmm4 = _mm_max_epi16(xmm4, xmm5); // preventing negative values
	xmm4 = _mm_slli_epi32(xmm4,16);
	xmm4 = _mm_srli_epi32(xmm4,16);

	xmm6 = _mm_packs_epi32(xmm7, xmm4);
	
	xmm1 = _mm_set_epi32(0x00100010,0x00100010,0x00100010,0x00100010);
	xmm2 = _mm_add_epi16(xmm2,xmm1);
	xmm2 = _mm_max_epi16(xmm2, xmm5); // preventing negative values
	xmm2 = _mm_srli_epi16(xmm2,5);

	
	xmm3 = _mm_add_epi16(xmm3,xmm1);
	xmm3 = _mm_max_epi16(xmm3, xmm5); // preventing negative values
	xmm3 = _mm_srli_epi16(xmm3,5);

	xmm2 = _mm_packus_epi16(xmm2,xmm5);
	xmm3 = _mm_packus_epi16(xmm3,xmm5);
	xmm6 = _mm_packus_epi16(xmm6,xmm5);

	xmm7 = _mm_unpacklo_epi8(xmm2,xmm6);
	xmm4 = _mm_unpacklo_epi8(xmm6,xmm3);
	xmm6 = _mm_avg_epu8(xmm4,xmm7);

	xmm6 = _mm_srli_epi16(xmm6,8);
	xmm6 = _mm_packus_epi16(xmm6,xmm5);
	ret = _mm_movepi64_pi64(xmm6);

	_mm_empty(); 

	return(ret);
}
Exemple #18
0
void mpeg2_idct_add_sse2(const int last, int16_t* block, uint8_t* dest, const int stride)
{
	idct_M128ASM(block);

	/*
	for(int i = 0; i < 8; i++)
	{
		dest[0] = CLIP(block[0] + dest[0]);
		dest[1] = CLIP(block[1] + dest[1]);
		dest[2] = CLIP(block[2] + dest[2]);
		dest[3] = CLIP(block[3] + dest[3]);
		dest[4] = CLIP(block[4] + dest[4]);
		dest[5] = CLIP(block[5] + dest[5]);
		dest[6] = CLIP(block[6] + dest[6]);
		dest[7] = CLIP(block[7] + dest[7]);

		memset(block, 0, sizeof(short)*8);

		dest += stride;
		block += 8;
	}
	*/

	__m128i* src = (__m128i*)block;
	__m128i zero = _mm_setzero_si128();

	__m128i r0 = _mm_load_si128(&src[0]);
	__m128i r1 = _mm_load_si128(&src[1]);
	__m128i r2 = _mm_load_si128(&src[2]);
	__m128i r3 = _mm_load_si128(&src[3]);
	__m128i r4 = _mm_load_si128(&src[4]);
	__m128i r5 = _mm_load_si128(&src[5]);
	__m128i r6 = _mm_load_si128(&src[6]);
	__m128i r7 = _mm_load_si128(&src[7]);

	__m128 q0 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[0*stride]);
	__m128 q1 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[1*stride]);
	__m128 q2 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[2*stride]);
	__m128 q3 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[3*stride]);
	__m128 q4 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[4*stride]);
	__m128 q5 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[5*stride]);
	__m128 q6 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[6*stride]);
	__m128 q7 = _mm_loadl_pi(*(__m128*)&zero, (__m64*)&dest[7*stride]);

	r0 = _mm_adds_epi16(r0, _mm_unpacklo_epi8(*(__m128i*)&q0, zero));
	r1 = _mm_adds_epi16(r1, _mm_unpacklo_epi8(*(__m128i*)&q1, zero));
	r2 = _mm_adds_epi16(r2, _mm_unpacklo_epi8(*(__m128i*)&q2, zero));
	r3 = _mm_adds_epi16(r3, _mm_unpacklo_epi8(*(__m128i*)&q3, zero));
	r4 = _mm_adds_epi16(r4, _mm_unpacklo_epi8(*(__m128i*)&q4, zero));
	r5 = _mm_adds_epi16(r5, _mm_unpacklo_epi8(*(__m128i*)&q5, zero));
	r6 = _mm_adds_epi16(r6, _mm_unpacklo_epi8(*(__m128i*)&q6, zero));
	r7 = _mm_adds_epi16(r7, _mm_unpacklo_epi8(*(__m128i*)&q7, zero));

	r0 = _mm_packus_epi16(r0, r1);
	r1 = _mm_packus_epi16(r2, r3);
	r2 = _mm_packus_epi16(r4, r5);
	r3 = _mm_packus_epi16(r6, r7);

	_mm_storel_pi((__m64*)&dest[0*stride], *(__m128*)&r0);
	_mm_storeh_pi((__m64*)&dest[1*stride], *(__m128*)&r0);
	_mm_storel_pi((__m64*)&dest[2*stride], *(__m128*)&r1);
	_mm_storeh_pi((__m64*)&dest[3*stride], *(__m128*)&r1);
	_mm_storel_pi((__m64*)&dest[4*stride], *(__m128*)&r2);
	_mm_storeh_pi((__m64*)&dest[5*stride], *(__m128*)&r2);
	_mm_storel_pi((__m64*)&dest[6*stride], *(__m128*)&r3);
	_mm_storeh_pi((__m64*)&dest[7*stride], *(__m128*)&r3);

	_mm_store_si128(&src[0], zero);
	_mm_store_si128(&src[1], zero);
	_mm_store_si128(&src[2], zero);
	_mm_store_si128(&src[3], zero);
	_mm_store_si128(&src[4], zero);
	_mm_store_si128(&src[5], zero);
	_mm_store_si128(&src[6], zero);
	_mm_store_si128(&src[7], zero);
}
Exemple #19
0
/* motion templates */
CV_IMPL void
cvUpdateMotionHistory( const void* silhouette, void* mhimg,
                       double timestamp, double mhi_duration )
{
    CvMat  silhstub, *silh = cvGetMat(silhouette, &silhstub);
    CvMat  mhistub, *mhi = cvGetMat(mhimg, &mhistub);

    if( !CV_IS_MASK_ARR( silh ))
        CV_Error( CV_StsBadMask, "" );

    if( CV_MAT_TYPE( mhi->type ) != CV_32FC1 )
        CV_Error( CV_StsUnsupportedFormat, "" );

    if( !CV_ARE_SIZES_EQ( mhi, silh ))
        CV_Error( CV_StsUnmatchedSizes, "" );

    CvSize size = cvGetMatSize( mhi );

    if( CV_IS_MAT_CONT( mhi->type & silh->type ))
    {
        size.width *= size.height;
        size.height = 1;
    }

    float ts = (float)timestamp;
    float delbound = (float)(timestamp - mhi_duration);
    int x, y;
#if CV_SSE2
    volatile bool useSIMD = cv::checkHardwareSupport(CV_CPU_SSE2);
#endif

    for( y = 0; y < size.height; y++ )
    {
        const uchar* silhData = silh->data.ptr + silh->step*y;
        float* mhiData = (float*)(mhi->data.ptr + mhi->step*y);
        x = 0;

#if CV_SSE2
        if( useSIMD )
        {
            __m128 ts4 = _mm_set1_ps(ts), db4 = _mm_set1_ps(delbound);
            for( ; x <= size.width - 8; x += 8 )
            {
                __m128i z = _mm_setzero_si128();
                __m128i s = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(silhData + x)), z);
                __m128 s0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(s, z)), s1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(s, z));
                __m128 v0 = _mm_loadu_ps(mhiData + x), v1 = _mm_loadu_ps(mhiData + x + 4);
                __m128 fz = _mm_setzero_ps();

                v0 = _mm_and_ps(v0, _mm_cmpge_ps(v0, db4));
                v1 = _mm_and_ps(v1, _mm_cmpge_ps(v1, db4));

                __m128 m0 = _mm_and_ps(_mm_xor_ps(v0, ts4), _mm_cmpneq_ps(s0, fz));
                __m128 m1 = _mm_and_ps(_mm_xor_ps(v1, ts4), _mm_cmpneq_ps(s1, fz));

                v0 = _mm_xor_ps(v0, m0);
                v1 = _mm_xor_ps(v1, m1);

                _mm_storeu_ps(mhiData + x, v0);
                _mm_storeu_ps(mhiData + x + 4, v1);
            }
        }
#endif

        for( ; x < size.width; x++ )
        {
            float val = mhiData[x];
            val = silhData[x] ? ts : val < delbound ? 0 : val;
            mhiData[x] = val;
        }
    }
}
Exemple #20
0
static void mb_lpf_horizontal_edge_w_avx2_8(unsigned char *s, int p,
                                            const unsigned char *_blimit,
                                            const unsigned char *_limit,
                                            const unsigned char *_thresh) {
  __m128i mask, hev, flat, flat2;
  const __m128i zero = _mm_set1_epi16(0);
  const __m128i one = _mm_set1_epi8(1);
  __m128i q7p7, q6p6, q5p5, q4p4, q3p3, q2p2, q1p1, q0p0, p0q0, p1q1;
  __m128i abs_p1p0;

  const __m128i thresh =
      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_thresh[0]));
  const __m128i limit = _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_limit[0]));
  const __m128i blimit =
      _mm_broadcastb_epi8(_mm_cvtsi32_si128((int)_blimit[0]));

  q4p4 = _mm_loadl_epi64((__m128i *)(s - 5 * p));
  q4p4 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q4p4), (__m64 *)(s + 4 * p)));
  q3p3 = _mm_loadl_epi64((__m128i *)(s - 4 * p));
  q3p3 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q3p3), (__m64 *)(s + 3 * p)));
  q2p2 = _mm_loadl_epi64((__m128i *)(s - 3 * p));
  q2p2 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q2p2), (__m64 *)(s + 2 * p)));
  q1p1 = _mm_loadl_epi64((__m128i *)(s - 2 * p));
  q1p1 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q1p1), (__m64 *)(s + 1 * p)));
  p1q1 = _mm_shuffle_epi32(q1p1, 78);
  q0p0 = _mm_loadl_epi64((__m128i *)(s - 1 * p));
  q0p0 = _mm_castps_si128(
      _mm_loadh_pi(_mm_castsi128_ps(q0p0), (__m64 *)(s - 0 * p)));
  p0q0 = _mm_shuffle_epi32(q0p0, 78);

  {
    __m128i abs_p1q1, abs_p0q0, abs_q1q0, fe, ff, work;
    abs_p1p0 =
        _mm_or_si128(_mm_subs_epu8(q1p1, q0p0), _mm_subs_epu8(q0p0, q1p1));
    abs_q1q0 = _mm_srli_si128(abs_p1p0, 8);
    fe = _mm_set1_epi8(0xfe);
    ff = _mm_cmpeq_epi8(abs_p1p0, abs_p1p0);
    abs_p0q0 =
        _mm_or_si128(_mm_subs_epu8(q0p0, p0q0), _mm_subs_epu8(p0q0, q0p0));
    abs_p1q1 =
        _mm_or_si128(_mm_subs_epu8(q1p1, p1q1), _mm_subs_epu8(p1q1, q1p1));
    flat = _mm_max_epu8(abs_p1p0, abs_q1q0);
    hev = _mm_subs_epu8(flat, thresh);
    hev = _mm_xor_si128(_mm_cmpeq_epi8(hev, zero), ff);

    abs_p0q0 = _mm_adds_epu8(abs_p0q0, abs_p0q0);
    abs_p1q1 = _mm_srli_epi16(_mm_and_si128(abs_p1q1, fe), 1);
    mask = _mm_subs_epu8(_mm_adds_epu8(abs_p0q0, abs_p1q1), blimit);
    mask = _mm_xor_si128(_mm_cmpeq_epi8(mask, zero), ff);
    // mask |= (abs(p0 - q0) * 2 + abs(p1 - q1) / 2  > blimit) * -1;
    mask = _mm_max_epu8(abs_p1p0, mask);
    // mask |= (abs(p1 - p0) > limit) * -1;
    // mask |= (abs(q1 - q0) > limit) * -1;

    work = _mm_max_epu8(
        _mm_or_si128(_mm_subs_epu8(q2p2, q1p1), _mm_subs_epu8(q1p1, q2p2)),
        _mm_or_si128(_mm_subs_epu8(q3p3, q2p2), _mm_subs_epu8(q2p2, q3p3)));
    mask = _mm_max_epu8(work, mask);
    mask = _mm_max_epu8(mask, _mm_srli_si128(mask, 8));
    mask = _mm_subs_epu8(mask, limit);
    mask = _mm_cmpeq_epi8(mask, zero);
  }

  // lp filter
  {
    const __m128i t4 = _mm_set1_epi8(4);
    const __m128i t3 = _mm_set1_epi8(3);
    const __m128i t80 = _mm_set1_epi8(0x80);
    const __m128i t1 = _mm_set1_epi16(0x1);
    __m128i qs1ps1 = _mm_xor_si128(q1p1, t80);
    __m128i qs0ps0 = _mm_xor_si128(q0p0, t80);
    __m128i qs0 = _mm_xor_si128(p0q0, t80);
    __m128i qs1 = _mm_xor_si128(p1q1, t80);
    __m128i filt;
    __m128i work_a;
    __m128i filter1, filter2;
    __m128i flat2_q6p6, flat2_q5p5, flat2_q4p4, flat2_q3p3, flat2_q2p2;
    __m128i flat2_q1p1, flat2_q0p0, flat_q2p2, flat_q1p1, flat_q0p0;

    filt = _mm_and_si128(_mm_subs_epi8(qs1ps1, qs1), hev);
    work_a = _mm_subs_epi8(qs0, qs0ps0);
    filt = _mm_adds_epi8(filt, work_a);
    filt = _mm_adds_epi8(filt, work_a);
    filt = _mm_adds_epi8(filt, work_a);
    /* (vpx_filter + 3 * (qs0 - ps0)) & mask */
    filt = _mm_and_si128(filt, mask);

    filter1 = _mm_adds_epi8(filt, t4);
    filter2 = _mm_adds_epi8(filt, t3);

    filter1 = _mm_unpacklo_epi8(zero, filter1);
    filter1 = _mm_srai_epi16(filter1, 0xB);
    filter2 = _mm_unpacklo_epi8(zero, filter2);
    filter2 = _mm_srai_epi16(filter2, 0xB);

    /* Filter1 >> 3 */
    filt = _mm_packs_epi16(filter2, _mm_subs_epi16(zero, filter1));
    qs0ps0 = _mm_xor_si128(_mm_adds_epi8(qs0ps0, filt), t80);

    /* filt >> 1 */
    filt = _mm_adds_epi16(filter1, t1);
    filt = _mm_srai_epi16(filt, 1);
    filt = _mm_andnot_si128(_mm_srai_epi16(_mm_unpacklo_epi8(zero, hev), 0x8),
                            filt);
    filt = _mm_packs_epi16(filt, _mm_subs_epi16(zero, filt));
    qs1ps1 = _mm_xor_si128(_mm_adds_epi8(qs1ps1, filt), t80);
    // loopfilter done

    {
      __m128i work;
      flat = _mm_max_epu8(
          _mm_or_si128(_mm_subs_epu8(q2p2, q0p0), _mm_subs_epu8(q0p0, q2p2)),
          _mm_or_si128(_mm_subs_epu8(q3p3, q0p0), _mm_subs_epu8(q0p0, q3p3)));
      flat = _mm_max_epu8(abs_p1p0, flat);
      flat = _mm_max_epu8(flat, _mm_srli_si128(flat, 8));
      flat = _mm_subs_epu8(flat, one);
      flat = _mm_cmpeq_epi8(flat, zero);
      flat = _mm_and_si128(flat, mask);

      q5p5 = _mm_loadl_epi64((__m128i *)(s - 6 * p));
      q5p5 = _mm_castps_si128(
          _mm_loadh_pi(_mm_castsi128_ps(q5p5), (__m64 *)(s + 5 * p)));

      q6p6 = _mm_loadl_epi64((__m128i *)(s - 7 * p));
      q6p6 = _mm_castps_si128(
          _mm_loadh_pi(_mm_castsi128_ps(q6p6), (__m64 *)(s + 6 * p)));

      flat2 = _mm_max_epu8(
          _mm_or_si128(_mm_subs_epu8(q4p4, q0p0), _mm_subs_epu8(q0p0, q4p4)),
          _mm_or_si128(_mm_subs_epu8(q5p5, q0p0), _mm_subs_epu8(q0p0, q5p5)));

      q7p7 = _mm_loadl_epi64((__m128i *)(s - 8 * p));
      q7p7 = _mm_castps_si128(
          _mm_loadh_pi(_mm_castsi128_ps(q7p7), (__m64 *)(s + 7 * p)));

      work = _mm_max_epu8(
          _mm_or_si128(_mm_subs_epu8(q6p6, q0p0), _mm_subs_epu8(q0p0, q6p6)),
          _mm_or_si128(_mm_subs_epu8(q7p7, q0p0), _mm_subs_epu8(q0p0, q7p7)));

      flat2 = _mm_max_epu8(work, flat2);
      flat2 = _mm_max_epu8(flat2, _mm_srli_si128(flat2, 8));
      flat2 = _mm_subs_epu8(flat2, one);
      flat2 = _mm_cmpeq_epi8(flat2, zero);
      flat2 = _mm_and_si128(flat2, flat);  // flat2 & flat & mask
    }

    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
    // flat and wide flat calculations
    {
      const __m128i eight = _mm_set1_epi16(8);
      const __m128i four = _mm_set1_epi16(4);
      __m128i p7_16, p6_16, p5_16, p4_16, p3_16, p2_16, p1_16, p0_16;
      __m128i q7_16, q6_16, q5_16, q4_16, q3_16, q2_16, q1_16, q0_16;
      __m128i pixelFilter_p, pixelFilter_q;
      __m128i pixetFilter_p2p1p0, pixetFilter_q2q1q0;
      __m128i sum_p7, sum_q7, sum_p3, sum_q3, res_p, res_q;

      p7_16 = _mm_unpacklo_epi8(q7p7, zero);
      p6_16 = _mm_unpacklo_epi8(q6p6, zero);
      p5_16 = _mm_unpacklo_epi8(q5p5, zero);
      p4_16 = _mm_unpacklo_epi8(q4p4, zero);
      p3_16 = _mm_unpacklo_epi8(q3p3, zero);
      p2_16 = _mm_unpacklo_epi8(q2p2, zero);
      p1_16 = _mm_unpacklo_epi8(q1p1, zero);
      p0_16 = _mm_unpacklo_epi8(q0p0, zero);
      q0_16 = _mm_unpackhi_epi8(q0p0, zero);
      q1_16 = _mm_unpackhi_epi8(q1p1, zero);
      q2_16 = _mm_unpackhi_epi8(q2p2, zero);
      q3_16 = _mm_unpackhi_epi8(q3p3, zero);
      q4_16 = _mm_unpackhi_epi8(q4p4, zero);
      q5_16 = _mm_unpackhi_epi8(q5p5, zero);
      q6_16 = _mm_unpackhi_epi8(q6p6, zero);
      q7_16 = _mm_unpackhi_epi8(q7p7, zero);

      pixelFilter_p = _mm_add_epi16(_mm_add_epi16(p6_16, p5_16),
                                    _mm_add_epi16(p4_16, p3_16));
      pixelFilter_q = _mm_add_epi16(_mm_add_epi16(q6_16, q5_16),
                                    _mm_add_epi16(q4_16, q3_16));

      pixetFilter_p2p1p0 = _mm_add_epi16(p0_16, _mm_add_epi16(p2_16, p1_16));
      pixelFilter_p = _mm_add_epi16(pixelFilter_p, pixetFilter_p2p1p0);

      pixetFilter_q2q1q0 = _mm_add_epi16(q0_16, _mm_add_epi16(q2_16, q1_16));
      pixelFilter_q = _mm_add_epi16(pixelFilter_q, pixetFilter_q2q1q0);
      pixelFilter_p =
          _mm_add_epi16(eight, _mm_add_epi16(pixelFilter_p, pixelFilter_q));
      pixetFilter_p2p1p0 = _mm_add_epi16(
          four, _mm_add_epi16(pixetFilter_p2p1p0, pixetFilter_q2q1q0));
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(p7_16, p0_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(q7_16, q0_16)), 4);
      flat2_q0p0 = _mm_packus_epi16(res_p, res_q);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(p3_16, p0_16)), 3);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(q3_16, q0_16)), 3);

      flat_q0p0 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(p7_16, p7_16);
      sum_q7 = _mm_add_epi16(q7_16, q7_16);
      sum_p3 = _mm_add_epi16(p3_16, p3_16);
      sum_q3 = _mm_add_epi16(q3_16, q3_16);

      pixelFilter_q = _mm_sub_epi16(pixelFilter_p, p6_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q6_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p1_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q1_16)), 4);
      flat2_q1p1 = _mm_packus_epi16(res_p, res_q);

      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_p2p1p0, p2_16);
      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q2_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p1_16)), 3);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q1_16)), 3);
      flat_q1p1 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
      sum_p3 = _mm_add_epi16(sum_p3, p3_16);
      sum_q3 = _mm_add_epi16(sum_q3, q3_16);

      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q5_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p5_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p2_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q2_16)), 4);
      flat2_q2p2 = _mm_packus_epi16(res_p, res_q);

      pixetFilter_p2p1p0 = _mm_sub_epi16(pixetFilter_p2p1p0, q1_16);
      pixetFilter_q2q1q0 = _mm_sub_epi16(pixetFilter_q2q1q0, p1_16);

      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_p2p1p0, _mm_add_epi16(sum_p3, p2_16)), 3);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixetFilter_q2q1q0, _mm_add_epi16(sum_q3, q2_16)), 3);
      flat_q2p2 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q4_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p4_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p3_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q3_16)), 4);
      flat2_q3p3 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q3_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p3_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p4_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q4_16)), 4);
      flat2_q4p4 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q2_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p2_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p5_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q5_16)), 4);
      flat2_q5p5 = _mm_packus_epi16(res_p, res_q);

      sum_p7 = _mm_add_epi16(sum_p7, p7_16);
      sum_q7 = _mm_add_epi16(sum_q7, q7_16);
      pixelFilter_p = _mm_sub_epi16(pixelFilter_p, q1_16);
      pixelFilter_q = _mm_sub_epi16(pixelFilter_q, p1_16);
      res_p = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_p, _mm_add_epi16(sum_p7, p6_16)), 4);
      res_q = _mm_srli_epi16(
          _mm_add_epi16(pixelFilter_q, _mm_add_epi16(sum_q7, q6_16)), 4);
      flat2_q6p6 = _mm_packus_epi16(res_p, res_q);
    }
    // wide flat
    // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

    flat = _mm_shuffle_epi32(flat, 68);
    flat2 = _mm_shuffle_epi32(flat2, 68);

    q2p2 = _mm_andnot_si128(flat, q2p2);
    flat_q2p2 = _mm_and_si128(flat, flat_q2p2);
    q2p2 = _mm_or_si128(q2p2, flat_q2p2);

    qs1ps1 = _mm_andnot_si128(flat, qs1ps1);
    flat_q1p1 = _mm_and_si128(flat, flat_q1p1);
    q1p1 = _mm_or_si128(qs1ps1, flat_q1p1);

    qs0ps0 = _mm_andnot_si128(flat, qs0ps0);
    flat_q0p0 = _mm_and_si128(flat, flat_q0p0);
    q0p0 = _mm_or_si128(qs0ps0, flat_q0p0);

    q6p6 = _mm_andnot_si128(flat2, q6p6);
    flat2_q6p6 = _mm_and_si128(flat2, flat2_q6p6);
    q6p6 = _mm_or_si128(q6p6, flat2_q6p6);
    _mm_storel_epi64((__m128i *)(s - 7 * p), q6p6);
    _mm_storeh_pi((__m64 *)(s + 6 * p), _mm_castsi128_ps(q6p6));

    q5p5 = _mm_andnot_si128(flat2, q5p5);
    flat2_q5p5 = _mm_and_si128(flat2, flat2_q5p5);
    q5p5 = _mm_or_si128(q5p5, flat2_q5p5);
    _mm_storel_epi64((__m128i *)(s - 6 * p), q5p5);
    _mm_storeh_pi((__m64 *)(s + 5 * p), _mm_castsi128_ps(q5p5));

    q4p4 = _mm_andnot_si128(flat2, q4p4);
    flat2_q4p4 = _mm_and_si128(flat2, flat2_q4p4);
    q4p4 = _mm_or_si128(q4p4, flat2_q4p4);
    _mm_storel_epi64((__m128i *)(s - 5 * p), q4p4);
    _mm_storeh_pi((__m64 *)(s + 4 * p), _mm_castsi128_ps(q4p4));

    q3p3 = _mm_andnot_si128(flat2, q3p3);
    flat2_q3p3 = _mm_and_si128(flat2, flat2_q3p3);
    q3p3 = _mm_or_si128(q3p3, flat2_q3p3);
    _mm_storel_epi64((__m128i *)(s - 4 * p), q3p3);
    _mm_storeh_pi((__m64 *)(s + 3 * p), _mm_castsi128_ps(q3p3));

    q2p2 = _mm_andnot_si128(flat2, q2p2);
    flat2_q2p2 = _mm_and_si128(flat2, flat2_q2p2);
    q2p2 = _mm_or_si128(q2p2, flat2_q2p2);
    _mm_storel_epi64((__m128i *)(s - 3 * p), q2p2);
    _mm_storeh_pi((__m64 *)(s + 2 * p), _mm_castsi128_ps(q2p2));

    q1p1 = _mm_andnot_si128(flat2, q1p1);
    flat2_q1p1 = _mm_and_si128(flat2, flat2_q1p1);
    q1p1 = _mm_or_si128(q1p1, flat2_q1p1);
    _mm_storel_epi64((__m128i *)(s - 2 * p), q1p1);
    _mm_storeh_pi((__m64 *)(s + 1 * p), _mm_castsi128_ps(q1p1));

    q0p0 = _mm_andnot_si128(flat2, q0p0);
    flat2_q0p0 = _mm_and_si128(flat2, flat2_q0p0);
    q0p0 = _mm_or_si128(q0p0, flat2_q0p0);
    _mm_storel_epi64((__m128i *)(s - 1 * p), q0p0);
    _mm_storeh_pi((__m64 *)(s - 0 * p), _mm_castsi128_ps(q0p0));
  }
}
Exemple #21
0
void LW_FUNC_ALIGN convert_lw48_to_yuy2_sse41( int thread_id, int thread_num, void *param1, void *param2 )
{
    /* LW48 -> YUY2 using SSE4.1 */
    COLOR_PROC_INFO *cpip = (COLOR_PROC_INFO *)param1;
    int start = (cpip->h *  thread_id     ) / thread_num;
    int end   = (cpip->h * (thread_id + 1)) / thread_num;
    int w     = cpip->w;
    BYTE *ycp_line   = (BYTE *)cpip->ycp    + start * cpip->line_size;
    BYTE *pixel_line = (BYTE *)cpip->pixelp + start * w * 2;
    __m128i x0, x1, x2, x3, x5, x6, x7;
    static const char LW_ALIGN(16) SHUFFLE_Y[16] = { 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11 };
    for( int y = start; y < end; y++ )
    {
        BYTE *ycp      = ycp_line;
        BYTE *yuy2_ptr = pixel_line;
        for( int x = 0, i_step = 0; x < w; x += i_step, ycp += i_step*6, yuy2_ptr += i_step*2 )
        {
            x5 = _mm_loadu_si128((__m128i *)(ycp +  0));
            x6 = _mm_loadu_si128((__m128i *)(ycp + 16));
            x7 = _mm_loadu_si128((__m128i *)(ycp + 32));

            x0 = _mm_blend_epi16(x5, x6, 0x80+0x10+0x02);
            x0 = _mm_blend_epi16(x0, x7, 0x20+0x04);

            x1 = _mm_blend_epi16(x5, x6, 0x40+0x20+0x01);
            x1 = _mm_blend_epi16(x1, x7, 0x10+0x08);

            x0 = _mm_shuffle_epi8(x0, _mm_load_si128((__m128i*)SHUFFLE_Y));
            x1 = _mm_alignr_epi8(x1, x1, 2);
            x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(1,2,3,0));

            x0 = _mm_srli_epi16(x0, 8);
            x1 = _mm_srli_epi16(x1, 8);

            x5 = _mm_loadu_si128((__m128i *)(ycp + 48));
            x6 = _mm_loadu_si128((__m128i *)(ycp + 64));
            x7 = _mm_loadu_si128((__m128i *)(ycp + 80));

            x2 = _mm_blend_epi16(x5, x6, 0x80+0x10+0x02);
            x2 = _mm_blend_epi16(x2, x7, 0x20+0x04);

            x3 = _mm_blend_epi16(x5, x6, 0x40+0x20+0x01);
            x3 = _mm_blend_epi16(x3, x7, 0x10+0x08);

            x2 = _mm_shuffle_epi8(x2, _mm_load_si128((__m128i*)SHUFFLE_Y));
            x3 = _mm_alignr_epi8(x3, x3, 2);
            x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(1,2,3,0));

            x2 = _mm_srli_epi16(x2, 8);
            x3 = _mm_srli_epi16(x3, 8);

            x0 = _mm_packus_epi16(x0, x2);
            x1 = _mm_packus_epi16(x1, x3);

            _mm_storeu_si128((__m128i*)(yuy2_ptr +  0), _mm_unpacklo_epi8(x0, x1));
            _mm_storeu_si128((__m128i*)(yuy2_ptr + 16), _mm_unpackhi_epi8(x0, x1));

            int remain = w - x;
            i_step = (remain >= 16);
            i_step = (i_step<<4) + (remain & ((~(0-i_step)) & 0x0f));
        }
        ycp_line   += cpip->line_size;
        pixel_line += w*2;
    }
}
void png_read_filter_row_paeth3_sse2(png_row_infop row_info, png_bytep row,
   png_const_bytep prev)
{
   /* Paeth tries to predict pixel d using the pixel to the left of it, a,
    * and two pixels from the previous row, b and c:
    *   prev: c b
    *   row:  a d
    * The Paeth function predicts d to be whichever of a, b, or c is nearest to
    * p=a+b-c.
    *
    * The first pixel has no left context, and so uses an Up filter, p = b.
    * This works naturally with our main loop's p = a+b-c if we force a and c
    * to zero.
    * Here we zero b and d, which become c and a respectively at the start of
    * the loop.
    */
   png_debug(1, "in png_read_filter_row_paeth3_sse2");
   const __m128i zero = _mm_setzero_si128();
   __m128i c, b = zero,
           a, d = zero;

   int rb = row_info->rowbytes;
   while (rb >= 4) {
      /* It's easiest to do this math (particularly, deal with pc) with 16-bit
       * intermediates.
       */
      c = b; b = _mm_unpacklo_epi8(load4(prev), zero);
      a = d; d = _mm_unpacklo_epi8(load4(row ), zero);

      /* (p-a) == (a+b-c - a) == (b-c) */
      __m128i pa = _mm_sub_epi16(b,c);

      /* (p-b) == (a+b-c - b) == (a-c) */
      __m128i pb = _mm_sub_epi16(a,c);

      /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
      __m128i pc = _mm_add_epi16(pa,pb);

      pa = abs_i16(pa);  /* |p-a| */
      pb = abs_i16(pb);  /* |p-b| */
      pc = abs_i16(pc);  /* |p-c| */

      __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));

      /* Paeth breaks ties favoring a over b over c. */
      __m128i nearest  = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
                         if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
                                                                     c));

      /* Note `_epi8`: we need addition to wrap modulo 255. */
      d = _mm_add_epi8(d, nearest);
      store3(row, _mm_packus_epi16(d,d));

      prev += 3;
      row  += 3;
      rb   -= 3;
   }
   if (rb > 0) {
      /* It's easiest to do this math (particularly, deal with pc) with 16-bit
       * intermediates.
       */
      c = b; b = _mm_unpacklo_epi8(load3(prev), zero);
      a = d; d = _mm_unpacklo_epi8(load3(row ), zero);

      /* (p-a) == (a+b-c - a) == (b-c) */
      __m128i pa = _mm_sub_epi16(b,c);

      /* (p-b) == (a+b-c - b) == (a-c) */
      __m128i pb = _mm_sub_epi16(a,c);

      /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */
      __m128i pc = _mm_add_epi16(pa,pb);

      pa = abs_i16(pa);  /* |p-a| */
      pb = abs_i16(pb);  /* |p-b| */
      pc = abs_i16(pc);  /* |p-c| */

      __m128i smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb));

      /* Paeth breaks ties favoring a over b over c. */
      __m128i nearest  = if_then_else(_mm_cmpeq_epi16(smallest, pa), a,
                         if_then_else(_mm_cmpeq_epi16(smallest, pb), b,
                                                                     c));

      /* Note `_epi8`: we need addition to wrap modulo 255. */
      d = _mm_add_epi8(d, nearest);
      store3(row, _mm_packus_epi16(d,d));

      prev += 3;
      row  += 3;
      rb   -= 3;
   }
}
Exemple #23
0
test (__m128i s1, __m128i s2)
{
  return _mm_unpacklo_epi8 (s1, s2); 
}
    void matrixCalc(void *inputs, void *outputs, long in_bitRate)
    {
        //Validate the input
        TML::Matrix i(inputs,0);
        if (i.dims() != 2)
            throw "Input should be a 2D matrix";

        if (!i.isChar())
            throw "Input should have character data";

        if (i.planes() != 4)
            throw "Input needs 4 planes";

        if (i.dim(0) % 64 != 0)
            throw "Width needs to be a multiple of 64";

        if (i.dim(1) % 2 != 0)
            throw "Height needs to be a multiple of 2";

        if (i.dim(0) != m_cfg.g_w || i.dim(1) != m_cfg.g_h || m_cfg.rc_target_bitrate != in_bitRate)
        {
            vpx_img_free(&m_raw);
            vpx_img_alloc(&m_raw, VPX_IMG_FMT_I420, i.dim(0), i.dim(1), 1);

            vpx_codec_destroy(&m_codec);

            vpx_codec_enc_config_default(vpx_codec_vp8_cx(), &m_cfg, 0);

            m_cfg.rc_target_bitrate = in_bitRate;
            m_cfg.g_w = i.dim(0);
            m_cfg.g_h = i.dim(1);

            vpx_codec_enc_init(&m_codec, vpx_codec_vp8_cx(), &m_cfg, 0);
        }


        //ARGB -> YYYY U V
        int x;
        const int N = 32;

        const int Uoffset = i.dim(0)*i.dim(1);
        const int Voffset = Uoffset + Uoffset/4;
        const int w = i.dim(0);
        const int h = i.dim(1);
        const int sy = i.stride(1);
        unsigned char *data = (unsigned char*)i.data();
        int y;

        unsigned char *buffer = m_raw.planes[0];

        //RRRR
        __v16qi rShuffle = {1, -1,-1,-1,
                            5, -1,-1,-1,
                            9, -1,-1,-1,
                            13, -1,-1,-1
                           };

        __v16qi gShuffle = {2, -1,-1,-1,
                            6, -1,-1,-1,
                            10,-1,-1,-1,
                            14,-1,-1,-1
                           };

        __v16qi bShuffle = {3,-1,-1,-1,
                            7,-1,-1,-1,
                            11,-1,-1,-1,
                            15,-1,-1,-1
                           };

        //Shuffle so elements are moved to front/back
        __v16qi _aShuffle = {
            0, 4, 8, 12,
            -1, -1, -1, -1,
            -1, -1, -1, -1,
            -1, -1, -1, -1
        };

        __v16qi _bShuffle = {
            -1, -1, -1, -1,
            0, 4, 8, 12,
            -1, -1, -1, -1,
            -1, -1, -1, -1
        };

        __v16qi _cShuffle = {
            -1, -1, -1, -1,
            -1, -1, -1, -1,
            0, 4, 8, 12,
            -1, -1, -1, -1
        };

        __v16qi _dShuffle = {
            -1, -1, -1, -1,
            -1, -1, -1, -1,
            -1, -1, -1, -1,
            0, 4, 8, 12
        };

        __v8hi R2Y = {	27,	27,	27,	27,	27,	27,	27,	27};
        __v8hi G2Y = {	91,	91,	91,	91,	91,	91,	91,	91};
        __v8hi B2Y = {	9,	9,	9,	9,	9,	9,	9,	9};

        __v8hi R2U = {	-12,-12,-12,-12,-12,-12,-12,-12};
        __v8hi G2U = {	-43,-43,-43,-43,-43,-43,-43,-43};
        __v8hi B2U = {	55,	55,	55,	55,	55,	55,	55,	55};

        __v8hi R2V = {	78,	78,	78,	78,	78,	78,	78,	78};
        __v8hi G2V = {	-71,-71,-71,-71,-71,-71,-71,-71};
        __v8hi B2V = {	-7,	-7,	-7,	-7,	-7,	-7,	-7,	-7};

        __v8hi m127 = {127,127,127,127,127,127,127,127};

        __v8hi zero = {	0,	0,	0,	0,	0,	0,	0,	0};
        __v8hi two55 = {255,255,255,255,255,255,255,255};

        for (y=0; y<h; y+=2)
        {
            for (x=0; x<w; x+=N)
            {
                __v8hi tY[N/8];
                __v8hi bY[N/8];

                __v8hi tU[N/8];
                __v8hi bU[N/8];

                __v8hi tV[N/8];
                __v8hi bV[N/8];

                //Step 1: Convert to YUV
                int n;
                for (n=0; n<N; n+=8)	//Read 8x per lane
                {
                    __v16qi tARGBx4_l = _mm_load_si128((__m128i*)(data + y*w*4 + x*4 + n*4));
                    __v16qi tARGBx4_r = _mm_load_si128((__m128i*)(data + y*w*4 + x*4 + n*4 + 16));
                    __v16qi bARGBx4_l = _mm_load_si128((__m128i*)(data + y*w*4 + x*4 + n*4 + sy));
                    __v16qi bARGBx4_r = _mm_load_si128((__m128i*)(data + y*w*4 + x*4 + n*4 + sy + 16));

                    // ARGB(1) ARGB(2) ARGB(3) ARGB(4) | ARGB(5) ARGB(6) ARGB(7) ARGB(8)
                    // => AARRGGBB(1,5) AARRGGBB(2,6) | AARRGGBB(3,7) AARRGGBB(4,8)
                    __v16qi tARGBx2_15 = _mm_unpacklo_epi8(tARGBx4_l, tARGBx4_r);
                    __v16qi tARGBx2_26 = _mm_unpackhi_epi8(tARGBx4_l, tARGBx4_r);

                    __v16qi bARGBx2_15 = _mm_unpacklo_epi8(bARGBx4_l, bARGBx4_r);
                    __v16qi bARGBx2_26 = _mm_unpackhi_epi8(bARGBx4_l, bARGBx4_r);

                    // AARRGGBB(1,5) AARRGGBB(2,6) | AARRGGBB(3,7) AARRGGBB(4,8)
                    // => AAAARRRRGGGGBBBB(1,3,5,7) | AAAARRRRGGGGBBBB(2,4,6,8)
                    __v16qi tARGB_1357 = _mm_unpacklo_epi8(tARGBx2_15, tARGBx2_26);
                    __v16qi tARGB_2468 = _mm_unpackhi_epi8(tARGBx2_15, tARGBx2_26);

                    __v16qi bARGB_1357 = _mm_unpacklo_epi8(bARGBx2_15, bARGBx2_26);
                    __v16qi bARGB_2468 = _mm_unpackhi_epi8(bARGBx2_15, bARGBx2_26);

                    //AAAARRRRGGGGBBBB(1,3,5,7) | AAAARRRRGGGGBBBB(2,4,6,8)
                    // => AAAAAAAARRRRRRRR | GGGGGGGGBBBBBBBB
                    __v16qi tAARR = _mm_unpacklo_epi8(tARGB_1357, tARGB_2468);
                    __v16qi tGGBB = _mm_unpackhi_epi8(tARGB_1357, tARGB_2468);

                    __v16qi bAARR = _mm_unpacklo_epi8(bARGB_1357, bARGB_2468);
                    __v16qi bGGBB = _mm_unpackhi_epi8(bARGB_1357, bARGB_2468);

                    //Unpack to 8 R's, 8 G's, and 8 B's.
                    __v8hi tRRRR = _mm_unpackhi_epi8(tAARR, zero);
                    __v8hi tGGGG = _mm_unpacklo_epi8(tGGBB, zero);
                    __v8hi tBBBB = _mm_unpackhi_epi8(tGGBB, zero);

                    __v8hi bRRRR = _mm_unpackhi_epi8(bAARR, zero);
                    __v8hi bGGGG = _mm_unpacklo_epi8(bGGBB, zero);
                    __v8hi bBBBB = _mm_unpackhi_epi8(bGGBB, zero);

                    //Convert to YUV (8x parallel)
                    __v8hi tYYYY = _mm_add_epi16(_mm_mullo_epi16(tRRRR, R2Y), _mm_add_epi16(_mm_mullo_epi16(tGGGG, G2Y), _mm_mullo_epi16(tBBBB, B2Y)));
                    __v8hi tUUUU = _mm_add_epi16(_mm_mullo_epi16(tRRRR, R2U), _mm_add_epi16(_mm_mullo_epi16(tGGGG, G2U), _mm_mullo_epi16(tBBBB, B2U)));
                    __v8hi tVVVV = _mm_add_epi16(_mm_mullo_epi16(tRRRR, R2V), _mm_add_epi16(_mm_mullo_epi16(tGGGG, G2V), _mm_mullo_epi16(tBBBB, B2V)));

                    __v8hi bYYYY = _mm_add_epi16(_mm_mullo_epi16(bRRRR, R2Y), _mm_add_epi16(_mm_mullo_epi16(bGGGG, G2Y), _mm_mullo_epi16(bBBBB, B2Y)));
                    __v8hi bUUUU = _mm_add_epi16(_mm_mullo_epi16(bRRRR, R2U), _mm_add_epi16(_mm_mullo_epi16(bGGGG, G2U), _mm_mullo_epi16(bBBBB, B2U)));
                    __v8hi bVVVV = _mm_add_epi16(_mm_mullo_epi16(bRRRR, R2V), _mm_add_epi16(_mm_mullo_epi16(bGGGG, G2V), _mm_mullo_epi16(bBBBB, B2V)));

                    tUUUU = _mm_add_epi16(_mm_srai_epi16(tUUUU, 7), m127);
                    tVVVV = _mm_add_epi16(_mm_srai_epi16(tVVVV, 7), m127);

                    bUUUU = _mm_add_epi16(_mm_srai_epi16(bUUUU, 7), m127);
                    bVVVV = _mm_add_epi16(_mm_srai_epi16(bVVVV, 7), m127);

                    //Remove the fractional portion and clamp in 0...255
                    tY[n/8] = _mm_min_epi16(_mm_srai_epi16(_mm_max_epi16(tYYYY,zero), 7), two55);
                    tU[n/8] = _mm_min_epi16(_mm_max_epi16(tUUUU,zero), two55);
                    tV[n/8] = _mm_min_epi16(_mm_max_epi16(tVVVV,zero), two55);

                    bY[n/8] = _mm_min_epi16(_mm_srai_epi16(_mm_max_epi16(bYYYY,zero), 7), two55);
                    bU[n/8] = _mm_min_epi16(_mm_max_epi16(bUUUU,zero), two55);
                    bV[n/8] = _mm_min_epi16(_mm_max_epi16(bVVVV,zero), two55);
                }


                // Step 2 - Write out Luma (part 1)
                for (n=0; n<N; n+=16)
                {
                    __v8hi A = tY[n/8];
                    __v8hi B = tY[n/8+1];

                    __m128i Y = _mm_packus_epi16(A,B);

                    _mm_storeu_si128((__m128i*)(buffer+y*w+x+n), Y);
                }

                for (n=0; n<N; n+=16)
                {
                    __v8hi A = bY[n/8];
                    __v8hi B = bY[n/8+1];

                    __m128i Y = _mm_packus_epi16(A,B);

                    _mm_storeu_si128((__m128i*)(buffer+y*w+x+n+w), Y);
                }

                //Step 3 -- U and V data...
                for (n=0; n<N; n+=32)
                {
                    __m128i U16a = _mm_add_epi16(tU[n/8], bU[n/8]);
                    __m128i U16b = _mm_add_epi16(tU[n/8+1], bU[n/8+1]);
                    __m128i U16c = _mm_add_epi16(tU[n/8+2], bU[n/8+2]);
                    __m128i U16d = _mm_add_epi16(tU[n/8+3], bU[n/8+3]);

                    U16a = _mm_srli_epi16(_mm_hadd_epi16(U16a, U16b),2);
                    U16c = _mm_srli_epi16(_mm_hadd_epi16(U16c, U16d),2);

                    __m128i U = _mm_packus_epi16(U16a, U16c);

                    _mm_storeu_si128((__m128i*)(buffer+Uoffset+y/2*w/2 + x/2+n/2), U);
                }

                for (n=0; n<N; n+=32)
                {
                    __m128i U16a = _mm_add_epi16(tV[n/8], bV[n/8]);
                    __m128i U16b = _mm_add_epi16(tV[n/8+1], bV[n/8+1]);
                    __m128i U16c = _mm_add_epi16(tV[n/8+2], bV[n/8+2]);
                    __m128i U16d = _mm_add_epi16(tV[n/8+3], bV[n/8+3]);

                    U16a = _mm_srli_epi16(_mm_hadd_epi16(U16a, U16b),2);
                    U16c = _mm_srli_epi16(_mm_hadd_epi16(U16c, U16d),2);

                    __m128i U = _mm_packus_epi16(U16a, U16c);

                    _mm_storeu_si128((__m128i*)(buffer+Voffset+y/2*w/2 + x/2+n/2), U);
                }
            }
        }

        m_frameCnt++;
        vpx_codec_encode(&m_codec, &m_raw, m_frameCnt, 1, 0, VPX_DL_REALTIME);
        vpx_codec_iter_t iter = NULL;

        const vpx_codec_cx_pkt_t *pkt;

        while ((pkt = vpx_codec_get_cx_data(&m_codec, &iter)))
        {
            if (pkt->kind == VPX_CODEC_CX_FRAME_PKT)
            {

                //Generate output
                TML::Matrix o(outputs,0);

                _jit_matrix_info m;
                memset(&m, 0, sizeof(m));

                m.dimcount = 2;
                m.dim[0] = pkt->data.frame.sz;
                m.dim[1] = 1;
                m.dimstride[0] = pkt->data.frame.sz;
                m.dimstride[1] = 1;
                m.planecount = 1;
                m.type = _jit_sym_char;

                o.resizeTo(&m);

                memcpy(o.data(), pkt->data.frame.buf, pkt->data.frame.sz);

                break;
            }
        }

    }
Exemple #25
0
// Hadamard transform
// Returns the difference between the weighted sum of the absolute value of
// transformed coefficients.
static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
                          const uint16_t* const w) {
  int32_t sum[4];
  __m128i tmp_0, tmp_1, tmp_2, tmp_3;
  const __m128i zero = _mm_setzero_si128();
  const __m128i one = _mm_set1_epi16(1);
  const __m128i three = _mm_set1_epi16(3);

  // Load, combine and tranpose inputs.
  {
    const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]);
    const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]);
    const __m128i inA_2 = _mm_loadl_epi64((__m128i*)&inA[BPS * 2]);
    const __m128i inA_3 = _mm_loadl_epi64((__m128i*)&inA[BPS * 3]);
    const __m128i inB_0 = _mm_loadl_epi64((__m128i*)&inB[BPS * 0]);
    const __m128i inB_1 = _mm_loadl_epi64((__m128i*)&inB[BPS * 1]);
    const __m128i inB_2 = _mm_loadl_epi64((__m128i*)&inB[BPS * 2]);
    const __m128i inB_3 = _mm_loadl_epi64((__m128i*)&inB[BPS * 3]);

    // Combine inA and inB (we'll do two transforms in parallel).
    const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0);
    const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1);
    const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2);
    const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3);
    // a00 b00 a01 b01 a02 b03 a03 b03   0 0 0 0 0 0 0 0
    // a10 b10 a11 b11 a12 b12 a13 b13   0 0 0 0 0 0 0 0
    // a20 b20 a21 b21 a22 b22 a23 b23   0 0 0 0 0 0 0 0
    // a30 b30 a31 b31 a32 b32 a33 b33   0 0 0 0 0 0 0 0

    // Transpose the two 4x4, discarding the filling zeroes.
    const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2);
    const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3);
    // a00 a20  b00 b20  a01 a21  b01 b21  a02 a22  b02 b22  a03 a23  b03 b23
    // a10 a30  b10 b30  a11 a31  b11 b31  a12 a32  b12 b32  a13 a33  b13 b33
    const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
    const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
    // a00 a10 a20 a30  b00 b10 b20 b30  a01 a11 a21 a31  b01 b11 b21 b31
    // a02 a12 a22 a32  b02 b12 b22 b32  a03 a13 a23 a33  b03 b13 b23 b33

    // Convert to 16b.
    tmp_0 = _mm_unpacklo_epi8(transpose1_0, zero);
    tmp_1 = _mm_unpackhi_epi8(transpose1_0, zero);
    tmp_2 = _mm_unpacklo_epi8(transpose1_1, zero);
    tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero);
    // a00 a10 a20 a30   b00 b10 b20 b30
    // a01 a11 a21 a31   b01 b11 b21 b31
    // a02 a12 a22 a32   b02 b12 b22 b32
    // a03 a13 a23 a33   b03 b13 b23 b33
  }

  // Horizontal pass and subsequent transpose.
  {
    // Calculate a and b (two 4x4 at once).
    const __m128i a0 = _mm_slli_epi16(_mm_add_epi16(tmp_0, tmp_2), 2);
    const __m128i a1 = _mm_slli_epi16(_mm_add_epi16(tmp_1, tmp_3), 2);
    const __m128i a2 = _mm_slli_epi16(_mm_sub_epi16(tmp_1, tmp_3), 2);
    const __m128i a3 = _mm_slli_epi16(_mm_sub_epi16(tmp_0, tmp_2), 2);
    // b0_extra = (a0 != 0);
    const __m128i b0_extra = _mm_andnot_si128(_mm_cmpeq_epi16 (a0, zero), one);
    const __m128i b0_base = _mm_add_epi16(a0, a1);
    const __m128i b1 = _mm_add_epi16(a3, a2);
    const __m128i b2 = _mm_sub_epi16(a3, a2);
    const __m128i b3 = _mm_sub_epi16(a0, a1);
    const __m128i b0 = _mm_add_epi16(b0_base, b0_extra);
    // a00 a01 a02 a03   b00 b01 b02 b03
    // a10 a11 a12 a13   b10 b11 b12 b13
    // a20 a21 a22 a23   b20 b21 b22 b23
    // a30 a31 a32 a33   b30 b31 b32 b33

    // Transpose the two 4x4.
    const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1);
    const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3);
    const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1);
    const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3);
    // a00 a10 a01 a11   a02 a12 a03 a13
    // a20 a30 a21 a31   a22 a32 a23 a33
    // b00 b10 b01 b11   b02 b12 b03 b13
    // b20 b30 b21 b31   b22 b32 b23 b33
    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
    // a00 a10 a20 a30 a01 a11 a21 a31
    // b00 b10 b20 b30 b01 b11 b21 b31
    // a02 a12 a22 a32 a03 a13 a23 a33
    // b02 b12 a22 b32 b03 b13 b23 b33
    tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
    tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
    tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
    tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
    // a00 a10 a20 a30   b00 b10 b20 b30
    // a01 a11 a21 a31   b01 b11 b21 b31
    // a02 a12 a22 a32   b02 b12 b22 b32
    // a03 a13 a23 a33   b03 b13 b23 b33
  }

  // Vertical pass and difference of weighted sums.
  {
    // Load all inputs.
    // TODO(cduvivier): Make variable declarations and allocations aligned so
    //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
    const __m128i w_0 = _mm_loadu_si128((__m128i*)&w[0]);
    const __m128i w_8 = _mm_loadu_si128((__m128i*)&w[8]);

    // Calculate a and b (two 4x4 at once).
    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
    const __m128i b0 = _mm_add_epi16(a0, a1);
    const __m128i b1 = _mm_add_epi16(a3, a2);
    const __m128i b2 = _mm_sub_epi16(a3, a2);
    const __m128i b3 = _mm_sub_epi16(a0, a1);

    // Separate the transforms of inA and inB.
    __m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
    __m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
    __m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
    __m128i B_b2 = _mm_unpackhi_epi64(b2, b3);

    {
      // sign(b) = b >> 15  (0x0000 if positive, 0xffff if negative)
      const __m128i sign_A_b0 = _mm_srai_epi16(A_b0, 15);
      const __m128i sign_A_b2 = _mm_srai_epi16(A_b2, 15);
      const __m128i sign_B_b0 = _mm_srai_epi16(B_b0, 15);
      const __m128i sign_B_b2 = _mm_srai_epi16(B_b2, 15);

      // b = abs(b) = (b ^ sign) - sign
      A_b0 = _mm_xor_si128(A_b0, sign_A_b0);
      A_b2 = _mm_xor_si128(A_b2, sign_A_b2);
      B_b0 = _mm_xor_si128(B_b0, sign_B_b0);
      B_b2 = _mm_xor_si128(B_b2, sign_B_b2);
      A_b0 = _mm_sub_epi16(A_b0, sign_A_b0);
      A_b2 = _mm_sub_epi16(A_b2, sign_A_b2);
      B_b0 = _mm_sub_epi16(B_b0, sign_B_b0);
      B_b2 = _mm_sub_epi16(B_b2, sign_B_b2);
    }

    // b = abs(b) + 3
    A_b0 = _mm_add_epi16(A_b0, three);
    A_b2 = _mm_add_epi16(A_b2, three);
    B_b0 = _mm_add_epi16(B_b0, three);
    B_b2 = _mm_add_epi16(B_b2, three);

    // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3
    // b = (abs(b) + 3) >> 3
    A_b0 = _mm_srai_epi16(A_b0, 3);
    A_b2 = _mm_srai_epi16(A_b2, 3);
    B_b0 = _mm_srai_epi16(B_b0, 3);
    B_b2 = _mm_srai_epi16(B_b2, 3);

    // weighted sums
    A_b0 = _mm_madd_epi16(A_b0, w_0);
    A_b2 = _mm_madd_epi16(A_b2, w_8);
    B_b0 = _mm_madd_epi16(B_b0, w_0);
    B_b2 = _mm_madd_epi16(B_b2, w_8);
    A_b0 = _mm_add_epi32(A_b0, A_b2);
    B_b0 = _mm_add_epi32(B_b0, B_b2);

    // difference of weighted sums
    A_b0 = _mm_sub_epi32(A_b0, B_b0);
    _mm_storeu_si128((__m128i*)&sum[0], A_b0);
  }
  return sum[0] + sum[1] + sum[2] + sum[3];
}
Exemple #26
0
template<> void momentsInTile<uchar, int, int>( const cv::Mat& img, double* moments )
{
    typedef uchar T;
    typedef int WT;
    typedef int MT;
    Size size = img.size();
    int y;
    MT mom[10] = {0,0,0,0,0,0,0,0,0,0};
    bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);

    for( y = 0; y < size.height; y++ )
    {
        const T* ptr = img.ptr<T>(y);
        int x0 = 0, x1 = 0, x2 = 0, x3 = 0, x = 0;

        if( useSIMD )
        {
            __m128i qx_init = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
            __m128i dx = _mm_set1_epi16(8);
            __m128i z = _mm_setzero_si128(), qx0 = z, qx1 = z, qx2 = z, qx3 = z, qx = qx_init;

            for( ; x <= size.width - 8; x += 8 )
            {
                __m128i p = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr + x)), z);
                qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z));
                __m128i px = _mm_mullo_epi16(p, qx);
                __m128i sx = _mm_mullo_epi16(qx, qx);
                qx1 = _mm_add_epi32(qx1, _mm_madd_epi16(p, qx));
                qx2 = _mm_add_epi32(qx2, _mm_madd_epi16(p, sx));
                qx3 = _mm_add_epi32(qx3, _mm_madd_epi16(px, sx));

                qx = _mm_add_epi16(qx, dx);
            }
            int CV_DECL_ALIGNED(16) buf[4];
            _mm_store_si128((__m128i*)buf, qx0);
            x0 = buf[0] + buf[1] + buf[2] + buf[3];
            _mm_store_si128((__m128i*)buf, qx1);
            x1 = buf[0] + buf[1] + buf[2] + buf[3];
            _mm_store_si128((__m128i*)buf, qx2);
            x2 = buf[0] + buf[1] + buf[2] + buf[3];
            _mm_store_si128((__m128i*)buf, qx3);
            x3 = buf[0] + buf[1] + buf[2] + buf[3];
        }

        for( ; x < size.width; x++ )
        {
            WT p = ptr[x];
            WT xp = x * p, xxp;

            x0 += p;
            x1 += xp;
            xxp = xp * x;
            x2 += xxp;
            x3 += xxp * x;
        }

        WT py = y * x0, sy = y*y;

        mom[9] += ((MT)py) * sy;  // m03
        mom[8] += ((MT)x1) * sy;  // m12
        mom[7] += ((MT)x2) * y;  // m21
        mom[6] += x3;             // m30
        mom[5] += x0 * sy;        // m02
        mom[4] += x1 * y;         // m11
        mom[3] += x2;             // m20
        mom[2] += py;             // m01
        mom[1] += x1;             // m10
        mom[0] += x0;             // m00
    }

    for(int x = 0; x < 10; x++ )
        moments[x] = (double)mom[x];
}
}bool validate_utf8_sse(const char *src, size_t len) {
  const char *end = src + len;
  while (src + 16 < end) {
    __m128i chunk = _mm_loadu_si128((const __m128i *)(src));

    int asciiMask = _mm_movemask_epi8(chunk);
    if (!asciiMask) {
      src += 16;
      continue;
    }

    __m128i chunk_signed = _mm_add_epi8(chunk, _mm_set1_epi8(0x80));
    __m128i cond2 =
        _mm_cmplt_epi8(_mm_set1_epi8(0xc2 - 1 - 0x80), chunk_signed);
    __m128i state = _mm_set1_epi8((char)(0x0 | 0x80));
    state = _mm_blendv_epi8(state, _mm_set1_epi8((char)(0x2 | 0xc0)), cond2);

    __m128i cond3 =
        _mm_cmplt_epi8(_mm_set1_epi8(0xe0 - 1 - 0x80), chunk_signed);

    state = _mm_blendv_epi8(state, _mm_set1_epi8((char)(0x3 | 0xe0)), cond3);
    __m128i mask3 = _mm_slli_si128(cond3, 1);

    __m128i cond4 =
        _mm_cmplt_epi8(_mm_set1_epi8(0xf0 - 1 - 0x80), chunk_signed);

    // Fall back to the scalar processing
    if (_mm_movemask_epi8(cond4)) {
      break;
    }

    __m128i count = _mm_and_si128(state, _mm_set1_epi8(0x7));

    __m128i count_sub1 = _mm_subs_epu8(count, _mm_set1_epi8(0x1));

    __m128i counts = _mm_add_epi8(count, _mm_slli_si128(count_sub1, 1));

    __m128i shifts = count_sub1;
    shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 1));
    counts = _mm_add_epi8(
        counts, _mm_slli_si128(_mm_subs_epu8(counts, _mm_set1_epi8(0x2)), 2));
    shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 2));

    if (asciiMask ^ _mm_movemask_epi8(_mm_cmpgt_epi8(counts, _mm_set1_epi8(0))))
      return false; // error
    shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 4));

    if (_mm_movemask_epi8(_mm_cmpgt_epi8(
            _mm_sub_epi8(_mm_slli_si128(counts, 1), counts), _mm_set1_epi8(1))))
      return false; // error

    shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 8));

    __m128i mask = _mm_and_si128(state, _mm_set1_epi8(0xf8));
    shifts =
        _mm_and_si128(shifts, _mm_cmplt_epi8(counts, _mm_set1_epi8(2))); // <=1

    chunk =
        _mm_andnot_si128(mask, chunk); // from now on, we only have usefull bits

    shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 1),
                             _mm_srli_si128(_mm_slli_epi16(shifts, 7), 1));

    __m128i chunk_right = _mm_slli_si128(chunk, 1);

    __m128i chunk_low = _mm_blendv_epi8(
        chunk,
        _mm_or_si128(chunk, _mm_and_si128(_mm_slli_epi16(chunk_right, 6),
                                          _mm_set1_epi8(0xc0))),
        _mm_cmpeq_epi8(counts, _mm_set1_epi8(1)));

    __m128i chunk_high =
        _mm_and_si128(chunk, _mm_cmpeq_epi8(counts, _mm_set1_epi8(2)));

    shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 2),
                             _mm_srli_si128(_mm_slli_epi16(shifts, 6), 2));
    chunk_high = _mm_srli_epi32(chunk_high, 2);

    shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 4),
                             _mm_srli_si128(_mm_slli_epi16(shifts, 5), 4));
    chunk_high = _mm_or_si128(
        chunk_high, _mm_and_si128(_mm_and_si128(_mm_slli_epi32(chunk_right, 4),
                                                _mm_set1_epi8(0xf0)),
                                  mask3));
    int c = _mm_extract_epi16(counts, 7);
    int source_advance = !(c & 0x0200) ? 16 : !(c & 0x02) ? 15 : 14;

    __m128i high_bits = _mm_and_si128(chunk_high, _mm_set1_epi8(0xf8));
    if (!_mm_testz_si128(
            mask3,
            _mm_or_si128(_mm_cmpeq_epi8(high_bits, _mm_set1_epi8(0x00)),
                         _mm_cmpeq_epi8(high_bits, _mm_set1_epi8(0xd8)))))
      return false;

    shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 8),
                             _mm_srli_si128(_mm_slli_epi16(shifts, 4), 8));

    chunk_high = _mm_slli_si128(chunk_high, 1);

    __m128i shuf =
        _mm_add_epi8(shifts, _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5,
                                          4, 3, 2, 1, 0));

    chunk_low = _mm_shuffle_epi8(chunk_low, shuf);
    chunk_high = _mm_shuffle_epi8(chunk_high, shuf);
    __m128i utf16_low = _mm_unpacklo_epi8(chunk_low, chunk_high);
    __m128i utf16_high = _mm_unpackhi_epi8(chunk_low, chunk_high);

    if (_mm_cmpestrc(_mm_cvtsi64_si128(0xfdeffdd0fffffffe), 4, utf16_high, 8,
                     _SIDD_UWORD_OPS | _SIDD_CMP_RANGES) |
        _mm_cmpestrc(_mm_cvtsi64_si128(0xfdeffdd0fffffffe), 4, utf16_low, 8,
                     _SIDD_UWORD_OPS | _SIDD_CMP_RANGES)) {
      return false;
    }

    src += source_advance;
  }
  return validate_utf8(src, end - src);
}
Exemple #28
0
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i seven = _mm_set1_epi16(7);
  const __m128i k937 = _mm_set1_epi32(937);
  const __m128i k1812 = _mm_set1_epi32(1812);
  const __m128i k51000 = _mm_set1_epi32(51000);
  const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
  const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
                                           5352,  2217, 5352,  2217);
  const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
                                           2217, -5352, 2217, -5352);
  const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8);
  const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8);
  const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352,
                                            2217, 5352, 2217, 5352);
  const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217,
                                            -5352, 2217, -5352, 2217);
  __m128i v01, v32;


  // Difference between src and ref and initial transpose.
  {
    // Load src and convert to 16b.
    const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
    const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]);
    const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]);
    const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]);
    const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
    const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
    const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
    const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
    // Load ref and convert to 16b.
    const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
    const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
    const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
    const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
    const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
    const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
    const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
    const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
    // Compute difference. -> 00 01 02 03 00 00 00 00
    const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
    const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
    const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
    const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);


    // Unpack and shuffle
    // 00 01 02 03   0 0 0 0
    // 10 11 12 13   0 0 0 0
    // 20 21 22 23   0 0 0 0
    // 30 31 32 33   0 0 0 0
    const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1);
    const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3);
    // 00 01 10 11 02 03 12 13
    // 20 21 30 31 22 23 32 33
    const __m128i shuf01_p =
        _mm_shufflehi_epi16(shuf01, _MM_SHUFFLE(2, 3, 0, 1));
    const __m128i shuf23_p =
        _mm_shufflehi_epi16(shuf23, _MM_SHUFFLE(2, 3, 0, 1));
    // 00 01 10 11 03 02 13 12
    // 20 21 30 31 23 22 33 32
    const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p);
    const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p);
    // 00 01 10 11 20 21 30 31
    // 03 02 13 12 23 22 33 32
    const __m128i a01 = _mm_add_epi16(s01, s32);
    const __m128i a32 = _mm_sub_epi16(s01, s32);
    // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ]
    // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ]

    const __m128i tmp0 = _mm_madd_epi16(a01, k88p);  // [ (a0 + a1) << 3, ... ]
    const __m128i tmp2 = _mm_madd_epi16(a01, k88m);  // [ (a0 - a1) << 3, ... ]
    const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p);
    const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m);
    const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812);
    const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937);
    const __m128i tmp1   = _mm_srai_epi32(tmp1_2, 9);
    const __m128i tmp3   = _mm_srai_epi32(tmp3_2, 9);
    const __m128i s03 = _mm_packs_epi32(tmp0, tmp2);
    const __m128i s12 = _mm_packs_epi32(tmp1, tmp3);
    const __m128i s_lo = _mm_unpacklo_epi16(s03, s12);   // 0 1 0 1 0 1...
    const __m128i s_hi = _mm_unpackhi_epi16(s03, s12);   // 2 3 2 3 2 3
    const __m128i v23 = _mm_unpackhi_epi32(s_lo, s_hi);
    v01 = _mm_unpacklo_epi32(s_lo, s_hi);
    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));  // 3 2 3 2 3 2..
  }

  // Second pass
  {
    // Same operations are done on the (0,3) and (1,2) pairs.
    // a0 = v0 + v3
    // a1 = v1 + v2
    // a3 = v0 - v3
    // a2 = v1 - v2
    const __m128i a01 = _mm_add_epi16(v01, v32);
    const __m128i a32 = _mm_sub_epi16(v01, v32);
    const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
    const __m128i a22 = _mm_unpackhi_epi64(a32, a32);
    const __m128i a01_plus_7 = _mm_add_epi16(a01, seven);

    // d0 = (a0 + a1 + 7) >> 4;
    // d2 = (a0 - a1 + 7) >> 4;
    const __m128i c0 = _mm_add_epi16(a01_plus_7, a11);
    const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11);
    const __m128i d0 = _mm_srai_epi16(c0, 4);
    const __m128i d2 = _mm_srai_epi16(c2, 4);

    // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
    // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
    const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
    const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
    const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
    const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);
    const __m128i d3 = _mm_add_epi32(c3, k51000);
    const __m128i e1 = _mm_srai_epi32(d1, 16);
    const __m128i e3 = _mm_srai_epi32(d3, 16);
    const __m128i f1 = _mm_packs_epi32(e1, e1);
    const __m128i f3 = _mm_packs_epi32(e3, e3);
    // f1 = f1 + (a3 != 0);
    // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
    // desired (0, 1), we add one earlier through k12000_plus_one.
    // -> f1 = f1 + 1 - (a3 == 0)
    const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));

    const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1);
    const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3);
    _mm_storeu_si128((__m128i*)&out[0], d0_g1);
    _mm_storeu_si128((__m128i*)&out[8], d2_f3);
  }
}
static void GF_FUNC_ALIGN VS_CC
proc_8bit_sse2(convolution_t *ch, uint8_t *buff, int bstride, int width,
               int height, int stride, uint8_t *dstp, const uint8_t *srcp)
{
    uint8_t *p0 = buff + 16;
    uint8_t *p1 = p0 + bstride;
    uint8_t *p2 = p1 + bstride;
    uint8_t *p3 = p2 + bstride;
    uint8_t *p4 = p3 + bstride;
    uint8_t *orig = p0, *end = p4;

    line_copy8(p0, srcp + 2 * stride , width, 2);
    line_copy8(p1, srcp + stride, width, 2);
    line_copy8(p2, srcp, width, 2);
    srcp += stride;
    line_copy8(p3, srcp, width, 2);

    __m128i zero = _mm_setzero_si128();
    __m128 rdiv = _mm_set1_ps((float)ch->rdiv);
    __m128 bias = _mm_set1_ps((float)ch->bias);
    __m128i matrix[25];
    for (int i = 0; i < 25; i++) {
        matrix[i] = _mm_unpacklo_epi16(_mm_set1_epi16((int16_t)ch->m[i]), zero);
    }

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 2 ? 1 : -1);
        line_copy8(p4, srcp, width, 2);
        uint8_t *array[] = {
            p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2,
            p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2,
            p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2,
            p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2,
            p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2
        };

        for (int x = 0; x < width; x += 16) {
            __m128i sum[4] = { zero, zero, zero, zero };

            for (int i = 0; i < 25; i++) {
                __m128i xmm0, xmm1, xmm2;

                xmm0 = _mm_loadu_si128((__m128i *)(array[i] + x));
                xmm2 = _mm_unpackhi_epi8(xmm0, zero);
                xmm0 = _mm_unpacklo_epi8(xmm0, zero);
                
                xmm1 = _mm_unpackhi_epi16(xmm0, zero);
                xmm0 = _mm_unpacklo_epi16(xmm0, zero);
                sum[0] = _mm_add_epi32(sum[0], _mm_madd_epi16(xmm0, matrix[i]));
                sum[1] = _mm_add_epi32(sum[1], _mm_madd_epi16(xmm1, matrix[i]));

                xmm1 = _mm_unpackhi_epi16(xmm2, zero);
                xmm0 = _mm_unpacklo_epi16(xmm2, zero);
                sum[2] = _mm_add_epi32(sum[2], _mm_madd_epi16(xmm0, matrix[i]));
                sum[3] = _mm_add_epi32(sum[3], _mm_madd_epi16(xmm1, matrix[i]));
            }

            for (int i = 0; i < 4; i++) {
                __m128 sumfp = _mm_cvtepi32_ps(sum[i]);
                sumfp = _mm_mul_ps(sumfp, rdiv);
                sumfp = _mm_add_ps(sumfp, bias);
                if (!ch->saturate) {
                    sumfp = mm_abs_ps(sumfp);
                }
                sum[i] = _mm_cvttps_epi32(sumfp);
            }

            sum[0] = _mm_packs_epi32(sum[0], sum[1]);
            sum[1] = _mm_packs_epi32(sum[2], sum[3]);
            sum[0] = _mm_packus_epi16(sum[0], sum[1]);

            _mm_store_si128((__m128i *)(dstp + x), sum[0]);
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = p3;
        p3 = p4;
        p4 = (p4 == end) ? orig : p4 + bstride;
    }
}
Exemple #30
0
//计算TI、SI
int tisi(char* ydata,char* prev_ydata,int width,int height,SDLParam sdlparam,float &TI,float &SI)
{
	int nYSize=height*width;
	float nFrameSize = nYSize*1.5;
	int realframe_size=(width-2*PADDING)*(height-2*PADDING);

	unsigned char *pFrame=(unsigned char*)malloc(nYSize);
	unsigned char *pNextFrame=(unsigned char*)malloc(nYSize);
	unsigned char *pFrame_0;
	unsigned char *pFrame_1;
	unsigned char *pFrame_2;
	unsigned char *pNextFrame_0;
	unsigned char *pSobelScreen=(unsigned char*)malloc(realframe_size);
	unsigned char *pDiffScreen=(unsigned char*)malloc(realframe_size);
	memset(pSobelScreen,0,realframe_size);
	memset(pDiffScreen,0,realframe_size);

	float *frame_sobel=(float*)malloc(realframe_size*sizeof(float));
	memset(frame_sobel,0.0f,realframe_size*sizeof(float));
	float avg_frame_sobel=0;
	int index=0;

	float *frame_diff=(float*)malloc(realframe_size*sizeof(float));
	memset(frame_diff,0.0f,realframe_size*sizeof(float));
	float avg_frame_diff=0;

	__m128 sobel_avg_sum=_mm_set1_ps(+0.0f);
	__m128 sobel_square_sum=_mm_set1_ps(+0.0f);
	__m128 diff_avg_sum=_mm_set1_ps(+0.0f);
	__m128 diff_square_sum=_mm_set1_ps(+0.0f);
	__m128 avg_sobel=_mm_set1_ps(+0.0f);
	__m128 avg_diff=_mm_set1_ps(+0.0f);

	int i,j,k;
	int pad_threshold=0;
	
	memcpy(pFrame,prev_ydata,nYSize);
	memcpy(pNextFrame,ydata,nYSize);

	pFrame_0=pFrame+width*(PADDING-1);
	pFrame_1=pFrame+width*PADDING;
	pFrame_2=pFrame+width*(PADDING+1);
	pNextFrame_0=pNextFrame+width*(PADDING-1);

	//Check
	if(mark_exit==1){
		return -1;
	}	

	for(j = PADDING; j < height-PADDING; j++)
	{
		for(i = PADDING; i < width-PADDING; i+=4)
		{
			if(i+4>width-PADDING)
				pad_threshold=1;
			// load 16 components. (0~6 will be used)
			__m128i current_0 = _mm_unpacklo_epi8(_mm_loadu_si128((__m128i*)(pFrame_0+i-1)), _mm_setzero_si128());
			__m128i current_1 = _mm_unpacklo_epi8(_mm_loadu_si128((__m128i*)(pFrame_1+i-1)), _mm_setzero_si128());
			__m128i current_2 = _mm_unpacklo_epi8(_mm_loadu_si128((__m128i*)(pFrame_2+i-1)), _mm_setzero_si128());
			__m128i next_0 = _mm_unpacklo_epi8(_mm_loadu_si128((__m128i*)(pNextFrame_0+i-1)), _mm_setzero_si128());

			// pFrame_00 = { pFrame_0[i-1], pFrame_0[i], pFrame_0[i+1], pFrame_0[i+2] }
			__m128 pFrame_00 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(current_0, _mm_setzero_si128()));
			// pFrame_01 = { pFrame_0[i], pFrame_0[i+1], pFrame_0[i+2], pFrame_0[i+3] }
			__m128 pFrame_01 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_srli_si128(current_0, 2), _mm_setzero_si128()));
			// pFrame_02 = { pFrame_0[i+1], pFrame_0[i+2], pFrame_0[i+3], pFrame_0[i+4] }
			__m128 pFrame_02 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_srli_si128(current_0, 4), _mm_setzero_si128()));
			// pFrame_10 = { pFrame_1[i-1], pFrame_1[i], pFrame_1[i+1], pFrame_1[i+2] }
			__m128 pFrame_10 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(current_1, _mm_setzero_si128()));
			// pFrame_12 = { pFrame_1[i+1], pFrame_1[i+2], pFrame_1[i+3], pFrame_1[i+4] }
			__m128 pFrame_12 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_srli_si128(current_1, 4), _mm_setzero_si128()));
			// pFrame_20 = { pFrame_2[i-1], pFrame_2[i], pFrame_2[i+1], pFrame_2[i+2] }
			__m128 pFrame_20 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(current_2, _mm_setzero_si128()));
			// pFrame_21 = { pFrame_2[i], pFrame_2[i+1], pFrame_2[i+2], pFrame_2[i+3] }
			__m128 pFrame_21 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_srli_si128(current_2, 2), _mm_setzero_si128()));
			// pFrame_22 = { pFrame_2[i+1], pFrame_2[i+2], pFrame_2[i+3], pFrame_2[i+4] }
			__m128 pFrame_22 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(_mm_srli_si128(current_2, 4), _mm_setzero_si128()));
			
			__m128 pNextFrame_00 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(next_0, _mm_setzero_si128()));
			
			__m128 gx=_mm_add_ps(_mm_sub_ps(_mm_add_ps(_mm_add_ps(_mm_sub_ps(_mm_sub_ps(_mm_sub_ps(pFrame_20,pFrame_22),pFrame_12),pFrame_12),pFrame_10),pFrame_10),pFrame_02),pFrame_00);

			__m128 gy=_mm_sub_ps(_mm_sub_ps(_mm_add_ps(_mm_add_ps(_mm_add_ps(_mm_sub_ps(pFrame_00,pFrame_20),_mm_sub_ps(pFrame_02,pFrame_22)),pFrame_01),pFrame_01),pFrame_21),pFrame_21);

			__m128 sobel_result = _mm_sqrt_ps(_mm_add_ps(_mm_mul_ps(gx, gx), _mm_mul_ps(gy,gy)));

			__m128 diff_result=_mm_sub_ps(pNextFrame_00,pFrame_00);
			
			if(!pad_threshold)
			{
				frame_sobel[index]=sobel_result.m128_f32[0];
				frame_sobel[index+1]=sobel_result.m128_f32[1];
				frame_sobel[index+2]=sobel_result.m128_f32[2];
				frame_sobel[index+3]=sobel_result.m128_f32[3];
				pSobelScreen[index]=(unsigned char)sobel_result.m128_f32[0];
				pSobelScreen[index+1]=(unsigned char)sobel_result.m128_f32[1];
				pSobelScreen[index+2]=(unsigned char)sobel_result.m128_f32[2];
				pSobelScreen[index+3]=(unsigned char)sobel_result.m128_f32[3];
				
				frame_diff[index]=diff_result.m128_f32[0];
				frame_diff[index+1]=diff_result.m128_f32[1];
				frame_diff[index+2]=diff_result.m128_f32[2];
				frame_diff[index+3]=diff_result.m128_f32[3];
				pDiffScreen[index]=(unsigned char)abs(diff_result.m128_f32[0]);
				pDiffScreen[index+1]=(unsigned char)abs(diff_result.m128_f32[1]);
				pDiffScreen[index+2]=(unsigned char)abs(diff_result.m128_f32[2]);
				pDiffScreen[index+3]=(unsigned char)abs(diff_result.m128_f32[3]);
				index+=4;
			}
			else
			{
				for(k=0;k<width-PADDING-i;k++)
				{
					frame_sobel[index+k]=sobel_result.m128_f32[k];
					pSobelScreen[index+k]=(unsigned char)sobel_result.m128_f32[k];
					frame_diff[index+k]=diff_result.m128_f32[k];
					pDiffScreen[index+k]=(unsigned char)abs(diff_result.m128_f32[k]);
				}
				index+=width-PADDING-i;
			}		
		}
		pFrame_0 += width;
		pFrame_1 += width;
		pFrame_2 += width;
		pNextFrame_0 += width;
		pad_threshold=0;
	}

	//画图
	if(sdlparam.graphically_si==true&&sdlparam.isinterval==false){
		memcpy(sdlparam.show_YBuffer,pSobelScreen,realframe_size);
		SDL_Event event;
		event.type = REFRESH_EVENT;
		SDL_PushEvent(&event);
	}

	//画图
	if(sdlparam.graphically_ti==true&&sdlparam.isinterval==false){
		memcpy(sdlparam.show_YBuffer,pDiffScreen,realframe_size);
		SDL_Event event;
		event.type = REFRESH_EVENT;
		SDL_PushEvent(&event);
	}

	for(i=0;i<index;i+=4)
	{
		__m128 sobel_result_0 = _mm_set_ps (frame_sobel[i],frame_sobel[i+1], frame_sobel[i+2], frame_sobel[i+3]);
		sobel_avg_sum = _mm_add_ps(sobel_avg_sum,sobel_result_0);

		__m128 diff_result_0 = _mm_set_ps (frame_diff[i],frame_diff[i+1], frame_diff[i+2], frame_diff[i+3]);
		diff_avg_sum = _mm_add_ps(diff_avg_sum,diff_result_0);
	}		
	avg_frame_sobel=(sobel_avg_sum.m128_f32[0]+
		sobel_avg_sum.m128_f32[1]+
		sobel_avg_sum.m128_f32[2]+
		sobel_avg_sum.m128_f32[3])/index;
	avg_sobel = _mm_set_ps (avg_frame_sobel,avg_frame_sobel, avg_frame_sobel, avg_frame_sobel);

	avg_frame_diff=(diff_avg_sum.m128_f32[0]+
		diff_avg_sum.m128_f32[1]+
		diff_avg_sum.m128_f32[2]+
		diff_avg_sum.m128_f32[3])/index;
	avg_diff = _mm_set_ps (avg_frame_diff,avg_frame_diff, avg_frame_diff, avg_frame_diff);

	for(i=0;i<index;i+=4)
	{
		__m128 sobel_result_1 = _mm_set_ps (frame_sobel[i],frame_sobel[i+1], frame_sobel[i+2], frame_sobel[i+3]);
		__m128 sobel_square=_mm_mul_ps(_mm_sub_ps(sobel_result_1,avg_sobel),_mm_sub_ps(sobel_result_1,avg_sobel));
		sobel_square_sum = _mm_add_ps(sobel_square_sum,sobel_square);

		__m128 diff_result_1 = _mm_set_ps (frame_diff[i],frame_diff[i+1], frame_diff[i+2], frame_diff[i+3]);
		__m128 diff_square=_mm_mul_ps(_mm_sub_ps(diff_result_1,avg_diff),_mm_sub_ps(diff_result_1,avg_diff));
		diff_square_sum = _mm_add_ps(diff_square_sum,diff_square);
	}
	
	SI=sqrt((sobel_square_sum.m128_f32[0]+sobel_square_sum.m128_f32[1]+sobel_square_sum.m128_f32[2]+sobel_square_sum.m128_f32[3])/index);
	avg_frame_sobel=0;
	sobel_avg_sum=_mm_set1_ps(+0.0f);
	sobel_square_sum=_mm_set1_ps(+0.0f);
	
	TI=sqrt((diff_square_sum.m128_f32[0]+diff_square_sum.m128_f32[1]+diff_square_sum.m128_f32[2]+diff_square_sum.m128_f32[3])/index);
	avg_frame_diff=0;
	diff_avg_sum=_mm_set1_ps(+0.0f);
	diff_square_sum=_mm_set1_ps(+0.0f);

	index=0;

	//--------
	delete pFrame;
	delete pNextFrame;
	delete pSobelScreen;
	delete pDiffScreen;
	delete frame_diff;
	delete frame_sobel;

	return 0;
}