コード例 #1
0
static inline __m128i
v4_mul_color_sse2(__m128i x, __m128i y)
{
   const __m128i zero = _mm_setzero_si128();
   const __m128i sym4_mask = _mm_set_epi32(0x00FF00FF, 0x000000FF, 0x00FF00FF, 0x000000FF);

   __m128i x_l = _mm_unpacklo_epi8(x, zero);
   __m128i x_h = _mm_unpackhi_epi8(x, zero);

   __m128i y_l = _mm_unpacklo_epi8(y, zero);
   __m128i y_h = _mm_unpackhi_epi8(y, zero);

   __m128i r_l = _mm_mullo_epi16(x_l, y_l);
   __m128i r_h = _mm_mullo_epi16(x_h, y_h);

   r_l = _mm_add_epi16(r_l, sym4_mask);
   r_h = _mm_add_epi16(r_h, sym4_mask);

   r_l = _mm_srli_epi16(r_l, 8);
   r_h = _mm_srli_epi16(r_h, 8);

   return  _mm_packus_epi16(r_l, r_h);
}
コード例 #2
0
// Compute the sum of all pixel differences of this MB.
static INLINE int sum_diff_16x1(__m128i acc_diff) {
  const __m128i k_1 = _mm_set1_epi16(1);
  const __m128i acc_diff_lo =
      _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8);
  const __m128i acc_diff_hi =
      _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8);
  const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
  const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
  const __m128i hgfe_dcba =
      _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
  const __m128i hgfedcba =
      _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
  return _mm_cvtsi128_si32(hgfedcba);
}
コード例 #3
0
ファイル: blend_sse42.c プロジェクト: WojciechMula/toys
void SSE42_blend() {
	int n = width * height * 4;
	int dummy __attribute__((unused));

    const uint8_t alpha2 = alpha/2;
    __m128i alpha_np = _mm_set1_epi16(alpha2 | ((uint16_t)(alpha2 ^ 0x7f) << 8));

    for (size_t i=0; i < n; i += 32) {
        __m128i A0 = _mm_load_si128((__m128i*)(imgA + i));
        __m128i B0 = _mm_load_si128((__m128i*)(imgB + i));

        __m128i A1 = _mm_load_si128((__m128i*)(imgA + i + 16));
        __m128i B1 = _mm_load_si128((__m128i*)(imgB + i + 16));

        __m128i lo0 = _mm_unpacklo_epi8(A0, B0);
        __m128i hi0 = _mm_unpackhi_epi8(A0, B0);

        __m128i lo1 = _mm_unpacklo_epi8(A1, B1);
        __m128i hi1 = _mm_unpackhi_epi8(A1, B1);

        lo0 = _mm_maddubs_epi16(lo0, alpha_np);
        lo1 = _mm_maddubs_epi16(lo1, alpha_np);
        hi0 = _mm_maddubs_epi16(hi0, alpha_np);
        hi1 = _mm_maddubs_epi16(hi1, alpha_np);

        lo0 = _mm_srli_epi16(lo0, 7);
        lo1 = _mm_srli_epi16(lo1, 7);
        hi0 = _mm_srli_epi16(hi0, 7);
        hi1 = _mm_srli_epi16(hi1, 7);

        __m128i res0 = _mm_packus_epi16(lo0, hi0);
        __m128i res1 = _mm_packus_epi16(lo1, hi1);

        _mm_store_si128((__m128i*)(data + i +  0), res0);
        _mm_store_si128((__m128i*)(data + i + 16), res1);
    }
}
コード例 #4
0
ファイル: dec_sse2.c プロジェクト: keenliu/cuzySample
// Transpose back and store
static WEBP_INLINE void Store16x4(uint8_t* r0, uint8_t* r8, int stride,
                                  __m128i* p1, __m128i* p0,
                                  __m128i* q0, __m128i* q1) {
    __m128i t1;

    // p0 = 71 70 61 60 51 50 41 40 31 30 21 20 11 10 01 00
    // p1 = f1 f0 e1 e0 d1 d0 c1 c0 b1 b0 a1 a0 91 90 81 80
    t1 = *p0;
    *p0 = _mm_unpacklo_epi8(*p1, t1);
    *p1 = _mm_unpackhi_epi8(*p1, t1);

    // q0 = 73 72 63 62 53 52 43 42 33 32 23 22 13 12 03 02
    // q1 = f3 f2 e3 e2 d3 d2 c3 c2 b3 b2 a3 a2 93 92 83 82
    t1 = *q0;
    *q0 = _mm_unpacklo_epi8(t1, *q1);
    *q1 = _mm_unpackhi_epi8(t1, *q1);

    // p0 = 33 32 31 30 23 22 21 20 13 12 11 10 03 02 01 00
    // q0 = 73 72 71 70 63 62 61 60 53 52 51 50 43 42 41 40
    t1 = *p0;
    *p0 = _mm_unpacklo_epi16(t1, *q0);
    *q0 = _mm_unpackhi_epi16(t1, *q0);

    // p1 = b3 b2 b1 b0 a3 a2 a1 a0 93 92 91 90 83 82 81 80
    // q1 = f3 f2 f1 f0 e3 e2 e1 e0 d3 d2 d1 d0 c3 c2 c1 c0
    t1 = *p1;
    *p1 = _mm_unpacklo_epi16(t1, *q1);
    *q1 = _mm_unpackhi_epi16(t1, *q1);

    Store4x4(p0, r0, stride);
    r0 += 4 * stride;
    Store4x4(q0, r0, stride);

    Store4x4(p1, r8, stride);
    r8 += 4 * stride;
    Store4x4(q1, r8, stride);
}
コード例 #5
0
static void GF_FUNC_ALIGN VS_CC
proc_8bit(int radius, float *kernel, const uint8_t *srcp, float *buff,
          float *dstp, int width, int height, int src_stride, int dst_stride)
{
    int length = radius * 2 + 1;
    const uint8_t *p[17];
    for (int i = -radius; i <= radius; i++) {
        p[i + radius] = srcp + abs(i) * src_stride;
    }

    __m128i zero = _mm_setzero_si128();

    for (int y = 0; y < height; y++) {
        for (int x = 0; x < width; x += 16) {
            __m128 sum[4];
            sum[0] = _mm_setzero_ps();
            sum[1] = _mm_setzero_ps();
            sum[2] = _mm_setzero_ps();
            sum[3] = _mm_setzero_ps();
            
            for (int i = 0; i < length; i++) {
                __m128 f[4];
                __m128i xmm0 = _mm_load_si128((__m128i *)(p[i] + x));
                __m128i xmm1 = _mm_unpackhi_epi8(xmm0, zero);
                xmm0 = _mm_unpacklo_epi8(xmm0, zero);
                f[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm0, zero));
                f[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm0, zero));
                f[2] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm1, zero));
                f[3] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm1, zero));
                __m128 k = _mm_set1_ps(kernel[i]);
                
                for (int j = 0; j < 4; j++) {
                    sum[j] = _mm_add_ps(sum[j], _mm_mul_ps(k, f[j]));
                }
            }
            _mm_store_ps(buff + x,      sum[0]);
            _mm_store_ps(buff + x +  4, sum[1]);
            _mm_store_ps(buff + x +  8, sum[2]);
            _mm_store_ps(buff + x + 12, sum[3]);
        }
        proc_horizontal(buff, radius, length, width, kernel, dstp);
        
        for (int i = 0; i < length - 1; i++) {
            p[i] = p[i + 1];
        }
        p[length - 1] += (y < height - radius - 1 ? 1 : -1) * src_stride;
        dstp += dst_stride;
    }
}
コード例 #6
0
static void GF_FUNC_ALIGN VS_CC
proc_8bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride,
               uint8_t *dstp, const uint8_t *srcp, int th)
{
    uint8_t *p0 = buff + 16;
    uint8_t *p1 = p0 + bstride;
    uint8_t *p2 = p1 + bstride;
    uint8_t *orig = p0, *end = p2;

    line_copy8(p0, srcp + stride, width, 1);
    line_copy8(p1, srcp, width, 1);

    uint8_t threshold = (uint8_t)th;

    __m128i zero = _mm_setzero_si128();
    __m128i xth = _mm_set1_epi8((int8_t)threshold);

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 1 ? 1 : -1);
        line_copy8(p2, srcp, width, 1);
        uint8_t *coordinates[] = COORDINATES;
        for (int x = 0; x < width; x += 16) {
            __m128i sumlo = zero;
            __m128i sumhi = zero;

            for (int i = 0; i < 8; i++) {
                __m128i target = _mm_loadu_si128((__m128i *)(coordinates[i] + x));
                sumlo  = _mm_add_epi16(sumlo, _mm_unpacklo_epi8(target, zero));
                sumhi  = _mm_add_epi16(sumhi, _mm_unpackhi_epi8(target, zero));
            }

            sumlo = _mm_srai_epi16(sumlo, 3);
            sumhi = _mm_srai_epi16(sumhi, 3);
            sumlo = _mm_packus_epi16(sumlo, sumhi);

            __m128i src = _mm_load_si128((__m128i *)(p1 + x));
            __m128i limit = _mm_adds_epu8(src, xth);

            sumlo = _mm_max_epu8(sumlo, src);
            sumlo = _mm_min_epu8(sumlo, limit);

            _mm_store_si128((__m128i *)(dstp + x), sumlo);
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = (p2 == end) ? orig : p2 + bstride;
    }
}
コード例 #7
0
void fb_sqrm_low(dig_t *c, const dig_t *a) {
	__m128i t0, t1, m0, m1, m2, m3, m4, m5, m6, m8, m9, mask;
	align dig_t x[2];

	t0 = _mm_set_epi32(0x55545150, 0x45444140, 0x15141110, 0x05040100);
	mask = _mm_set_epi32(0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F, 0x0F0F0F0F);

	m0 = _mm_load_si128((__m128i *)(a));
	m1 = _mm_and_si128(m0, mask);
	m1 = _mm_shuffle_epi8(t0, m1);
	m2 = _mm_srli_epi64(m0, 4);
	m2 = _mm_and_si128(m2, mask);
	m2 = _mm_shuffle_epi8(t0, m2);
	m3 = _mm_unpacklo_epi8(m1, m2);
	m4 = _mm_unpackhi_epi8(m1, m2);

	m0 = _mm_load_si128((__m128i *)(a+2));
	m1 = _mm_and_si128(m0, mask);
	m1 = _mm_shuffle_epi8(t0, m1);
	m2 = _mm_srli_epi64(m0, 4);
	m2 = _mm_and_si128(m2, mask);
	m2 = _mm_shuffle_epi8(t0, m2);
	m5 = _mm_unpacklo_epi8(m1, m2);
	m6 = _mm_unpackhi_epi8(m1, m2);

	m0 = m3;
	m1 = m4;
	m2 = m5;
	m3 = m6;

	REDUCE();
	_mm_store_si128((__m128i *) c + 0, m0);
	_mm_store_si128((__m128i *) x, m1);
	c[2] = x[0];
	c[3] = x[1] & 0x07FFFFFFFFFFFFFF;
}
コード例 #8
0
ファイル: dec_sse2.c プロジェクト: 8l/insieme
// Applies filter on 6 pixels (p2, p1, p0, q0, q1 and q2)
static WEBP_INLINE void DoFilter6(__m128i* const p2, __m128i* const p1,
                                  __m128i* const p0, __m128i* const q0,
                                  __m128i* const q1, __m128i* const q2,
                                  const __m128i* const mask, int hev_thresh) {
    const __m128i zero = _mm_setzero_si128();
    const __m128i sign_bit = _mm_set1_epi8(0x80);
    __m128i a, not_hev;

    // compute hev mask
    GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);

    FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
    FLIP_SIGN_BIT2(*p2, *q2);
    GetBaseDelta(p1, p0, q0, q1, &a);

    {   // do simple filter on pixels with hev
        const __m128i m = _mm_andnot_si128(not_hev, *mask);
        __m128i f = _mm_and_si128(a, m);   // insieme: dropped const
        DoSimpleFilter(p0, q0, &f);
    }

    {   // do strong filter on pixels with not hev
        const __m128i k9 = _mm_set1_epi16(0x0900);
        const __m128i k63 = _mm_set1_epi16(63);

        const __m128i m = _mm_and_si128(not_hev, *mask);
        const __m128i f = _mm_and_si128(a, m);

        const __m128i f_lo = _mm_unpacklo_epi8(zero, f);
        const __m128i f_hi = _mm_unpackhi_epi8(zero, f);

        const __m128i f9_lo = _mm_mulhi_epi16(f_lo, k9);    // Filter (lo) * 9
        const __m128i f9_hi = _mm_mulhi_epi16(f_hi, k9);    // Filter (hi) * 9

        __m128i a2_lo = _mm_add_epi16(f9_lo, k63);    // Filter * 9 + 63   // insieme: dropped const
        __m128i a2_hi = _mm_add_epi16(f9_hi, k63);    // Filter * 9 + 63   // insieme: dropped const

        __m128i a1_lo = _mm_add_epi16(a2_lo, f9_lo);  // Filter * 18 + 63   // insieme: dropped const
        __m128i a1_hi = _mm_add_epi16(a2_hi, f9_hi);  // Filter * 18 + 63   // insieme: dropped const

        __m128i a0_lo = _mm_add_epi16(a1_lo, f9_lo);  // Filter * 27 + 63   // insieme: dropped const
        __m128i a0_hi = _mm_add_epi16(a1_hi, f9_hi);  // Filter * 27 + 63   // insieme: dropped const

        Update2Pixels(p2, q2, &a2_lo, &a2_hi);
        Update2Pixels(p1, q1, &a1_lo, &a1_hi);
        Update2Pixels(p0, q0, &a0_lo, &a0_hi);
    }
}
コード例 #9
0
ファイル: shuffle.c プロジェクト: B-Rich/PyTables
/* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */
static void
unshuffle8(uint8_t* dest, uint8_t* orig, size_t size)
{
  size_t i, j, k;
  size_t neblock, numof16belem;
  __m128i xmm0[8], xmm1[8];

  neblock = size / 8;
  numof16belem = neblock / 16;
  for (i = 0, k = 0; i < numof16belem; i++, k += 8) {
    /* Load the first 64 bytes in 8 XMM registrers */
    for (j = 0; j < 8; j++) {
      xmm0[j] = ((__m128i *)orig)[j*numof16belem+i];
    }
    /* Shuffle bytes */
    for (j = 0; j < 4; j++) {
      /* Compute the low 32 bytes */
      xmm1[j] = _mm_unpacklo_epi8(xmm0[j*2], xmm0[j*2+1]);
      /* Compute the hi 32 bytes */
      xmm1[4+j] = _mm_unpackhi_epi8(xmm0[j*2], xmm0[j*2+1]);
    }
    /* Shuffle 2-byte words */
    for (j = 0; j < 4; j++) {
      /* Compute the low 32 bytes */
      xmm0[j] = _mm_unpacklo_epi16(xmm1[j*2], xmm1[j*2+1]);
      /* Compute the hi 32 bytes */
      xmm0[4+j] = _mm_unpackhi_epi16(xmm1[j*2], xmm1[j*2+1]);
    }
    /* Shuffle 4-byte dwords */
    for (j = 0; j < 4; j++) {
      /* Compute the low 32 bytes */
      xmm1[j] = _mm_unpacklo_epi32(xmm0[j*2], xmm0[j*2+1]);
      /* Compute the hi 32 bytes */
      xmm1[4+j] = _mm_unpackhi_epi32(xmm0[j*2], xmm0[j*2+1]);
    }
    /* Store the result vectors in proper order */
    ((__m128i *)dest)[k+0] = xmm1[0];
    ((__m128i *)dest)[k+1] = xmm1[4];
    ((__m128i *)dest)[k+2] = xmm1[2];
    ((__m128i *)dest)[k+3] = xmm1[6];
    ((__m128i *)dest)[k+4] = xmm1[1];
    ((__m128i *)dest)[k+5] = xmm1[5];
    ((__m128i *)dest)[k+6] = xmm1[3];
    ((__m128i *)dest)[k+7] = xmm1[7];
  }
}
コード例 #10
0
ファイル: sound_sse.cpp プロジェクト: punkkeks/ClanLib
void SoundSSE::unpack_8bit_stereo(unsigned char *input, int size, float *output[2])
{
#ifndef CL_DISABLE_SSE2
	int sse_size = (size/16)*16;

	__m128i zero = _mm_setzero_si128();
	__m128 constant1 = _mm_set1_ps(1.0f/128.0f);
	__m128i constant2 = _mm_set1_epi16(128);
	for (int i = 0; i < sse_size; i+=16)
	{
		__m128i isamples = _mm_loadu_si128((__m128i*)(input+i));
		__m128i isamples0 = _mm_sub_epi16(_mm_unpacklo_epi8(isamples, zero), constant2);
		__m128i isamples1 = _mm_sub_epi16(_mm_unpackhi_epi8(isamples, zero), constant2);
		__m128 samples0 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples0), 16));
		__m128 samples1 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples0), 16));
		__m128 samples2 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples1), 16));
		__m128 samples3 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples1), 16));
		samples0 = _mm_mul_ps(samples0, constant1);
		samples1 = _mm_mul_ps(samples1, constant1);
		samples2 = _mm_mul_ps(samples2, constant1);
		samples3 = _mm_mul_ps(samples3, constant1);

		__m128 tmp0, tmp1, tmp2, tmp3;
		tmp0 = _mm_shuffle_ps(samples0, samples1, _MM_SHUFFLE(2,0,2,0));
		tmp1 = _mm_shuffle_ps(samples0, samples1, _MM_SHUFFLE(3,1,3,1));
		tmp2 = _mm_shuffle_ps(samples2, samples3, _MM_SHUFFLE(2,0,2,0));
		tmp3 = _mm_shuffle_ps(samples2, samples3, _MM_SHUFFLE(3,1,3,1));

		 _mm_storeu_ps(output[0]+i/2, tmp0);
		 _mm_storeu_ps(output[1]+i/2, tmp1);
		 _mm_storeu_ps(output[0]+i/2+4, tmp2);
		 _mm_storeu_ps(output[1]+i/2+4, tmp3);
	}
#else
	const int sse_size = 0;
#endif
	// unpack remaining
	for (int i = sse_size; i < size; i+=2)
	{
		int value = input[i];
		output[0][i/2] = ((float) (value - 128)) / 128.0f;

		value = input[i+1];
		output[1][i/2] = ((float) (value - 128)) / 128.0f;
	}
}
コード例 #11
0
ファイル: shuffle-sse2.c プロジェクト: BillTheBest/c-blosc2
/* Routine optimized for unshuffling a buffer for a type size of 8 bytes. */
static void
unshuffle8_sse2(uint8_t* const dest, const uint8_t* const src,
                const size_t vectorizable_elements, const size_t total_elements) {
  static const size_t bytesoftype = 8;
  size_t i;
  int j;
  __m128i xmm0[8], xmm1[8];

  for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
    /* Load 16 elements (128 bytes) into 8 XMM registers. */
    const uint8_t* const src_for_ith_element = src + i;
    for (j = 0; j < 8; j++) {
      xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements)));
    }
    /* Shuffle bytes */
    for (j = 0; j < 4; j++) {
      /* Compute the low 32 bytes */
      xmm1[j] = _mm_unpacklo_epi8(xmm0[j * 2], xmm0[j * 2 + 1]);
      /* Compute the hi 32 bytes */
      xmm1[4 + j] = _mm_unpackhi_epi8(xmm0[j * 2], xmm0[j * 2 + 1]);
    }
    /* Shuffle 2-byte words */
    for (j = 0; j < 4; j++) {
      /* Compute the low 32 bytes */
      xmm0[j] = _mm_unpacklo_epi16(xmm1[j * 2], xmm1[j * 2 + 1]);
      /* Compute the hi 32 bytes */
      xmm0[4 + j] = _mm_unpackhi_epi16(xmm1[j * 2], xmm1[j * 2 + 1]);
    }
    /* Shuffle 4-byte dwords */
    for (j = 0; j < 4; j++) {
      /* Compute the low 32 bytes */
      xmm1[j] = _mm_unpacklo_epi32(xmm0[j * 2], xmm0[j * 2 + 1]);
      /* Compute the hi 32 bytes */
      xmm1[4 + j] = _mm_unpackhi_epi32(xmm0[j * 2], xmm0[j * 2 + 1]);
    }
    /* Store the result vectors in proper order */
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]);
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[4]);
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm1[2]);
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm1[6]);
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (4 * sizeof(__m128i))), xmm1[1]);
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (5 * sizeof(__m128i))), xmm1[5]);
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (6 * sizeof(__m128i))), xmm1[3]);
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (7 * sizeof(__m128i))), xmm1[7]);
  }
}
コード例 #12
0
ファイル: mmintrin64.c プロジェクト: TheRyuu/ffdshow
__m64 _m_punpckhbw(__m64 _MM1, __m64 _MM2)
{
    __m128i lhs = {0}, rhs = {0};
    lhs.m128i_i8[ 8] = _MM1.m64_i8[4];
    lhs.m128i_i8[ 9] = _MM1.m64_i8[5];
    lhs.m128i_i8[10] = _MM1.m64_i8[6];
    lhs.m128i_i8[11] = _MM1.m64_i8[7];

    rhs.m128i_i8[ 8] = _MM2.m64_i8[4];
    rhs.m128i_i8[ 9] = _MM2.m64_i8[5];
    rhs.m128i_i8[10] = _MM2.m64_i8[6];
    rhs.m128i_i8[11] = _MM2.m64_i8[7];

    lhs = _mm_unpackhi_epi8(lhs, rhs);

    _MM1.m64_i64 = lhs.m128i_i64[0];
    return _MM1;
}
コード例 #13
0
static INLINE unsigned int masked_sad8xh_ssse3(
    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
    int height) {
  int y;
  __m128i res = _mm_setzero_si128();
  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));

  for (y = 0; y < height; y += 2) {
    const __m128i src = _mm_unpacklo_epi64(
        _mm_loadl_epi64((const __m128i *)src_ptr),
        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
    const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr);
    const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]);
    const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr);
    const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]);
    const __m128i m =
        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
                           _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
    const __m128i m_inv = _mm_sub_epi8(mask_max, m);

    const __m128i data_l = _mm_unpacklo_epi8(a0, b0);
    const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
    __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
    pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);

    const __m128i data_r = _mm_unpacklo_epi8(a1, b1);
    const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
    __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
    pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);

    const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
    res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));

    src_ptr += src_stride * 2;
    a_ptr += a_stride * 2;
    b_ptr += b_stride * 2;
    m_ptr += m_stride * 2;
  }
  int32_t sad =
      _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
  return (sad + 31) >> 6;
}
コード例 #14
0
ファイル: shuffle-sse2.c プロジェクト: BillTheBest/c-blosc2
/* Routine optimized for shuffling a buffer for a type size of 16 bytes. */
static void
shuffle16_sse2(uint8_t* const dest, const uint8_t* const src,
               const size_t vectorizable_elements, const size_t total_elements) {
  static const size_t bytesoftype = 16;
  size_t j;
  int k, l;
  uint8_t* dest_for_jth_element;
  __m128i xmm0[16], xmm1[16];

  for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) {
    /* Fetch 16 elements (256 bytes). */
    for (k = 0; k < 16; k++) {
      xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i))));
    }
    /* Transpose bytes */
    for (k = 0, l = 0; k < 8; k++, l += 2) {
      xmm1[k * 2] = _mm_unpacklo_epi8(xmm0[l], xmm0[l + 1]);
      xmm1[k * 2 + 1] = _mm_unpackhi_epi8(xmm0[l], xmm0[l + 1]);
    }
    /* Transpose words */
    for (k = 0, l = -2; k < 8; k++, l++) {
      if ((k % 2) == 0) l += 2;
      xmm0[k * 2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l + 2]);
      xmm0[k * 2 + 1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l + 2]);
    }
    /* Transpose double words */
    for (k = 0, l = -4; k < 8; k++, l++) {
      if ((k % 4) == 0) l += 4;
      xmm1[k * 2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l + 4]);
      xmm1[k * 2 + 1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l + 4]);
    }
    /* Transpose quad words */
    for (k = 0; k < 8; k++) {
      xmm0[k * 2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k + 8]);
      xmm0[k * 2 + 1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k + 8]);
    }
    /* Store the result vectors */
    dest_for_jth_element = dest + j;
    for (k = 0; k < 16; k++) {
      _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm0[k]);
    }
  }
}
コード例 #15
0
static void accumulate_and_store_16(const __m128i sum_0_u16,
                                    const __m128i sum_1_u16,
                                    const uint8_t *pred, uint16_t *count,
                                    uint32_t *accumulator) {
  const __m128i pred_u8 = _mm_loadu_si128((const __m128i *)pred);
  const __m128i zero = _mm_setzero_si128();
  __m128i count_0_u16 = _mm_loadu_si128((const __m128i *)count),
          count_1_u16 = _mm_loadu_si128((const __m128i *)(count + 8));
  __m128i pred_0_u16 = _mm_cvtepu8_epi16(pred_u8),
          pred_1_u16 = _mm_unpackhi_epi8(pred_u8, zero);
  __m128i pred_0_u32, pred_1_u32, pred_2_u32, pred_3_u32;
  __m128i accum_0_u32, accum_1_u32, accum_2_u32, accum_3_u32;

  count_0_u16 = _mm_adds_epu16(count_0_u16, sum_0_u16);
  _mm_storeu_si128((__m128i *)count, count_0_u16);

  count_1_u16 = _mm_adds_epu16(count_1_u16, sum_1_u16);
  _mm_storeu_si128((__m128i *)(count + 8), count_1_u16);

  pred_0_u16 = _mm_mullo_epi16(sum_0_u16, pred_0_u16);
  pred_1_u16 = _mm_mullo_epi16(sum_1_u16, pred_1_u16);

  pred_0_u32 = _mm_cvtepu16_epi32(pred_0_u16);
  pred_1_u32 = _mm_unpackhi_epi16(pred_0_u16, zero);
  pred_2_u32 = _mm_cvtepu16_epi32(pred_1_u16);
  pred_3_u32 = _mm_unpackhi_epi16(pred_1_u16, zero);

  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));
  accum_2_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 8));
  accum_3_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 12));

  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);
  accum_2_u32 = _mm_add_epi32(pred_2_u32, accum_2_u32);
  accum_3_u32 = _mm_add_epi32(pred_3_u32, accum_3_u32);

  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
  _mm_storeu_si128((__m128i *)(accumulator + 8), accum_2_u32);
  _mm_storeu_si128((__m128i *)(accumulator + 12), accum_3_u32);
}
コード例 #16
0
ファイル: simd.cpp プロジェクト: hjwhang/Image_Rescale
inline Pixel GetPixelSSE3(const Image<Pixel>* img, float x, float y)
{
 const int stride = img->width;
 const Pixel* p0 = img->data + (int)x + (int)y * stride; // pointer to first pixel

 // Load the data (2 pixels in one load)
 __m128i p12 = _mm_loadl_epi64((const __m128i*)&p0[0 * stride]); 
 __m128i p34 = _mm_loadl_epi64((const __m128i*)&p0[1 * stride]); 

 __m128 weight = CalcWeights(x, y);

 // convert RGBA RGBA RGBA RGAB to RRRR GGGG BBBB AAAA (AoS to SoA)
 __m128i p1234 = _mm_unpacklo_epi8(p12, p34);
 __m128i p34xx = _mm_unpackhi_epi64(p1234, _mm_setzero_si128());
 __m128i p1234_8bit = _mm_unpacklo_epi8(p1234, p34xx);

 // extend to 16bit 
 __m128i pRG = _mm_unpacklo_epi8(p1234_8bit, _mm_setzero_si128());
 __m128i pBA = _mm_unpackhi_epi8(p1234_8bit, _mm_setzero_si128());
 
 // convert weights to integer
 weight = _mm_mul_ps(weight, CONST_256); 
 __m128i weighti = _mm_cvtps_epi32(weight); // w4 w3 w2 w1
         weighti = _mm_packs_epi32(weighti, weighti); // 32->2x16bit

 //outRG = [w1*R1 + w2*R2 | w3*R3 + w4*R4 | w1*G1 + w2*G2 | w3*G3 + w4*G4]
 __m128i outRG = _mm_madd_epi16(pRG, weighti);
 //outBA = [w1*B1 + w2*B2 | w3*B3 + w4*B4 | w1*A1 + w2*A2 | w3*A3 + w4*A4]
 __m128i outBA = _mm_madd_epi16(pBA, weighti);

 // horizontal add that will produce the output values (in 32bit)
 __m128i out = _mm_hadd_epi32(outRG, outBA);
 out = _mm_srli_epi32(out, 8); // divide by 256
 
 // convert 32bit->8bit
 out = _mm_packus_epi32(out, _mm_setzero_si128());
 out = _mm_packus_epi16(out, _mm_setzero_si128());

 // return
 return _mm_cvtsi128_si32(out);
}
コード例 #17
0
ファイル: shuffle.c プロジェクト: B-Rich/PyTables
/* Routine optimized for shuffling a buffer for a type size of 16 bytes. */
static void
shuffle16(uint8_t* dest, uint8_t* src, size_t size)
{
  size_t i, j, k, l;
  size_t numof16belem;
  __m128i xmm0[16], xmm1[16];

  numof16belem = size / (16*16);
  for (i = 0, j = 0; i < numof16belem; i++, j += 16*16) {
    /* Fetch elements in groups of 256 bytes */
    for (k = 0; k < 16; k++) {
      xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16));
    }
    /* Transpose bytes */
    for (k = 0, l = 0; k < 8; k++, l +=2) {
      xmm1[k*2] = _mm_unpacklo_epi8(xmm0[l], xmm0[l+1]);
      xmm1[k*2+1] = _mm_unpackhi_epi8(xmm0[l], xmm0[l+1]);
    }
    /* Transpose words */
    for (k = 0, l = -2; k < 8; k++, l++) {
      if ((k%2) == 0) l += 2;
      xmm0[k*2] = _mm_unpacklo_epi16(xmm1[l], xmm1[l+2]);
      xmm0[k*2+1] = _mm_unpackhi_epi16(xmm1[l], xmm1[l+2]);
    }
    /* Transpose double words */
    for (k = 0, l = -4; k < 8; k++, l++) {
      if ((k%4) == 0) l += 4;
      xmm1[k*2] = _mm_unpacklo_epi32(xmm0[l], xmm0[l+4]);
      xmm1[k*2+1] = _mm_unpackhi_epi32(xmm0[l], xmm0[l+4]);
    }
    /* Transpose quad words */
    for (k = 0; k < 8; k++) {
      xmm0[k*2] = _mm_unpacklo_epi64(xmm1[k], xmm1[k+8]);
      xmm0[k*2+1] = _mm_unpackhi_epi64(xmm1[k], xmm1[k+8]);
    }
    /* Store the result vectors */
    for (k = 0; k < 16; k++) {
      ((__m128i *)dest)[k*numof16belem+i] = xmm0[k];
    }
  }
}
コード例 #18
0
ファイル: Zoom.cpp プロジェクト: Juju-Dredd/OpenXcom
/**
 *  Optimized 8 bit zoomer for resizing by a factor of 2. Doesn't flip.
 *  Used internally by _zoomSurfaceY() below.
 *	This is an SSE2 version written with Intel intrinsics.
 *  source and dest. widths must be multiples of 16 bytes for 128-bit access
 *  and it would help if they were aligned properly... :(
 */
static int zoomSurface2X_SSE2(SDL_Surface *src, SDL_Surface *dst)
{
	__m128i dataSrc;
	__m128i dataDst;
	Uint8 *pixelSrc = (Uint8*)src->pixels;
	Uint8 *pixelDstRow = (Uint8*)dst->pixels;
	int sx, sy;
	static bool proclaimed = false;
	
	if (!proclaimed)
	{
		proclaimed = true;
		Log(LOG_INFO) << "Using SSE2 2X zoom routine.";
	}

	for (sy = 0; sy < src->h; ++sy, pixelDstRow += dst->pitch*2)
	{
		__m128i *pixelDst =  (__m128i*)pixelDstRow;
		__m128i *pixelDst2 = (__m128i*)((Uint8*)pixelDstRow + dst->pitch);

		for (sx = 0; sx < src->w; sx += 16, pixelSrc += 16)
		{
			dataSrc = *((__m128i*) pixelSrc);

			dataDst = _mm_unpacklo_epi8(dataSrc, dataSrc); 

#undef WRITE_DST
#define WRITE_DST			*(pixelDst++) = dataDst; \
			*(pixelDst2++) = dataDst; \
			
			WRITE_DST;
			
			dataDst = _mm_unpackhi_epi8(dataSrc, dataSrc);
			
			WRITE_DST;
		}
	}
	
	return 0;
}
コード例 #19
0
ファイル: shuffle.c プロジェクト: B-Rich/PyTables
/* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */
static void
unshuffle2(uint8_t* dest, uint8_t* orig, size_t size)
{
  size_t i, k;
  size_t neblock, numof16belem;
  __m128i xmm1[2], xmm2[2];

  neblock = size / 2;
  numof16belem = neblock / 16;
  for (i = 0, k = 0; i < numof16belem; i++, k += 2) {
    /* Load the first 32 bytes in 2 XMM registrers */
    xmm1[0] = ((__m128i *)orig)[0*numof16belem+i];
    xmm1[1] = ((__m128i *)orig)[1*numof16belem+i];
    /* Shuffle bytes */
    /* Compute the low 32 bytes */
    xmm2[0] = _mm_unpacklo_epi8(xmm1[0], xmm1[1]);
    /* Compute the hi 32 bytes */
    xmm2[1] = _mm_unpackhi_epi8(xmm1[0], xmm1[1]);
    /* Store the result vectors in proper order */
    ((__m128i *)dest)[k+0] = xmm2[0];
    ((__m128i *)dest)[k+1] = xmm2[1];
  }
}
コード例 #20
0
ファイル: shuffle-sse2.c プロジェクト: BillTheBest/c-blosc2
/* Routine optimized for unshuffling a buffer for a type size of 2 bytes. */
static void
unshuffle2_sse2(uint8_t* const dest, const uint8_t* const src,
                const size_t vectorizable_elements, const size_t total_elements) {
  static const size_t bytesoftype = 2;
  size_t i;
  int j;
  __m128i xmm0[2], xmm1[2];

  for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) {
    /* Load 16 elements (32 bytes) into 2 XMM registers. */
    const uint8_t* const src_for_ith_element = src + i;
    for (j = 0; j < 2; j++) {
      xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements)));
    }
    /* Shuffle bytes */
    /* Compute the low 32 bytes */
    xmm1[0] = _mm_unpacklo_epi8(xmm0[0], xmm0[1]);
    /* Compute the hi 32 bytes */
    xmm1[1] = _mm_unpackhi_epi8(xmm0[0], xmm0[1]);
    /* Store the result vectors in proper order */
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm1[0]);
    _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm1[1]);
  }
}
コード例 #21
0
ファイル: sound_sse.cpp プロジェクト: punkkeks/ClanLib
void SoundSSE::unpack_8bit_mono(unsigned char *input, int size, float *output)
{
#ifndef CL_DISABLE_SSE2
	int sse_size = (size/16)*16;

	__m128i zero = _mm_setzero_si128();
	__m128 constant1 = _mm_set1_ps(1.0f/128.0f);
	__m128i constant2 = _mm_set1_epi16(128);
	for (int i = 0; i < sse_size; i+=16)
	{
		__m128i isamples = _mm_loadu_si128((__m128i*)(input+i));
		__m128i isamples0 = _mm_sub_epi16(_mm_unpacklo_epi8(isamples, zero), constant2);
		__m128i isamples1 = _mm_sub_epi16(_mm_unpackhi_epi8(isamples, zero), constant2);
		__m128 samples0 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples0), 16));
		__m128 samples1 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples0), 16));
		__m128 samples2 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples1), 16));
		__m128 samples3 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples1), 16));
		samples0 = _mm_mul_ps(samples0, constant1);
		samples1 = _mm_mul_ps(samples1, constant1);
		samples2 = _mm_mul_ps(samples2, constant1);
		samples3 = _mm_mul_ps(samples3, constant1);
		 _mm_storeu_ps(output+i+0, samples0);
		 _mm_storeu_ps(output+i+4, samples1);
		 _mm_storeu_ps(output+i+8, samples2);
		 _mm_storeu_ps(output+i+12, samples3);
	}
#else
	const int sse_size = 0;
#endif
	// unpack remaining
	for (int i = sse_size; i < size; i++)
	{
		int value = input[i];
		output[i] = ((float) (value - 128)) / 128.0f;
	}
}
コード例 #22
0
ファイル: bitshuffle-sse2.c プロジェクト: ASPP/python-blosc
/* Transpose bytes within elements for 32 bit elements. */
int64_t bshuf_trans_byte_elem_SSE_32(void* in, void* out, const size_t size) {

    char* in_b = (char*) in;
    char* out_b = (char*) out;
    __m128i a0, b0, c0, d0, a1, b1, c1, d1;
    size_t ii;

    for (ii=0; ii + 15 < size; ii += 16) {
        a0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 0*16]);
        b0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 1*16]);
        c0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 2*16]);
        d0 = _mm_loadu_si128((__m128i *) &in_b[4*ii + 3*16]);

        a1 = _mm_unpacklo_epi8(a0, b0);
        b1 = _mm_unpackhi_epi8(a0, b0);
        c1 = _mm_unpacklo_epi8(c0, d0);
        d1 = _mm_unpackhi_epi8(c0, d0);

        a0 = _mm_unpacklo_epi8(a1, b1);
        b0 = _mm_unpackhi_epi8(a1, b1);
        c0 = _mm_unpacklo_epi8(c1, d1);
        d0 = _mm_unpackhi_epi8(c1, d1);

        a1 = _mm_unpacklo_epi8(a0, b0);
        b1 = _mm_unpackhi_epi8(a0, b0);
        c1 = _mm_unpacklo_epi8(c0, d0);
        d1 = _mm_unpackhi_epi8(c0, d0);

        a0 = _mm_unpacklo_epi64(a1, c1);
        b0 = _mm_unpackhi_epi64(a1, c1);
        c0 = _mm_unpacklo_epi64(b1, d1);
        d0 = _mm_unpackhi_epi64(b1, d1);

        _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0);
        _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0);
        _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0);
        _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0);
    }
    return bshuf_trans_byte_elem_remainder(in, out, size, 4,
            size - size % 16);
}
コード例 #23
0
ファイル: ThumbnailProvider.cpp プロジェクト: CheddarB/nbites
void ThumbnailProvider::shrink8x8SSE(const ImageBH& srcImage, ThumbnailBH::ThumbnailImage& destImage)
{
  int scaleFactor = 8;
  int averagedPixels = scaleFactor * scaleFactor;
  ASSERT(srcImage.width % scaleFactor == 0);
  ASSERT(srcImage.height % scaleFactor == 0);

  destImage.setResolution(srcImage.width / scaleFactor, srcImage.height / scaleFactor);
  int height = srcImage.height;
  int width = srcImage.width;

  static const __m128i zero = _mm_setzero_si128();
  __m128i* summs = reinterpret_cast<__m128i*>(SystemCall::alignedMalloc(16 * destImage.width, 16));
  memset(summs, 0, destImage.width * 16);

  const ImageBH::Pixel* pSrc;
  ThumbnailBH::ThumbnailImage::PixelType* pDest;
  __m128i* pSumms;

  __m128i tmp;
  __m128i lower;
  __m128i upper;

  for(int y = 0; y < height; ++y)
  {
    if(y % scaleFactor == 0)
    {
      pDest = destImage[y / scaleFactor];
    }
    pSrc = srcImage[y];
    pSumms = summs;
    for(int x = 0; x < width; x += scaleFactor, pSrc += scaleFactor, ++pSumms)
    {
      tmp = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pSrc));
      lower = _mm_unpacklo_epi8(tmp, zero);
      upper = _mm_unpackhi_epi8(tmp, zero);
      *pSumms = _mm_add_epi16(*pSumms, lower);
      *pSumms = _mm_add_epi16(*pSumms, upper);

      tmp = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pSrc + scaleFactor / 2));
      lower = _mm_unpacklo_epi8(tmp, zero);
      upper = _mm_unpackhi_epi8(tmp, zero);
      *pSumms = _mm_add_epi16(*pSumms, lower);
      *pSumms = _mm_add_epi16(*pSumms, upper);
    }

    if(y % scaleFactor == scaleFactor - 1)
    {
      pSumms = summs;
      for(int i = 0; i < destImage.width; ++i, ++pSumms, ++pDest)
      {
        short* ptr = reinterpret_cast<short*>(pSumms);

        short sumY = ptr[offsetof(ImageBH::Pixel, y)] + ptr[offsetof(ImageBH::Pixel, y) + sizeof(ImageBH::Pixel)];
        short sumCb = ptr[offsetof(ImageBH::Pixel, cb)] + ptr[offsetof(ImageBH::Pixel, cb) + sizeof(ImageBH::Pixel)];
        short sumCr = ptr[offsetof(ImageBH::Pixel, cr)] + ptr[offsetof(ImageBH::Pixel, cr) + sizeof(ImageBH::Pixel)];

        pDest->y = static_cast<char>(sumY / averagedPixels);
        pDest->cb = static_cast<char>(sumCb / averagedPixels);
        pDest->cr = static_cast<char>(sumCr / averagedPixels);
      }
      memset(summs, 0, destImage.width * 16);
    }
  }
  SystemCall::alignedFree(summs);
}
コード例 #24
0
ファイル: bitshuffle-sse2.c プロジェクト: ASPP/python-blosc
/* Transpose bytes within elements for 64 bit elements. */
int64_t bshuf_trans_byte_elem_SSE_64(void* in, void* out, const size_t size) {

    char* in_b = (char*) in;
    char* out_b = (char*) out;
    __m128i a0, b0, c0, d0, e0, f0, g0, h0;
    __m128i a1, b1, c1, d1, e1, f1, g1, h1;
    size_t ii;

    for (ii=0; ii + 15 < size; ii += 16) {
        a0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 0*16]);
        b0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 1*16]);
        c0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 2*16]);
        d0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 3*16]);
        e0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 4*16]);
        f0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 5*16]);
        g0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 6*16]);
        h0 = _mm_loadu_si128((__m128i *) &in_b[8*ii + 7*16]);

        a1 = _mm_unpacklo_epi8(a0, b0);
        b1 = _mm_unpackhi_epi8(a0, b0);
        c1 = _mm_unpacklo_epi8(c0, d0);
        d1 = _mm_unpackhi_epi8(c0, d0);
        e1 = _mm_unpacklo_epi8(e0, f0);
        f1 = _mm_unpackhi_epi8(e0, f0);
        g1 = _mm_unpacklo_epi8(g0, h0);
        h1 = _mm_unpackhi_epi8(g0, h0);

        a0 = _mm_unpacklo_epi8(a1, b1);
        b0 = _mm_unpackhi_epi8(a1, b1);
        c0 = _mm_unpacklo_epi8(c1, d1);
        d0 = _mm_unpackhi_epi8(c1, d1);
        e0 = _mm_unpacklo_epi8(e1, f1);
        f0 = _mm_unpackhi_epi8(e1, f1);
        g0 = _mm_unpacklo_epi8(g1, h1);
        h0 = _mm_unpackhi_epi8(g1, h1);

        a1 = _mm_unpacklo_epi32(a0, c0);
        b1 = _mm_unpackhi_epi32(a0, c0);
        c1 = _mm_unpacklo_epi32(b0, d0);
        d1 = _mm_unpackhi_epi32(b0, d0);
        e1 = _mm_unpacklo_epi32(e0, g0);
        f1 = _mm_unpackhi_epi32(e0, g0);
        g1 = _mm_unpacklo_epi32(f0, h0);
        h1 = _mm_unpackhi_epi32(f0, h0);

        a0 = _mm_unpacklo_epi64(a1, e1);
        b0 = _mm_unpackhi_epi64(a1, e1);
        c0 = _mm_unpacklo_epi64(b1, f1);
        d0 = _mm_unpackhi_epi64(b1, f1);
        e0 = _mm_unpacklo_epi64(c1, g1);
        f0 = _mm_unpackhi_epi64(c1, g1);
        g0 = _mm_unpacklo_epi64(d1, h1);
        h0 = _mm_unpackhi_epi64(d1, h1);

        _mm_storeu_si128((__m128i *) &out_b[0*size + ii], a0);
        _mm_storeu_si128((__m128i *) &out_b[1*size + ii], b0);
        _mm_storeu_si128((__m128i *) &out_b[2*size + ii], c0);
        _mm_storeu_si128((__m128i *) &out_b[3*size + ii], d0);
        _mm_storeu_si128((__m128i *) &out_b[4*size + ii], e0);
        _mm_storeu_si128((__m128i *) &out_b[5*size + ii], f0);
        _mm_storeu_si128((__m128i *) &out_b[6*size + ii], g0);
        _mm_storeu_si128((__m128i *) &out_b[7*size + ii], h0);
    }
    return bshuf_trans_byte_elem_remainder(in, out, size, 8,
            size - size % 16);
}
コード例 #25
0
ファイル: bitshuffle-sse2.c プロジェクト: ASPP/python-blosc
/* For data organized into a row for each bit (8 * elem_size rows), transpose
 * the bytes. */
int64_t bshuf_trans_byte_bitrow_sse2(void* in, void* out, const size_t size,
				     const size_t elem_size) {

    char* in_b = (char*) in;
    char* out_b = (char*) out;
    size_t nrows = 8 * elem_size;
    size_t nbyte_row = size / 8;
    size_t ii, jj;

    __m128i a0, b0, c0, d0, e0, f0, g0, h0;
    __m128i a1, b1, c1, d1, e1, f1, g1, h1;
    __m128 *as, *bs, *cs, *ds, *es, *fs, *gs, *hs;

    CHECK_MULT_EIGHT(size);

    for (ii = 0; ii + 7 < nrows; ii += 8) {
        for (jj = 0; jj + 15 < nbyte_row; jj += 16) {
            a0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 0)*nbyte_row + jj]);
            b0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 1)*nbyte_row + jj]);
            c0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 2)*nbyte_row + jj]);
            d0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 3)*nbyte_row + jj]);
            e0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 4)*nbyte_row + jj]);
            f0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 5)*nbyte_row + jj]);
            g0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 6)*nbyte_row + jj]);
            h0 = _mm_loadu_si128((__m128i *) &in_b[(ii + 7)*nbyte_row + jj]);


            a1 = _mm_unpacklo_epi8(a0, b0);
            b1 = _mm_unpacklo_epi8(c0, d0);
            c1 = _mm_unpacklo_epi8(e0, f0);
            d1 = _mm_unpacklo_epi8(g0, h0);
            e1 = _mm_unpackhi_epi8(a0, b0);
            f1 = _mm_unpackhi_epi8(c0, d0);
            g1 = _mm_unpackhi_epi8(e0, f0);
            h1 = _mm_unpackhi_epi8(g0, h0);


            a0 = _mm_unpacklo_epi16(a1, b1);
            b0 = _mm_unpacklo_epi16(c1, d1);
            c0 = _mm_unpackhi_epi16(a1, b1);
            d0 = _mm_unpackhi_epi16(c1, d1);

            e0 = _mm_unpacklo_epi16(e1, f1);
            f0 = _mm_unpacklo_epi16(g1, h1);
            g0 = _mm_unpackhi_epi16(e1, f1);
            h0 = _mm_unpackhi_epi16(g1, h1);


            a1 = _mm_unpacklo_epi32(a0, b0);
            b1 = _mm_unpackhi_epi32(a0, b0);

            c1 = _mm_unpacklo_epi32(c0, d0);
            d1 = _mm_unpackhi_epi32(c0, d0);

            e1 = _mm_unpacklo_epi32(e0, f0);
            f1 = _mm_unpackhi_epi32(e0, f0);

            g1 = _mm_unpacklo_epi32(g0, h0);
            h1 = _mm_unpackhi_epi32(g0, h0);

            /*  We don't have a storeh instruction for integers, so interpret */
            /*  as a float. Have a storel (_mm_storel_epi64). */
            as = (__m128 *) &a1;
            bs = (__m128 *) &b1;
            cs = (__m128 *) &c1;
            ds = (__m128 *) &d1;
            es = (__m128 *) &e1;
            fs = (__m128 *) &f1;
            gs = (__m128 *) &g1;
            hs = (__m128 *) &h1;

            _mm_storel_pi((__m64 *) &out_b[(jj + 0) * nrows + ii], *as);
            _mm_storel_pi((__m64 *) &out_b[(jj + 2) * nrows + ii], *bs);
            _mm_storel_pi((__m64 *) &out_b[(jj + 4) * nrows + ii], *cs);
            _mm_storel_pi((__m64 *) &out_b[(jj + 6) * nrows + ii], *ds);
            _mm_storel_pi((__m64 *) &out_b[(jj + 8) * nrows + ii], *es);
            _mm_storel_pi((__m64 *) &out_b[(jj + 10) * nrows + ii], *fs);
            _mm_storel_pi((__m64 *) &out_b[(jj + 12) * nrows + ii], *gs);
            _mm_storel_pi((__m64 *) &out_b[(jj + 14) * nrows + ii], *hs);

            _mm_storeh_pi((__m64 *) &out_b[(jj + 1) * nrows + ii], *as);
            _mm_storeh_pi((__m64 *) &out_b[(jj + 3) * nrows + ii], *bs);
            _mm_storeh_pi((__m64 *) &out_b[(jj + 5) * nrows + ii], *cs);
            _mm_storeh_pi((__m64 *) &out_b[(jj + 7) * nrows + ii], *ds);
            _mm_storeh_pi((__m64 *) &out_b[(jj + 9) * nrows + ii], *es);
            _mm_storeh_pi((__m64 *) &out_b[(jj + 11) * nrows + ii], *fs);
            _mm_storeh_pi((__m64 *) &out_b[(jj + 13) * nrows + ii], *gs);
            _mm_storeh_pi((__m64 *) &out_b[(jj + 15) * nrows + ii], *hs);
        }
        for (jj = nbyte_row - nbyte_row % 16; jj < nbyte_row; jj ++) {
            out_b[jj * nrows + ii + 0] = in_b[(ii + 0)*nbyte_row + jj];
            out_b[jj * nrows + ii + 1] = in_b[(ii + 1)*nbyte_row + jj];
            out_b[jj * nrows + ii + 2] = in_b[(ii + 2)*nbyte_row + jj];
            out_b[jj * nrows + ii + 3] = in_b[(ii + 3)*nbyte_row + jj];
            out_b[jj * nrows + ii + 4] = in_b[(ii + 4)*nbyte_row + jj];
            out_b[jj * nrows + ii + 5] = in_b[(ii + 5)*nbyte_row + jj];
            out_b[jj * nrows + ii + 6] = in_b[(ii + 6)*nbyte_row + jj];
            out_b[jj * nrows + ii + 7] = in_b[(ii + 7)*nbyte_row + jj];
        }
    }
    return size * elem_size;
}
コード例 #26
0
mlib_status
mlib_VideoColorYUV2ARGB422_aligned(
	mlib_u8 *argb,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 width,
	mlib_s32 height,
	mlib_s32 argb_stride,
	mlib_s32 y_stride,
	mlib_s32 uv_stride)
{
/* 1.1644  * 8192 */
	const __m128i c0 = _mm_set1_epi16(0x2543);
	const mlib_s32 ic0 = 0x2543;

/* 2.0184  * 8192 */
	const __m128i c1 = _mm_set1_epi16(0x4097);
	const mlib_s32 ic1 = 0x4097;

/* abs( -0.3920 * 8192 ) */
	const __m128i c4 = _mm_set1_epi16(0xc8b);
	const mlib_s32 ic4 = 0xc8b;

/* abs( -0.8132 * 8192 ) */
	const __m128i c5 = _mm_set1_epi16(0x1a06);
	const mlib_s32 ic5 = 0x1a06;

/* 1.5966  * 8192 */
	const __m128i c8 = _mm_set1_epi16(0x3317);
	const mlib_s32 ic8 = 0x3317;

/* -276.9856 * 32 */
	const __m128i coff0 = _mm_set1_epi16(0xdd60);
	const mlib_s32 icoff0 = (mlib_s32)0xffffdd60;

/* 135.6352  * 32 */
	const __m128i coff1 = _mm_set1_epi16(0x10f4);
	const mlib_s32 icoff1 = 0x10f4;

/* -222.9952 * 32 */
	const __m128i coff2 = _mm_set1_epi16(0xe420);
	const mlib_s32 icoff2 = (mlib_s32)0xffffe420;

/* loop variable */
	mlib_s32 jH, iW;

/* pointers */
	mlib_u8 *pY, *pU, *pV, *pD, *pdd, *ptemp;
	__m128i *py, *pu, *pv;

/* variables */
	__m128i sy1, sy2, sy3, sy4, su1, su2, sv1, sv2;
	__m128i du0, du1, dv1, dv2;
	__m128i db1, db2, db3, db4, dr1, dr2, dr3, dr4, dg1, dg2, dg3, dg4;
	__m128i ddy1, ddy2, ddy3, ddy4, dzrl, dzrh, dgbl, dgbh, drgbh, drgbl;
	__m128i db_h, db_l, dg_h, dg_l, dr_h, dr_l, temp, bak;
	const __m128i x_zero = _mm_setzero_si128();
	const __m128i x_mask = _mm_set1_epi32(0xff);

/* for 4-pixel computing */
	mlib_s32 iu, iv, ig, ir, ib, iTemp;
	mlib_s32 iu0, iu1, iv1, iv2;

	pY  = (mlib_u8 *)y;
	pU  = (mlib_u8 *)u;
	pV  = (mlib_u8 *)v;
	pD = (mlib_u8 *)argb;

	for (jH = 0; jH < height; jH++) {
		py = (__m128i *)pY;
		pu = (__m128i *)pU;
		pv = (__m128i *)pV;
		pdd = pD;
		iW = 0;

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
		/* 32 pixels */
		for (; iW <= width - 32; iW += 32) {
			/* load y u v, and expand */
			temp = _mm_load_si128(pu);
			su1 = _mm_unpacklo_epi8(x_zero, temp);
			su2 = _mm_unpackhi_epi8(x_zero, temp);
			pu++;
			temp = _mm_load_si128(pv);
			sv1 = _mm_unpacklo_epi8(x_zero, temp);
			sv2 = _mm_unpackhi_epi8(x_zero, temp);
			pv++;
			temp = _mm_load_si128(py);
			sy1 = _mm_unpacklo_epi8(x_zero, temp);
			sy2 = _mm_unpackhi_epi8(x_zero, temp);
			py++;
			temp = _mm_load_si128(py);
			sy3 = _mm_unpacklo_epi8(x_zero, temp);
			sy4 = _mm_unpackhi_epi8(x_zero, temp);
			py++;

			/* pre-calc d[r/g/b][1234] */
			du0 = _mm_mulhi_epu16(su1, c1);
			db_l = _mm_add_epi16(du0, coff0);
			du0 = _mm_mulhi_epu16(su2, c1);
			db_h = _mm_add_epi16(du0, coff0);

			du1 = _mm_mulhi_epu16(su1, c4);
			dv1 = _mm_mulhi_epu16(sv1, c5);
			temp = _mm_add_epi16(du1, dv1);
			dg_l = _mm_sub_epi16(coff1, temp);
			du1 = _mm_mulhi_epu16(su2, c4);
			dv1 = _mm_mulhi_epu16(sv2, c5);
			temp = _mm_add_epi16(du1, dv1);
			dg_h = _mm_sub_epi16(coff1, temp);

			dv2 = _mm_mulhi_epu16(sv1, c8);
			dr_l = _mm_add_epi16(dv2, coff2);
			dv2 = _mm_mulhi_epu16(sv2, c8);
			dr_h = _mm_add_epi16(dv2, coff2);

			ddy1 = _mm_mulhi_epu16(sy1, c0);
			ddy2 = _mm_mulhi_epu16(sy2, c0);
			ddy3 = _mm_mulhi_epu16(sy3, c0);
			ddy4 = _mm_mulhi_epu16(sy4, c0);

			/* db1/2/3/4 */
			bak = _mm_unpacklo_epi16(db_l, db_l);
			db1 = _mm_add_epi16(ddy1, bak);
			bak = _mm_unpackhi_epi16(db_l, db_l);
			db2 = _mm_add_epi16(ddy2, bak);

			bak = _mm_unpacklo_epi16(db_h, db_h);
			db3 = _mm_add_epi16(ddy3, bak);
			bak = _mm_unpackhi_epi16(db_h, db_h);
			db4 = _mm_add_epi16(ddy4, bak);

			/* dg1/2/3/4 */
			bak = _mm_unpacklo_epi16(dg_l, dg_l);
			dg1 = _mm_add_epi16(ddy1, bak);
			bak = _mm_unpackhi_epi16(dg_l, dg_l);
			dg2 = _mm_add_epi16(ddy2, bak);

			bak = _mm_unpacklo_epi16(dg_h, dg_h);
			dg3 = _mm_add_epi16(ddy3, bak);
			bak = _mm_unpackhi_epi16(dg_h, dg_h);
			dg4 = _mm_add_epi16(ddy4, bak);

			/* dr1/2/3/4 */
			bak = _mm_unpacklo_epi16(dr_l, dr_l);
			dr1 = _mm_add_epi16(ddy1, bak);
			bak = _mm_unpackhi_epi16(dr_l, dr_l);
			dr2 = _mm_add_epi16(ddy2, bak);

			bak = _mm_unpacklo_epi16(dr_h, dr_h);
			dr3 = _mm_add_epi16(ddy3, bak);
			bak = _mm_unpackhi_epi16(dr_h, dr_h);
			dr4 = _mm_add_epi16(ddy4, bak);

			db1 = _mm_srai_epi16(db1, 5);
			db2 = _mm_srai_epi16(db2, 5);
			db3 = _mm_srai_epi16(db3, 5);
			db4 = _mm_srai_epi16(db4, 5);
			dg1 = _mm_srai_epi16(dg1, 5);
			dg2 = _mm_srai_epi16(dg2, 5);
			dg3 = _mm_srai_epi16(dg3, 5);
			dg4 = _mm_srai_epi16(dg4, 5);
			dr1 = _mm_srai_epi16(dr1, 5);
			dr2 = _mm_srai_epi16(dr2, 5);
			dr3 = _mm_srai_epi16(dr3, 5);
			dr4 = _mm_srai_epi16(dr4, 5);

			/* pack: 16=>8 */
			db1 = _mm_packus_epi16(db1, db2);
			db2 = _mm_packus_epi16(db3, db4);
			dr1 = _mm_packus_epi16(dr1, dr2);
			dr2 = _mm_packus_epi16(dr3, dr4);
			dg1 = _mm_packus_epi16(dg1, dg2);
			dg2 = _mm_packus_epi16(dg3, dg4);

			/* create rgb sequences : db/dr/dg[1] */
			dzrl = _mm_unpacklo_epi8(x_zero, dr1);
			dzrh = _mm_unpackhi_epi8(x_zero, dr1);
			dgbl = _mm_unpacklo_epi8(dg1, db1);
			dgbh = _mm_unpackhi_epi8(dg1, db1);

			drgbl = _mm_unpacklo_epi16(dzrl, dgbl);
			SAVE_ARGB1(drgbl);

			drgbh = _mm_unpackhi_epi16(dzrl, dgbl);
			SAVE_ARGB1(drgbh);

			drgbl = _mm_unpacklo_epi16(dzrh, dgbh);
			SAVE_ARGB1(drgbl);

			drgbh = _mm_unpackhi_epi16(dzrh, dgbh);
			SAVE_ARGB1(drgbh);

			/* create rgb sequences : db/dr/dg[2] */
			dzrl = _mm_unpacklo_epi8(x_zero, dr2);
			dzrh = _mm_unpackhi_epi8(x_zero, dr2);
			dgbl = _mm_unpacklo_epi8(dg2, db2);
			dgbh = _mm_unpackhi_epi8(dg2, db2);

			drgbl = _mm_unpacklo_epi16(dzrl, dgbl);
			SAVE_ARGB1(drgbl);

			drgbh = _mm_unpackhi_epi16(dzrl, dgbl);
			SAVE_ARGB1(drgbh);

			drgbl = _mm_unpacklo_epi16(dzrh, dgbh);
			SAVE_ARGB1(drgbl);

			drgbh = _mm_unpackhi_epi16(dzrh, dgbh);
			SAVE_ARGB1(drgbh);
		}

		/* 16 pixels */
		if (iW <= width - 16) {
			/* load y u v, and expand */
			temp = _mm_loadl_epi64(pu);
			su1 = _mm_unpacklo_epi8(x_zero, temp);
			pu = (__m128i *) (((__m64 *)pu) + 1);
			temp = _mm_loadl_epi64(pv);
			sv1 = _mm_unpacklo_epi8(x_zero, temp);
			pv = (__m128i *) (((__m64 *)pv) + 1);
			temp = _mm_load_si128(py);
			sy1 = _mm_unpacklo_epi8(x_zero, temp);
			sy2 = _mm_unpackhi_epi8(x_zero, temp);
			py++;

			/* pre-calc d[r/g/b][12] */
			du0 = _mm_mulhi_epu16(su1, c1);
			db_l = _mm_add_epi16(du0, coff0);

			du1 = _mm_mulhi_epu16(su1, c4);
			dv1 = _mm_mulhi_epu16(sv1, c5);
			temp = _mm_add_epi16(du1, dv1);
			dg_l = _mm_sub_epi16(coff1, temp);

			dv2 = _mm_mulhi_epu16(sv1, c8);
			dr_l = _mm_add_epi16(dv2, coff2);

			ddy1 = _mm_mulhi_epu16(sy1, c0);
			ddy2 = _mm_mulhi_epu16(sy2, c0);

			/* db1/2 */
			bak = _mm_unpacklo_epi16(db_l, db_l);
			db1 = _mm_add_epi16(ddy1, bak);
			bak = _mm_unpackhi_epi16(db_l, db_l);
			db2 = _mm_add_epi16(ddy2, bak);

			/* dg1/2 */
			bak = _mm_unpacklo_epi16(dg_l, dg_l);
			dg1 = _mm_add_epi16(ddy1, bak);
			bak = _mm_unpackhi_epi16(dg_l, dg_l);
			dg2 = _mm_add_epi16(ddy2, bak);

			/* dr1/2 */
			bak = _mm_unpacklo_epi16(dr_l, dr_l);
			dr1 = _mm_add_epi16(ddy1, bak);
			bak = _mm_unpackhi_epi16(dr_l, dr_l);
			dr2 = _mm_add_epi16(ddy2, bak);

			db1 = _mm_srai_epi16(db1, 5);
			db2 = _mm_srai_epi16(db2, 5);
			dg1 = _mm_srai_epi16(dg1, 5);
			dg2 = _mm_srai_epi16(dg2, 5);
			dr1 = _mm_srai_epi16(dr1, 5);
			dr2 = _mm_srai_epi16(dr2, 5);

			/* pack: 16=>8 */
			db1 = _mm_packus_epi16(db1, db2);
			dr1 = _mm_packus_epi16(dr1, dr2);
			dg1 = _mm_packus_epi16(dg1, dg2);

			/* create rgb sequences : db/dr/dg[1] */
			dzrl = _mm_unpacklo_epi8(x_zero, dr1);
			dzrh = _mm_unpackhi_epi8(x_zero, dr1);
			dgbl = _mm_unpacklo_epi8(dg1, db1);
			dgbh = _mm_unpackhi_epi8(dg1, db1);

			drgbl = _mm_unpacklo_epi16(dzrl, dgbl);
			SAVE_ARGB1(drgbl);

			drgbh = _mm_unpackhi_epi16(dzrl, dgbl);
			SAVE_ARGB1(drgbh);

			drgbl = _mm_unpacklo_epi16(dzrh, dgbh);
			SAVE_ARGB1(drgbl);

			drgbh = _mm_unpackhi_epi16(dzrh, dgbh);
			SAVE_ARGB1(drgbh);

			iW += 16;
		}

		/* 8 pixels */
		if (iW <= width - 8) {
			/* load y u v, and expand */
			iTemp = *((mlib_s32 *)pu);
			temp = _mm_cvtsi32_si128(iTemp);
			su1 = _mm_unpacklo_epi8(x_zero, temp);
			pu = (__m128i *) (((mlib_s32 *)pu) + 1);
			iTemp = *((mlib_s32 *)pv);
			temp = _mm_cvtsi32_si128(iTemp);
			sv1 = _mm_unpacklo_epi8(x_zero, temp);
			pv = (__m128i *) (((mlib_s32 *)pv) + 1);
			temp = _mm_loadl_epi64(py);
			sy1 = _mm_unpacklo_epi8(x_zero, temp);
			py = (__m128i *) (((__m64 *)py) + 1);

			/* pre-calc d[r/g/b][1] */
			du0 = _mm_mulhi_epu16(su1, c1);
			db_l = _mm_add_epi16(du0, coff0);

			du1 = _mm_mulhi_epu16(su1, c4);
			dv1 = _mm_mulhi_epu16(sv1, c5);
			temp = _mm_add_epi16(du1, dv1);
			dg_l = _mm_sub_epi16(coff1, temp);

			dv2 = _mm_mulhi_epu16(sv1, c8);
			dr_l = _mm_add_epi16(dv2, coff2);

			ddy1 = _mm_mulhi_epu16(sy1, c0);

			/* db1 */
			bak = _mm_unpacklo_epi16(db_l, db_l);
			db1 = _mm_add_epi16(ddy1, bak);

			/* dg1 */
			bak = _mm_unpacklo_epi16(dg_l, dg_l);
			dg1 = _mm_add_epi16(ddy1, bak);

			/* dr1 */
			bak = _mm_unpacklo_epi16(dr_l, dr_l);
			dr1 = _mm_add_epi16(ddy1, bak);

			db1 = _mm_srai_epi16(db1, 5);
			dg1 = _mm_srai_epi16(dg1, 5);
			dr1 = _mm_srai_epi16(dr1, 5);

			/* pack: 16=>8 */
			db1 = _mm_packus_epi16(db1, x_zero);
			dr1 = _mm_packus_epi16(dr1, x_zero);
			dg1 = _mm_packus_epi16(dg1, x_zero);

			/* create rgb sequences : db/dr/dg[1] */
			dzrl = _mm_unpacklo_epi8(x_zero, dr1);
			dgbl = _mm_unpacklo_epi8(dg1, db1);

			drgbl = _mm_unpacklo_epi16(dzrl, dgbl);
			SAVE_ARGB1(drgbl);

			drgbh = _mm_unpackhi_epi16(dzrl, dgbl);
			SAVE_ARGB1(drgbh);

			iW += 8;
		}

		/* 4 pixels */
		if (iW <= width - 4) {
			/* load y u v, and expand */
			iTemp = *((mlib_s16 *)pu);
			temp = _mm_cvtsi32_si128(iTemp);
			su1 = _mm_unpacklo_epi8(x_zero, temp);
			pu = (__m128i *) (((mlib_s16 *)pu) + 1);
			iTemp = *((mlib_s16 *)pv);
			temp = _mm_cvtsi32_si128(iTemp);
			sv1 = _mm_unpacklo_epi8(x_zero, temp);
			pv = (__m128i *) (((mlib_s16 *)pv) + 1);
			iTemp = *((mlib_s32 *)py);
			temp = _mm_cvtsi32_si128(iTemp);
			sy1 = _mm_unpacklo_epi8(x_zero, temp);
			py = (__m128i *) (((mlib_s32 *)py) + 1);

			/* pre-calc d[r/g/b][1] */
			du0 = _mm_mulhi_epu16(su1, c1);
			db_l = _mm_add_epi16(du0, coff0);

			du1 = _mm_mulhi_epu16(su1, c4);
			dv1 = _mm_mulhi_epu16(sv1, c5);
			temp = _mm_add_epi16(du1, dv1);
			dg_l = _mm_sub_epi16(coff1, temp);

			dv2 = _mm_mulhi_epu16(sv1, c8);
			dr_l = _mm_add_epi16(dv2, coff2);

			ddy1 = _mm_mulhi_epu16(sy1, c0);

			/* db1 */
			bak = _mm_unpacklo_epi16(db_l, db_l);
			db1 = _mm_add_epi16(ddy1, bak);

			/* dg1 */
			bak = _mm_unpacklo_epi16(dg_l, dg_l);
			dg1 = _mm_add_epi16(ddy1, bak);

			/* dr1 */
			bak = _mm_unpacklo_epi16(dr_l, dr_l);
			dr1 = _mm_add_epi16(ddy1, bak);

			db1 = _mm_srai_epi16(db1, 5);
			dg1 = _mm_srai_epi16(dg1, 5);
			dr1 = _mm_srai_epi16(dr1, 5);

			/* pack: 16=>8 */
			db1 = _mm_packus_epi16(db1, x_zero);
			dr1 = _mm_packus_epi16(dr1, x_zero);
			dg1 = _mm_packus_epi16(dg1, x_zero);

			/* create rgb sequences : db/dr/dg[1] */
			dzrl = _mm_unpacklo_epi8(x_zero, dr1);
			dgbl = _mm_unpacklo_epi8(dg1, db1);

			drgbl = _mm_unpacklo_epi16(dzrl, dgbl);
			SAVE_ARGB1(drgbl);

			iW += 4;
		}

		/* 2 pixels */
		if (iW <= width - 2) {
			/* load y u v, and expand */
			iu = *((mlib_u8 *)pu);
			pu = (__m128i *) (((mlib_u8 *)pu) + 1);
			iv = *((mlib_u8 *)pv);
			pv = (__m128i *) (((mlib_u8 *)pv) + 1);
			iTemp = *((mlib_s16 *)py);
			temp = _mm_cvtsi32_si128(iTemp);
			sy1 = _mm_unpacklo_epi8(x_zero, temp);
			py = (__m128i *) (((mlib_s16 *)py) + 1);

			/* pre-calc d[r/g/b][1] */
			iu0 = (iu * ic1) >> 8;
			ib = icoff0 + iu0;

			iu1 = (iu * ic4) >> 8;
			iv1 = (iv * ic5) >> 8;
			iTemp = iu1 + iv1;
			ig = icoff1 - iTemp;

			iv2 = (iv * ic8) >> 8;
			ir = iv2 + icoff2;

			ddy1 = _mm_mulhi_epu16(sy1, c0);

			/* db1 */
			temp = _mm_set1_epi16(ib);
			db1 = _mm_add_epi16(ddy1, temp);

			/* dg1 */
			temp = _mm_set1_epi16(ig);
			dg1 = _mm_add_epi16(ddy1, temp);

			/* dr1 */
			temp = _mm_set1_epi16(ir);
			dr1 = _mm_add_epi16(ddy1, temp);

			db1 = _mm_srai_epi16(db1, 5);
			dg1 = _mm_srai_epi16(dg1, 5);
			dr1 = _mm_srai_epi16(dr1, 5);

			/* pack: 16=>8 */
			db1 = _mm_packus_epi16(db1, x_zero);
			dr1 = _mm_packus_epi16(dr1, x_zero);
			dg1 = _mm_packus_epi16(dg1, x_zero);

			/* create rgb sequences : db/dr/dg */
			dzrl = _mm_unpacklo_epi8(x_zero, dr1);
			dgbl = _mm_unpacklo_epi8(dg1, db1);

			/* lower half of drgl & dbzl */
			drgbl = _mm_unpacklo_epi16(dzrl, dgbl);
			ptemp = (mlib_u8*)(&drgbl);
			pdd += 1;
			ptemp += 1;
			*((mlib_s16*)pdd) = *((mlib_s16*)ptemp);
			pdd += 2;
			ptemp += 2;
			*((mlib_u8*)pdd) = *((mlib_u8*)ptemp);
			pdd += 2;
			ptemp += 2;
			*((mlib_s16*)pdd) = *((mlib_s16*)ptemp);
			pdd += 2;
			ptemp += 2;
			*((mlib_u8*)pdd) = *((mlib_u8*)ptemp);
			pdd += 1;

			iW += 2;
		}

		pY += y_stride;
		pU += uv_stride;
		pV += uv_stride;
		pD += argb_stride;
	}
コード例 #27
0
void alphaBlendSSE_8u(Mat& src1, Mat& src2, Mat& alpha, Mat& dest)
{
	if(dest.empty())dest.create(src1.size(),CV_8U);

	const int imsize = (src1.size().area()/16);
	uchar* s1 = src1.data;
	uchar* s2 = src2.data;
	uchar* a = alpha.data;
	uchar* d = dest.data;

	const __m128i zero = _mm_setzero_si128();
	const __m128i amax = _mm_set1_epi8(char(255));
	int i=0;
	if(s1==d)
	{
		for(;i<imsize;++i)
		{
			__m128i ms1h = _mm_load_si128((__m128i*)(s1));
			__m128i ms2h = _mm_load_si128((__m128i*)(s2));
			__m128i mah = _mm_load_si128((__m128i*)(a));
			__m128i imah = _mm_sub_epi8(amax,mah);

			__m128i ms1l = _mm_unpacklo_epi8(ms1h, zero);
			ms1h = _mm_unpackhi_epi8(ms1h, zero);

			__m128i ms2l = _mm_unpacklo_epi8(ms2h, zero);
			ms2h = _mm_unpackhi_epi8(ms2h, zero);

			__m128i mal = _mm_unpacklo_epi8(mah, zero);
			mah = _mm_unpackhi_epi8(mah, zero);

			__m128i imal = _mm_unpacklo_epi8(imah, zero);
			imah = _mm_unpackhi_epi8(imah, zero);

			ms1l = _mm_mullo_epi16(ms1l,mal);
			ms2l = _mm_mullo_epi16(ms2l,imal);
			ms1l = _mm_add_epi16(ms1l,ms2l);
			//ms1l = _mm_srli_epi16(ms1l,8);
			ms1l = _mm_srai_epi16(ms1l,8);

			ms1h = _mm_mullo_epi16(ms1h,mah);
			ms2h = _mm_mullo_epi16(ms2h,imah);
			ms1h = _mm_add_epi16(ms1h,ms2h);
			//ms1h = _mm_srli_epi16(ms1h,8);
			ms1h = _mm_srai_epi16(ms1h,8);

			_mm_stream_si128((__m128i*)s1,_mm_packs_epi16(ms1l,ms1h));

			s1+=16;
			s2+=16;
			a+=16;
		}
	}
	else
	{
		for(;i<imsize;++i)
		{
			__m128i ms1h = _mm_load_si128((__m128i*)(s1));
			__m128i ms2h = _mm_load_si128((__m128i*)(s2));
			__m128i mah = _mm_load_si128((__m128i*)(a));
			__m128i imah = _mm_sub_epi8(amax,mah);

			__m128i ms1l = _mm_unpacklo_epi8(ms1h, zero);
			ms1h = _mm_unpackhi_epi8(ms1h, zero);

			__m128i ms2l = _mm_unpacklo_epi8(ms2h, zero);
			ms2h = _mm_unpackhi_epi8(ms2h, zero);

			__m128i mal = _mm_unpacklo_epi8(mah, zero);
			mah = _mm_unpackhi_epi8(mah, zero);

			__m128i imal = _mm_unpacklo_epi8(imah, zero);
			imah = _mm_unpackhi_epi8(imah, zero);

			ms1l = _mm_mullo_epi16(ms1l,mal);
			ms2l = _mm_mullo_epi16(ms2l,imal);
			ms1l = _mm_add_epi16(ms1l,ms2l);
			//ms1l = _mm_srli_epi16(ms1l,8);
			ms1l = _mm_srai_epi16(ms1l,8);

			ms1h = _mm_mullo_epi16(ms1h,mah);
			ms2h = _mm_mullo_epi16(ms2h,imah);
			ms1h = _mm_add_epi16(ms1h,ms2h);
			//ms1h = _mm_srli_epi16(ms1h,8);
			ms1h = _mm_srai_epi16(ms1h,8);

			_mm_store_si128((__m128i*)d,_mm_packs_epi16(ms1l,ms1h));

			s1+=16;
			s2+=16;
			a+=16;
			d+=16;
		}
	}

	{
		uchar* s1 = src1.data;
		uchar* s2 = src2.data;
		uchar* a = alpha.data;
		uchar* d = dest.data;
		for(int n=i*16;n<src1.size().area();n++)
		{
			d[n] = (a[n]*s1[n] + (255-a[n])*s2[n])>>8;
		}
	}
}
コード例 #28
0
ファイル: pixops_sse2.cpp プロジェクト: ewmailing/simdtests
void pixops_crossfade_sse2(void* dst, intptr_t dstStride, const void* src, intptr_t srcStride, uint32_t w, uint32_t h, uint32_t alpha) {
  uint8_t* pDstRow = static_cast<uint8_t*>(dst);
  const uint8_t* pSrcRow = static_cast<const uint8_t*>(src);

  __m128i a  = _mm_shuffle_epi32(_mm_cvtsi32_si128(expand16(alpha      )), _MM_SHUFFLE(0, 0, 0, 0));
  __m128i ia = _mm_shuffle_epi32(_mm_cvtsi32_si128(expand16(256 - alpha)), _MM_SHUFFLE(0, 0, 0, 0));

  for (uint32_t y = h; y > 0; y--, pDstRow += dstStride, pSrcRow += srcStride) {
    uint32_t* pDst = reinterpret_cast<uint32_t*>(pDstRow);
    const uint32_t* pSrc = reinterpret_cast<const uint32_t*>(pSrcRow);

    uint32_t x = w;
    for (;;) {
      while (x < 4 || !SimdUtils::isAligned(dst, 16)) {
        __m128i d = _mm_cvtsi32_si128(*pDst);
        __m128i s = _mm_cvtsi32_si128(*pSrc);

        d = _mm_unpacklo_epi8(d, _mm_setzero_si128());
        s = _mm_unpacklo_epi8(s, _mm_setzero_si128());

        d = _mm_mullo_epi16(d, ia);
        s = _mm_mullo_epi16(s, a);

        d = _mm_add_epi16(d, s);
        d = _mm_srli_epi16(d, 8);
        d = _mm_packus_epi16(d, d);
        *pDst = _mm_cvtsi128_si32(d);

        pDst++;
        pSrc++;
        x--;
      }

      if (x == 0)
        break;

      while (x >= 8) {
        __m128i d0 = _mm_load_si128(reinterpret_cast<__m128i*>(pDst + 0));
        __m128i d2 = _mm_load_si128(reinterpret_cast<__m128i*>(pDst + 4));
        __m128i s0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pSrc + 0));
        __m128i s2 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pSrc + 4));

        __m128i d1 = _mm_unpackhi_epi8(d0, _mm_setzero_si128());
        __m128i d3 = _mm_unpackhi_epi8(d2, _mm_setzero_si128());
        __m128i s1 = _mm_unpackhi_epi8(s0, _mm_setzero_si128());
        __m128i s3 = _mm_unpackhi_epi8(s2, _mm_setzero_si128());

        d0 = _mm_unpacklo_epi8(d0, _mm_setzero_si128());
        d2 = _mm_unpacklo_epi8(d2, _mm_setzero_si128());
        s0 = _mm_unpacklo_epi8(s0, _mm_setzero_si128());
        s2 = _mm_unpacklo_epi8(s2, _mm_setzero_si128());

        d0 = _mm_mullo_epi16(d0, ia);
        d1 = _mm_mullo_epi16(d1, ia);
        d2 = _mm_mullo_epi16(d2, ia);
        d3 = _mm_mullo_epi16(d3, ia);

        s0 = _mm_mullo_epi16(s0, a);
        s1 = _mm_mullo_epi16(s1, a);
        s2 = _mm_mullo_epi16(s2, a);
        s3 = _mm_mullo_epi16(s3, a);

        d0 = _mm_add_epi16(d0, s0);
        d1 = _mm_add_epi16(d1, s1);
        d2 = _mm_add_epi16(d2, s2);
        d3 = _mm_add_epi16(d3, s3);

        d0 = _mm_srli_epi16(d0, 8);
        d1 = _mm_srli_epi16(d1, 8);
        d2 = _mm_srli_epi16(d2, 8);
        d3 = _mm_srli_epi16(d3, 8);

        d0 = _mm_packus_epi16(d0, d1);
        d2 = _mm_packus_epi16(d2, d3);
        _mm_store_si128(reinterpret_cast<__m128i*>(pDst + 0), d0);
        _mm_store_si128(reinterpret_cast<__m128i*>(pDst + 4), d2);

        pDst += 8;
        pSrc += 8;
        x -= 8;
      }

      while (x >= 4) {
        __m128i d0 = _mm_load_si128(reinterpret_cast<__m128i*>(pDst));
        __m128i s0 = _mm_loadu_si128(reinterpret_cast<const __m128i*>(pSrc));

        __m128i d1 = _mm_unpackhi_epi8(d0, _mm_setzero_si128());
        __m128i s1 = _mm_unpackhi_epi8(s0, _mm_setzero_si128());

        d0 = _mm_unpacklo_epi8(d0, _mm_setzero_si128());
        s0 = _mm_unpacklo_epi8(s0, _mm_setzero_si128());

        d0 = _mm_mullo_epi16(d0, ia);
        d1 = _mm_mullo_epi16(d1, ia);
        s0 = _mm_mullo_epi16(s0, a);
        s1 = _mm_mullo_epi16(s1, a);

        d0 = _mm_add_epi16(d0, s0);
        d1 = _mm_add_epi16(d1, s1);

        d0 = _mm_srli_epi16(d0, 8);
        d1 = _mm_srli_epi16(d1, 8);

        d0 = _mm_packus_epi16(d0, d1);
        _mm_store_si128(reinterpret_cast<__m128i*>(pDst), d0);

        pDst += 4;
        pSrc += 4;
        x -= 4;
      }

      if (x == 0)
        break;
    }
  }
}
コード例 #29
0
ファイル: enc_sse2.c プロジェクト: 0309/cocos2d-x
// Hadamard transform
// Returns the difference between the weighted sum of the absolute value of
// transformed coefficients.
static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
                          const uint16_t* const w) {
  int32_t sum[4];
  __m128i tmp_0, tmp_1, tmp_2, tmp_3;
  const __m128i zero = _mm_setzero_si128();
  const __m128i one = _mm_set1_epi16(1);
  const __m128i three = _mm_set1_epi16(3);

  // Load, combine and tranpose inputs.
  {
    const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]);
    const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]);
    const __m128i inA_2 = _mm_loadl_epi64((__m128i*)&inA[BPS * 2]);
    const __m128i inA_3 = _mm_loadl_epi64((__m128i*)&inA[BPS * 3]);
    const __m128i inB_0 = _mm_loadl_epi64((__m128i*)&inB[BPS * 0]);
    const __m128i inB_1 = _mm_loadl_epi64((__m128i*)&inB[BPS * 1]);
    const __m128i inB_2 = _mm_loadl_epi64((__m128i*)&inB[BPS * 2]);
    const __m128i inB_3 = _mm_loadl_epi64((__m128i*)&inB[BPS * 3]);

    // Combine inA and inB (we'll do two transforms in parallel).
    const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0);
    const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1);
    const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2);
    const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3);
    // a00 b00 a01 b01 a02 b03 a03 b03   0 0 0 0 0 0 0 0
    // a10 b10 a11 b11 a12 b12 a13 b13   0 0 0 0 0 0 0 0
    // a20 b20 a21 b21 a22 b22 a23 b23   0 0 0 0 0 0 0 0
    // a30 b30 a31 b31 a32 b32 a33 b33   0 0 0 0 0 0 0 0

    // Transpose the two 4x4, discarding the filling zeroes.
    const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2);
    const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3);
    // a00 a20  b00 b20  a01 a21  b01 b21  a02 a22  b02 b22  a03 a23  b03 b23
    // a10 a30  b10 b30  a11 a31  b11 b31  a12 a32  b12 b32  a13 a33  b13 b33
    const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
    const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
    // a00 a10 a20 a30  b00 b10 b20 b30  a01 a11 a21 a31  b01 b11 b21 b31
    // a02 a12 a22 a32  b02 b12 b22 b32  a03 a13 a23 a33  b03 b13 b23 b33

    // Convert to 16b.
    tmp_0 = _mm_unpacklo_epi8(transpose1_0, zero);
    tmp_1 = _mm_unpackhi_epi8(transpose1_0, zero);
    tmp_2 = _mm_unpacklo_epi8(transpose1_1, zero);
    tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero);
    // a00 a10 a20 a30   b00 b10 b20 b30
    // a01 a11 a21 a31   b01 b11 b21 b31
    // a02 a12 a22 a32   b02 b12 b22 b32
    // a03 a13 a23 a33   b03 b13 b23 b33
  }

  // Horizontal pass and subsequent transpose.
  {
    // Calculate a and b (two 4x4 at once).
    const __m128i a0 = _mm_slli_epi16(_mm_add_epi16(tmp_0, tmp_2), 2);
    const __m128i a1 = _mm_slli_epi16(_mm_add_epi16(tmp_1, tmp_3), 2);
    const __m128i a2 = _mm_slli_epi16(_mm_sub_epi16(tmp_1, tmp_3), 2);
    const __m128i a3 = _mm_slli_epi16(_mm_sub_epi16(tmp_0, tmp_2), 2);
    // b0_extra = (a0 != 0);
    const __m128i b0_extra = _mm_andnot_si128(_mm_cmpeq_epi16 (a0, zero), one);
    const __m128i b0_base = _mm_add_epi16(a0, a1);
    const __m128i b1 = _mm_add_epi16(a3, a2);
    const __m128i b2 = _mm_sub_epi16(a3, a2);
    const __m128i b3 = _mm_sub_epi16(a0, a1);
    const __m128i b0 = _mm_add_epi16(b0_base, b0_extra);
    // a00 a01 a02 a03   b00 b01 b02 b03
    // a10 a11 a12 a13   b10 b11 b12 b13
    // a20 a21 a22 a23   b20 b21 b22 b23
    // a30 a31 a32 a33   b30 b31 b32 b33

    // Transpose the two 4x4.
    const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1);
    const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3);
    const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1);
    const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3);
    // a00 a10 a01 a11   a02 a12 a03 a13
    // a20 a30 a21 a31   a22 a32 a23 a33
    // b00 b10 b01 b11   b02 b12 b03 b13
    // b20 b30 b21 b31   b22 b32 b23 b33
    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
    // a00 a10 a20 a30 a01 a11 a21 a31
    // b00 b10 b20 b30 b01 b11 b21 b31
    // a02 a12 a22 a32 a03 a13 a23 a33
    // b02 b12 a22 b32 b03 b13 b23 b33
    tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
    tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
    tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
    tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
    // a00 a10 a20 a30   b00 b10 b20 b30
    // a01 a11 a21 a31   b01 b11 b21 b31
    // a02 a12 a22 a32   b02 b12 b22 b32
    // a03 a13 a23 a33   b03 b13 b23 b33
  }

  // Vertical pass and difference of weighted sums.
  {
    // Load all inputs.
    // TODO(cduvivier): Make variable declarations and allocations aligned so
    //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
    const __m128i w_0 = _mm_loadu_si128((__m128i*)&w[0]);
    const __m128i w_8 = _mm_loadu_si128((__m128i*)&w[8]);

    // Calculate a and b (two 4x4 at once).
    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
    const __m128i b0 = _mm_add_epi16(a0, a1);
    const __m128i b1 = _mm_add_epi16(a3, a2);
    const __m128i b2 = _mm_sub_epi16(a3, a2);
    const __m128i b3 = _mm_sub_epi16(a0, a1);

    // Separate the transforms of inA and inB.
    __m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
    __m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
    __m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
    __m128i B_b2 = _mm_unpackhi_epi64(b2, b3);

    {
      // sign(b) = b >> 15  (0x0000 if positive, 0xffff if negative)
      const __m128i sign_A_b0 = _mm_srai_epi16(A_b0, 15);
      const __m128i sign_A_b2 = _mm_srai_epi16(A_b2, 15);
      const __m128i sign_B_b0 = _mm_srai_epi16(B_b0, 15);
      const __m128i sign_B_b2 = _mm_srai_epi16(B_b2, 15);

      // b = abs(b) = (b ^ sign) - sign
      A_b0 = _mm_xor_si128(A_b0, sign_A_b0);
      A_b2 = _mm_xor_si128(A_b2, sign_A_b2);
      B_b0 = _mm_xor_si128(B_b0, sign_B_b0);
      B_b2 = _mm_xor_si128(B_b2, sign_B_b2);
      A_b0 = _mm_sub_epi16(A_b0, sign_A_b0);
      A_b2 = _mm_sub_epi16(A_b2, sign_A_b2);
      B_b0 = _mm_sub_epi16(B_b0, sign_B_b0);
      B_b2 = _mm_sub_epi16(B_b2, sign_B_b2);
    }

    // b = abs(b) + 3
    A_b0 = _mm_add_epi16(A_b0, three);
    A_b2 = _mm_add_epi16(A_b2, three);
    B_b0 = _mm_add_epi16(B_b0, three);
    B_b2 = _mm_add_epi16(B_b2, three);

    // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3
    // b = (abs(b) + 3) >> 3
    A_b0 = _mm_srai_epi16(A_b0, 3);
    A_b2 = _mm_srai_epi16(A_b2, 3);
    B_b0 = _mm_srai_epi16(B_b0, 3);
    B_b2 = _mm_srai_epi16(B_b2, 3);

    // weighted sums
    A_b0 = _mm_madd_epi16(A_b0, w_0);
    A_b2 = _mm_madd_epi16(A_b2, w_8);
    B_b0 = _mm_madd_epi16(B_b0, w_0);
    B_b2 = _mm_madd_epi16(B_b2, w_8);
    A_b0 = _mm_add_epi32(A_b0, A_b2);
    B_b0 = _mm_add_epi32(B_b0, B_b2);

    // difference of weighted sums
    A_b0 = _mm_sub_epi32(A_b0, B_b0);
    _mm_storeu_si128((__m128i*)&sum[0], A_b0);
  }
  return sum[0] + sum[1] + sum[2] + sum[3];
}
コード例 #30
0
ファイル: Zoom.cpp プロジェクト: Juju-Dredd/OpenXcom
/**
 *  Optimized 8 bit zoomer for resizing by a factor of 4. Doesn't flip.
 *  32-bit version.
 *  Used internally by _zoomSurfaceY() below.
 *  source and dest. widths must be multiples of 4 bytes for 32-bit access
 */
static int zoomSurface4X_32bit(SDL_Surface *src, SDL_Surface *dst)
{
	Uint32 dataSrc;
	Uint32 dataDst;
	Uint8 *pixelSrc = (Uint8*)src->pixels;
	Uint8 *pixelDstRow = (Uint8*)dst->pixels;
	int sx, sy;
	static bool proclaimed = false;
	
	if (!proclaimed)
	{
		proclaimed = true;
		Log(LOG_INFO) << "Using 32-bit 4X zoom routine.";
	}

	for (sy = 0; sy < src->h; ++sy, pixelDstRow += dst->pitch*4)
	{
		Uint32 *pixelDst = (Uint32*)pixelDstRow;
		Uint32 *pixelDst2 = (Uint32*)(pixelDstRow + dst->pitch);
		Uint32 *pixelDst3 = (Uint32*)(pixelDstRow + 2*dst->pitch);
		Uint32 *pixelDst4 = (Uint32*)(pixelDstRow + 3*dst->pitch);
		for (sx = 0; sx < src->w; sx += 4, pixelSrc += 4)
		{
			dataSrc = *((Uint32*) pixelSrc);
#if SDL_BYTEORDER == SDL_BIG_ENDIAN
			// boo
			dataSrc = SDL_Swap32(dataSrc);
			
			for (int i = 0; i < 4; ++i)
			{
				dataDst = SDL_Swap32( (dataSrc & 0xFF) | ((dataSrc & 0xFF) << 8) | 
					((dataSrc & 0xFF) << 16) | ((dataSrc & 0xFF ) << 24) ); 
#else
			for (int i = 0; i < 4; ++i)
			{
				dataDst = (dataSrc & 0xFF) | ((dataSrc & 0xFF) << 8) | 
					((dataSrc & 0xFF) << 16) | ((dataSrc & 0xFF ) << 24); 

#endif

				*pixelDst = dataDst;
				*pixelDst2 = dataDst;
				*pixelDst3 = dataDst;
				*pixelDst4 = dataDst;
				pixelDst++; // forward 4 bytes!
				pixelDst2++;
				pixelDst3++;
				pixelDst4++;
				dataSrc >>= 8;
			}
		}
	}
	
	return 0;
}
#endif

/**
 *  Optimized 8 bit zoomer for resizing by a factor of 4. Doesn't flip.
 *  32-bit version.
 *  Used internally by _zoomSurfaceY() below.
 *  source and dest. widths must be multiples of 4 bytes for 32-bit access
 */
static int zoomSurface2X_XAxis_32bit(SDL_Surface *src, SDL_Surface *dst)
{
	Uint32 dataSrc;
	Uint32 dataDst;
	Uint8 *pixelSrc;
	Uint8 *pixelDstRow = (Uint8*)dst->pixels;
	Uint8 *pixelSrcRow = (Uint8*)src->pixels;
	int sx;
	int dsty;
	static bool proclaimed = false;

	static Uint32 *say = 0;
	Uint32 *csay;
	int csy;
	
	if (!proclaimed)
	{
		proclaimed = true;
		Log(LOG_INFO) << "Using mediocre scaling routine due to screen height.";
	}
	
	if ((say = (Uint32 *) realloc(say, (dst->h + 1) * sizeof(Uint32))) == NULL) {
		say = 0;
		return (-1);
	}

	csy = 0;
	csay = say;
	for (int y = 0; y < dst->h; y++) {
		csy += src->h;
		*csay = 0;
		while (csy >= dst->h) {
			csy -= dst->h;
			(*csay)++;
		}
		(*csay) *= src->pitch;
		csay++;
	}
	
	for (dsty = 0; dsty < dst->h; ++dsty, pixelDstRow += dst->pitch)
	{
		if (!say[dsty]) continue;

		Uint32 *pixelDst = (Uint32*)pixelDstRow;
		pixelSrc = pixelSrcRow;
		pixelSrcRow += say[dsty];

		for (sx = 0; sx < src->w; sx += 4, pixelSrc += 4)
		{
			dataSrc = *((Uint32*) pixelSrc);
#if SDL_BYTEORDER == SDL_BIG_ENDIAN
			// boo
			dataSrc = SDL_Swap32(dataSrc);
			
			for (int i = 0; i < 2; ++i)
			{
				dataDst = SDL_Swap32( (dataSrc & 0xFF) | ((dataSrc & 0xFFFF) << 8) | 
					((dataSrc & 0xFF00) << 16) );
			
#else
			for (int i = 0; i < 2; ++i)
			{
				dataDst = (dataSrc & 0xFF) | ((dataSrc & 0xFFFF) << 8) | 
					((dataSrc & 0xFF00) << 16);

#endif

				int j = 0;
				do
				{
					if (dsty + j >= dst->h) break;

					*(pixelDst + (dst->pitch/sizeof(Uint32))*j) = dataDst;
				} while(say[dsty + ++j] == 0); // fill in all relevant rows
				
				dataSrc >>= 16;
				pixelDst++; // forward 4 bytes!
			}
		}
	}
	
	return 0;
}


/**
 *  Optimized 8 bit zoomer for resizing by a factor of 2. Doesn't flip.
 *  32-bit version.
 *  Used internally by _zoomSurfaceY() below.
 *  source and dest. widths must be multiples of 4 bytes for 32-bit access
 */
static int zoomSurface4X_XAxis_32bit(SDL_Surface *src, SDL_Surface *dst)
{
	Uint32 dataSrc;
	Uint32 dataDst;
	Uint8 *pixelSrc;
	Uint8 *pixelDstRow = (Uint8*)dst->pixels;
	Uint8 *pixelSrcRow = (Uint8*)src->pixels;
	int sx;
	int dsty;
	static bool proclaimed = false;

	static Uint32 *say = 0;
	Uint32 *csay;
	int csy;
	
	if (!proclaimed)
	{
		proclaimed = true;
		Log(LOG_INFO) << "Using mediocre scaling routine due to screen height.";
	}
	
	if ((say = (Uint32 *) realloc(say, (dst->h + 1) * sizeof(Uint32))) == NULL) {
		say = 0;
		return (-1);
	}
	
	csy = 0;
	csay = say;
	for (int y = 0; y < dst->h; y++) {
		csy += src->h;
		*csay = 0;
		while (csy >= dst->h) {
			csy -= dst->h;
			(*csay)++;
		}
		(*csay) *= src->pitch;
		csay++;
	}

	for (dsty = 0; dsty < dst->h; ++dsty, pixelDstRow += dst->pitch)
	{
		if (!say[dsty]) continue;

		Uint32 *pixelDst = (Uint32*)pixelDstRow;
		pixelSrc = pixelSrcRow;
		pixelSrcRow += say[dsty];

		for (sx = 0; sx < src->w; sx += 4, pixelSrc += 4)
		{
			dataSrc = *((Uint32*) pixelSrc);
#if SDL_BYTEORDER == SDL_BIG_ENDIAN
			// boo
			dataSrc = SDL_Swap32(dataSrc);
			
			for (int i = 0; i < 4; ++i)
			{
				dataDst = SDL_Swap32( (dataSrc & 0xFF) | ((dataSrc & 0xFF) << 8) | 
					((dataSrc & 0xFF) << 16) | ((dataSrc & 0xFF ) << 24) ); 
			
#else
			for (int i = 0; i < 4; ++i)
			{
				dataDst = (dataSrc & 0xFF) | ((dataSrc & 0xFF) << 8) | 
					((dataSrc & 0xFF) << 16) | ((dataSrc & 0xFF ) << 24); 
#endif
				int j = 0;
				do
				{
					if (dsty + j >= dst->h) break;

					*(pixelDst + (dst->pitch/sizeof(Uint32))*j) = dataDst;
				} while(say[dsty + ++j] == 0); // fill in all relevant rows
				
				dataSrc >>= 8;
				pixelDst++; // forward 4 bytes!
			}
		}
	}
	
	return 0;
}

#ifdef __SSE2__
/**
 *  Optimized 8 bit zoomer for resizing by a factor of 4. Doesn't flip.
 *  Used internally by _zoomSurfaceY() below.
 *	This is an SSE2 version written with Intel intrinsics.
 *  source and dest. widths must be multiples of 16 bytes for 128-bit access
 *  and it would help if they were aligned properly... :(
 */
static int zoomSurface4X_SSE2(SDL_Surface *src, SDL_Surface *dst)
{
	__m128i dataSrc;
	__m128i dataDst;
	Uint8 *pixelSrc = (Uint8*)src->pixels;
	Uint8 *pixelDstRow = (Uint8*)dst->pixels;
	int sx, sy;
	static bool proclaimed = false;

	if (!proclaimed)
	{
		proclaimed = true;
		Log(LOG_INFO) << "Using SSE2 4X zoom routine.";
	}

	for (sy = 0; sy < src->h; ++sy, pixelDstRow += dst->pitch*4)
	{
		__m128i *pixelDst =  (__m128i*)pixelDstRow;
		__m128i *pixelDst2 = (__m128i*)((Uint8*)pixelDstRow + dst->pitch);
		__m128i *pixelDst3 = (__m128i*)((Uint8*)pixelDstRow + dst->pitch*2);
		__m128i *pixelDst4 = (__m128i*)((Uint8*)pixelDstRow + dst->pitch*3);
		for (sx = 0; sx < src->w; sx += 16, pixelSrc += 16)
		{
			dataSrc = *((__m128i*) pixelSrc);

			__m128i halfDone = _mm_unpacklo_epi8(dataSrc, dataSrc); 
			dataDst = _mm_unpacklo_epi8(halfDone, halfDone);

/* #define WRITE_DST if ((char*)pixelDst4 + 128 > (char*)dst->pixels+(dst->w*dst->pitch)) { Log(LOG_ERROR) << "HELL"; exit(0); } \ */
#define WRITE_DST			*(pixelDst++) = dataDst; \
			*(pixelDst2++) = dataDst; \
			*(pixelDst3++) = dataDst; \
			*(pixelDst4++) = dataDst; \
			
			WRITE_DST;
			
			dataDst = _mm_unpackhi_epi8(halfDone, halfDone);
			
			WRITE_DST;
			
			halfDone = _mm_unpackhi_epi8(dataSrc, dataSrc);
			dataDst = _mm_unpacklo_epi8(halfDone, halfDone);
			
			WRITE_DST;
			
			dataDst = _mm_unpackhi_epi8(halfDone, halfDone);
			
			WRITE_DST;
		}	
	}

	return 0;
}