C++ (Cpp) _mm_shufflehi_epi16の例

コード例 #1

0

ファイルを表示

ファイル: idea_sse2.cpp プロジェクト: louiz/botan

void transpose_out(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3)
   {
   __m128i T0 = _mm_unpacklo_epi64(B0, B1);
   __m128i T1 = _mm_unpacklo_epi64(B2, B3);
   __m128i T2 = _mm_unpackhi_epi64(B0, B1);
   __m128i T3 = _mm_unpackhi_epi64(B2, B3);

   T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
   T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
   T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
   T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));

   T0 = _mm_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
   T1 = _mm_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
   T2 = _mm_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
   T3 = _mm_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));

   T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
   T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
   T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
   T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));

   B0 = _mm_unpacklo_epi32(T0, T1);
   B1 = _mm_unpackhi_epi32(T0, T1);
   B2 = _mm_unpacklo_epi32(T2, T3);
   B3 = _mm_unpackhi_epi32(T2, T3);
   }

コード例 #2

0

ファイルを表示

ファイル: lossless_sse2.c プロジェクト: bovender/XLToolbox

static void ConvertBGRAToRGBA_SSE2(const uint32_t* src,
                                   int num_pixels, uint8_t* dst) {
  const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
  const __m128i* in = (const __m128i*)src;
  __m128i* out = (__m128i*)dst;
  while (num_pixels >= 8) {
    const __m128i A1 = _mm_loadu_si128(in++);
    const __m128i A2 = _mm_loadu_si128(in++);
    const __m128i B1 = _mm_and_si128(A1, red_blue_mask);     // R 0 B 0
    const __m128i B2 = _mm_and_si128(A2, red_blue_mask);     // R 0 B 0
    const __m128i C1 = _mm_andnot_si128(red_blue_mask, A1);  // 0 G 0 A
    const __m128i C2 = _mm_andnot_si128(red_blue_mask, A2);  // 0 G 0 A
    const __m128i D1 = _mm_shufflelo_epi16(B1, _MM_SHUFFLE(2, 3, 0, 1));
    const __m128i D2 = _mm_shufflelo_epi16(B2, _MM_SHUFFLE(2, 3, 0, 1));
    const __m128i E1 = _mm_shufflehi_epi16(D1, _MM_SHUFFLE(2, 3, 0, 1));
    const __m128i E2 = _mm_shufflehi_epi16(D2, _MM_SHUFFLE(2, 3, 0, 1));
    const __m128i F1 = _mm_or_si128(E1, C1);
    const __m128i F2 = _mm_or_si128(E2, C2);
    _mm_storeu_si128(out++, F1);
    _mm_storeu_si128(out++, F2);
    num_pixels -= 8;
  }
  // left-overs
  if (num_pixels > 0) {
    VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
  }
}

コード例 #3

0

ファイルを表示

ファイル: sora_ext_lib.cpp プロジェクト: mainland/Ziria

//
// multiplies two complex vectors and returns the real and imaginary parts
// as two 32 bit integers.
//
FORCE_INLINE
int __ext_v_conj_mul_complex16_int32(int32* re, int lenout1, int32* im, int lenout2,
        struct complex16* x, int len1, struct complex16* y, int len2 )
{
  const unum8 wlen = 4;// sizeof(vcs) / sizeof(complex16);
  const __m128i xmm5 = _mm_set1_epi32(0xFFFF0000);
  const __m128i xmm4 = _mm_set1_epi32(0x00010000);

  __m128i* Xs = (__m128i*) x;
  __m128i* Ys = (__m128i*) y;
  __m128i* Res = (__m128i*) re;
  __m128i* Ims = (__m128i*) im;
  for (int i = 0; i < len1 / wlen; i++){
    __m128i mx = _mm_loadu_si128(&Xs[i]);
    __m128i my = _mm_loadu_si128(&Ys[i]);

    __m128i ms2 = _mm_xor_si128(my, xmm5);
    ms2 = _mm_add_epi32(ms2, xmm4);

    ms2 = _mm_shufflehi_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1));
    ms2 = _mm_shufflelo_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1));

    _mm_storeu_si128(&Res[i], _mm_madd_epi16(my, mx));
    _mm_storeu_si128(&Ims[i], _mm_madd_epi16(ms2, mx));
  }

  for (int i = (len1 / wlen) * wlen; i < len1; i++){
    re[i] = x[i].re * y[i].re + x[i].im * y[i].im ;
    im[i] = x[i].im * y[i].re - x[i].re * y[i].im ;
  }

  return 0;
}

コード例 #4

0

ファイルを表示

ファイル: highbd_intrapred_intrin_ssse3.c プロジェクト: wolfviking0/webcl-webkit

void vpx_highbd_d207_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
                                           const uint16_t *above,
                                           const uint16_t *left, int bd) {
  const __m128i A0 = _mm_load_si128((const __m128i *)left);
  const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8));
  const __m128i LR0 = _mm_shufflehi_epi16(A1, 0xff);
  const __m128i LR = _mm_unpackhi_epi64(LR0, LR0);
  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
  const __m128i B1 = _mm_alignr_epi8(LR, A1, 2);
  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
  const __m128i C1 = _mm_alignr_epi8(LR, A1, 4);
  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
  const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0);
  const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0);
  const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1);
  const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1);
  (void)above;
  (void)bd;
  d207_store_4x16(&dst, stride, &out_a, &out_b, &out_c);
  d207_store_4x16(&dst, stride, &out_b, &out_c, &out_d);
  d207_store_4x16(&dst, stride, &out_c, &out_d, &LR);
  d207_store_4x16(&dst, stride, &out_d, &LR, &LR);
}

コード例 #5

0

ファイルを表示

ファイル: highbd_intrapred_intrin_ssse3.c プロジェクト: wolfviking0/webcl-webkit

void vpx_highbd_d45_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
                                          const uint16_t *above,
                                          const uint16_t *left, int bd) {
  const __m128i A0 = _mm_load_si128((const __m128i *)above);
  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
  const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff);
  const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
  const __m128i B1 = _mm_alignr_epi8(AR, A1, 2);
  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
  const __m128i C1 = _mm_alignr_epi8(AR, A1, 4);
  __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
  __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
  (void)left;
  (void)bd;
  _mm_store_si128((__m128i *)dst, avg3_0);
  _mm_store_si128((__m128i *)(dst + 8), avg3_1);
  dst += stride;
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
  d45_store_16(&dst, stride, &avg3_0, &avg3_1, &AR);
}

コード例 #6

0

ファイルを表示

ファイル: lossless_enc_sse2.c プロジェクト: 03050903/godot

static void TransformColor(const VP8LMultipliers* const m,
                           uint32_t* argb_data, int num_pixels) {
  const __m128i mults_rb = _mm_set_epi16(
      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_));
  const __m128i mults_b2 = _mm_set_epi16(
      CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0,
      CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0);
  const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);  // alpha-green masks
  const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff);  // red-blue masks
  int i;
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
    const __m128i A = _mm_and_si128(in, mask_ag);     // a   0   g   0
    const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
    const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // g0g0
    const __m128i D = _mm_mulhi_epi16(C, mults_rb);    // x dr  x db1
    const __m128i E = _mm_slli_epi16(in, 8);           // r 0   b   0
    const __m128i F = _mm_mulhi_epi16(E, mults_b2);    // x db2 0   0
    const __m128i G = _mm_srli_epi32(F, 16);           // 0 0   x db2
    const __m128i H = _mm_add_epi8(G, D);              // x dr  x  db
    const __m128i I = _mm_and_si128(H, mask_rb);       // 0 dr  0  db
    const __m128i out = _mm_sub_epi8(in, I);
    _mm_storeu_si128((__m128i*)&argb_data[i], out);
  }
  // fallthrough and finish off with plain-C
  VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
}

コード例 #7

0

ファイルを表示

ファイル: PCMBlitterLibX86.cpp プロジェクト: fruitsamples/HAL

static inline __m128i  byteswap32( __m128i v )
{
	//rotate each 32 bit quantity by 16 bits
	// 0xB1 = 10110001 = 2,3,0,1
	v = _mm_shufflehi_epi16( _mm_shufflelo_epi16( v, 0xB1 ), 0xB1 );
	return byteswap16( v );
}

コード例 #8

0

ファイルを表示

ファイル: shuffle-sse2.c プロジェクト: BillTheBest/c-blosc2

/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */
static void
shuffle2_sse2(uint8_t* const dest, const uint8_t* const src,
              const size_t vectorizable_elements, const size_t total_elements) {
  static const size_t bytesoftype = 2;
  size_t j;
  int k;
  uint8_t* dest_for_jth_element;
  __m128i xmm0[2], xmm1[2];

  for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) {
    /* Fetch 16 elements (32 bytes) then transpose bytes, words and double words. */
    for (k = 0; k < 2; k++) {
      xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i))));
      xmm0[k] = _mm_shufflelo_epi16(xmm0[k], 0xd8);
      xmm0[k] = _mm_shufflehi_epi16(xmm0[k], 0xd8);
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
      xmm0[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]);
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
      xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]);
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
    }
    /* Transpose quad words */
    for (k = 0; k < 1; k++) {
      xmm1[k * 2] = _mm_unpacklo_epi64(xmm0[k], xmm0[k + 1]);
      xmm1[k * 2 + 1] = _mm_unpackhi_epi64(xmm0[k], xmm0[k + 1]);
    }
    /* Store the result vectors */
    dest_for_jth_element = dest + j;
    for (k = 0; k < 2; k++) {
      _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm1[k]);
    }
  }
}

コード例 #9

0

ファイルを表示

ファイル: alpha_processing_sse2.c プロジェクト: 03050903/godot

static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
  int x = 0;
  if (!inverse) {
    const int kSpan = 2;
    const __m128i zero = _mm_setzero_si128();
    const __m128i kRound =
        _mm_set_epi16(0, 1 << 7, 1 << 7, 1 << 7, 0, 1 << 7, 1 << 7, 1 << 7);
    const __m128i kMult =
        _mm_set_epi16(0, 0x0101, 0x0101, 0x0101, 0, 0x0101, 0x0101, 0x0101);
    const __m128i kOne64 = _mm_set_epi16(1u << 8, 0, 0, 0, 1u << 8, 0, 0, 0);
    const int w2 = width & ~(kSpan - 1);
    for (x = 0; x < w2; x += kSpan) {
      const __m128i argb0 = _mm_loadl_epi64((__m128i*)&ptr[x]);
      const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero);
      const __m128i tmp0 = _mm_shufflelo_epi16(argb1, _MM_SHUFFLE(3, 3, 3, 3));
      const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, _MM_SHUFFLE(3, 3, 3, 3));
      const __m128i tmp2 = _mm_srli_epi64(tmp1, 16);
      const __m128i scale0 = _mm_mullo_epi16(tmp1, kMult);
      const __m128i scale1 = _mm_or_si128(tmp2, kOne64);
      const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0);
      const __m128i argb3 = _mm_mullo_epi16(argb1, scale1);
      const __m128i argb4 = _mm_adds_epu16(argb2, argb3);
      const __m128i argb5 = _mm_adds_epu16(argb4, kRound);
      const __m128i argb6 = _mm_srli_epi16(argb5, 8);
      const __m128i argb7 = _mm_packus_epi16(argb6, zero);
      _mm_storel_epi64((__m128i*)&ptr[x], argb7);
    }
  }
  width -= x;
  if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse);
}

コード例 #10

0

ファイルを表示

ファイル: dsp.argb_sse2.c プロジェクト: Antranilan/Sparky

static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
                     const uint8_t* b, int len, uint32_t* out) {
  if (g == r + 1) {  // RGBA input order. Need to swap R and B.
    int i = 0;
    const int len_max = len & ~3;  // max length processed in main loop
    const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
    assert(b == r + 2);
    assert(a == r + 3);
    for (; i < len_max; i += 4) {
      const __m128i A = _mm_loadu_si128((const __m128i*)(r + 4 * i));
      const __m128i B = _mm_and_si128(A, red_blue_mask);     // R 0 B 0
      const __m128i C = _mm_andnot_si128(red_blue_mask, A);  // 0 G 0 A
      const __m128i D = _mm_shufflelo_epi16(B, _MM_SHUFFLE(2, 3, 0, 1));
      const __m128i E = _mm_shufflehi_epi16(D, _MM_SHUFFLE(2, 3, 0, 1));
      const __m128i F = _mm_or_si128(E, C);
      _mm_storeu_si128((__m128i*)(out + i), F);
    }
    for (; i < len; ++i) {
      out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
    }
  } else {
    assert(g == b + 1);
    assert(r == b + 2);
    assert(a == b + 3);
    memcpy(out, b, len * 4);
  }
}

コード例 #11

0

ファイルを表示

ファイル: lossless_sse2.c プロジェクト: bovender/XLToolbox

static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
                                       const uint32_t* const src,
                                       int num_pixels, uint32_t* dst) {
// sign-extended multiplying constants, pre-shifted by 5.
#define CST(X)  (((int16_t)(m->X << 8)) >> 5)   // sign-extend
#define MK_CST_16(HI, LO) \
  _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
  const __m128i mults_rb = MK_CST_16(CST(green_to_red_), CST(green_to_blue_));
  const __m128i mults_b2 = MK_CST_16(CST(red_to_blue_), 0);
#undef MK_CST_16
#undef CST
  const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);  // alpha-green masks
  int i;
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
    const __m128i A = _mm_and_si128(in, mask_ag);     // a   0   g   0
    const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
    const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // g0g0
    const __m128i D = _mm_mulhi_epi16(C, mults_rb);    // x dr  x db1
    const __m128i E = _mm_add_epi8(in, D);             // x r'  x   b'
    const __m128i F = _mm_slli_epi16(E, 8);            // r' 0   b' 0
    const __m128i G = _mm_mulhi_epi16(F, mults_b2);    // x db2  0  0
    const __m128i H = _mm_srli_epi32(G, 8);            // 0  x db2  0
    const __m128i I = _mm_add_epi8(H, F);              // r' x  b'' 0
    const __m128i J = _mm_srli_epi16(I, 8);            // 0  r'  0  b''
    const __m128i out = _mm_or_si128(J, A);
    _mm_storeu_si128((__m128i*)&dst[i], out);
  }
  // Fall-back to C-version for left-overs.
  if (i != num_pixels) {
    VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
  }
}

コード例 #12

0

ファイルを表示

ファイル: shuffle.c プロジェクト: B-Rich/PyTables

/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */
static void
shuffle2(uint8_t* dest, uint8_t* src, size_t size)
{
  size_t i, j, k;
  size_t numof16belem;
  __m128i xmm0[2], xmm1[2];

  numof16belem = size / (16*2);
  for (i = 0, j = 0; i < numof16belem; i++, j += 16*2) {
    /* Fetch and transpose bytes, words and double words in groups of
       32 bytes */
    for (k = 0; k < 2; k++) {
      xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16));
      xmm0[k] = _mm_shufflelo_epi16(xmm0[k], 0xd8);
      xmm0[k] = _mm_shufflehi_epi16(xmm0[k], 0xd8);
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
      xmm0[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]);
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
      xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]);
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
    }
    /* Transpose quad words */
    for (k = 0; k < 1; k++) {
      xmm1[k*2] = _mm_unpacklo_epi64(xmm0[k], xmm0[k+1]);
      xmm1[k*2+1] = _mm_unpackhi_epi64(xmm0[k], xmm0[k+1]);
    }
    /* Store the result vectors */
    for (k = 0; k < 2; k++) {
      ((__m128i *)dest)[k*numof16belem+i] = xmm1[k];
    }
  }
}

コード例 #13

0

ファイルを表示

ファイル: sse2-builtins.c プロジェクト: lucasmrthomaz/clang

__m128i test_mm_shufflehi_epi16(__m128i A) {
  // DAG-LABEL: test_mm_shufflehi_epi16
  // DAG: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 4, i32 4, i32 4>
  //
  // ASM-LABEL: test_mm_shufflehi_epi16
  // ASM: pshufhw $0,
  return _mm_shufflehi_epi16(A, 0);
}

コード例 #14

0

ファイルを表示

ファイル: highbd_intrapred_intrin_ssse3.c プロジェクト: wolfviking0/webcl-webkit

void vpx_highbd_d63_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
                                          const uint16_t *above,
                                          const uint16_t *left, int bd) {
  const __m128i A0 = _mm_load_si128((const __m128i *)above);
  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
  const __m128i A2 = _mm_load_si128((const __m128i *)(above + 16));
  const __m128i A3 = _mm_load_si128((const __m128i *)(above + 24));
  const __m128i AR0 = _mm_shufflehi_epi16(A3, 0xff);
  const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
  const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
  const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
  const __m128i B3 = _mm_alignr_epi8(AR, A3, 2);
  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
  const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
  const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
  const __m128i C3 = _mm_alignr_epi8(AR, A3, 4);
  __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
  __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
  __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
  __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
  __m128i avg2_0 = _mm_avg_epu16(A0, B0);
  __m128i avg2_1 = _mm_avg_epu16(A1, B1);
  __m128i avg2_2 = _mm_avg_epu16(A2, B2);
  __m128i avg2_3 = _mm_avg_epu16(A3, B3);
  int i;
  (void)left;
  (void)bd;
  for (i = 0; i < 30; i += 2) {
    _mm_store_si128((__m128i *)dst, avg2_0);
    _mm_store_si128((__m128i *)(dst + 8), avg2_1);
    _mm_store_si128((__m128i *)(dst + 16), avg2_2);
    _mm_store_si128((__m128i *)(dst + 24), avg2_3);
    dst += stride;
    _mm_store_si128((__m128i *)dst, avg3_0);
    _mm_store_si128((__m128i *)(dst + 8), avg3_1);
    _mm_store_si128((__m128i *)(dst + 16), avg3_2);
    _mm_store_si128((__m128i *)(dst + 24), avg3_3);
    dst += stride;
    avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2);
    avg2_1 = _mm_alignr_epi8(avg2_2, avg2_1, 2);
    avg2_2 = _mm_alignr_epi8(avg2_3, avg2_2, 2);
    avg2_3 = _mm_alignr_epi8(AR, avg2_3, 2);
    avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
    avg3_1 = _mm_alignr_epi8(avg3_2, avg3_1, 2);
    avg3_2 = _mm_alignr_epi8(avg3_3, avg3_2, 2);
    avg3_3 = _mm_alignr_epi8(AR, avg3_3, 2);
  }
  _mm_store_si128((__m128i *)dst, avg2_0);
  _mm_store_si128((__m128i *)(dst + 8), avg2_1);
  _mm_store_si128((__m128i *)(dst + 16), avg2_2);
  _mm_store_si128((__m128i *)(dst + 24), avg2_3);
  dst += stride;
  _mm_store_si128((__m128i *)dst, avg3_0);
  _mm_store_si128((__m128i *)(dst + 8), avg3_1);
  _mm_store_si128((__m128i *)(dst + 16), avg3_2);
  _mm_store_si128((__m128i *)(dst + 24), avg3_3);
}

コード例 #15

0

ファイルを表示

ファイル: idct_sse2.cpp プロジェクト: Fluffiest/splayer

static __forceinline void DCT_8_INV_ROW(const uint8_t * const ecx,const uint8_t * const esi,__m128i &xmm0,__m128i &xmm1,__m128i &xmm2,__m128i &xmm3,__m128i &xmm4,__m128i &xmm5,__m128i &xmm6,__m128i &xmm7)
{
     xmm0=_mm_shufflelo_epi16(xmm0, 0xD8 );
     xmm1=_mm_shuffle_epi32( xmm0, 0 );
     pmaddwd (xmm1, esi);
     xmm3=_mm_shuffle_epi32( xmm0, 0x55);
     xmm0=_mm_shufflehi_epi16( xmm0, 0xD8 );
     pmaddwd( xmm3, esi+32 );
     xmm2=_mm_shuffle_epi32( xmm0, 0xAA );
     xmm0=_mm_shuffle_epi32( xmm0, 0xFF );
     pmaddwd( xmm2, esi+16 );
     xmm4=_mm_shufflehi_epi16( xmm4, 0xD8 );
     paddd (xmm1, M128_round_inv_row);
     xmm4=_mm_shufflelo_epi16 (xmm4, 0xD8 );
     pmaddwd (xmm0, esi+48 );
     xmm5=_mm_shuffle_epi32( xmm4, 0 );
     xmm6=_mm_shuffle_epi32( xmm4, 0xAA );
     pmaddwd (xmm5, ecx );
     paddd (xmm1, xmm2 );
     movdqa (xmm2, xmm1 );
     xmm7=_mm_shuffle_epi32( xmm4, 0x55 );
     pmaddwd (xmm6, ecx+16 );
     paddd (xmm0, xmm3 );
     xmm4=_mm_shuffle_epi32( xmm4, 0xFF );
     psubd (xmm2, xmm0 );
     pmaddwd (xmm7, ecx+32 );
     paddd (xmm0, xmm1 );
     psrad (xmm2, 12 );
     paddd (xmm5, M128_round_inv_row);
     pmaddwd (xmm4, ecx+48 );
     paddd (xmm5, xmm6 );
     movdqa (xmm6, xmm5 );
     psrad (xmm0, 12 );
     xmm2=_mm_shuffle_epi32( xmm2, 0x1B );
     packssdw (xmm0, xmm2 );
     paddd (xmm4, xmm7 );
     psubd (xmm6, xmm4 );
     paddd (xmm4, xmm5 );
     psrad (xmm6, 12 );
     psrad (xmm4, 12 );
     xmm6=_mm_shuffle_epi32( xmm6, 0x1B );
     packssdw (xmm4, xmm6 );
}

コード例 #16

0

ファイルを表示

ファイル: sora_ext_lib.c プロジェクト: travisfcollins/Ziria

//
// multiplies two complex vectors and returns the real and imaginary parts 
// as two 32 bit integers.
//
int __ext_v_conj_mul_complex16_int32(int32* re, int lenout1, int32* im, int lenout2, 
				struct complex16* x, int len1, struct complex16* y, int len2 )
{

	const int wlen = 4;// sizeof(vcs) / sizeof(complex16);
	const __m128i xmm6 = _mm_set1_epi32(0x0000FFFF);		//0x0000FFFF0000FFFF0000FFFF0000FFFF
	const __m128i xmm5 = _mm_set1_epi32(0xFFFF0000);
	const __m128i xmm4 = _mm_set1_epi32(0x00010000);
	for (int i = 0; i < len1 / wlen; i++){

	/*	vcs *vx = (vcs *)(x + wlen*i);
		vcs *vy = (vcs *)(y + wlen*i);
		vi *reout = (vi *)(re + wlen*i);
		vi *imout = (vi *)(im + wlen*i);

		vcs vs2 = conj0(*vy);

	    vs2 = permutate_low<1, 0, 3, 2>(vs2);
		vs2 = permutate_high<1, 0, 3, 2>(vs2);

		*reout = (vcs)muladd(*vx, *vy);
		*imout = (vcs)muladd(*vx, vs2);*/


		__m128i mx = _mm_loadu_si128((__m128i *)(x + wlen*i));
		__m128i my = _mm_loadu_si128((__m128i *)(y + wlen*i));


		//__m128i ms1 = _mm_sign_epi16(my, conj);
		__m128i ms2 = _mm_xor_si128(my, xmm5);
		ms2 = _mm_add_epi32(ms2, xmm4);


		ms2 = _mm_shufflehi_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1));
		ms2 = _mm_shufflelo_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1));

		__m128i mre = _mm_madd_epi16(my, mx);
		__m128i mim = _mm_madd_epi16(ms2, mx);

		_mm_storeu_si128((__m128i *) (re + wlen*i), mre);
		_mm_storeu_si128((__m128i *) (im + wlen*i), mim);



	}

	for (int i = (len1 / wlen) * wlen; i < len1; i++){
		re[i] = x[i].re * y[i].re + x[i].im * y[i].im ;
		im[i] = x[i].im * y[i].re - x[i].re * y[i].im ;
	};

	return 0;

}

コード例 #17

0

ファイルを表示

ファイル: lossless_enc_sse2.c プロジェクト: 03050903/godot

static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
  int i;
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
    const __m128i A = _mm_srli_epi16(in, 8);     // 0 a 0 g
    const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
    const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // 0g0g
    const __m128i out = _mm_sub_epi8(in, C);
    _mm_storeu_si128((__m128i*)&argb_data[i], out);
  }
  // fallthrough and finish off with plain-C
  VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
}

コード例 #18

0

ファイルを表示

void LoadRGBA8ToBGRA8_SSE2(size_t width, size_t height, size_t depth,
                           const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
                           uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
{
#if defined(_M_ARM)
    // Ensure that this function is reported as not implemented for ARM builds because
    // the instructions below are not present for that architecture.
    UNIMPLEMENTED();
    return;
#else
    __m128i brMask = _mm_set1_epi32(0x00ff00ff);

    for (size_t z = 0; z < depth; z++)
    {
        for (size_t y = 0; y < height; y++)
        {
            const uint32_t *source = OffsetDataPointer<uint32_t>(input, y, z, inputRowPitch, inputDepthPitch);
            uint32_t *dest = OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch, outputDepthPitch);

            size_t x = 0;

            // Make output writes aligned
            for (; ((reinterpret_cast<intptr_t>(&dest[x]) & 15) != 0) && x < width; x++)
            {
                uint32_t rgba = source[x];
                dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
            }

            for (; x + 3 < width; x += 4)
            {
                __m128i sourceData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&source[x]));
                // Mask out g and a, which don't change
                __m128i gaComponents = _mm_andnot_si128(brMask, sourceData);
                // Mask out b and r
                __m128i brComponents = _mm_and_si128(sourceData, brMask);
                // Swap b and r
                __m128i brSwapped = _mm_shufflehi_epi16(_mm_shufflelo_epi16(brComponents, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
                __m128i result = _mm_or_si128(gaComponents, brSwapped);
                _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), result);
            }

            // Perform leftover writes
            for (; x < width; x++)
            {
                uint32_t rgba = source[x];
                dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
            }
        }
    }
#endif
}

コード例 #19

0

ファイルを表示

ファイル: highbd_intrapred_intrin_ssse3.c プロジェクト: wolfviking0/webcl-webkit

void vpx_highbd_d63_predictor_8x8_ssse3(uint16_t *dst, ptrdiff_t stride,
                                        const uint16_t *above,
                                        const uint16_t *left, int bd) {
  const __m128i ABCDEFGH = _mm_load_si128((const __m128i *)above);
  const __m128i ABCDHHHH = _mm_shufflehi_epi16(ABCDEFGH, 0xff);
  const __m128i HHHHHHHH = _mm_unpackhi_epi64(ABCDHHHH, ABCDHHHH);
  const __m128i BCDEFGHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 2);
  const __m128i CDEFGHHH = _mm_alignr_epi8(HHHHHHHH, ABCDEFGH, 4);
  __m128i avg3 = avg3_epu16(&ABCDEFGH, &BCDEFGHH, &CDEFGHHH);
  __m128i avg2 = _mm_avg_epu16(ABCDEFGH, BCDEFGHH);
  (void)left;
  (void)bd;
  d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH);
  d63_store_4x8(&dst, stride, &avg2, &avg3, &HHHHHHHH);
}

コード例 #20

0

ファイルを表示

ファイル: lossless_sse2.c プロジェクト: bovender/XLToolbox

static void AddGreenToBlueAndRed_SSE2(const uint32_t* const src, int num_pixels,
                                      uint32_t* dst) {
  int i;
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
    const __m128i A = _mm_srli_epi16(in, 8);     // 0 a 0 g
    const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
    const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // 0g0g
    const __m128i out = _mm_add_epi8(in, C);
    _mm_storeu_si128((__m128i*)&dst[i], out);
  }
  // fallthrough and finish off with plain-C
  if (i != num_pixels) {
    VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i);
  }
}

コード例 #21

0

ファイルを表示

ファイル: image.c プロジェクト: xaphier/Eternal-Lands

void unpack_rgba8_sse2(const Uint8* source, const Uint32 size, Uint8* dest)
{
	__m128i t0, t1, t2;
	Uint32 i;

	for (i = 0; i < (size / 16); i++)
	{
		t0 = _mm_load_si128((__m128i*)&source[i * 16]);

		t1 = _mm_and_si128(t0, _mm_set1_epi16(0x00FF));
		t2 = _mm_and_si128(t0, _mm_set1_epi16(0xFF00));
		t1 = _mm_shufflelo_epi16(t1, _MM_SHUFFLE(2, 3, 0, 1));
		t1 = _mm_shufflehi_epi16(t1, _MM_SHUFFLE(2, 3, 0, 1));
		t1 = _mm_or_si128(t1, t2);

		_mm_stream_si128((__m128i*)&dest[i * 16], t1);
	}
}

コード例 #22

0

ファイルを表示

ファイル: highbd_intrapred_intrin_ssse3.c プロジェクト: wolfviking0/webcl-webkit

void vpx_highbd_d207_predictor_32x32_ssse3(uint16_t *dst, ptrdiff_t stride,
                                           const uint16_t *above,
                                           const uint16_t *left, int bd) {
  const __m128i A0 = _mm_load_si128((const __m128i *)left);
  const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8));
  const __m128i A2 = _mm_load_si128((const __m128i *)(left + 16));
  const __m128i A3 = _mm_load_si128((const __m128i *)(left + 24));
  const __m128i LR0 = _mm_shufflehi_epi16(A3, 0xff);
  const __m128i LR = _mm_unpackhi_epi64(LR0, LR0);
  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
  const __m128i B1 = _mm_alignr_epi8(A2, A1, 2);
  const __m128i B2 = _mm_alignr_epi8(A3, A2, 2);
  const __m128i B3 = _mm_alignr_epi8(LR, A3, 2);
  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
  const __m128i C1 = _mm_alignr_epi8(A2, A1, 4);
  const __m128i C2 = _mm_alignr_epi8(A3, A2, 4);
  const __m128i C3 = _mm_alignr_epi8(LR, A3, 4);
  const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
  const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
  const __m128i avg3_2 = avg3_epu16(&A2, &B2, &C2);
  const __m128i avg3_3 = avg3_epu16(&A3, &B3, &C3);
  const __m128i avg2_0 = _mm_avg_epu16(A0, B0);
  const __m128i avg2_1 = _mm_avg_epu16(A1, B1);
  const __m128i avg2_2 = _mm_avg_epu16(A2, B2);
  const __m128i avg2_3 = _mm_avg_epu16(A3, B3);
  const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0);
  const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0);
  const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1);
  const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1);
  const __m128i out_e = _mm_unpacklo_epi16(avg2_2, avg3_2);
  const __m128i out_f = _mm_unpackhi_epi16(avg2_2, avg3_2);
  const __m128i out_g = _mm_unpacklo_epi16(avg2_3, avg3_3);
  const __m128i out_h = _mm_unpackhi_epi16(avg2_3, avg3_3);
  (void)above;
  (void)bd;
  d207_store_4x32(&dst, stride, &out_a, &out_b, &out_c, &out_d, &out_e);
  d207_store_4x32(&dst, stride, &out_b, &out_c, &out_d, &out_e, &out_f);
  d207_store_4x32(&dst, stride, &out_c, &out_d, &out_e, &out_f, &out_g);
  d207_store_4x32(&dst, stride, &out_d, &out_e, &out_f, &out_g, &out_h);
  d207_store_4x32(&dst, stride, &out_e, &out_f, &out_g, &out_h, &LR);
  d207_store_4x32(&dst, stride, &out_f, &out_g, &out_h, &LR, &LR);
  d207_store_4x32(&dst, stride, &out_g, &out_h, &LR, &LR, &LR);
  d207_store_4x32(&dst, stride, &out_h, &LR, &LR, &LR, &LR);
}

コード例 #23

0

ファイルを表示

ファイル: picture-avx2.c プロジェクト: damjeux/kvazaar

static INLINE void hor_transform_row_avx2(__m128i* row){
  
  __m128i mask_pos = _mm_set1_epi16(1);
  __m128i mask_neg = _mm_set1_epi16(-1);
  __m128i sign_mask = _mm_unpacklo_epi64(mask_pos, mask_neg);
  __m128i temp = _mm_shuffle_epi32(*row, KVZ_PERMUTE(2, 3, 0, 1));
  *row = _mm_sign_epi16(*row, sign_mask);
  *row = _mm_add_epi16(*row, temp);

  sign_mask = _mm_unpacklo_epi32(mask_pos, mask_neg);
  temp = _mm_shuffle_epi32(*row, KVZ_PERMUTE(1, 0, 3, 2));
  *row = _mm_sign_epi16(*row, sign_mask);
  *row = _mm_add_epi16(*row, temp);

  sign_mask = _mm_unpacklo_epi16(mask_pos, mask_neg);
  temp = _mm_shufflelo_epi16(*row, KVZ_PERMUTE(1,0,3,2));
  temp = _mm_shufflehi_epi16(temp, KVZ_PERMUTE(1,0,3,2));
  *row = _mm_sign_epi16(*row, sign_mask);
  *row = _mm_add_epi16(*row, temp);
}

コード例 #24

0

ファイルを表示

ファイル: sora_ext_lib.cpp プロジェクト: mainland/Ziria

//
// This was v_mul_complex16_shift but I changed the name for consistency with v_conj_mul
// and the fact that the old v_mul_complex16 was never called
//
FORCE_INLINE
int __ext_v_mul_complex16(struct complex16* out, int lenout,
                struct complex16* x, int len1,
                struct complex16* y, int len2, int shift)
{
  const unum8 wlen = 4;// sizeof(vcs) / sizeof(complex16);
  const __m128i xmm6 = _mm_set1_epi32(0x0000FFFF);
  const __m128i xmm5 = _mm_set1_epi32(0xFFFF0000);
  const __m128i xmm4 = _mm_set1_epi32(0x00010000);

  __m128i* Xs = (__m128i*) x;
  __m128i* Ys = (__m128i*) y;
  __m128i* Outs = (__m128i*) out;
  for (int i = 0; i < len1 / wlen; i++){
    __m128i mx = _mm_loadu_si128(&Xs[i]);
    __m128i my = _mm_loadu_si128(&Ys[i]);

    __m128i ms1 = _mm_xor_si128(mx, xmm5);
    ms1 = _mm_add_epi32(ms1, xmm4);

    __m128i ms2 = _mm_shufflehi_epi16(mx, _MM_SHUFFLE(2, 3, 0, 1));
    ms2 = _mm_shufflelo_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1));

    __m128i mre = _mm_srai_epi32(_mm_madd_epi16(ms1, my), shift);
    __m128i mim = _mm_srai_epi32(_mm_madd_epi16(ms2, my), shift);

    mre = _mm_and_si128(mre,xmm6);
    mim = _mm_and_si128(mim,xmm6);

    mim = _mm_slli_epi32(mim,0x10);

    _mm_storeu_si128(&Outs[i], _mm_or_si128(mre, mim));
  }

  for (int i = (len1 / wlen) * wlen; i < len1; i++){
    out[i].re = (x[i].re * y[i].re - x[i].im * y[i].im) >> shift;
    out[i].im = (x[i].re * y[i].im + x[i].im * y[i].re) >> shift;
  }

  return 0;
}

コード例 #25

0

ファイルを表示

ファイル: TextureSSE2.cpp プロジェクト: hgl888/RuntimeCanvas

void Image::loadRGBAUByteDataSSE2(GLsizei width, GLsizei height,
                                  int inputPitch, const void *input, size_t outputPitch, void *output) const
{
    const unsigned int *source = NULL;
    unsigned int *dest = NULL;
    __m128i brMask = _mm_set1_epi32(0x00ff00ff);

    for (int y = 0; y < height; y++)
    {
        source = reinterpret_cast<const unsigned int*>(static_cast<const unsigned char*>(input) + y * inputPitch);
        dest = reinterpret_cast<unsigned int*>(static_cast<unsigned char*>(output) + y * outputPitch);
        int x = 0;

        // Make output writes aligned
        for (x = 0; ((reinterpret_cast<intptr_t>(&dest[x]) & 15) != 0) && x < width; x++)
        {
            unsigned int rgba = source[x];
            dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
        }

        for (; x + 3 < width; x += 4)
        {
            __m128i sourceData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&source[x]));
            // Mask out g and a, which don't change
            __m128i gaComponents = _mm_andnot_si128(brMask, sourceData);
            // Mask out b and r
            __m128i brComponents = _mm_and_si128(sourceData, brMask);
            // Swap b and r
            __m128i brSwapped = _mm_shufflehi_epi16(_mm_shufflelo_epi16(brComponents, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
            __m128i result = _mm_or_si128(gaComponents, brSwapped);
            _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), result);
        }

        // Perform leftover writes
        for (; x < width; x++)
        {
            unsigned int rgba = source[x];
            dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
        }
    }
}

コード例 #26

0

ファイルを表示

ファイル: highbd_intrapred_intrin_ssse3.c プロジェクト: wolfviking0/webcl-webkit

void vpx_highbd_d63_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride,
                                          const uint16_t *above,
                                          const uint16_t *left, int bd) {
  const __m128i A0 = _mm_load_si128((const __m128i *)above);
  const __m128i A1 = _mm_load_si128((const __m128i *)(above + 8));
  const __m128i AR0 = _mm_shufflehi_epi16(A1, 0xff);
  const __m128i AR = _mm_unpackhi_epi64(AR0, AR0);
  const __m128i B0 = _mm_alignr_epi8(A1, A0, 2);
  const __m128i B1 = _mm_alignr_epi8(AR, A1, 2);
  const __m128i C0 = _mm_alignr_epi8(A1, A0, 4);
  const __m128i C1 = _mm_alignr_epi8(AR, A1, 4);
  __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0);
  __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1);
  __m128i avg2_0 = _mm_avg_epu16(A0, B0);
  __m128i avg2_1 = _mm_avg_epu16(A1, B1);
  int i;
  (void)left;
  (void)bd;
  for (i = 0; i < 14; i += 2) {
    _mm_store_si128((__m128i *)dst, avg2_0);
    _mm_store_si128((__m128i *)(dst + 8), avg2_1);
    dst += stride;
    _mm_store_si128((__m128i *)dst, avg3_0);
    _mm_store_si128((__m128i *)(dst + 8), avg3_1);
    dst += stride;
    avg2_0 = _mm_alignr_epi8(avg2_1, avg2_0, 2);
    avg2_1 = _mm_alignr_epi8(AR, avg2_1, 2);
    avg3_0 = _mm_alignr_epi8(avg3_1, avg3_0, 2);
    avg3_1 = _mm_alignr_epi8(AR, avg3_1, 2);
  }
  _mm_store_si128((__m128i *)dst, avg2_0);
  _mm_store_si128((__m128i *)(dst + 8), avg2_1);
  dst += stride;
  _mm_store_si128((__m128i *)dst, avg3_0);
  _mm_store_si128((__m128i *)(dst + 8), avg3_1);
}

コード例 #27

0

ファイルを表示

ファイル: enc_sse2.c プロジェクト: 0309/cocos2d-x

// Simple quantization
static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
                             int n, const VP8Matrix* const mtx) {
  const __m128i max_coeff_2047 = _mm_set1_epi16(2047);
  const __m128i zero = _mm_set1_epi16(0);
  __m128i sign0, sign8;
  __m128i coeff0, coeff8;
  __m128i out0, out8;
  __m128i packed_out;

  // Load all inputs.
  // TODO(cduvivier): Make variable declarations and allocations aligned so that
  //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
  __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
  __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
  const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]);
  const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]);
  const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]);
  const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]);
  const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
  const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
  const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);
  const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);
  const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]);
  const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]);

  // sign(in) = in >> 15  (0x0000 if positive, 0xffff if negative)
  sign0 = _mm_srai_epi16(in0, 15);
  sign8 = _mm_srai_epi16(in8, 15);

  // coeff = abs(in) = (in ^ sign) - sign
  coeff0 = _mm_xor_si128(in0, sign0);
  coeff8 = _mm_xor_si128(in8, sign8);
  coeff0 = _mm_sub_epi16(coeff0, sign0);
  coeff8 = _mm_sub_epi16(coeff8, sign8);

  // coeff = abs(in) + sharpen
  coeff0 = _mm_add_epi16(coeff0, sharpen0);
  coeff8 = _mm_add_epi16(coeff8, sharpen8);

  // if (coeff > 2047) coeff = 2047
  coeff0 = _mm_min_epi16(coeff0, max_coeff_2047);
  coeff8 = _mm_min_epi16(coeff8, max_coeff_2047);

  // out = (coeff * iQ + B) >> QFIX;
  {
    // doing calculations with 32b precision (QFIX=17)
    // out = (coeff * iQ)
    __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
    __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
    __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
    __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
    __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
    __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
    // expand bias from 16b to 32b
    __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero);
    __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero);
    __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero);
    __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero);
    // out = (coeff * iQ + B)
    out_00 = _mm_add_epi32(out_00, bias_00);
    out_04 = _mm_add_epi32(out_04, bias_04);
    out_08 = _mm_add_epi32(out_08, bias_08);
    out_12 = _mm_add_epi32(out_12, bias_12);
    // out = (coeff * iQ + B) >> QFIX;
    out_00 = _mm_srai_epi32(out_00, QFIX);
    out_04 = _mm_srai_epi32(out_04, QFIX);
    out_08 = _mm_srai_epi32(out_08, QFIX);
    out_12 = _mm_srai_epi32(out_12, QFIX);
    // pack result as 16b
    out0 = _mm_packs_epi32(out_00, out_04);
    out8 = _mm_packs_epi32(out_08, out_12);
  }

  // get sign back (if (sign[j]) out_n = -out_n)
  out0 = _mm_xor_si128(out0, sign0);
  out8 = _mm_xor_si128(out8, sign8);
  out0 = _mm_sub_epi16(out0, sign0);
  out8 = _mm_sub_epi16(out8, sign8);

  // in = out * Q
  in0 = _mm_mullo_epi16(out0, q0);
  in8 = _mm_mullo_epi16(out8, q8);

  // if (coeff <= mtx->zthresh_) {in=0; out=0;}
  {
    __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0);
    __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8);
    in0 = _mm_and_si128(in0, cmp0);
    in8 = _mm_and_si128(in8, cmp8);
    _mm_storeu_si128((__m128i*)&in[0], in0);
    _mm_storeu_si128((__m128i*)&in[8], in8);
    out0 = _mm_and_si128(out0, cmp0);
    out8 = _mm_and_si128(out8, cmp8);
  }

  // zigzag the output before storing it.
  //
  // The zigzag pattern can almost be reproduced with a small sequence of
  // shuffles. After it, we only need to swap the 7th (ending up in third
  // position instead of twelfth) and 8th values.
  {
    __m128i outZ0, outZ8;
    outZ0 = _mm_shufflehi_epi16(out0,  _MM_SHUFFLE(2, 1, 3, 0));
    outZ0 = _mm_shuffle_epi32  (outZ0, _MM_SHUFFLE(3, 1, 2, 0));
    outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2));
    outZ8 = _mm_shufflelo_epi16(out8,  _MM_SHUFFLE(3, 0, 2, 1));
    outZ8 = _mm_shuffle_epi32  (outZ8, _MM_SHUFFLE(3, 1, 2, 0));
    outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0));
    _mm_storeu_si128((__m128i*)&out[0], outZ0);
    _mm_storeu_si128((__m128i*)&out[8], outZ8);
    packed_out = _mm_packs_epi16(outZ0, outZ8);
  }
  {
    const int16_t outZ_12 = out[12];
    const int16_t outZ_3 = out[3];
    out[3] = outZ_12;
    out[12] = outZ_3;
  }

  // detect if all 'out' values are zeroes or not
  {
    int32_t tmp[4];
    _mm_storeu_si128((__m128i*)tmp, packed_out);
    if (n) {
      tmp[0] &= ~0xff;
    }
    return (tmp[3] || tmp[2] || tmp[1] || tmp[0]);
  }
}

コード例 #28

0

ファイルを表示

ファイル: FileIconDraw.cpp プロジェクト: FMJ-Software/Image-Eye

void FileIconDrawGlass::Text(HDC hdc, PCTCHAR pcszText, const RECT &rc, eTextColor eColor, UINT uFlags)
{
	if (!pcszText || !*pcszText) return;

	// Find out actual size of text
	int nChars = _tcslen(pcszText);
	uFlags |= DT_NOCLIP;

	int iX = rc.left;
	int iY = rc.top;
	int iXW = (rc.right - iX);
	int iYH = (rc.bottom - iY);

	RECT rcMin = rc;
	if (DrawText(hdcTextDIB, pcszText, nChars, &rcMin, uFlags | DT_CALCRECT)) {
		int iMinXW = rcMin.right - rcMin.left;
		int iMinYH = rcMin.bottom - rcMin.top;
		if (iMinXW < iXW) {
			if (uFlags & DT_CENTER) {
				iX += (iXW - iMinXW)/2;
				uFlags &= ~DT_CENTER;
			} else if (uFlags & DT_RIGHT) {
				iX += (iXW - iMinXW);
				uFlags &= ~DT_RIGHT;
			}
			iXW = iMinXW;
		}
		if (iMinYH < iYH) {
			if (uFlags & DT_SINGLELINE) {
				if (uFlags & DT_VCENTER) {
					iY += (iYH - iMinYH)/2;
					uFlags &= ~DT_VCENTER;
				} else if (uFlags & DT_BOTTOM) {
					iY += (iYH - iMinYH);
					uFlags &= ~DT_BOTTOM;
				}
			}
			iYH = iMinYH;
		}
	}

	iXW += 2;	// NB: +2 'cause we want an extra pixel at the border so that the font smoothing will look bette!
	iYH += 2;

	// Ensure we have a big enough DIB to draw the text to
	if ((iXW > iTextDIBXW) || (iYH > iTextDIBYH)) CreateTextDIB(iXW, iYH);
	if (!hbmpTextDIB) return;

	// Select color
	ieBGRA clr;
	switch (eColor) {
	case eFileName:	clr = clrFileName;		break;
	case eComment:	clr = clrComment;		break;
	case eFileInfo:	clr = clrFileInfo;		break;
	default:		clr = ieBGRA(0,0,0);	break;
	}
	clr.A = 0xFF - clrBkg.A;

	// Draw the text to in-memory DIB
	RECT rcTextDIB = { 0, 0, iXW, iYH };
	FillRect(hdcTextDIB, &rcTextDIB, hbrBkg);

	rcTextDIB.left++;
	rcTextDIB.top++;

	DrawText(hdcTextDIB, pcszText, nChars, &rcTextDIB, uFlags);

	// Modify DIB:
#ifndef __X64__
	if (g_bSSE2) 
#endif
	{
		__m128i r0, r1, r2, r3, r4, r5, r6, r7;

		r7 = _mm_setzero_si128();									// 0
		r6 = _mm_set1_epi32(clr.dw);								// CA  CR  CG  CB  CA  CR  CG  CB  CA  CR  CG  CB  CA  CR  CG  CB
		r6 = _mm_unpacklo_epi8(r7, r6);								// CA<<8   CR<<8   CG<<8   CB<<8   CA<<8   CR<<8   CG<<8   CB<<8
		r5 = _mm_set1_epi16(1);										// 1       1       1       1       1       1       1       1
		r4 = _mm_set1_epi32(0xFF);									// FF              FF              FF              FF
		r3 = _mm_set1_epi32(clrBkg.dw);								// DA  0   0   0   DA  0   0   0   DA  0   0   0   DA  0   0   0

		ieBGRA *py = pTextDIB;
		for (int y = iYH; y--; py += iTextDIBXW) {
			ieBGRA *px = py;

			for (int x_4 = (iXW+3)>>2; x_4--; px += 4) {

				r0 = _mm_load_si128((__m128i *)px);
				r1 = r0;
				r2 = r0;											// X3  R3  G3  B3  X2  R2  G2  B2  X1  R1  G1  B1  X0  R0  G0  B0 
				r0 = _mm_srli_epi32(r0, 16);						// 0   0   X3  R3  0   0   X2  R2  0   0   X1  R1  0   0   X0  R0 
				r1 = _mm_srli_epi32(r1, 8);							// 0   X3  R3  G3  0   X2  R2  G2  0   X1  R1  G1  0   X0  R0  G0 
				r0 = _mm_max_epu8(r0, r2);
				r0 = _mm_max_epu8(r0, r1);							// x   x   x   A3  x   x   x   A2  x   x   x   A1  x   x   x   A0
				r0 = _mm_and_si128(r0, r4);							// 0       A3      0       A2      0       A1      0       A0
				r0 = _mm_shufflelo_epi16(r0, _MM_SHUFFLE(2,2,0,0));
				r0 = _mm_shufflehi_epi16(r0, _MM_SHUFFLE(2,2,0,0));	// A3      A3      A2      A2      A1      A1      A0      A0
				r1 = r0;
				r0 = _mm_unpacklo_epi32(r0, r0);					// A1      A1      A1      A1      A0      A0      A0      A0
				r1 = _mm_unpackhi_epi32(r1, r1);					// A3      A3      A3      A3      A2      A2      A2      A2
				r0 = _mm_add_epi16(r0, r5);							// A1'     A1'     A1'     A1'     A0'     A0'     A0'     A0' 
				r1 = _mm_add_epi16(r1, r5);							// A3'     A3'     A3'     A3'     A2'     A2'     A2'     A2' 
				r0 = _mm_mulhi_epu16(r0, r6);						// xA1"    xR1     xG1     xB1     xA0"    xR0     xG0     xB0
				r1 = _mm_mulhi_epu16(r1, r6);						// xA3"    xR3     xG3     xB3     xA2"    xR2     xG2     xB2
				r0 = _mm_packus_epi16(r0, r1);						// xA3"xR3 xG3 xB3 xA2"xR2 xG2 xB2 xA1"xR1 xG1 xB1 xA0"xR0 xG0 xB0
				r0 = _mm_adds_epu8(r0, r3);							// xA3 xR3 xG3 xB3 xA2 xR2 xG2 xB2 xA1 xR1 xG1 xB1 xA0 xR0 xG0 xB0
				_mm_store_si128((__m128i *)px, r0);
			}
		}
	}
#ifndef __X64__
	else {

コード例 #29

0

ファイルを表示

ファイル: enc_sse2.c プロジェクト: haroldma/Universal.WebP

static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
                           int16_t* out) {
    const __m128i zero = _mm_setzero_si128();
    const __m128i seven = _mm_set1_epi16(7);
    const __m128i k937 = _mm_set1_epi32(937);
    const __m128i k1812 = _mm_set1_epi32(1812);
    const __m128i k51000 = _mm_set1_epi32(51000);
    const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
    const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
                               5352,  2217, 5352,  2217);
    const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
                               2217, -5352, 2217, -5352);
    const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8);
    const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8);
    const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352,
                                2217, 5352, 2217, 5352);
    const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217,
                                -5352, 2217, -5352, 2217);
    __m128i v01, v32;


    // Difference between src and ref and initial transpose.
    {
        // Load src and convert to 16b.
        const __m128i src0 = _mm_loadl_epi64((__m128i*)&src[0 * BPS]);
        const __m128i src1 = _mm_loadl_epi64((__m128i*)&src[1 * BPS]);
        const __m128i src2 = _mm_loadl_epi64((__m128i*)&src[2 * BPS]);
        const __m128i src3 = _mm_loadl_epi64((__m128i*)&src[3 * BPS]);
        const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
        const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
        const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
        const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
        // Load ref and convert to 16b.
        const __m128i ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]);
        const __m128i ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]);
        const __m128i ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]);
        const __m128i ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]);
        const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
        const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
        const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
        const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
        // Compute difference. -> 00 01 02 03 00 00 00 00
        const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
        const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
        const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
        const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);


        // Unpack and shuffle
        // 00 01 02 03   0 0 0 0
        // 10 11 12 13   0 0 0 0
        // 20 21 22 23   0 0 0 0
        // 30 31 32 33   0 0 0 0
        const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1);
        const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3);
        // 00 01 10 11 02 03 12 13
        // 20 21 30 31 22 23 32 33
        const __m128i shuf01_p =
            _mm_shufflehi_epi16(shuf01, _MM_SHUFFLE(2, 3, 0, 1));
        const __m128i shuf23_p =
            _mm_shufflehi_epi16(shuf23, _MM_SHUFFLE(2, 3, 0, 1));
        // 00 01 10 11 03 02 13 12
        // 20 21 30 31 23 22 33 32
        const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p);
        const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p);
        // 00 01 10 11 20 21 30 31
        // 03 02 13 12 23 22 33 32
        const __m128i a01 = _mm_add_epi16(s01, s32);
        const __m128i a32 = _mm_sub_epi16(s01, s32);
        // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ]
        // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ]

        const __m128i tmp0 = _mm_madd_epi16(a01, k88p);  // [ (a0 + a1) << 3, ... ]
        const __m128i tmp2 = _mm_madd_epi16(a01, k88m);  // [ (a0 - a1) << 3, ... ]
        const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p);
        const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m);
        const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812);
        const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937);
        const __m128i tmp1   = _mm_srai_epi32(tmp1_2, 9);
        const __m128i tmp3   = _mm_srai_epi32(tmp3_2, 9);
        const __m128i s03 = _mm_packs_epi32(tmp0, tmp2);
        const __m128i s12 = _mm_packs_epi32(tmp1, tmp3);
        const __m128i s_lo = _mm_unpacklo_epi16(s03, s12);   // 0 1 0 1 0 1...
        const __m128i s_hi = _mm_unpackhi_epi16(s03, s12);   // 2 3 2 3 2 3
        const __m128i v23 = _mm_unpackhi_epi32(s_lo, s_hi);
        v01 = _mm_unpacklo_epi32(s_lo, s_hi);
        v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));  // 3 2 3 2 3 2..
    }

    // Second pass
    {
        // Same operations are done on the (0,3) and (1,2) pairs.
        // a0 = v0 + v3
        // a1 = v1 + v2
        // a3 = v0 - v3
        // a2 = v1 - v2
        const __m128i a01 = _mm_add_epi16(v01, v32);
        const __m128i a32 = _mm_sub_epi16(v01, v32);
        const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
        const __m128i a22 = _mm_unpackhi_epi64(a32, a32);
        const __m128i a01_plus_7 = _mm_add_epi16(a01, seven);

        // d0 = (a0 + a1 + 7) >> 4;
        // d2 = (a0 - a1 + 7) >> 4;
        const __m128i c0 = _mm_add_epi16(a01_plus_7, a11);
        const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11);
        const __m128i d0 = _mm_srai_epi16(c0, 4);
        const __m128i d2 = _mm_srai_epi16(c2, 4);

        // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
        // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
        const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
        const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
        const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
        const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);
        const __m128i d3 = _mm_add_epi32(c3, k51000);
        const __m128i e1 = _mm_srai_epi32(d1, 16);
        const __m128i e3 = _mm_srai_epi32(d3, 16);
        const __m128i f1 = _mm_packs_epi32(e1, e1);
        const __m128i f3 = _mm_packs_epi32(e3, e3);
        // f1 = f1 + (a3 != 0);
        // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
        // desired (0, 1), we add one earlier through k12000_plus_one.
        // -> f1 = f1 + 1 - (a3 == 0)
        const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));

        _mm_storel_epi64((__m128i*)&out[ 0], d0);
        _mm_storel_epi64((__m128i*)&out[ 4], g1);
        _mm_storel_epi64((__m128i*)&out[ 8], d2);
        _mm_storel_epi64((__m128i*)&out[12], f3);
    }
}

コード例 #30

0

ファイルを表示

ファイル: prim_alphaComp_opt.c プロジェクト: FreeRDP/FreeRDP

pstatus_t sse2_alphaComp_argb(
    const BYTE* pSrc1,  UINT32 src1Step,
    const BYTE* pSrc2,  UINT32 src2Step,
    BYTE* pDst,  UINT32 dstStep,
    UINT32 width,  UINT32 height)
{
	const UINT32* sptr1 = (const UINT32*) pSrc1;
	const UINT32* sptr2 = (const UINT32*) pSrc2;
	UINT32* dptr;
	int linebytes, src1Jump, src2Jump, dstJump;
	UINT32 y;
	__m128i xmm0, xmm1;

	if ((width <= 0) || (height <= 0)) return PRIMITIVES_SUCCESS;

	if (width < 4)     /* pointless if too small */
	{
		return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step,
					       pDst, dstStep, width, height);
	}

	dptr = (UINT32*) pDst;
	linebytes = width * sizeof(UINT32);
	src1Jump = (src1Step - linebytes) / sizeof(UINT32);
	src2Jump = (src2Step - linebytes) / sizeof(UINT32);
	dstJump  = (dstStep  - linebytes) / sizeof(UINT32);
	xmm0 = _mm_set1_epi32(0);
	xmm1 = _mm_set1_epi16(1);

	for (y = 0; y < height; ++y)
	{
		int pixels = width;
		int count;
		/* Get to the 16-byte boundary now. */
		int leadIn = 0;

		switch ((ULONG_PTR) dptr & 0x0f)
		{
			case 0:
				leadIn = 0;
				break;

			case 4:
				leadIn = 3;
				break;

			case 8:
				leadIn = 2;
				break;

			case 12:
				leadIn = 1;
				break;

			default:
				/* We'll never hit a 16-byte boundary, so do the whole
				 * thing the slow way.
				 */
				leadIn = width;
				break;
		}

		if (leadIn)
		{
			pstatus_t status;
			status = generic->alphaComp_argb((const BYTE*) sptr1,
						src1Step, (const BYTE*) sptr2, src2Step,
						(BYTE*) dptr, dstStep, leadIn, 1);
			if (status != PRIMITIVES_SUCCESS)
				return status;

			sptr1 += leadIn;
			sptr2 += leadIn;
			dptr  += leadIn;
			pixels -= leadIn;
		}

		/* Use SSE registers to do 4 pixels at a time. */
		count = pixels >> 2;
		pixels -= count << 2;

		while (count--)
		{
			__m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
			/* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
			xmm2 = LOAD_SI128(sptr1);
			sptr1 += 4;
			/* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
			xmm3 = LOAD_SI128(sptr2);
			sptr2 += 4;
			/* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
			xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
			/* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
			xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
			/* subtract */
			xmm6 = _mm_subs_epi16(xmm4, xmm5);
			/* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
			xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
			/* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
			xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
			/* Add one to alphas */
			xmm4 = _mm_adds_epi16(xmm4, xmm1);
			/* Multiply and take low word */
			xmm4 = _mm_mullo_epi16(xmm4, xmm6);
			/* Shift 8 right */
			xmm4 = _mm_srai_epi16(xmm4, 8);
			/* Add xmm5 */
			xmm4 = _mm_adds_epi16(xmm4, xmm5);
			/* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
			/* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
			xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
			/* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
			xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
			/* subtract */
			xmm7 = _mm_subs_epi16(xmm5, xmm6);
			/* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
			xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
			/* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
			xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
			/* Add one to alphas */
			xmm5 = _mm_adds_epi16(xmm5, xmm1);
			/* Multiply and take low word */
			xmm5 = _mm_mullo_epi16(xmm5, xmm7);
			/* Shift 8 right */
			xmm5 = _mm_srai_epi16(xmm5, 8);
			/* Add xmm6 */
			xmm5 = _mm_adds_epi16(xmm5, xmm6);
			/* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
			/* Must mask off remainders or pack gets confused */
			xmm3 = _mm_set1_epi16(0x00ffU);
			xmm4 = _mm_and_si128(xmm4, xmm3);
			xmm5 = _mm_and_si128(xmm5, xmm3);
			/* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
			xmm5 = _mm_packus_epi16(xmm5, xmm4);
			_mm_store_si128((__m128i*) dptr, xmm5);
			dptr += 4;
		}

		/* Finish off the remainder. */
		if (pixels)
		{
			pstatus_t status;
			status = generic->alphaComp_argb((const BYTE*) sptr1, src1Step,
						(const BYTE*) sptr2, src2Step,
						(BYTE*) dptr, dstStep, pixels, 1);
			if (status != PRIMITIVES_SUCCESS)
				return status;

			sptr1 += pixels;
			sptr2 += pixels;
			dptr  += pixels;
		}

		/* Jump to next row. */
		sptr1 += src1Jump;
		sptr2 += src2Jump;
		dptr  += dstJump;
	}

	return PRIMITIVES_SUCCESS;
}