示例#1
0
static void ConvertBGRAToRGBA_SSE2(const uint32_t* src,
                                   int num_pixels, uint8_t* dst) {
  const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
  const __m128i* in = (const __m128i*)src;
  __m128i* out = (__m128i*)dst;
  while (num_pixels >= 8) {
    const __m128i A1 = _mm_loadu_si128(in++);
    const __m128i A2 = _mm_loadu_si128(in++);
    const __m128i B1 = _mm_and_si128(A1, red_blue_mask);     // R 0 B 0
    const __m128i B2 = _mm_and_si128(A2, red_blue_mask);     // R 0 B 0
    const __m128i C1 = _mm_andnot_si128(red_blue_mask, A1);  // 0 G 0 A
    const __m128i C2 = _mm_andnot_si128(red_blue_mask, A2);  // 0 G 0 A
    const __m128i D1 = _mm_shufflelo_epi16(B1, _MM_SHUFFLE(2, 3, 0, 1));
    const __m128i D2 = _mm_shufflelo_epi16(B2, _MM_SHUFFLE(2, 3, 0, 1));
    const __m128i E1 = _mm_shufflehi_epi16(D1, _MM_SHUFFLE(2, 3, 0, 1));
    const __m128i E2 = _mm_shufflehi_epi16(D2, _MM_SHUFFLE(2, 3, 0, 1));
    const __m128i F1 = _mm_or_si128(E1, C1);
    const __m128i F2 = _mm_or_si128(E2, C2);
    _mm_storeu_si128(out++, F1);
    _mm_storeu_si128(out++, F2);
    num_pixels -= 8;
  }
  // left-overs
  if (num_pixels > 0) {
    VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
  }
}
示例#2
0
文件: idea_sse2.cpp 项目: louiz/botan
void transpose_out(__m128i& B0, __m128i& B1, __m128i& B2, __m128i& B3)
   {
   __m128i T0 = _mm_unpacklo_epi64(B0, B1);
   __m128i T1 = _mm_unpacklo_epi64(B2, B3);
   __m128i T2 = _mm_unpackhi_epi64(B0, B1);
   __m128i T3 = _mm_unpackhi_epi64(B2, B3);

   T0 = _mm_shuffle_epi32(T0, _MM_SHUFFLE(3, 1, 2, 0));
   T1 = _mm_shuffle_epi32(T1, _MM_SHUFFLE(3, 1, 2, 0));
   T2 = _mm_shuffle_epi32(T2, _MM_SHUFFLE(3, 1, 2, 0));
   T3 = _mm_shuffle_epi32(T3, _MM_SHUFFLE(3, 1, 2, 0));

   T0 = _mm_shufflehi_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
   T1 = _mm_shufflehi_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
   T2 = _mm_shufflehi_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
   T3 = _mm_shufflehi_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));

   T0 = _mm_shufflelo_epi16(T0, _MM_SHUFFLE(3, 1, 2, 0));
   T1 = _mm_shufflelo_epi16(T1, _MM_SHUFFLE(3, 1, 2, 0));
   T2 = _mm_shufflelo_epi16(T2, _MM_SHUFFLE(3, 1, 2, 0));
   T3 = _mm_shufflelo_epi16(T3, _MM_SHUFFLE(3, 1, 2, 0));

   B0 = _mm_unpacklo_epi32(T0, T1);
   B1 = _mm_unpackhi_epi32(T0, T1);
   B2 = _mm_unpacklo_epi32(T2, T3);
   B3 = _mm_unpackhi_epi32(T2, T3);
   }
示例#3
0
void Coefs(unsigned char *current_part_ptr, int current_part_stride, unsigned char *ref_part_ptr, int ref_part_stride, unsigned char *coef_buf, int n)	{


static const unsigned short c_32[8] = {32, 32, 32, 32, 32, 32, 32, 32};			

int i;			

__m128i v_row0_0, v_row0_1;			
__m128i v_temp_0, v_temp_1;			
__m128i v_result;			

__m128i vZero;
vZero = _mm_setzero_si128();			
__m128i v_32 = _mm_loadu_si128((__m128i*)c_32);			

__m128i* coef_ptr = (__m128i*) coef_buf;			

v_row0_0 = _mm_loadl_epi64((__m128i*)ref_part_ptr);			
v_row0_1 = _mm_shufflelo_epi16(v_row0_0, 0xf9);			
v_row0_1 = _mm_insert_epi16(v_row0_1, *(unsigned short*)(ref_part_ptr+8), 3);			
ref_part_ptr += ref_part_stride;			
// row0: 0 1 2 3 4 5 6 7			
// row1: 2 3 4 5 6 7 8 9			

v_row0_0 = _mm_unpacklo_epi8(v_row0_0, vZero);			
v_row0_1 = _mm_unpacklo_epi8(v_row0_1, vZero);			

for ( i = 0; i < n; i++ )			
{			
v_row0_0 = _mm_mullo_epi16(v_row0_0, coef_ptr[0]);			
v_row0_1 = _mm_mullo_epi16(v_row0_1, coef_ptr[1]);			

v_result = v_32;			
v_result = _mm_add_epi16(v_result, v_row0_0);			
v_result = _mm_add_epi16(v_result, v_row0_1);			

v_row0_0 = _mm_loadl_epi64((__m128i*)ref_part_ptr);			
v_row0_1 = _mm_shufflelo_epi16(v_row0_0, 0xf9);			
v_row0_1 = _mm_insert_epi16(v_row0_1, *(unsigned short*)(ref_part_ptr+8), 3);			
ref_part_ptr += ref_part_stride;			
v_row0_0 = _mm_unpacklo_epi8(v_row0_0, vZero);			
v_row0_1 = _mm_unpacklo_epi8(v_row0_1, vZero);			
v_temp_0 = _mm_mullo_epi16(v_row0_0, coef_ptr[2]);			
v_temp_1 = _mm_mullo_epi16(v_row0_1, coef_ptr[3]);			

v_result = _mm_add_epi16(v_result, v_temp_0);			
v_result = _mm_add_epi16(v_result, v_temp_1);			
v_result = _mm_srli_epi16(v_result, 6);			

_mm_store_si128((__m128i*)(current_part_ptr), v_result);			
current_part_ptr += current_part_stride;			
}			

}			
示例#4
0
static void TransformColorInverse_SSE2(const VP8LMultipliers* const m,
                                       const uint32_t* const src,
                                       int num_pixels, uint32_t* dst) {
// sign-extended multiplying constants, pre-shifted by 5.
#define CST(X)  (((int16_t)(m->X << 8)) >> 5)   // sign-extend
#define MK_CST_16(HI, LO) \
  _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff)))
  const __m128i mults_rb = MK_CST_16(CST(green_to_red_), CST(green_to_blue_));
  const __m128i mults_b2 = MK_CST_16(CST(red_to_blue_), 0);
#undef MK_CST_16
#undef CST
  const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);  // alpha-green masks
  int i;
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
    const __m128i A = _mm_and_si128(in, mask_ag);     // a   0   g   0
    const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
    const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // g0g0
    const __m128i D = _mm_mulhi_epi16(C, mults_rb);    // x dr  x db1
    const __m128i E = _mm_add_epi8(in, D);             // x r'  x   b'
    const __m128i F = _mm_slli_epi16(E, 8);            // r' 0   b' 0
    const __m128i G = _mm_mulhi_epi16(F, mults_b2);    // x db2  0  0
    const __m128i H = _mm_srli_epi32(G, 8);            // 0  x db2  0
    const __m128i I = _mm_add_epi8(H, F);              // r' x  b'' 0
    const __m128i J = _mm_srli_epi16(I, 8);            // 0  r'  0  b''
    const __m128i out = _mm_or_si128(J, A);
    _mm_storeu_si128((__m128i*)&dst[i], out);
  }
  // Fall-back to C-version for left-overs.
  if (i != num_pixels) {
    VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i);
  }
}
示例#5
0
static void PackARGB(const uint8_t* a, const uint8_t* r, const uint8_t* g,
                     const uint8_t* b, int len, uint32_t* out) {
  if (g == r + 1) {  // RGBA input order. Need to swap R and B.
    int i = 0;
    const int len_max = len & ~3;  // max length processed in main loop
    const __m128i red_blue_mask = _mm_set1_epi32(0x00ff00ffu);
    assert(b == r + 2);
    assert(a == r + 3);
    for (; i < len_max; i += 4) {
      const __m128i A = _mm_loadu_si128((const __m128i*)(r + 4 * i));
      const __m128i B = _mm_and_si128(A, red_blue_mask);     // R 0 B 0
      const __m128i C = _mm_andnot_si128(red_blue_mask, A);  // 0 G 0 A
      const __m128i D = _mm_shufflelo_epi16(B, _MM_SHUFFLE(2, 3, 0, 1));
      const __m128i E = _mm_shufflehi_epi16(D, _MM_SHUFFLE(2, 3, 0, 1));
      const __m128i F = _mm_or_si128(E, C);
      _mm_storeu_si128((__m128i*)(out + i), F);
    }
    for (; i < len; ++i) {
      out[i] = MakeARGB32(a[4 * i], r[4 * i], g[4 * i], b[4 * i]);
    }
  } else {
    assert(g == b + 1);
    assert(r == b + 2);
    assert(a == b + 3);
    memcpy(out, b, len * 4);
  }
}
inline COLORREF MakeColor2(COLORREF a, COLORREF b, int alpha)
{
#ifdef USE_SSE2
	// (a * alpha + b * (256 - alpha)) / 256 -> ((a - b) * alpha) / 256 + b
	__m128i xmm0, xmm1, xmm2, xmm3;
	COLORREF color;
	xmm0 = _mm_setzero_si128();
	xmm1 = _mm_cvtsi32_si128( a );
	xmm2 = _mm_cvtsi32_si128( b );
	xmm3 = _mm_cvtsi32_si128( alpha );

	xmm1 = _mm_unpacklo_epi8( xmm1, xmm0 ); // a:a:a:a
	xmm2 = _mm_unpacklo_epi8( xmm2, xmm0 ); // b:b:b:b
	xmm3 = _mm_shufflelo_epi16( xmm3, 0 ); // alpha:alpha:alpha:alpha

	xmm1 = _mm_sub_epi16( xmm1, xmm2 ); // (a - b)
	xmm1 = _mm_mullo_epi16( xmm1, xmm3 ); // (a - b) * alpha
	xmm1 = _mm_srli_epi16( xmm1, 8 ); // ((a - b) * alpha) / 256
	xmm1 = _mm_add_epi8( xmm1, xmm2 ); // ((a - b) * alpha) / 256 + b

	xmm1 = _mm_packus_epi16( xmm1, xmm0 );
	color = _mm_cvtsi128_si32( xmm1 );

	return color;
#else
	const int ap = alpha;
	const int bp = 256 - ap;
	BYTE valR = (BYTE)((GetRValue(a) * ap + GetRValue(b) * bp) / 256);
	BYTE valG = (BYTE)((GetGValue(a) * ap + GetGValue(b) * bp) / 256);
	BYTE valB = (BYTE)((GetBValue(a) * ap + GetBValue(b) * bp) / 256);
	return RGB(valR, valG, valB);
#endif
}
示例#7
0
文件: shuffle.c 项目: B-Rich/PyTables
/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */
static void
shuffle2(uint8_t* dest, uint8_t* src, size_t size)
{
  size_t i, j, k;
  size_t numof16belem;
  __m128i xmm0[2], xmm1[2];

  numof16belem = size / (16*2);
  for (i = 0, j = 0; i < numof16belem; i++, j += 16*2) {
    /* Fetch and transpose bytes, words and double words in groups of
       32 bytes */
    for (k = 0; k < 2; k++) {
      xmm0[k] = _mm_loadu_si128((__m128i*)(src+j+k*16));
      xmm0[k] = _mm_shufflelo_epi16(xmm0[k], 0xd8);
      xmm0[k] = _mm_shufflehi_epi16(xmm0[k], 0xd8);
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
      xmm0[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]);
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
      xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]);
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
    }
    /* Transpose quad words */
    for (k = 0; k < 1; k++) {
      xmm1[k*2] = _mm_unpacklo_epi64(xmm0[k], xmm0[k+1]);
      xmm1[k*2+1] = _mm_unpackhi_epi64(xmm0[k], xmm0[k+1]);
    }
    /* Store the result vectors */
    for (k = 0; k < 2; k++) {
      ((__m128i *)dest)[k*numof16belem+i] = xmm1[k];
    }
  }
}
示例#8
0
/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */
static void
shuffle2_sse2(uint8_t* const dest, const uint8_t* const src,
              const size_t vectorizable_elements, const size_t total_elements) {
  static const size_t bytesoftype = 2;
  size_t j;
  int k;
  uint8_t* dest_for_jth_element;
  __m128i xmm0[2], xmm1[2];

  for (j = 0; j < vectorizable_elements; j += sizeof(__m128i)) {
    /* Fetch 16 elements (32 bytes) then transpose bytes, words and double words. */
    for (k = 0; k < 2; k++) {
      xmm0[k] = _mm_loadu_si128((__m128i*)(src + (j * bytesoftype) + (k * sizeof(__m128i))));
      xmm0[k] = _mm_shufflelo_epi16(xmm0[k], 0xd8);
      xmm0[k] = _mm_shufflehi_epi16(xmm0[k], 0xd8);
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
      xmm0[k] = _mm_unpacklo_epi8(xmm0[k], xmm1[k]);
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
      xmm1[k] = _mm_shuffle_epi32(xmm0[k], 0x4e);
      xmm0[k] = _mm_unpacklo_epi16(xmm0[k], xmm1[k]);
      xmm0[k] = _mm_shuffle_epi32(xmm0[k], 0xd8);
    }
    /* Transpose quad words */
    for (k = 0; k < 1; k++) {
      xmm1[k * 2] = _mm_unpacklo_epi64(xmm0[k], xmm0[k + 1]);
      xmm1[k * 2 + 1] = _mm_unpackhi_epi64(xmm0[k], xmm0[k + 1]);
    }
    /* Store the result vectors */
    dest_for_jth_element = dest + j;
    for (k = 0; k < 2; k++) {
      _mm_storeu_si128((__m128i*)(dest_for_jth_element + (k * total_elements)), xmm1[k]);
    }
  }
}
示例#9
0
//
// multiplies two complex vectors and returns the real and imaginary parts
// as two 32 bit integers.
//
FORCE_INLINE
int __ext_v_conj_mul_complex16_int32(int32* re, int lenout1, int32* im, int lenout2,
        struct complex16* x, int len1, struct complex16* y, int len2 )
{
  const unum8 wlen = 4;// sizeof(vcs) / sizeof(complex16);
  const __m128i xmm5 = _mm_set1_epi32(0xFFFF0000);
  const __m128i xmm4 = _mm_set1_epi32(0x00010000);

  __m128i* Xs = (__m128i*) x;
  __m128i* Ys = (__m128i*) y;
  __m128i* Res = (__m128i*) re;
  __m128i* Ims = (__m128i*) im;
  for (int i = 0; i < len1 / wlen; i++){
    __m128i mx = _mm_loadu_si128(&Xs[i]);
    __m128i my = _mm_loadu_si128(&Ys[i]);

    __m128i ms2 = _mm_xor_si128(my, xmm5);
    ms2 = _mm_add_epi32(ms2, xmm4);

    ms2 = _mm_shufflehi_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1));
    ms2 = _mm_shufflelo_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1));

    _mm_storeu_si128(&Res[i], _mm_madd_epi16(my, mx));
    _mm_storeu_si128(&Ims[i], _mm_madd_epi16(ms2, mx));
  }

  for (int i = (len1 / wlen) * wlen; i < len1; i++){
    re[i] = x[i].re * y[i].re + x[i].im * y[i].im ;
    im[i] = x[i].im * y[i].re - x[i].re * y[i].im ;
  }

  return 0;
}
示例#10
0
static void TransformColor(const VP8LMultipliers* const m,
                           uint32_t* argb_data, int num_pixels) {
  const __m128i mults_rb = _mm_set_epi16(
      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_),
      CST_5b(m->green_to_red_), CST_5b(m->green_to_blue_));
  const __m128i mults_b2 = _mm_set_epi16(
      CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0,
      CST_5b(m->red_to_blue_), 0, CST_5b(m->red_to_blue_), 0);
  const __m128i mask_ag = _mm_set1_epi32(0xff00ff00);  // alpha-green masks
  const __m128i mask_rb = _mm_set1_epi32(0x00ff00ff);  // red-blue masks
  int i;
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
    const __m128i A = _mm_and_si128(in, mask_ag);     // a   0   g   0
    const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
    const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // g0g0
    const __m128i D = _mm_mulhi_epi16(C, mults_rb);    // x dr  x db1
    const __m128i E = _mm_slli_epi16(in, 8);           // r 0   b   0
    const __m128i F = _mm_mulhi_epi16(E, mults_b2);    // x db2 0   0
    const __m128i G = _mm_srli_epi32(F, 16);           // 0 0   x db2
    const __m128i H = _mm_add_epi8(G, D);              // x dr  x  db
    const __m128i I = _mm_and_si128(H, mask_rb);       // 0 dr  0  db
    const __m128i out = _mm_sub_epi8(in, I);
    _mm_storeu_si128((__m128i*)&argb_data[i], out);
  }
  // fallthrough and finish off with plain-C
  VP8LTransformColor_C(m, argb_data + i, num_pixels - i);
}
示例#11
0
static inline __m128i  byteswap32( __m128i v )
{
	//rotate each 32 bit quantity by 16 bits
	// 0xB1 = 10110001 = 2,3,0,1
	v = _mm_shufflehi_epi16( _mm_shufflelo_epi16( v, 0xB1 ), 0xB1 );
	return byteswap16( v );
}
示例#12
0
static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
  int x = 0;
  if (!inverse) {
    const int kSpan = 2;
    const __m128i zero = _mm_setzero_si128();
    const __m128i kRound =
        _mm_set_epi16(0, 1 << 7, 1 << 7, 1 << 7, 0, 1 << 7, 1 << 7, 1 << 7);
    const __m128i kMult =
        _mm_set_epi16(0, 0x0101, 0x0101, 0x0101, 0, 0x0101, 0x0101, 0x0101);
    const __m128i kOne64 = _mm_set_epi16(1u << 8, 0, 0, 0, 1u << 8, 0, 0, 0);
    const int w2 = width & ~(kSpan - 1);
    for (x = 0; x < w2; x += kSpan) {
      const __m128i argb0 = _mm_loadl_epi64((__m128i*)&ptr[x]);
      const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero);
      const __m128i tmp0 = _mm_shufflelo_epi16(argb1, _MM_SHUFFLE(3, 3, 3, 3));
      const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, _MM_SHUFFLE(3, 3, 3, 3));
      const __m128i tmp2 = _mm_srli_epi64(tmp1, 16);
      const __m128i scale0 = _mm_mullo_epi16(tmp1, kMult);
      const __m128i scale1 = _mm_or_si128(tmp2, kOne64);
      const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0);
      const __m128i argb3 = _mm_mullo_epi16(argb1, scale1);
      const __m128i argb4 = _mm_adds_epu16(argb2, argb3);
      const __m128i argb5 = _mm_adds_epu16(argb4, kRound);
      const __m128i argb6 = _mm_srli_epi16(argb5, 8);
      const __m128i argb7 = _mm_packus_epi16(argb6, zero);
      _mm_storel_epi64((__m128i*)&ptr[x], argb7);
    }
  }
  width -= x;
  if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse);
}
示例#13
0
__m128i test_mm_shufflelo_epi16(__m128i A) {
  // DAG-LABEL: test_mm_shufflelo_epi16
  // DAG: shufflevector <8 x i16> %{{.*}}, <8 x i16> %{{.*}}, <8 x i32> <i32 0, i32 0, i32 0, i32 0, i32 4, i32 5, i32 6, i32 7>
  //
  // ASM-LABEL: test_mm_shufflelo_epi16
  // ASM: pshuflw $0,
  return _mm_shufflelo_epi16(A, 0);
}
示例#14
0
void Convert444to420(LPBYTE input, int width, int pitch, int height, int startY, int endY, LPBYTE *output, bool bSSE2Available)
{
    LPBYTE lumPlane     = output[0];
    LPBYTE uPlane       = output[1];
    LPBYTE vPlane       = output[2];
    int  chrPitch       = width>>1;

    if(bSSE2Available)
    {
        __m128i lumMask = _mm_set1_epi32(0x0000FF00);
        __m128i uvMask = _mm_set1_epi16(0x00FF);

        for(int y=startY; y<endY; y+=2)
        {
            int yPos    = y*pitch;
            int chrYPos = ((y>>1)*chrPitch);
            int lumYPos = y*width;

            for(int x=0; x<width; x+=4)
            {
                LPBYTE lpImagePos = input+yPos+(x*4);
                int chrPos  = chrYPos + (x>>1);
                int lumPos0 = lumYPos + x;
                int lumPos1 = lumPos0+width;

                __m128i line1 = _mm_load_si128((__m128i*)lpImagePos);
                __m128i line2 = _mm_load_si128((__m128i*)(lpImagePos+pitch));

                //pack lum vals
                {
                    __m128i packVal = _mm_packs_epi32(_mm_srli_si128(_mm_and_si128(line1, lumMask), 1), _mm_srli_si128(_mm_and_si128(line2, lumMask), 1));
                    packVal = _mm_packus_epi16(packVal, packVal);

                    *(LPUINT)(lumPlane+lumPos0) = packVal.m128i_u32[0];
                    *(LPUINT)(lumPlane+lumPos1) = packVal.m128i_u32[1];
                }

                //do average, pack UV vals
                {
                    __m128i addVal = _mm_add_epi64(_mm_and_si128(line1, uvMask), _mm_and_si128(line2, uvMask));
                    __m128i avgVal = _mm_srai_epi16(_mm_add_epi64(addVal, _mm_shuffle_epi32(addVal, _MM_SHUFFLE(2, 3, 0, 1))), 2);
                    avgVal = _mm_shuffle_epi32(avgVal, _MM_SHUFFLE(3, 1, 2, 0));
                    avgVal = _mm_shufflelo_epi16(avgVal, _MM_SHUFFLE(3, 1, 2, 0));
                    avgVal = _mm_packus_epi16(avgVal, avgVal);

                    DWORD packedVals = avgVal.m128i_u32[0];

                    *(LPWORD)(uPlane+chrPos) = WORD(packedVals);
                    *(LPWORD)(vPlane+chrPos) = WORD(packedVals>>16);
                }
            }
        }
    }
    else
    {
#ifdef _WIN64
        for(int y=startY; y<endY; y+=2)
示例#15
0
static __forceinline void DCT_8_INV_ROW(const uint8_t * const ecx,const uint8_t * const esi,__m128i &xmm0,__m128i &xmm1,__m128i &xmm2,__m128i &xmm3,__m128i &xmm4,__m128i &xmm5,__m128i &xmm6,__m128i &xmm7)
{
     xmm0=_mm_shufflelo_epi16(xmm0, 0xD8 );
     xmm1=_mm_shuffle_epi32( xmm0, 0 );
     pmaddwd (xmm1, esi);
     xmm3=_mm_shuffle_epi32( xmm0, 0x55);
     xmm0=_mm_shufflehi_epi16( xmm0, 0xD8 );
     pmaddwd( xmm3, esi+32 );
     xmm2=_mm_shuffle_epi32( xmm0, 0xAA );
     xmm0=_mm_shuffle_epi32( xmm0, 0xFF );
     pmaddwd( xmm2, esi+16 );
     xmm4=_mm_shufflehi_epi16( xmm4, 0xD8 );
     paddd (xmm1, M128_round_inv_row);
     xmm4=_mm_shufflelo_epi16 (xmm4, 0xD8 );
     pmaddwd (xmm0, esi+48 );
     xmm5=_mm_shuffle_epi32( xmm4, 0 );
     xmm6=_mm_shuffle_epi32( xmm4, 0xAA );
     pmaddwd (xmm5, ecx );
     paddd (xmm1, xmm2 );
     movdqa (xmm2, xmm1 );
     xmm7=_mm_shuffle_epi32( xmm4, 0x55 );
     pmaddwd (xmm6, ecx+16 );
     paddd (xmm0, xmm3 );
     xmm4=_mm_shuffle_epi32( xmm4, 0xFF );
     psubd (xmm2, xmm0 );
     pmaddwd (xmm7, ecx+32 );
     paddd (xmm0, xmm1 );
     psrad (xmm2, 12 );
     paddd (xmm5, M128_round_inv_row);
     pmaddwd (xmm4, ecx+48 );
     paddd (xmm5, xmm6 );
     movdqa (xmm6, xmm5 );
     psrad (xmm0, 12 );
     xmm2=_mm_shuffle_epi32( xmm2, 0x1B );
     packssdw (xmm0, xmm2 );
     paddd (xmm4, xmm7 );
     psubd (xmm6, xmm4 );
     paddd (xmm4, xmm5 );
     psrad (xmm6, 12 );
     psrad (xmm4, 12 );
     xmm6=_mm_shuffle_epi32( xmm6, 0x1B );
     packssdw (xmm4, xmm6 );
}
示例#16
0
inline Pixel GetPixelSSE(const Image* img, float x, float y)
{
 const int stride = img->width;
 const Pixel* p0 = img->data + (int)x + (int)y * stride; // pointer to first pixel

 // Load the data (2 pixels in one load)
 __m128i p12 = _mm_loadl_epi64((const __m128i*)&p0[0 * stride]); 
 __m128i p34 = _mm_loadl_epi64((const __m128i*)&p0[1 * stride]); 

 __m128 weight = CalcWeights(x, y);

 // extend to 16bit
 p12 = _mm_unpacklo_epi8(p12, _mm_setzero_si128());
 p34 = _mm_unpacklo_epi8(p34, _mm_setzero_si128());

 // convert floating point weights to 16bit integer
 weight = _mm_mul_ps(weight, CONST_256); 
 __m128i weighti = _mm_cvtps_epi32(weight); // w4 w3 w2 w1
         weighti = _mm_packs_epi32(weighti, _mm_setzero_si128()); // 32->16bit

 // prepare the weights
 __m128i w12 = _mm_shufflelo_epi16(weighti, _MM_SHUFFLE(1, 1, 0, 0));
 __m128i w34 = _mm_shufflelo_epi16(weighti, _MM_SHUFFLE(3, 3, 2, 2));
 w12 = _mm_unpacklo_epi16(w12, w12); // w2 w2 w2 w2 w1 w1 w1 w1
 w34 = _mm_unpacklo_epi16(w34, w34); // w4 w4 w4 w4 w3 w3 w3 w3
 
 // multiply each pixel with its weight (2 pixel per SSE mul)
 __m128i L12 = _mm_mullo_epi16(p12, w12);
 __m128i L34 = _mm_mullo_epi16(p34, w34);

 // sum the results
 __m128i L1234 = _mm_add_epi16(L12, L34); 
 __m128i Lhi = _mm_shuffle_epi32(L1234, _MM_SHUFFLE(3, 2, 3, 2));
 __m128i L = _mm_add_epi16(L1234, Lhi);
  
 // convert back to 8bit
 __m128i L8 = _mm_srli_epi16(L, 8); // divide by 256
 L8 = _mm_packus_epi16(L8, _mm_setzero_si128());
 
 // return
 return _mm_cvtsi128_si32(L8);
}
示例#17
0
//
// multiplies two complex vectors and returns the real and imaginary parts 
// as two 32 bit integers.
//
int __ext_v_conj_mul_complex16_int32(int32* re, int lenout1, int32* im, int lenout2, 
				struct complex16* x, int len1, struct complex16* y, int len2 )
{

	const int wlen = 4;// sizeof(vcs) / sizeof(complex16);
	const __m128i xmm6 = _mm_set1_epi32(0x0000FFFF);		//0x0000FFFF0000FFFF0000FFFF0000FFFF
	const __m128i xmm5 = _mm_set1_epi32(0xFFFF0000);
	const __m128i xmm4 = _mm_set1_epi32(0x00010000);
	for (int i = 0; i < len1 / wlen; i++){

	/*	vcs *vx = (vcs *)(x + wlen*i);
		vcs *vy = (vcs *)(y + wlen*i);
		vi *reout = (vi *)(re + wlen*i);
		vi *imout = (vi *)(im + wlen*i);

		vcs vs2 = conj0(*vy);

	    vs2 = permutate_low<1, 0, 3, 2>(vs2);
		vs2 = permutate_high<1, 0, 3, 2>(vs2);

		*reout = (vcs)muladd(*vx, *vy);
		*imout = (vcs)muladd(*vx, vs2);*/


		__m128i mx = _mm_loadu_si128((__m128i *)(x + wlen*i));
		__m128i my = _mm_loadu_si128((__m128i *)(y + wlen*i));


		//__m128i ms1 = _mm_sign_epi16(my, conj);
		__m128i ms2 = _mm_xor_si128(my, xmm5);
		ms2 = _mm_add_epi32(ms2, xmm4);


		ms2 = _mm_shufflehi_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1));
		ms2 = _mm_shufflelo_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1));

		__m128i mre = _mm_madd_epi16(my, mx);
		__m128i mim = _mm_madd_epi16(ms2, mx);

		_mm_storeu_si128((__m128i *) (re + wlen*i), mre);
		_mm_storeu_si128((__m128i *) (im + wlen*i), mim);



	}

	for (int i = (len1 / wlen) * wlen; i < len1; i++){
		re[i] = x[i].re * y[i].re + x[i].im * y[i].im ;
		im[i] = x[i].im * y[i].re - x[i].re * y[i].im ;
	};

	return 0;

}
示例#18
0
opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y,
      int N)
{
    opus_int  i, dataSize16;
    opus_int32 sum;

    __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
    __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;

    sum = 0;
    dataSize16 = N & ~15;

    acc1 = _mm_setzero_si128();
    acc2 = _mm_setzero_si128();

    for (i=0;i<dataSize16;i+=16)
    {
        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));

        inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
        inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));

        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
        inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);

        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
        acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
    }

    acc1 = _mm_add_epi32( acc1, acc2 );

    if (N - i >= 8)
    {
        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));

        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);

        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
        i += 8;
    }

    acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1));
    acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E));
    sum += _mm_cvtsi128_si32(acc1);

    for (;i<N;i++) {
        sum = silk_SMLABB(sum, x[i], y[i]);
    }

    return sum;
}
示例#19
0
static void SubtractGreenFromBlueAndRed(uint32_t* argb_data, int num_pixels) {
  int i;
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); // argb
    const __m128i A = _mm_srli_epi16(in, 8);     // 0 a 0 g
    const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
    const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // 0g0g
    const __m128i out = _mm_sub_epi8(in, C);
    _mm_storeu_si128((__m128i*)&argb_data[i], out);
  }
  // fallthrough and finish off with plain-C
  VP8LSubtractGreenFromBlueAndRed_C(argb_data + i, num_pixels - i);
}
示例#20
0
void LoadRGBA8ToBGRA8_SSE2(size_t width, size_t height, size_t depth,
                           const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
                           uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
{
#if defined(_M_ARM)
    // Ensure that this function is reported as not implemented for ARM builds because
    // the instructions below are not present for that architecture.
    UNIMPLEMENTED();
    return;
#else
    __m128i brMask = _mm_set1_epi32(0x00ff00ff);

    for (size_t z = 0; z < depth; z++)
    {
        for (size_t y = 0; y < height; y++)
        {
            const uint32_t *source = OffsetDataPointer<uint32_t>(input, y, z, inputRowPitch, inputDepthPitch);
            uint32_t *dest = OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch, outputDepthPitch);

            size_t x = 0;

            // Make output writes aligned
            for (; ((reinterpret_cast<intptr_t>(&dest[x]) & 15) != 0) && x < width; x++)
            {
                uint32_t rgba = source[x];
                dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
            }

            for (; x + 3 < width; x += 4)
            {
                __m128i sourceData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&source[x]));
                // Mask out g and a, which don't change
                __m128i gaComponents = _mm_andnot_si128(brMask, sourceData);
                // Mask out b and r
                __m128i brComponents = _mm_and_si128(sourceData, brMask);
                // Swap b and r
                __m128i brSwapped = _mm_shufflehi_epi16(_mm_shufflelo_epi16(brComponents, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
                __m128i result = _mm_or_si128(gaComponents, brSwapped);
                _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), result);
            }

            // Perform leftover writes
            for (; x < width; x++)
            {
                uint32_t rgba = source[x];
                dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
            }
        }
    }
#endif
}
示例#21
0
文件: image.hpp 项目: Sur3/FLIF
    void store(uint16_t *p) const{
        assert(((uintptr_t)p & 7) == 0);//assert aligned
        //_mm_packus_epi32 (pack with unsigned saturation) is not in SSE2 (2001) for some reason, requires SSE 4.1 (2007)
        //_mm_storel_epi64((__m128i*)p,_mm_packus_epi32 (a,a));

        //a:    AAAABBBBCCCCDDDD  input vector
        //slli: AA__BB__CC__DD__  bitshift left by 16
        //srli: __________AA__BB  byteshift right by 10
        //_or_: AA__BB__CCAADDBB  OR together
        //shuf: AA__BB__AABBCCDD  reshuffle low half: {[2], [0], [3], [1]} : 10 00 11 01 : 0x8D (I may have gotten this wrong)
        //storel:       AABBCCDD  store low half
        __m128i shifted = _mm_slli_epi32(vec,16);
        _mm_storel_epi64((__m128i*)p,_mm_shufflelo_epi16(_mm_or_si128(shifted,_mm_srli_si128(shifted,10)),0x8D));
    }
示例#22
0
static void AddGreenToBlueAndRed_SSE2(const uint32_t* const src, int num_pixels,
                                      uint32_t* dst) {
  int i;
  for (i = 0; i + 4 <= num_pixels; i += 4) {
    const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb
    const __m128i A = _mm_srli_epi16(in, 8);     // 0 a 0 g
    const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0));
    const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0));  // 0g0g
    const __m128i out = _mm_add_epi8(in, C);
    _mm_storeu_si128((__m128i*)&dst[i], out);
  }
  // fallthrough and finish off with plain-C
  if (i != num_pixels) {
    VP8LAddGreenToBlueAndRed_C(src + i, num_pixels - i, dst + i);
  }
}
示例#23
0
void unpack_rgba8_sse2(const Uint8* source, const Uint32 size, Uint8* dest)
{
	__m128i t0, t1, t2;
	Uint32 i;

	for (i = 0; i < (size / 16); i++)
	{
		t0 = _mm_load_si128((__m128i*)&source[i * 16]);

		t1 = _mm_and_si128(t0, _mm_set1_epi16(0x00FF));
		t2 = _mm_and_si128(t0, _mm_set1_epi16(0xFF00));
		t1 = _mm_shufflelo_epi16(t1, _MM_SHUFFLE(2, 3, 0, 1));
		t1 = _mm_shufflehi_epi16(t1, _MM_SHUFFLE(2, 3, 0, 1));
		t1 = _mm_or_si128(t1, t2);

		_mm_stream_si128((__m128i*)&dest[i * 16], t1);
	}
}
示例#24
0
static unsigned satd_8bit_4x4_avx2(const kvz_pixel *org, const kvz_pixel *cur)
{

  __m128i original = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)org));
  __m128i current = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)cur));

  __m128i diff_lo = _mm_sub_epi16(current, original);

  original = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(org + 8)));
  current = _mm_cvtepu8_epi16(_mm_loadl_epi64((__m128i*)(cur + 8)));

  __m128i diff_hi = _mm_sub_epi16(current, original);


  //Hor
  __m128i row0 = _mm_hadd_epi16(diff_lo, diff_hi);
  __m128i row1 = _mm_hsub_epi16(diff_lo, diff_hi);

  __m128i row2 = _mm_hadd_epi16(row0, row1);
  __m128i row3 = _mm_hsub_epi16(row0, row1);

  //Ver
  row0 = _mm_hadd_epi16(row2, row3);
  row1 = _mm_hsub_epi16(row2, row3);

  row2 = _mm_hadd_epi16(row0, row1);
  row3 = _mm_hsub_epi16(row0, row1);

  //Abs and sum
  row2 = _mm_abs_epi16(row2);
  row3 = _mm_abs_epi16(row3);

  row3 = _mm_add_epi16(row2, row3);

  row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(2, 3, 0, 1) ));
  row3 = _mm_add_epi16(row3, _mm_shuffle_epi32(row3, KVZ_PERMUTE(1, 0, 1, 0) ));
  row3 = _mm_add_epi16(row3, _mm_shufflelo_epi16(row3, KVZ_PERMUTE(1, 0, 1, 0) ));

  unsigned sum = _mm_extract_epi16(row3, 0);
  unsigned satd = (sum + 1) >> 1;

  return satd;
}
示例#25
0
static INLINE void hor_transform_row_avx2(__m128i* row){
  
  __m128i mask_pos = _mm_set1_epi16(1);
  __m128i mask_neg = _mm_set1_epi16(-1);
  __m128i sign_mask = _mm_unpacklo_epi64(mask_pos, mask_neg);
  __m128i temp = _mm_shuffle_epi32(*row, KVZ_PERMUTE(2, 3, 0, 1));
  *row = _mm_sign_epi16(*row, sign_mask);
  *row = _mm_add_epi16(*row, temp);

  sign_mask = _mm_unpacklo_epi32(mask_pos, mask_neg);
  temp = _mm_shuffle_epi32(*row, KVZ_PERMUTE(1, 0, 3, 2));
  *row = _mm_sign_epi16(*row, sign_mask);
  *row = _mm_add_epi16(*row, temp);

  sign_mask = _mm_unpacklo_epi16(mask_pos, mask_neg);
  temp = _mm_shufflelo_epi16(*row, KVZ_PERMUTE(1,0,3,2));
  temp = _mm_shufflehi_epi16(temp, KVZ_PERMUTE(1,0,3,2));
  *row = _mm_sign_epi16(*row, sign_mask);
  *row = _mm_add_epi16(*row, temp);
}
示例#26
0
//
// This was v_mul_complex16_shift but I changed the name for consistency with v_conj_mul
// and the fact that the old v_mul_complex16 was never called
//
FORCE_INLINE
int __ext_v_mul_complex16(struct complex16* out, int lenout,
                struct complex16* x, int len1,
                struct complex16* y, int len2, int shift)
{
  const unum8 wlen = 4;// sizeof(vcs) / sizeof(complex16);
  const __m128i xmm6 = _mm_set1_epi32(0x0000FFFF);
  const __m128i xmm5 = _mm_set1_epi32(0xFFFF0000);
  const __m128i xmm4 = _mm_set1_epi32(0x00010000);

  __m128i* Xs = (__m128i*) x;
  __m128i* Ys = (__m128i*) y;
  __m128i* Outs = (__m128i*) out;
  for (int i = 0; i < len1 / wlen; i++){
    __m128i mx = _mm_loadu_si128(&Xs[i]);
    __m128i my = _mm_loadu_si128(&Ys[i]);

    __m128i ms1 = _mm_xor_si128(mx, xmm5);
    ms1 = _mm_add_epi32(ms1, xmm4);

    __m128i ms2 = _mm_shufflehi_epi16(mx, _MM_SHUFFLE(2, 3, 0, 1));
    ms2 = _mm_shufflelo_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1));

    __m128i mre = _mm_srai_epi32(_mm_madd_epi16(ms1, my), shift);
    __m128i mim = _mm_srai_epi32(_mm_madd_epi16(ms2, my), shift);

    mre = _mm_and_si128(mre,xmm6);
    mim = _mm_and_si128(mim,xmm6);

    mim = _mm_slli_epi32(mim,0x10);

    _mm_storeu_si128(&Outs[i], _mm_or_si128(mre, mim));
  }

  for (int i = (len1 / wlen) * wlen; i < len1; i++){
    out[i].re = (x[i].re * y[i].re - x[i].im * y[i].im) >> shift;
    out[i].im = (x[i].re * y[i].im + x[i].im * y[i].re) >> shift;
  }

  return 0;
}
示例#27
0
void Image::loadRGBAUByteDataSSE2(GLsizei width, GLsizei height,
                                  int inputPitch, const void *input, size_t outputPitch, void *output) const
{
    const unsigned int *source = NULL;
    unsigned int *dest = NULL;
    __m128i brMask = _mm_set1_epi32(0x00ff00ff);

    for (int y = 0; y < height; y++)
    {
        source = reinterpret_cast<const unsigned int*>(static_cast<const unsigned char*>(input) + y * inputPitch);
        dest = reinterpret_cast<unsigned int*>(static_cast<unsigned char*>(output) + y * outputPitch);
        int x = 0;

        // Make output writes aligned
        for (x = 0; ((reinterpret_cast<intptr_t>(&dest[x]) & 15) != 0) && x < width; x++)
        {
            unsigned int rgba = source[x];
            dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
        }

        for (; x + 3 < width; x += 4)
        {
            __m128i sourceData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&source[x]));
            // Mask out g and a, which don't change
            __m128i gaComponents = _mm_andnot_si128(brMask, sourceData);
            // Mask out b and r
            __m128i brComponents = _mm_and_si128(sourceData, brMask);
            // Swap b and r
            __m128i brSwapped = _mm_shufflehi_epi16(_mm_shufflelo_epi16(brComponents, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
            __m128i result = _mm_or_si128(gaComponents, brSwapped);
            _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), result);
        }

        // Perform leftover writes
        for (; x < width; x++)
        {
            unsigned int rgba = source[x];
            dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
        }
    }
}
示例#28
0
文件: enc_sse2.c 项目: 0309/cocos2d-x
// Simple quantization
static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
                             int n, const VP8Matrix* const mtx) {
  const __m128i max_coeff_2047 = _mm_set1_epi16(2047);
  const __m128i zero = _mm_set1_epi16(0);
  __m128i sign0, sign8;
  __m128i coeff0, coeff8;
  __m128i out0, out8;
  __m128i packed_out;

  // Load all inputs.
  // TODO(cduvivier): Make variable declarations and allocations aligned so that
  //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
  __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
  __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
  const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]);
  const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]);
  const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]);
  const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]);
  const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
  const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
  const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);
  const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);
  const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]);
  const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]);

  // sign(in) = in >> 15  (0x0000 if positive, 0xffff if negative)
  sign0 = _mm_srai_epi16(in0, 15);
  sign8 = _mm_srai_epi16(in8, 15);

  // coeff = abs(in) = (in ^ sign) - sign
  coeff0 = _mm_xor_si128(in0, sign0);
  coeff8 = _mm_xor_si128(in8, sign8);
  coeff0 = _mm_sub_epi16(coeff0, sign0);
  coeff8 = _mm_sub_epi16(coeff8, sign8);

  // coeff = abs(in) + sharpen
  coeff0 = _mm_add_epi16(coeff0, sharpen0);
  coeff8 = _mm_add_epi16(coeff8, sharpen8);

  // if (coeff > 2047) coeff = 2047
  coeff0 = _mm_min_epi16(coeff0, max_coeff_2047);
  coeff8 = _mm_min_epi16(coeff8, max_coeff_2047);

  // out = (coeff * iQ + B) >> QFIX;
  {
    // doing calculations with 32b precision (QFIX=17)
    // out = (coeff * iQ)
    __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
    __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
    __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
    __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
    __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
    __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
    // expand bias from 16b to 32b
    __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero);
    __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero);
    __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero);
    __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero);
    // out = (coeff * iQ + B)
    out_00 = _mm_add_epi32(out_00, bias_00);
    out_04 = _mm_add_epi32(out_04, bias_04);
    out_08 = _mm_add_epi32(out_08, bias_08);
    out_12 = _mm_add_epi32(out_12, bias_12);
    // out = (coeff * iQ + B) >> QFIX;
    out_00 = _mm_srai_epi32(out_00, QFIX);
    out_04 = _mm_srai_epi32(out_04, QFIX);
    out_08 = _mm_srai_epi32(out_08, QFIX);
    out_12 = _mm_srai_epi32(out_12, QFIX);
    // pack result as 16b
    out0 = _mm_packs_epi32(out_00, out_04);
    out8 = _mm_packs_epi32(out_08, out_12);
  }

  // get sign back (if (sign[j]) out_n = -out_n)
  out0 = _mm_xor_si128(out0, sign0);
  out8 = _mm_xor_si128(out8, sign8);
  out0 = _mm_sub_epi16(out0, sign0);
  out8 = _mm_sub_epi16(out8, sign8);

  // in = out * Q
  in0 = _mm_mullo_epi16(out0, q0);
  in8 = _mm_mullo_epi16(out8, q8);

  // if (coeff <= mtx->zthresh_) {in=0; out=0;}
  {
    __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0);
    __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8);
    in0 = _mm_and_si128(in0, cmp0);
    in8 = _mm_and_si128(in8, cmp8);
    _mm_storeu_si128((__m128i*)&in[0], in0);
    _mm_storeu_si128((__m128i*)&in[8], in8);
    out0 = _mm_and_si128(out0, cmp0);
    out8 = _mm_and_si128(out8, cmp8);
  }

  // zigzag the output before storing it.
  //
  // The zigzag pattern can almost be reproduced with a small sequence of
  // shuffles. After it, we only need to swap the 7th (ending up in third
  // position instead of twelfth) and 8th values.
  {
    __m128i outZ0, outZ8;
    outZ0 = _mm_shufflehi_epi16(out0,  _MM_SHUFFLE(2, 1, 3, 0));
    outZ0 = _mm_shuffle_epi32  (outZ0, _MM_SHUFFLE(3, 1, 2, 0));
    outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2));
    outZ8 = _mm_shufflelo_epi16(out8,  _MM_SHUFFLE(3, 0, 2, 1));
    outZ8 = _mm_shuffle_epi32  (outZ8, _MM_SHUFFLE(3, 1, 2, 0));
    outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0));
    _mm_storeu_si128((__m128i*)&out[0], outZ0);
    _mm_storeu_si128((__m128i*)&out[8], outZ8);
    packed_out = _mm_packs_epi16(outZ0, outZ8);
  }
  {
    const int16_t outZ_12 = out[12];
    const int16_t outZ_3 = out[3];
    out[3] = outZ_12;
    out[12] = outZ_3;
  }

  // detect if all 'out' values are zeroes or not
  {
    int32_t tmp[4];
    _mm_storeu_si128((__m128i*)tmp, packed_out);
    if (n) {
      tmp[0] &= ~0xff;
    }
    return (tmp[3] || tmp[2] || tmp[1] || tmp[0]);
  }
}
示例#29
0
void FileIconDrawGlass::Text(HDC hdc, PCTCHAR pcszText, const RECT &rc, eTextColor eColor, UINT uFlags)
{
	if (!pcszText || !*pcszText) return;

	// Find out actual size of text
	int nChars = _tcslen(pcszText);
	uFlags |= DT_NOCLIP;

	int iX = rc.left;
	int iY = rc.top;
	int iXW = (rc.right - iX);
	int iYH = (rc.bottom - iY);

	RECT rcMin = rc;
	if (DrawText(hdcTextDIB, pcszText, nChars, &rcMin, uFlags | DT_CALCRECT)) {
		int iMinXW = rcMin.right - rcMin.left;
		int iMinYH = rcMin.bottom - rcMin.top;
		if (iMinXW < iXW) {
			if (uFlags & DT_CENTER) {
				iX += (iXW - iMinXW)/2;
				uFlags &= ~DT_CENTER;
			} else if (uFlags & DT_RIGHT) {
				iX += (iXW - iMinXW);
				uFlags &= ~DT_RIGHT;
			}
			iXW = iMinXW;
		}
		if (iMinYH < iYH) {
			if (uFlags & DT_SINGLELINE) {
				if (uFlags & DT_VCENTER) {
					iY += (iYH - iMinYH)/2;
					uFlags &= ~DT_VCENTER;
				} else if (uFlags & DT_BOTTOM) {
					iY += (iYH - iMinYH);
					uFlags &= ~DT_BOTTOM;
				}
			}
			iYH = iMinYH;
		}
	}

	iXW += 2;	// NB: +2 'cause we want an extra pixel at the border so that the font smoothing will look bette!
	iYH += 2;

	// Ensure we have a big enough DIB to draw the text to
	if ((iXW > iTextDIBXW) || (iYH > iTextDIBYH)) CreateTextDIB(iXW, iYH);
	if (!hbmpTextDIB) return;

	// Select color
	ieBGRA clr;
	switch (eColor) {
	case eFileName:	clr = clrFileName;		break;
	case eComment:	clr = clrComment;		break;
	case eFileInfo:	clr = clrFileInfo;		break;
	default:		clr = ieBGRA(0,0,0);	break;
	}
	clr.A = 0xFF - clrBkg.A;

	// Draw the text to in-memory DIB
	RECT rcTextDIB = { 0, 0, iXW, iYH };
	FillRect(hdcTextDIB, &rcTextDIB, hbrBkg);

	rcTextDIB.left++;
	rcTextDIB.top++;

	DrawText(hdcTextDIB, pcszText, nChars, &rcTextDIB, uFlags);

	// Modify DIB:
#ifndef __X64__
	if (g_bSSE2) 
#endif
	{
		__m128i r0, r1, r2, r3, r4, r5, r6, r7;

		r7 = _mm_setzero_si128();									// 0
		r6 = _mm_set1_epi32(clr.dw);								// CA  CR  CG  CB  CA  CR  CG  CB  CA  CR  CG  CB  CA  CR  CG  CB
		r6 = _mm_unpacklo_epi8(r7, r6);								// CA<<8   CR<<8   CG<<8   CB<<8   CA<<8   CR<<8   CG<<8   CB<<8
		r5 = _mm_set1_epi16(1);										// 1       1       1       1       1       1       1       1
		r4 = _mm_set1_epi32(0xFF);									// FF              FF              FF              FF
		r3 = _mm_set1_epi32(clrBkg.dw);								// DA  0   0   0   DA  0   0   0   DA  0   0   0   DA  0   0   0

		ieBGRA *py = pTextDIB;
		for (int y = iYH; y--; py += iTextDIBXW) {
			ieBGRA *px = py;

			for (int x_4 = (iXW+3)>>2; x_4--; px += 4) {

				r0 = _mm_load_si128((__m128i *)px);
				r1 = r0;
				r2 = r0;											// X3  R3  G3  B3  X2  R2  G2  B2  X1  R1  G1  B1  X0  R0  G0  B0 
				r0 = _mm_srli_epi32(r0, 16);						// 0   0   X3  R3  0   0   X2  R2  0   0   X1  R1  0   0   X0  R0 
				r1 = _mm_srli_epi32(r1, 8);							// 0   X3  R3  G3  0   X2  R2  G2  0   X1  R1  G1  0   X0  R0  G0 
				r0 = _mm_max_epu8(r0, r2);
				r0 = _mm_max_epu8(r0, r1);							// x   x   x   A3  x   x   x   A2  x   x   x   A1  x   x   x   A0
				r0 = _mm_and_si128(r0, r4);							// 0       A3      0       A2      0       A1      0       A0
				r0 = _mm_shufflelo_epi16(r0, _MM_SHUFFLE(2,2,0,0));
				r0 = _mm_shufflehi_epi16(r0, _MM_SHUFFLE(2,2,0,0));	// A3      A3      A2      A2      A1      A1      A0      A0
				r1 = r0;
				r0 = _mm_unpacklo_epi32(r0, r0);					// A1      A1      A1      A1      A0      A0      A0      A0
				r1 = _mm_unpackhi_epi32(r1, r1);					// A3      A3      A3      A3      A2      A2      A2      A2
				r0 = _mm_add_epi16(r0, r5);							// A1'     A1'     A1'     A1'     A0'     A0'     A0'     A0' 
				r1 = _mm_add_epi16(r1, r5);							// A3'     A3'     A3'     A3'     A2'     A2'     A2'     A2' 
				r0 = _mm_mulhi_epu16(r0, r6);						// xA1"    xR1     xG1     xB1     xA0"    xR0     xG0     xB0
				r1 = _mm_mulhi_epu16(r1, r6);						// xA3"    xR3     xG3     xB3     xA2"    xR2     xG2     xB2
				r0 = _mm_packus_epi16(r0, r1);						// xA3"xR3 xG3 xB3 xA2"xR2 xG2 xB2 xA1"xR1 xG1 xB1 xA0"xR0 xG0 xB0
				r0 = _mm_adds_epu8(r0, r3);							// xA3 xR3 xG3 xB3 xA2 xR2 xG2 xB2 xA1 xR1 xG1 xB1 xA0 xR0 xG0 xB0
				_mm_store_si128((__m128i *)px, r0);
			}
		}
	}
#ifndef __X64__
	else {
示例#30
0
pstatus_t sse2_alphaComp_argb(
    const BYTE* pSrc1,  UINT32 src1Step,
    const BYTE* pSrc2,  UINT32 src2Step,
    BYTE* pDst,  UINT32 dstStep,
    UINT32 width,  UINT32 height)
{
	const UINT32* sptr1 = (const UINT32*) pSrc1;
	const UINT32* sptr2 = (const UINT32*) pSrc2;
	UINT32* dptr;
	int linebytes, src1Jump, src2Jump, dstJump;
	UINT32 y;
	__m128i xmm0, xmm1;

	if ((width <= 0) || (height <= 0)) return PRIMITIVES_SUCCESS;

	if (width < 4)     /* pointless if too small */
	{
		return generic->alphaComp_argb(pSrc1, src1Step, pSrc2, src2Step,
					       pDst, dstStep, width, height);
	}

	dptr = (UINT32*) pDst;
	linebytes = width * sizeof(UINT32);
	src1Jump = (src1Step - linebytes) / sizeof(UINT32);
	src2Jump = (src2Step - linebytes) / sizeof(UINT32);
	dstJump  = (dstStep  - linebytes) / sizeof(UINT32);
	xmm0 = _mm_set1_epi32(0);
	xmm1 = _mm_set1_epi16(1);

	for (y = 0; y < height; ++y)
	{
		int pixels = width;
		int count;
		/* Get to the 16-byte boundary now. */
		int leadIn = 0;

		switch ((ULONG_PTR) dptr & 0x0f)
		{
			case 0:
				leadIn = 0;
				break;

			case 4:
				leadIn = 3;
				break;

			case 8:
				leadIn = 2;
				break;

			case 12:
				leadIn = 1;
				break;

			default:
				/* We'll never hit a 16-byte boundary, so do the whole
				 * thing the slow way.
				 */
				leadIn = width;
				break;
		}

		if (leadIn)
		{
			pstatus_t status;
			status = generic->alphaComp_argb((const BYTE*) sptr1,
						src1Step, (const BYTE*) sptr2, src2Step,
						(BYTE*) dptr, dstStep, leadIn, 1);
			if (status != PRIMITIVES_SUCCESS)
				return status;

			sptr1 += leadIn;
			sptr2 += leadIn;
			dptr  += leadIn;
			pixels -= leadIn;
		}

		/* Use SSE registers to do 4 pixels at a time. */
		count = pixels >> 2;
		pixels -= count << 2;

		while (count--)
		{
			__m128i xmm2, xmm3, xmm4, xmm5, xmm6, xmm7;
			/* BdGdRdAdBcGcRcAcBbGbRbAbBaGaRaAa */
			xmm2 = LOAD_SI128(sptr1);
			sptr1 += 4;
			/* BhGhRhAhBgGgRgAgBfGfRfAfBeGeReAe */
			xmm3 = LOAD_SI128(sptr2);
			sptr2 += 4;
			/* 00Bb00Gb00Rb00Ab00Ba00Ga00Ra00Aa */
			xmm4 = _mm_unpackhi_epi8(xmm2, xmm0);
			/* 00Bf00Gf00Bf00Af00Be00Ge00Re00Ae */
			xmm5 = _mm_unpackhi_epi8(xmm3, xmm0);
			/* subtract */
			xmm6 = _mm_subs_epi16(xmm4, xmm5);
			/* 00Bb00Gb00Rb00Ab00Aa00Aa00Aa00Aa */
			xmm4 = _mm_shufflelo_epi16(xmm4, 0xff);
			/* 00Ab00Ab00Ab00Ab00Aa00Aa00Aa00Aa */
			xmm4 = _mm_shufflehi_epi16(xmm4, 0xff);
			/* Add one to alphas */
			xmm4 = _mm_adds_epi16(xmm4, xmm1);
			/* Multiply and take low word */
			xmm4 = _mm_mullo_epi16(xmm4, xmm6);
			/* Shift 8 right */
			xmm4 = _mm_srai_epi16(xmm4, 8);
			/* Add xmm5 */
			xmm4 = _mm_adds_epi16(xmm4, xmm5);
			/* 00Bj00Gj00Rj00Aj00Bi00Gi00Ri00Ai */
			/* 00Bd00Gd00Rd00Ad00Bc00Gc00Rc00Ac */
			xmm5 = _mm_unpacklo_epi8(xmm2, xmm0);
			/* 00Bh00Gh00Rh00Ah00Bg00Gg00Rg00Ag */
			xmm6 = _mm_unpacklo_epi8(xmm3, xmm0);
			/* subtract */
			xmm7 = _mm_subs_epi16(xmm5, xmm6);
			/* 00Bd00Gd00Rd00Ad00Ac00Ac00Ac00Ac */
			xmm5 = _mm_shufflelo_epi16(xmm5, 0xff);
			/* 00Ad00Ad00Ad00Ad00Ac00Ac00Ac00Ac */
			xmm5 = _mm_shufflehi_epi16(xmm5, 0xff);
			/* Add one to alphas */
			xmm5 = _mm_adds_epi16(xmm5, xmm1);
			/* Multiply and take low word */
			xmm5 = _mm_mullo_epi16(xmm5, xmm7);
			/* Shift 8 right */
			xmm5 = _mm_srai_epi16(xmm5, 8);
			/* Add xmm6 */
			xmm5 = _mm_adds_epi16(xmm5, xmm6);
			/* 00Bl00Gl00Rl00Al00Bk00Gk00Rk0ABk */
			/* Must mask off remainders or pack gets confused */
			xmm3 = _mm_set1_epi16(0x00ffU);
			xmm4 = _mm_and_si128(xmm4, xmm3);
			xmm5 = _mm_and_si128(xmm5, xmm3);
			/* BlGlRlAlBkGkRkAkBjGjRjAjBiGiRiAi */
			xmm5 = _mm_packus_epi16(xmm5, xmm4);
			_mm_store_si128((__m128i*) dptr, xmm5);
			dptr += 4;
		}

		/* Finish off the remainder. */
		if (pixels)
		{
			pstatus_t status;
			status = generic->alphaComp_argb((const BYTE*) sptr1, src1Step,
						(const BYTE*) sptr2, src2Step,
						(BYTE*) dptr, dstStep, pixels, 1);
			if (status != PRIMITIVES_SUCCESS)
				return status;

			sptr1 += pixels;
			sptr2 += pixels;
			dptr  += pixels;
		}

		/* Jump to next row. */
		sptr1 += src1Jump;
		sptr2 += src2Jump;
		dptr  += dstJump;
	}

	return PRIMITIVES_SUCCESS;
}