Пример #1
0
/*
    and   : 5
    or    : 1
    add   : 2
    shift : 0
    mul   : 4
*/
void blend_sse_2() {

    const uint16_t alpha0 = alpha;
    const uint16_t alpha1 = 255 - alpha;

	const int n = width * height * 4;

    const __m128i v_alpha0 = _mm_set1_epi16(alpha0 << 8);
    const __m128i v_alpha1 = _mm_set1_epi16(alpha1 << 8);
    const __m128i mask00ff = _mm_set1_epi16((int16_t)0x00ff);
    const __m128i maskff00 = _mm_set1_epi16((int16_t)0xff00);

    for (size_t i=0; i < n; i += 16) {
        __m128i A = _mm_load_si128((__m128i*)(imgA + i));
        __m128i B = _mm_load_si128((__m128i*)(imgB + i));

        __m128i A0 = _mm_and_si128(A, mask00ff);
        __m128i B0 = _mm_and_si128(B, mask00ff);
        __m128i A1 = _mm_and_si128(A, maskff00);
        __m128i B1 = _mm_and_si128(B, maskff00);

        A0 = _mm_mulhi_epu16(A0, v_alpha0);
        B0 = _mm_mulhi_epu16(B0, v_alpha1);

        A1 = _mm_mulhi_epu16(A1, v_alpha0);
        B1 = _mm_mulhi_epu16(B1, v_alpha1);
        
        __m128i R0 = _mm_add_epi16(A0, B0);
        __m128i R1 = _mm_and_si128(_mm_add_epi16(A1, B1), maskff00);

        _mm_store_si128((__m128i*)(data + i), _mm_or_si128(R0, R1));
    }
}
Пример #2
0
inline __m128i Convert8DigitsSSE2(uint32_t value) {
	assert(value <= 99999999);

	// abcd, efgh = abcdefgh divmod 10000 
	const __m128i abcdefgh = _mm_cvtsi32_si128(value);
	const __m128i abcd = _mm_srli_epi64(_mm_mul_epu32(abcdefgh, reinterpret_cast<const __m128i*>(kDiv10000Vector)[0]), 45);
	const __m128i efgh = _mm_sub_epi32(abcdefgh, _mm_mul_epu32(abcd, reinterpret_cast<const __m128i*>(k10000Vector)[0]));

	// v1 = [ abcd, efgh, 0, 0, 0, 0, 0, 0 ]
	const __m128i v1 = _mm_unpacklo_epi16(abcd, efgh);

	// v1a = v1 * 4 = [ abcd * 4, efgh * 4, 0, 0, 0, 0, 0, 0 ]
	const __m128i v1a = _mm_slli_epi64(v1, 2);

	// v2 = [ abcd * 4, abcd * 4, abcd * 4, abcd * 4, efgh * 4, efgh * 4, efgh * 4, efgh * 4 ]
	const __m128i v2a = _mm_unpacklo_epi16(v1a, v1a);
	const __m128i v2 = _mm_unpacklo_epi32(v2a, v2a);

	// v4 = v2 div 10^3, 10^2, 10^1, 10^0 = [ a, ab, abc, abcd, e, ef, efg, efgh ]
	const __m128i v3 = _mm_mulhi_epu16(v2, reinterpret_cast<const __m128i*>(kDivPowersVector)[0]);
	const __m128i v4 = _mm_mulhi_epu16(v3, reinterpret_cast<const __m128i*>(kShiftPowersVector)[0]);

	// v5 = v4 * 10 = [ a0, ab0, abc0, abcd0, e0, ef0, efg0, efgh0 ]
	const __m128i v5 = _mm_mullo_epi16(v4, reinterpret_cast<const __m128i*>(k10Vector)[0]);

	// v6 = v5 << 16 = [ 0, a0, ab0, abc0, 0, e0, ef0, efg0 ]
	const __m128i v6 = _mm_slli_epi64(v5, 16);

	// v7 = v4 - v6 = { a, b, c, d, e, f, g, h }
	const __m128i v7 = _mm_sub_epi16(v4, v6);

	return v7;
}
Пример #3
0
static void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16,
                       const __m128i mul_constants_0,
                       const __m128i mul_constants_1, const int strength,
                       const int rounding, const int weight) {
  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
  const __m128i rounding_u16 = _mm_set1_epi16(rounding);
  const __m128i weight_u16 = _mm_set1_epi16(weight);
  const __m128i sixteen = _mm_set1_epi16(16);
  __m128i input_0, input_1;

  input_0 = _mm_mulhi_epu16(*sum_0_u16, mul_constants_0);
  input_0 = _mm_adds_epu16(input_0, rounding_u16);

  input_1 = _mm_mulhi_epu16(*sum_1_u16, mul_constants_1);
  input_1 = _mm_adds_epu16(input_1, rounding_u16);

  input_0 = _mm_srl_epi16(input_0, strength_u128);
  input_1 = _mm_srl_epi16(input_1, strength_u128);

  input_0 = _mm_min_epu16(input_0, sixteen);
  input_1 = _mm_min_epu16(input_1, sixteen);
  input_0 = _mm_sub_epi16(sixteen, input_0);
  input_1 = _mm_sub_epi16(sixteen, input_1);

  *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16);
  *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16);
}
Пример #4
0
void unpack_rgb5a1_sse2(const Uint8* source, const Uint32 size, Uint8* dest)
{
	__m128i t0, t1, t2;
	Uint32 i;

	for (i = 0; i < (size / 8); i++)
	{
		t0 = _mm_loadl_epi64((__m128i*)&source[i * 8]);

		t0 = _mm_unpacklo_epi16(t0, t0);
		t1 = _mm_unpacklo_epi16(t0, t0);
		t1 = _mm_and_si128(t1, _mm_set_epi16(0x8000, 0x001F, 0x03E0, 0x7C00, 0x8000, 0x001F, 0x03E0, 0x7C00));
		t1 = _mm_mullo_epi16(t1, _mm_set_epi16(0x0001, 0x0800, 0x0040, 0x0002, 0x0001, 0x0800, 0x0040, 0x0002));
		t1 = _mm_mulhi_epu16(t1, _mm_set_epi16(0x0200, 0x0260, 0x0260, 0x0260, 0x0200, 0x0260, 0x0260, 0x0260));
		t1 = _mm_mulhi_epu16(t1, _mm_set_epi16(0xFF00, 0x6ED5, 0x6ED5, 0x6ED5, 0xFF00, 0x6ED5, 0x6ED5, 0x6ED5));
		t2 = _mm_unpackhi_epi16(t0, t0);
		t2 = _mm_and_si128(t2, _mm_set_epi16(0x8000, 0x001F, 0x03E0, 0x7C00, 0x8000, 0x001F, 0x03E0, 0x7C00));
		t2 = _mm_mullo_epi16(t2, _mm_set_epi16(0x0001, 0x0800, 0x0040, 0x0002, 0x0001, 0x0800, 0x0040, 0x0002));
		t2 = _mm_mulhi_epu16(t2, _mm_set_epi16(0x0200, 0x0260, 0x0260, 0x0260, 0x0200, 0x0260, 0x0260, 0x0260));
		t2 = _mm_mulhi_epu16(t2, _mm_set_epi16(0xFF00, 0x6ED5, 0x6ED5, 0x6ED5, 0xFF00, 0x6ED5, 0x6ED5, 0x6ED5));
		t1 = _mm_packus_epi16(t1, t2);

		_mm_stream_si128((__m128i*)&dest[i * 16], t1);
	}
}
Пример #5
0
static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk,
                                         const uint8_t* src) {
  const int x_sub = wrk->x_sub;
  int accum = 0;
  const __m128i zero = _mm_setzero_si128();
  const __m128i mult0 = _mm_set1_epi16(x_sub);
  const __m128i mult1 = _mm_set1_epi32(wrk->fx_scale);
  const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER);
  __m128i sum = zero;
  rescaler_t* frow = wrk->frow;
  const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width;

  if (wrk->num_channels != 4 || wrk->x_add > (x_sub << 7)) {
    WebPRescalerImportRowShrink_C(wrk, src);
    return;
  }
  assert(!WebPRescalerInputDone(wrk));
  assert(!wrk->x_expand);

  for (; frow < frow_end; frow += 4) {
    __m128i base = zero;
    accum += wrk->x_add;
    while (accum > 0) {
      const __m128i A = _mm_cvtsi32_si128(WebPMemToUint32(src));
      src += 4;
      base = _mm_unpacklo_epi8(A, zero);
      // To avoid overflow, we need: base * x_add / x_sub < 32768
      // => x_add < x_sub << 7. That's a 1/128 reduction ratio limit.
      sum = _mm_add_epi16(sum, base);
      accum -= x_sub;
    }
    {    // Emit next horizontal pixel.
      const __m128i mult = _mm_set1_epi16(-accum);
      const __m128i frac0 = _mm_mullo_epi16(base, mult);  // 16b x 16b -> 32b
      const __m128i frac1 = _mm_mulhi_epu16(base, mult);
      const __m128i frac = _mm_unpacklo_epi16(frac0, frac1);  // frac is 32b
      const __m128i A0 = _mm_mullo_epi16(sum, mult0);
      const __m128i A1 = _mm_mulhi_epu16(sum, mult0);
      const __m128i B0 = _mm_unpacklo_epi16(A0, A1);      // sum * x_sub
      const __m128i frow_out = _mm_sub_epi32(B0, frac);   // sum * x_sub - frac
      const __m128i D0 = _mm_srli_epi64(frac, 32);
      const __m128i D1 = _mm_mul_epu32(frac, mult1);      // 32b x 16b -> 64b
      const __m128i D2 = _mm_mul_epu32(D0, mult1);
      const __m128i E1 = _mm_add_epi64(D1, rounder);
      const __m128i E2 = _mm_add_epi64(D2, rounder);
      const __m128i F1 = _mm_shuffle_epi32(E1, 1 | (3 << 2));
      const __m128i F2 = _mm_shuffle_epi32(E2, 1 | (3 << 2));
      const __m128i G = _mm_unpacklo_epi32(F1, F2);
      sum = _mm_packs_epi32(G, zero);
      _mm_storeu_si128((__m128i*)frow, frow_out);
    }
  }
  assert(accum == 0);
}
Пример #6
0
static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
  int x = 0;
  if (!inverse) {
    const int kSpan = 2;
    const __m128i zero = _mm_setzero_si128();
    const __m128i kRound =
        _mm_set_epi16(0, 1 << 7, 1 << 7, 1 << 7, 0, 1 << 7, 1 << 7, 1 << 7);
    const __m128i kMult =
        _mm_set_epi16(0, 0x0101, 0x0101, 0x0101, 0, 0x0101, 0x0101, 0x0101);
    const __m128i kOne64 = _mm_set_epi16(1u << 8, 0, 0, 0, 1u << 8, 0, 0, 0);
    const int w2 = width & ~(kSpan - 1);
    for (x = 0; x < w2; x += kSpan) {
      const __m128i argb0 = _mm_loadl_epi64((__m128i*)&ptr[x]);
      const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero);
      const __m128i tmp0 = _mm_shufflelo_epi16(argb1, _MM_SHUFFLE(3, 3, 3, 3));
      const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, _MM_SHUFFLE(3, 3, 3, 3));
      const __m128i tmp2 = _mm_srli_epi64(tmp1, 16);
      const __m128i scale0 = _mm_mullo_epi16(tmp1, kMult);
      const __m128i scale1 = _mm_or_si128(tmp2, kOne64);
      const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0);
      const __m128i argb3 = _mm_mullo_epi16(argb1, scale1);
      const __m128i argb4 = _mm_adds_epu16(argb2, argb3);
      const __m128i argb5 = _mm_adds_epu16(argb4, kRound);
      const __m128i argb6 = _mm_srli_epi16(argb5, 8);
      const __m128i argb7 = _mm_packus_epi16(argb6, zero);
      _mm_storel_epi64((__m128i*)&ptr[x], argb7);
    }
  }
  width -= x;
  if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse);
}
Пример #7
0
static void MultRow(uint8_t* const ptr, const uint8_t* const alpha,
                    int width, int inverse) {
  int x = 0;
  if (!inverse) {
    const int kSpan = 8;
    const __m128i zero = _mm_setzero_si128();
    const __m128i kRound = _mm_set1_epi16(1 << 7);
    const int w2 = width & ~(kSpan - 1);
    for (x = 0; x < w2; x += kSpan) {
      const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]);
      const __m128i v1 = _mm_unpacklo_epi8(v0, zero);
      const __m128i alpha0 = _mm_loadl_epi64((const __m128i*)&alpha[x]);
      const __m128i alpha1 = _mm_unpacklo_epi8(alpha0, zero);
      const __m128i alpha2 = _mm_unpacklo_epi8(alpha0, alpha0);
      const __m128i v2 = _mm_mulhi_epu16(v1, alpha2);
      const __m128i v3 = _mm_mullo_epi16(v1, alpha1);
      const __m128i v4 = _mm_adds_epu16(v2, v3);
      const __m128i v5 = _mm_adds_epu16(v4, kRound);
      const __m128i v6 = _mm_srli_epi16(v5, 8);
      const __m128i v7 = _mm_packus_epi16(v6, zero);
      _mm_storel_epi64((__m128i*)&ptr[x], v7);
    }
  }
  width -= x;
  if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse);
}
Пример #8
0
__m128i test_mm_mulhi_epu16(__m128i A, __m128i B) {
  // DAG-LABEL: test_mm_mulhi_epu16
  // DAG: call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}})
  //
  // ASM-LABEL: test_mm_mulhi_epu16
  // ASM: pmulhuw
  return _mm_mulhi_epu16(A, B);
}
Пример #9
0
void blend_sse2(const Uint8* alpha, const Uint32 size, const Uint8* source0,
	const Uint8* source1, Uint8* dest)
{
	__m128i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
	Uint32 i;

	for (i = 0; i < (size / 4); i++)
	{
		t0 = _mm_load_si128((__m128i*)&source0[i * 16]);
		t1 = _mm_load_si128((__m128i*)&source1[i * 16]);
		t2 = (__m128i)_mm_load_ss((float*)&alpha[i * 4]);

		t2 = _mm_unpacklo_epi8(t2, t2);
		t2 = _mm_unpacklo_epi16(t2, t2);

		t3 = _mm_unpacklo_epi8(t0, t0);
		t4 = _mm_unpacklo_epi8(t1, t1);

		t5 = _mm_unpacklo_epi32(t2, t2);
		t6 = _mm_sub_epi16(_mm_set1_epi8(0xFF), t5);

		t7 = _mm_mulhi_epu16(t3, t6);
		t8 = _mm_mulhi_epu16(t4, t5);

		t9 = _mm_adds_epu16(t7, t8);
		t9 = _mm_srli_epi16(t9, 8);

		t3 = _mm_unpackhi_epi8(t0, t0);
		t4 = _mm_unpackhi_epi8(t1, t1);

		t5 = _mm_unpackhi_epi32(t2, t2);
		t6 = _mm_sub_epi16(_mm_set1_epi8(0xFF), t5);

		t7 = _mm_mulhi_epu16(t3, t6);
		t8 = _mm_mulhi_epu16(t4, t5);

		t10 = _mm_adds_epu16(t7, t8);
		t10 = _mm_srli_epi16(t10, 8);

		t10 = _mm_packus_epi16(t9, t10);

		_mm_stream_si128((__m128i*)&dest[i * 16], t10);
	}
}
Пример #10
0
__m64 _m_pmulhuw(__m64 _MM1, __m64 _MM2)
{
    __m128i lhs = {0}, rhs = {0};
    lhs.m128i_i64[0] = _MM1.m64_i64;

    rhs.m128i_i64[0] = _MM2.m64_i64;

    lhs = _mm_mulhi_epu16(lhs, rhs);

    _MM1.m64_i64 = lhs.m128i_i64[0];
    return _MM1;
}
Пример #11
0
// These constants are 14b fixed-point version of ITU-R BT.601 constants.
// R = (19077 * y             + 26149 * v - 14234) >> 6
// G = (19077 * y -  6419 * u - 13320 * v +  8708) >> 6
// B = (19077 * y + 33050 * u             - 17685) >> 6
static void ConvertYUV444ToRGB_SSE41(const __m128i* const Y0,
                                     const __m128i* const U0,
                                     const __m128i* const V0,
                                     __m128i* const R,
                                     __m128i* const G,
                                     __m128i* const B) {
  const __m128i k19077 = _mm_set1_epi16(19077);
  const __m128i k26149 = _mm_set1_epi16(26149);
  const __m128i k14234 = _mm_set1_epi16(14234);
  // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic
  const __m128i k33050 = _mm_set1_epi16((short)33050);
  const __m128i k17685 = _mm_set1_epi16(17685);
  const __m128i k6419  = _mm_set1_epi16(6419);
  const __m128i k13320 = _mm_set1_epi16(13320);
  const __m128i k8708  = _mm_set1_epi16(8708);

  const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077);

  const __m128i R0 = _mm_mulhi_epu16(*V0, k26149);
  const __m128i R1 = _mm_sub_epi16(Y1, k14234);
  const __m128i R2 = _mm_add_epi16(R1, R0);

  const __m128i G0 = _mm_mulhi_epu16(*U0, k6419);
  const __m128i G1 = _mm_mulhi_epu16(*V0, k13320);
  const __m128i G2 = _mm_add_epi16(Y1, k8708);
  const __m128i G3 = _mm_add_epi16(G0, G1);
  const __m128i G4 = _mm_sub_epi16(G2, G3);

  // be careful with the saturated *unsigned* arithmetic here!
  const __m128i B0 = _mm_mulhi_epu16(*U0, k33050);
  const __m128i B1 = _mm_adds_epu16(B0, Y1);
  const __m128i B2 = _mm_subs_epu16(B1, k17685);

  // use logical shift for B2, which can be larger than 32767
  *R = _mm_srai_epi16(R2, 6);   // range: [-14234, 30815]
  *G = _mm_srai_epi16(G4, 6);   // range: [-10953, 27710]
  *B = _mm_srli_epi16(B2, 6);   // range: [0, 34238]
}
Пример #12
0
void unpack_rgba4_sse2(const Uint8* source, const Uint32 size, Uint8* dest)
{
	__m128i t0, t1, t2;
	Uint32 i;

	for (i = 0; i < (size / 8); i++)
	{
		t0 = _mm_loadl_epi64((__m128i*)&source[i * 8]);
		// converts 4 bit values to 8 bit values (multiply with 17)
		t0 = _mm_unpacklo_epi16(t0, t0);
		t1 = _mm_unpacklo_epi16(t0, t0);
		t1 = _mm_and_si128(t1, _mm_set_epi16(0xF000, 0x000F, 0x00F0, 0x0F00, 0xF000, 0x000F, 0x00F0, 0x0F00));
		t1 = _mm_mullo_epi16(t1, _mm_set_epi16(0x0001, 0x1000, 0x0100, 0x0010, 0x0001, 0x1000, 0x0100, 0x0010));
		t1 = _mm_mulhi_epu16(t1, _mm_set1_epi16(0x0110));
		t2 = _mm_unpackhi_epi16(t0, t0);
		t2 = _mm_and_si128(t2, _mm_set_epi16(0xF000, 0x000F, 0x00F0, 0x0F00, 0xF000, 0x000F, 0x00F0, 0x0F00));
		t2 = _mm_mullo_epi16(t2, _mm_set_epi16(0x0001, 0x1000, 0x0100, 0x0010, 0x0001, 0x1000, 0x0100, 0x0010));
		t2 = _mm_mulhi_epu16(t2, _mm_set1_epi16(0x0110));
		t1 = _mm_packus_epi16(t1, t2);

		_mm_stream_si128((__m128i*)&dest[i * 16], t1);
	}
}
Пример #13
0
//lower - usually target
//upper - usually source; its alpha decides how much of the lower color is visible
//always does the full blending operation, does not optimize based on A=FF being true 90% of the time and A=00 90% of the remainder
static inline uint32_t blend_8888_on_8888(uint32_t argb_lower, uint32_t argb_upper)
{
#ifdef __SSE2__
	//no need to extend this above 128bit, it's complex enough without having to consider multiple pixels at once
	
	uint32_t spx = argb_upper;
	uint32_t tpx = argb_lower;
	
	//contains u16: spx.a, spx.b, spx.g, spx.r, tpx.{a,b,g,r}
	__m128i vals = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, spx, tpx), _mm_setzero_si128());
	
	//contains u16: {sa}*4, {255-sa}*4
	__m128i alphas = _mm_xor_si128(_mm_set1_epi16(spx>>24), _mm_set_epi16(0,0,0,0, 255,255,255,255));
	//contains u16: pixel contributions times 255
	__m128i newcols255 = _mm_mullo_epi16(vals, alphas);
	
	//ugly magic constants: (u16)*8081>>16>>7 = (u16)/255
	__m128i newcols = _mm_srli_epi16(_mm_mulhi_epu16(newcols255, _mm_set1_epi16(0x8081)), 7);
	
	//contains u8: {don't care}*8, sac (source alpha contribution), sbc, sgc, src, tac, tbc, tgc, trc
	__m128i newpack = _mm_packus_epi16(newcols, _mm_undefined_si128());
	//contains u8: {don't care}*12, sac+tac = result alpha, sbc+tbc, sgc+tgc, src+trc
	//the components are known to not overflow
	__m128i newpacksum = _mm_add_epi8(newpack, _mm_srli_si128(newpack, 32/8));
	
	return _mm_cvtsi128_si32(newpacksum);
#else
	uint8_t sr = argb_upper>>0;
	uint8_t sg = argb_upper>>8;
	uint8_t sb = argb_upper>>16;
	uint8_t sa = argb_upper>>24;
	
	uint8_t tr = argb_lower>>0;
	uint8_t tg = argb_lower>>8;
	uint8_t tb = argb_lower>>16;
	uint8_t ta = argb_lower>>24;
	
	tr = (sr*sa/255) + (tr*(255-sa)/255);
	tg = (sg*sa/255) + (tg*(255-sa)/255);
	tb = (sb*sa/255) + (tb*(255-sa)/255);
	ta = (sa*sa/255) + (ta*(255-sa)/255);
	
	return ta<<24 | tb<<16 | tg<<8 | tr<<0;
#endif
}
Пример #14
0
// Average the value based on the number of values summed (9 for pixels away
// from the border, 4 for pixels in corners, and 6 for other edge values).
//
// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
// by weight.
static __m128i average_8(__m128i sum, const __m128i mul_constants,
                         const int strength, const int rounding,
                         const int weight) {
  // _mm_srl_epi16 uses the lower 64 bit value for the shift.
  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
  const __m128i rounding_u16 = _mm_set1_epi16(rounding);
  const __m128i weight_u16 = _mm_set1_epi16(weight);
  const __m128i sixteen = _mm_set1_epi16(16);

  // modifier * 3 / index;
  sum = _mm_mulhi_epu16(sum, mul_constants);

  sum = _mm_adds_epu16(sum, rounding_u16);
  sum = _mm_srl_epi16(sum, strength_u128);

  // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
  // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
  // So this needs to use the epu16 version which did not come until SSE4.
  sum = _mm_min_epu16(sum, sixteen);

  sum = _mm_sub_epi16(sixteen, sum);

  return _mm_mullo_epi16(sum, weight_u16);
}
mlib_status
mlib_VideoColorYUV2ARGB422_aligned(
	mlib_u8 *argb,
	const mlib_u8 *y,
	const mlib_u8 *u,
	const mlib_u8 *v,
	mlib_s32 width,
	mlib_s32 height,
	mlib_s32 argb_stride,
	mlib_s32 y_stride,
	mlib_s32 uv_stride)
{
/* 1.1644  * 8192 */
	const __m128i c0 = _mm_set1_epi16(0x2543);
	const mlib_s32 ic0 = 0x2543;

/* 2.0184  * 8192 */
	const __m128i c1 = _mm_set1_epi16(0x4097);
	const mlib_s32 ic1 = 0x4097;

/* abs( -0.3920 * 8192 ) */
	const __m128i c4 = _mm_set1_epi16(0xc8b);
	const mlib_s32 ic4 = 0xc8b;

/* abs( -0.8132 * 8192 ) */
	const __m128i c5 = _mm_set1_epi16(0x1a06);
	const mlib_s32 ic5 = 0x1a06;

/* 1.5966  * 8192 */
	const __m128i c8 = _mm_set1_epi16(0x3317);
	const mlib_s32 ic8 = 0x3317;

/* -276.9856 * 32 */
	const __m128i coff0 = _mm_set1_epi16(0xdd60);
	const mlib_s32 icoff0 = (mlib_s32)0xffffdd60;

/* 135.6352  * 32 */
	const __m128i coff1 = _mm_set1_epi16(0x10f4);
	const mlib_s32 icoff1 = 0x10f4;

/* -222.9952 * 32 */
	const __m128i coff2 = _mm_set1_epi16(0xe420);
	const mlib_s32 icoff2 = (mlib_s32)0xffffe420;

/* loop variable */
	mlib_s32 jH, iW;

/* pointers */
	mlib_u8 *pY, *pU, *pV, *pD, *pdd, *ptemp;
	__m128i *py, *pu, *pv;

/* variables */
	__m128i sy1, sy2, sy3, sy4, su1, su2, sv1, sv2;
	__m128i du0, du1, dv1, dv2;
	__m128i db1, db2, db3, db4, dr1, dr2, dr3, dr4, dg1, dg2, dg3, dg4;
	__m128i ddy1, ddy2, ddy3, ddy4, dzrl, dzrh, dgbl, dgbh, drgbh, drgbl;
	__m128i db_h, db_l, dg_h, dg_l, dr_h, dr_l, temp, bak;
	const __m128i x_zero = _mm_setzero_si128();
	const __m128i x_mask = _mm_set1_epi32(0xff);

/* for 4-pixel computing */
	mlib_s32 iu, iv, ig, ir, ib, iTemp;
	mlib_s32 iu0, iu1, iv1, iv2;

	pY  = (mlib_u8 *)y;
	pU  = (mlib_u8 *)u;
	pV  = (mlib_u8 *)v;
	pD = (mlib_u8 *)argb;

	for (jH = 0; jH < height; jH++) {
		py = (__m128i *)pY;
		pu = (__m128i *)pU;
		pv = (__m128i *)pV;
		pdd = pD;
		iW = 0;

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
		/* 32 pixels */
		for (; iW <= width - 32; iW += 32) {
			/* load y u v, and expand */
			temp = _mm_load_si128(pu);
			su1 = _mm_unpacklo_epi8(x_zero, temp);
			su2 = _mm_unpackhi_epi8(x_zero, temp);
			pu++;
			temp = _mm_load_si128(pv);
			sv1 = _mm_unpacklo_epi8(x_zero, temp);
			sv2 = _mm_unpackhi_epi8(x_zero, temp);
			pv++;
			temp = _mm_load_si128(py);
			sy1 = _mm_unpacklo_epi8(x_zero, temp);
			sy2 = _mm_unpackhi_epi8(x_zero, temp);
			py++;
			temp = _mm_load_si128(py);
			sy3 = _mm_unpacklo_epi8(x_zero, temp);
			sy4 = _mm_unpackhi_epi8(x_zero, temp);
			py++;

			/* pre-calc d[r/g/b][1234] */
			du0 = _mm_mulhi_epu16(su1, c1);
			db_l = _mm_add_epi16(du0, coff0);
			du0 = _mm_mulhi_epu16(su2, c1);
			db_h = _mm_add_epi16(du0, coff0);

			du1 = _mm_mulhi_epu16(su1, c4);
			dv1 = _mm_mulhi_epu16(sv1, c5);
			temp = _mm_add_epi16(du1, dv1);
			dg_l = _mm_sub_epi16(coff1, temp);
			du1 = _mm_mulhi_epu16(su2, c4);
			dv1 = _mm_mulhi_epu16(sv2, c5);
			temp = _mm_add_epi16(du1, dv1);
			dg_h = _mm_sub_epi16(coff1, temp);

			dv2 = _mm_mulhi_epu16(sv1, c8);
			dr_l = _mm_add_epi16(dv2, coff2);
			dv2 = _mm_mulhi_epu16(sv2, c8);
			dr_h = _mm_add_epi16(dv2, coff2);

			ddy1 = _mm_mulhi_epu16(sy1, c0);
			ddy2 = _mm_mulhi_epu16(sy2, c0);
			ddy3 = _mm_mulhi_epu16(sy3, c0);
			ddy4 = _mm_mulhi_epu16(sy4, c0);

			/* db1/2/3/4 */
			bak = _mm_unpacklo_epi16(db_l, db_l);
			db1 = _mm_add_epi16(ddy1, bak);
			bak = _mm_unpackhi_epi16(db_l, db_l);
			db2 = _mm_add_epi16(ddy2, bak);

			bak = _mm_unpacklo_epi16(db_h, db_h);
			db3 = _mm_add_epi16(ddy3, bak);
			bak = _mm_unpackhi_epi16(db_h, db_h);
			db4 = _mm_add_epi16(ddy4, bak);

			/* dg1/2/3/4 */
			bak = _mm_unpacklo_epi16(dg_l, dg_l);
			dg1 = _mm_add_epi16(ddy1, bak);
			bak = _mm_unpackhi_epi16(dg_l, dg_l);
			dg2 = _mm_add_epi16(ddy2, bak);

			bak = _mm_unpacklo_epi16(dg_h, dg_h);
			dg3 = _mm_add_epi16(ddy3, bak);
			bak = _mm_unpackhi_epi16(dg_h, dg_h);
			dg4 = _mm_add_epi16(ddy4, bak);

			/* dr1/2/3/4 */
			bak = _mm_unpacklo_epi16(dr_l, dr_l);
			dr1 = _mm_add_epi16(ddy1, bak);
			bak = _mm_unpackhi_epi16(dr_l, dr_l);
			dr2 = _mm_add_epi16(ddy2, bak);

			bak = _mm_unpacklo_epi16(dr_h, dr_h);
			dr3 = _mm_add_epi16(ddy3, bak);
			bak = _mm_unpackhi_epi16(dr_h, dr_h);
			dr4 = _mm_add_epi16(ddy4, bak);

			db1 = _mm_srai_epi16(db1, 5);
			db2 = _mm_srai_epi16(db2, 5);
			db3 = _mm_srai_epi16(db3, 5);
			db4 = _mm_srai_epi16(db4, 5);
			dg1 = _mm_srai_epi16(dg1, 5);
			dg2 = _mm_srai_epi16(dg2, 5);
			dg3 = _mm_srai_epi16(dg3, 5);
			dg4 = _mm_srai_epi16(dg4, 5);
			dr1 = _mm_srai_epi16(dr1, 5);
			dr2 = _mm_srai_epi16(dr2, 5);
			dr3 = _mm_srai_epi16(dr3, 5);
			dr4 = _mm_srai_epi16(dr4, 5);

			/* pack: 16=>8 */
			db1 = _mm_packus_epi16(db1, db2);
			db2 = _mm_packus_epi16(db3, db4);
			dr1 = _mm_packus_epi16(dr1, dr2);
			dr2 = _mm_packus_epi16(dr3, dr4);
			dg1 = _mm_packus_epi16(dg1, dg2);
			dg2 = _mm_packus_epi16(dg3, dg4);

			/* create rgb sequences : db/dr/dg[1] */
			dzrl = _mm_unpacklo_epi8(x_zero, dr1);
			dzrh = _mm_unpackhi_epi8(x_zero, dr1);
			dgbl = _mm_unpacklo_epi8(dg1, db1);
			dgbh = _mm_unpackhi_epi8(dg1, db1);

			drgbl = _mm_unpacklo_epi16(dzrl, dgbl);
			SAVE_ARGB1(drgbl);

			drgbh = _mm_unpackhi_epi16(dzrl, dgbl);
			SAVE_ARGB1(drgbh);

			drgbl = _mm_unpacklo_epi16(dzrh, dgbh);
			SAVE_ARGB1(drgbl);

			drgbh = _mm_unpackhi_epi16(dzrh, dgbh);
			SAVE_ARGB1(drgbh);

			/* create rgb sequences : db/dr/dg[2] */
			dzrl = _mm_unpacklo_epi8(x_zero, dr2);
			dzrh = _mm_unpackhi_epi8(x_zero, dr2);
			dgbl = _mm_unpacklo_epi8(dg2, db2);
			dgbh = _mm_unpackhi_epi8(dg2, db2);

			drgbl = _mm_unpacklo_epi16(dzrl, dgbl);
			SAVE_ARGB1(drgbl);

			drgbh = _mm_unpackhi_epi16(dzrl, dgbl);
			SAVE_ARGB1(drgbh);

			drgbl = _mm_unpacklo_epi16(dzrh, dgbh);
			SAVE_ARGB1(drgbl);

			drgbh = _mm_unpackhi_epi16(dzrh, dgbh);
			SAVE_ARGB1(drgbh);
		}

		/* 16 pixels */
		if (iW <= width - 16) {
			/* load y u v, and expand */
			temp = _mm_loadl_epi64(pu);
			su1 = _mm_unpacklo_epi8(x_zero, temp);
			pu = (__m128i *) (((__m64 *)pu) + 1);
			temp = _mm_loadl_epi64(pv);
			sv1 = _mm_unpacklo_epi8(x_zero, temp);
			pv = (__m128i *) (((__m64 *)pv) + 1);
			temp = _mm_load_si128(py);
			sy1 = _mm_unpacklo_epi8(x_zero, temp);
			sy2 = _mm_unpackhi_epi8(x_zero, temp);
			py++;

			/* pre-calc d[r/g/b][12] */
			du0 = _mm_mulhi_epu16(su1, c1);
			db_l = _mm_add_epi16(du0, coff0);

			du1 = _mm_mulhi_epu16(su1, c4);
			dv1 = _mm_mulhi_epu16(sv1, c5);
			temp = _mm_add_epi16(du1, dv1);
			dg_l = _mm_sub_epi16(coff1, temp);

			dv2 = _mm_mulhi_epu16(sv1, c8);
			dr_l = _mm_add_epi16(dv2, coff2);

			ddy1 = _mm_mulhi_epu16(sy1, c0);
			ddy2 = _mm_mulhi_epu16(sy2, c0);

			/* db1/2 */
			bak = _mm_unpacklo_epi16(db_l, db_l);
			db1 = _mm_add_epi16(ddy1, bak);
			bak = _mm_unpackhi_epi16(db_l, db_l);
			db2 = _mm_add_epi16(ddy2, bak);

			/* dg1/2 */
			bak = _mm_unpacklo_epi16(dg_l, dg_l);
			dg1 = _mm_add_epi16(ddy1, bak);
			bak = _mm_unpackhi_epi16(dg_l, dg_l);
			dg2 = _mm_add_epi16(ddy2, bak);

			/* dr1/2 */
			bak = _mm_unpacklo_epi16(dr_l, dr_l);
			dr1 = _mm_add_epi16(ddy1, bak);
			bak = _mm_unpackhi_epi16(dr_l, dr_l);
			dr2 = _mm_add_epi16(ddy2, bak);

			db1 = _mm_srai_epi16(db1, 5);
			db2 = _mm_srai_epi16(db2, 5);
			dg1 = _mm_srai_epi16(dg1, 5);
			dg2 = _mm_srai_epi16(dg2, 5);
			dr1 = _mm_srai_epi16(dr1, 5);
			dr2 = _mm_srai_epi16(dr2, 5);

			/* pack: 16=>8 */
			db1 = _mm_packus_epi16(db1, db2);
			dr1 = _mm_packus_epi16(dr1, dr2);
			dg1 = _mm_packus_epi16(dg1, dg2);

			/* create rgb sequences : db/dr/dg[1] */
			dzrl = _mm_unpacklo_epi8(x_zero, dr1);
			dzrh = _mm_unpackhi_epi8(x_zero, dr1);
			dgbl = _mm_unpacklo_epi8(dg1, db1);
			dgbh = _mm_unpackhi_epi8(dg1, db1);

			drgbl = _mm_unpacklo_epi16(dzrl, dgbl);
			SAVE_ARGB1(drgbl);

			drgbh = _mm_unpackhi_epi16(dzrl, dgbl);
			SAVE_ARGB1(drgbh);

			drgbl = _mm_unpacklo_epi16(dzrh, dgbh);
			SAVE_ARGB1(drgbl);

			drgbh = _mm_unpackhi_epi16(dzrh, dgbh);
			SAVE_ARGB1(drgbh);

			iW += 16;
		}

		/* 8 pixels */
		if (iW <= width - 8) {
			/* load y u v, and expand */
			iTemp = *((mlib_s32 *)pu);
			temp = _mm_cvtsi32_si128(iTemp);
			su1 = _mm_unpacklo_epi8(x_zero, temp);
			pu = (__m128i *) (((mlib_s32 *)pu) + 1);
			iTemp = *((mlib_s32 *)pv);
			temp = _mm_cvtsi32_si128(iTemp);
			sv1 = _mm_unpacklo_epi8(x_zero, temp);
			pv = (__m128i *) (((mlib_s32 *)pv) + 1);
			temp = _mm_loadl_epi64(py);
			sy1 = _mm_unpacklo_epi8(x_zero, temp);
			py = (__m128i *) (((__m64 *)py) + 1);

			/* pre-calc d[r/g/b][1] */
			du0 = _mm_mulhi_epu16(su1, c1);
			db_l = _mm_add_epi16(du0, coff0);

			du1 = _mm_mulhi_epu16(su1, c4);
			dv1 = _mm_mulhi_epu16(sv1, c5);
			temp = _mm_add_epi16(du1, dv1);
			dg_l = _mm_sub_epi16(coff1, temp);

			dv2 = _mm_mulhi_epu16(sv1, c8);
			dr_l = _mm_add_epi16(dv2, coff2);

			ddy1 = _mm_mulhi_epu16(sy1, c0);

			/* db1 */
			bak = _mm_unpacklo_epi16(db_l, db_l);
			db1 = _mm_add_epi16(ddy1, bak);

			/* dg1 */
			bak = _mm_unpacklo_epi16(dg_l, dg_l);
			dg1 = _mm_add_epi16(ddy1, bak);

			/* dr1 */
			bak = _mm_unpacklo_epi16(dr_l, dr_l);
			dr1 = _mm_add_epi16(ddy1, bak);

			db1 = _mm_srai_epi16(db1, 5);
			dg1 = _mm_srai_epi16(dg1, 5);
			dr1 = _mm_srai_epi16(dr1, 5);

			/* pack: 16=>8 */
			db1 = _mm_packus_epi16(db1, x_zero);
			dr1 = _mm_packus_epi16(dr1, x_zero);
			dg1 = _mm_packus_epi16(dg1, x_zero);

			/* create rgb sequences : db/dr/dg[1] */
			dzrl = _mm_unpacklo_epi8(x_zero, dr1);
			dgbl = _mm_unpacklo_epi8(dg1, db1);

			drgbl = _mm_unpacklo_epi16(dzrl, dgbl);
			SAVE_ARGB1(drgbl);

			drgbh = _mm_unpackhi_epi16(dzrl, dgbl);
			SAVE_ARGB1(drgbh);

			iW += 8;
		}

		/* 4 pixels */
		if (iW <= width - 4) {
			/* load y u v, and expand */
			iTemp = *((mlib_s16 *)pu);
			temp = _mm_cvtsi32_si128(iTemp);
			su1 = _mm_unpacklo_epi8(x_zero, temp);
			pu = (__m128i *) (((mlib_s16 *)pu) + 1);
			iTemp = *((mlib_s16 *)pv);
			temp = _mm_cvtsi32_si128(iTemp);
			sv1 = _mm_unpacklo_epi8(x_zero, temp);
			pv = (__m128i *) (((mlib_s16 *)pv) + 1);
			iTemp = *((mlib_s32 *)py);
			temp = _mm_cvtsi32_si128(iTemp);
			sy1 = _mm_unpacklo_epi8(x_zero, temp);
			py = (__m128i *) (((mlib_s32 *)py) + 1);

			/* pre-calc d[r/g/b][1] */
			du0 = _mm_mulhi_epu16(su1, c1);
			db_l = _mm_add_epi16(du0, coff0);

			du1 = _mm_mulhi_epu16(su1, c4);
			dv1 = _mm_mulhi_epu16(sv1, c5);
			temp = _mm_add_epi16(du1, dv1);
			dg_l = _mm_sub_epi16(coff1, temp);

			dv2 = _mm_mulhi_epu16(sv1, c8);
			dr_l = _mm_add_epi16(dv2, coff2);

			ddy1 = _mm_mulhi_epu16(sy1, c0);

			/* db1 */
			bak = _mm_unpacklo_epi16(db_l, db_l);
			db1 = _mm_add_epi16(ddy1, bak);

			/* dg1 */
			bak = _mm_unpacklo_epi16(dg_l, dg_l);
			dg1 = _mm_add_epi16(ddy1, bak);

			/* dr1 */
			bak = _mm_unpacklo_epi16(dr_l, dr_l);
			dr1 = _mm_add_epi16(ddy1, bak);

			db1 = _mm_srai_epi16(db1, 5);
			dg1 = _mm_srai_epi16(dg1, 5);
			dr1 = _mm_srai_epi16(dr1, 5);

			/* pack: 16=>8 */
			db1 = _mm_packus_epi16(db1, x_zero);
			dr1 = _mm_packus_epi16(dr1, x_zero);
			dg1 = _mm_packus_epi16(dg1, x_zero);

			/* create rgb sequences : db/dr/dg[1] */
			dzrl = _mm_unpacklo_epi8(x_zero, dr1);
			dgbl = _mm_unpacklo_epi8(dg1, db1);

			drgbl = _mm_unpacklo_epi16(dzrl, dgbl);
			SAVE_ARGB1(drgbl);

			iW += 4;
		}

		/* 2 pixels */
		if (iW <= width - 2) {
			/* load y u v, and expand */
			iu = *((mlib_u8 *)pu);
			pu = (__m128i *) (((mlib_u8 *)pu) + 1);
			iv = *((mlib_u8 *)pv);
			pv = (__m128i *) (((mlib_u8 *)pv) + 1);
			iTemp = *((mlib_s16 *)py);
			temp = _mm_cvtsi32_si128(iTemp);
			sy1 = _mm_unpacklo_epi8(x_zero, temp);
			py = (__m128i *) (((mlib_s16 *)py) + 1);

			/* pre-calc d[r/g/b][1] */
			iu0 = (iu * ic1) >> 8;
			ib = icoff0 + iu0;

			iu1 = (iu * ic4) >> 8;
			iv1 = (iv * ic5) >> 8;
			iTemp = iu1 + iv1;
			ig = icoff1 - iTemp;

			iv2 = (iv * ic8) >> 8;
			ir = iv2 + icoff2;

			ddy1 = _mm_mulhi_epu16(sy1, c0);

			/* db1 */
			temp = _mm_set1_epi16(ib);
			db1 = _mm_add_epi16(ddy1, temp);

			/* dg1 */
			temp = _mm_set1_epi16(ig);
			dg1 = _mm_add_epi16(ddy1, temp);

			/* dr1 */
			temp = _mm_set1_epi16(ir);
			dr1 = _mm_add_epi16(ddy1, temp);

			db1 = _mm_srai_epi16(db1, 5);
			dg1 = _mm_srai_epi16(dg1, 5);
			dr1 = _mm_srai_epi16(dr1, 5);

			/* pack: 16=>8 */
			db1 = _mm_packus_epi16(db1, x_zero);
			dr1 = _mm_packus_epi16(dr1, x_zero);
			dg1 = _mm_packus_epi16(dg1, x_zero);

			/* create rgb sequences : db/dr/dg */
			dzrl = _mm_unpacklo_epi8(x_zero, dr1);
			dgbl = _mm_unpacklo_epi8(dg1, db1);

			/* lower half of drgl & dbzl */
			drgbl = _mm_unpacklo_epi16(dzrl, dgbl);
			ptemp = (mlib_u8*)(&drgbl);
			pdd += 1;
			ptemp += 1;
			*((mlib_s16*)pdd) = *((mlib_s16*)ptemp);
			pdd += 2;
			ptemp += 2;
			*((mlib_u8*)pdd) = *((mlib_u8*)ptemp);
			pdd += 2;
			ptemp += 2;
			*((mlib_s16*)pdd) = *((mlib_s16*)ptemp);
			pdd += 2;
			ptemp += 2;
			*((mlib_u8*)pdd) = *((mlib_u8*)ptemp);
			pdd += 1;

			iW += 2;
		}

		pY += y_stride;
		pU += uv_stride;
		pV += uv_stride;
		pD += argb_stride;
	}
Пример #16
0
// Simple quantization
static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
                             int n, const VP8Matrix* const mtx) {
  const __m128i max_coeff_2047 = _mm_set1_epi16(2047);
  const __m128i zero = _mm_set1_epi16(0);
  __m128i sign0, sign8;
  __m128i coeff0, coeff8;
  __m128i out0, out8;
  __m128i packed_out;

  // Load all inputs.
  // TODO(cduvivier): Make variable declarations and allocations aligned so that
  //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
  __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
  __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
  const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]);
  const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]);
  const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]);
  const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]);
  const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
  const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
  const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);
  const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);
  const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]);
  const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]);

  // sign(in) = in >> 15  (0x0000 if positive, 0xffff if negative)
  sign0 = _mm_srai_epi16(in0, 15);
  sign8 = _mm_srai_epi16(in8, 15);

  // coeff = abs(in) = (in ^ sign) - sign
  coeff0 = _mm_xor_si128(in0, sign0);
  coeff8 = _mm_xor_si128(in8, sign8);
  coeff0 = _mm_sub_epi16(coeff0, sign0);
  coeff8 = _mm_sub_epi16(coeff8, sign8);

  // coeff = abs(in) + sharpen
  coeff0 = _mm_add_epi16(coeff0, sharpen0);
  coeff8 = _mm_add_epi16(coeff8, sharpen8);

  // if (coeff > 2047) coeff = 2047
  coeff0 = _mm_min_epi16(coeff0, max_coeff_2047);
  coeff8 = _mm_min_epi16(coeff8, max_coeff_2047);

  // out = (coeff * iQ + B) >> QFIX;
  {
    // doing calculations with 32b precision (QFIX=17)
    // out = (coeff * iQ)
    __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
    __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
    __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
    __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
    __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
    __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
    // expand bias from 16b to 32b
    __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero);
    __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero);
    __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero);
    __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero);
    // out = (coeff * iQ + B)
    out_00 = _mm_add_epi32(out_00, bias_00);
    out_04 = _mm_add_epi32(out_04, bias_04);
    out_08 = _mm_add_epi32(out_08, bias_08);
    out_12 = _mm_add_epi32(out_12, bias_12);
    // out = (coeff * iQ + B) >> QFIX;
    out_00 = _mm_srai_epi32(out_00, QFIX);
    out_04 = _mm_srai_epi32(out_04, QFIX);
    out_08 = _mm_srai_epi32(out_08, QFIX);
    out_12 = _mm_srai_epi32(out_12, QFIX);
    // pack result as 16b
    out0 = _mm_packs_epi32(out_00, out_04);
    out8 = _mm_packs_epi32(out_08, out_12);
  }

  // get sign back (if (sign[j]) out_n = -out_n)
  out0 = _mm_xor_si128(out0, sign0);
  out8 = _mm_xor_si128(out8, sign8);
  out0 = _mm_sub_epi16(out0, sign0);
  out8 = _mm_sub_epi16(out8, sign8);

  // in = out * Q
  in0 = _mm_mullo_epi16(out0, q0);
  in8 = _mm_mullo_epi16(out8, q8);

  // if (coeff <= mtx->zthresh_) {in=0; out=0;}
  {
    __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0);
    __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8);
    in0 = _mm_and_si128(in0, cmp0);
    in8 = _mm_and_si128(in8, cmp8);
    _mm_storeu_si128((__m128i*)&in[0], in0);
    _mm_storeu_si128((__m128i*)&in[8], in8);
    out0 = _mm_and_si128(out0, cmp0);
    out8 = _mm_and_si128(out8, cmp8);
  }

  // zigzag the output before storing it.
  //
  // The zigzag pattern can almost be reproduced with a small sequence of
  // shuffles. After it, we only need to swap the 7th (ending up in third
  // position instead of twelfth) and 8th values.
  {
    __m128i outZ0, outZ8;
    outZ0 = _mm_shufflehi_epi16(out0,  _MM_SHUFFLE(2, 1, 3, 0));
    outZ0 = _mm_shuffle_epi32  (outZ0, _MM_SHUFFLE(3, 1, 2, 0));
    outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2));
    outZ8 = _mm_shufflelo_epi16(out8,  _MM_SHUFFLE(3, 0, 2, 1));
    outZ8 = _mm_shuffle_epi32  (outZ8, _MM_SHUFFLE(3, 1, 2, 0));
    outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0));
    _mm_storeu_si128((__m128i*)&out[0], outZ0);
    _mm_storeu_si128((__m128i*)&out[8], outZ8);
    packed_out = _mm_packs_epi16(outZ0, outZ8);
  }
  {
    const int16_t outZ_12 = out[12];
    const int16_t outZ_3 = out[3];
    out[3] = outZ_12;
    out[12] = outZ_3;
  }

  // detect if all 'out' values are zeroes or not
  {
    int32_t tmp[4];
    _mm_storeu_si128((__m128i*)tmp, packed_out);
    if (n) {
      tmp[0] &= ~0xff;
    }
    return (tmp[3] || tmp[2] || tmp[1] || tmp[0]);
  }
}
Пример #17
0
void FileIconDrawGlass::Text(HDC hdc, PCTCHAR pcszText, const RECT &rc, eTextColor eColor, UINT uFlags)
{
	if (!pcszText || !*pcszText) return;

	// Find out actual size of text
	int nChars = _tcslen(pcszText);
	uFlags |= DT_NOCLIP;

	int iX = rc.left;
	int iY = rc.top;
	int iXW = (rc.right - iX);
	int iYH = (rc.bottom - iY);

	RECT rcMin = rc;
	if (DrawText(hdcTextDIB, pcszText, nChars, &rcMin, uFlags | DT_CALCRECT)) {
		int iMinXW = rcMin.right - rcMin.left;
		int iMinYH = rcMin.bottom - rcMin.top;
		if (iMinXW < iXW) {
			if (uFlags & DT_CENTER) {
				iX += (iXW - iMinXW)/2;
				uFlags &= ~DT_CENTER;
			} else if (uFlags & DT_RIGHT) {
				iX += (iXW - iMinXW);
				uFlags &= ~DT_RIGHT;
			}
			iXW = iMinXW;
		}
		if (iMinYH < iYH) {
			if (uFlags & DT_SINGLELINE) {
				if (uFlags & DT_VCENTER) {
					iY += (iYH - iMinYH)/2;
					uFlags &= ~DT_VCENTER;
				} else if (uFlags & DT_BOTTOM) {
					iY += (iYH - iMinYH);
					uFlags &= ~DT_BOTTOM;
				}
			}
			iYH = iMinYH;
		}
	}

	iXW += 2;	// NB: +2 'cause we want an extra pixel at the border so that the font smoothing will look bette!
	iYH += 2;

	// Ensure we have a big enough DIB to draw the text to
	if ((iXW > iTextDIBXW) || (iYH > iTextDIBYH)) CreateTextDIB(iXW, iYH);
	if (!hbmpTextDIB) return;

	// Select color
	ieBGRA clr;
	switch (eColor) {
	case eFileName:	clr = clrFileName;		break;
	case eComment:	clr = clrComment;		break;
	case eFileInfo:	clr = clrFileInfo;		break;
	default:		clr = ieBGRA(0,0,0);	break;
	}
	clr.A = 0xFF - clrBkg.A;

	// Draw the text to in-memory DIB
	RECT rcTextDIB = { 0, 0, iXW, iYH };
	FillRect(hdcTextDIB, &rcTextDIB, hbrBkg);

	rcTextDIB.left++;
	rcTextDIB.top++;

	DrawText(hdcTextDIB, pcszText, nChars, &rcTextDIB, uFlags);

	// Modify DIB:
#ifndef __X64__
	if (g_bSSE2) 
#endif
	{
		__m128i r0, r1, r2, r3, r4, r5, r6, r7;

		r7 = _mm_setzero_si128();									// 0
		r6 = _mm_set1_epi32(clr.dw);								// CA  CR  CG  CB  CA  CR  CG  CB  CA  CR  CG  CB  CA  CR  CG  CB
		r6 = _mm_unpacklo_epi8(r7, r6);								// CA<<8   CR<<8   CG<<8   CB<<8   CA<<8   CR<<8   CG<<8   CB<<8
		r5 = _mm_set1_epi16(1);										// 1       1       1       1       1       1       1       1
		r4 = _mm_set1_epi32(0xFF);									// FF              FF              FF              FF
		r3 = _mm_set1_epi32(clrBkg.dw);								// DA  0   0   0   DA  0   0   0   DA  0   0   0   DA  0   0   0

		ieBGRA *py = pTextDIB;
		for (int y = iYH; y--; py += iTextDIBXW) {
			ieBGRA *px = py;

			for (int x_4 = (iXW+3)>>2; x_4--; px += 4) {

				r0 = _mm_load_si128((__m128i *)px);
				r1 = r0;
				r2 = r0;											// X3  R3  G3  B3  X2  R2  G2  B2  X1  R1  G1  B1  X0  R0  G0  B0 
				r0 = _mm_srli_epi32(r0, 16);						// 0   0   X3  R3  0   0   X2  R2  0   0   X1  R1  0   0   X0  R0 
				r1 = _mm_srli_epi32(r1, 8);							// 0   X3  R3  G3  0   X2  R2  G2  0   X1  R1  G1  0   X0  R0  G0 
				r0 = _mm_max_epu8(r0, r2);
				r0 = _mm_max_epu8(r0, r1);							// x   x   x   A3  x   x   x   A2  x   x   x   A1  x   x   x   A0
				r0 = _mm_and_si128(r0, r4);							// 0       A3      0       A2      0       A1      0       A0
				r0 = _mm_shufflelo_epi16(r0, _MM_SHUFFLE(2,2,0,0));
				r0 = _mm_shufflehi_epi16(r0, _MM_SHUFFLE(2,2,0,0));	// A3      A3      A2      A2      A1      A1      A0      A0
				r1 = r0;
				r0 = _mm_unpacklo_epi32(r0, r0);					// A1      A1      A1      A1      A0      A0      A0      A0
				r1 = _mm_unpackhi_epi32(r1, r1);					// A3      A3      A3      A3      A2      A2      A2      A2
				r0 = _mm_add_epi16(r0, r5);							// A1'     A1'     A1'     A1'     A0'     A0'     A0'     A0' 
				r1 = _mm_add_epi16(r1, r5);							// A3'     A3'     A3'     A3'     A2'     A2'     A2'     A2' 
				r0 = _mm_mulhi_epu16(r0, r6);						// xA1"    xR1     xG1     xB1     xA0"    xR0     xG0     xB0
				r1 = _mm_mulhi_epu16(r1, r6);						// xA3"    xR3     xG3     xB3     xA2"    xR2     xG2     xB2
				r0 = _mm_packus_epi16(r0, r1);						// xA3"xR3 xG3 xB3 xA2"xR2 xG2 xB2 xA1"xR1 xG1 xB1 xA0"xR0 xG0 xB0
				r0 = _mm_adds_epu8(r0, r3);							// xA3 xR3 xG3 xB3 xA2 xR2 xG2 xB2 xA1 xR1 xG1 xB1 xA0 xR0 xG0 xB0
				_mm_store_si128((__m128i *)px, r0);
			}
		}
	}
#ifndef __X64__
	else {
 SIMD_INLINE __m128i ShiftedWeightedSquare16(__m128i difference, __m128i weight)
 {
     return _mm_mulhi_epu16(_mm_mullo_epi16(difference, difference), weight);
 }
Пример #19
0
static void GF_FUNC_ALIGN VS_CC
proc_16bit_sse2(convolution_hv_t *ch, uint8_t *buff, int bstride, int width,
                int height, int stride, uint8_t *d, const uint8_t *s)
{
    const uint16_t *srcp = (uint16_t *)s;
    uint16_t *dstp = (uint16_t *)d;
    stride /= 2;
    bstride /= 2;

    uint16_t *p0 = (uint16_t *)buff + 8;
    uint16_t *p1 = p0 + bstride;
    uint16_t *p2 = p1 + bstride;
    uint16_t *p3 = p2 + bstride;
    uint16_t *p4 = p3 + bstride;
    uint16_t *orig = p0, *end = p4;

    line_copy16(p0, srcp + 2 * stride, width, 2);
    line_copy16(p1, srcp + stride, width, 2);
    line_copy16(p2, srcp, width, 2);
    srcp += stride;
    line_copy16(p3, srcp, width, 2);

    __m128i zero = _mm_setzero_si128();
    __m128i all1 = _mm_cmpeq_epi32(zero, zero);
    __m128i one = _mm_srli_epi32(all1, 31);
    __m128 rdiv_h = _mm_set1_ps((float)ch->rdiv_h);
    __m128 rdiv_v = _mm_set1_ps((float)ch->rdiv_v);
    __m128 bias = _mm_set1_ps((float)ch->bias);

    __m128i matrix_h[5];
    __m128i matrix_v[5];
    int sign_h[5];
    int sign_v[5];
    for (int i = 0; i < 5; i++) {
        sign_h[i] = ch->m_h[i] < 0 ? 1 : 0;
        sign_v[i] = ch->m_v[i] < 0 ? 1 : 0;
        uint16_t val = sign_h[i] ? (uint16_t)(ch->m_h[i] * -1) : (uint16_t)ch->m_h[i];
        matrix_h[i] = _mm_set1_epi16((int16_t)val);
        val = sign_v[i] ? (uint16_t)(ch->m_v[i] * -1) : (uint16_t)ch->m_v[i];
        matrix_v[i] = _mm_set1_epi16((int16_t)val);
    }

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 2 ? 1 : -1);
        line_copy16(p4, srcp, width, 2);

        for (int x = 0; x < width; x += 8) {
            uint16_t *array[] = {
                p0 + x, p1 + x, p2 + x, p3 + x, p4 + x,
                p2 + x - 2, p2 + x - 1, dstp + x, p2 + x + 1, p2 + x + 2
            };

            for (int j = 0; j < 2; j++) {
                __m128i *matrix = j == 0 ? matrix_v : matrix_h;
                int *sign = j == 0 ? sign_v : sign_h;
                __m128 rdiv = j == 0 ? rdiv_v : rdiv_h;
                __m128i sum[2];
                sum[0] = _mm_setzero_si128();
                sum[1] = _mm_setzero_si128();

                for (int i = 0; i < 5; i++) {
                    __m128i xmm0, xmm1, xmm2;

                    xmm0 = _mm_loadu_si128((__m128i *)array[i + j * 5]);

                    xmm1 = _mm_mullo_epi16(xmm0, matrix[i]);
                    xmm0 = _mm_mulhi_epu16(xmm0, matrix[i]);
                    xmm2 = _mm_unpacklo_epi16(xmm1, xmm0);
                    xmm0 = _mm_unpackhi_epi16(xmm1, xmm0);

                    if (sign[i]) {
                        xmm2 = _mm_add_epi32(one, _mm_xor_si128(xmm2, all1));
                        xmm0 = _mm_add_epi32(one, _mm_xor_si128(xmm0, all1));
                    }
                    sum[0] = _mm_add_epi32(sum[0], xmm2);
                    sum[1] = _mm_add_epi32(sum[1], xmm0);
                }

                for (int i = 0; i < 2; i++) {
                    __m128 sumfp;
                    __m128i mask, temp;
                    sumfp = _mm_cvtepi32_ps(sum[i]);
                    sumfp = _mm_mul_ps(sumfp, rdiv);
                    if (j == 1) {
                        sumfp = _mm_add_ps(sumfp, bias);
                    }
                    sum[i] = _mm_cvttps_epi32(sumfp);

                    temp = _mm_srli_epi32(all1, 16);
                    mask = _mm_cmplt_epi32(sum[i], temp);
                    sum[i] = _mm_or_si128(_mm_and_si128(sum[i], mask),
                                          _mm_andnot_si128(mask, temp));
                    mask = _mm_cmpgt_epi32(sum[i], zero);
                    if (ch->saturate) {
                        sum[i] = _mm_and_si128(mask, sum[i]);
                    } else {
                        temp = _mm_add_epi32(one, _mm_xor_si128(sum[i], all1));
                        sum[i] = _mm_or_si128(_mm_and_si128(mask, sum[i]),
                                              _mm_andnot_si128(mask, temp));
                    }
                }

                sum[0] = mm_cast_epi32(sum[0], sum[1]);

                _mm_store_si128((__m128i *)(dstp + x), sum[0]);
            }
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = p3;
        p3 = p4;
        p4 = (p4 == end) ? orig : p4 + bstride;
    }
}
Пример #20
0
static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
                                       const uint16_t* const sharpen,
                                       const VP8Matrix* const mtx) {
    const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
    const __m128i zero = _mm_setzero_si128();
    __m128i out0, out8;
    __m128i packed_out;

    // Load all inputs.
    // TODO(cduvivier): Make variable declarations and allocations aligned so that
    //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
    __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
    __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
    const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]);
    const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]);
    const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]);
    const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]);

    // coeff = abs(in)
    __m128i coeff0 = _mm_abs_epi16(in0);
    __m128i coeff8 = _mm_abs_epi16(in8);

    // coeff = abs(in) + sharpen
    if (sharpen != NULL) {
        const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]);
        const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]);
        coeff0 = _mm_add_epi16(coeff0, sharpen0);
        coeff8 = _mm_add_epi16(coeff8, sharpen8);
    }

    // out = (coeff * iQ + B) >> QFIX
    {
        // doing calculations with 32b precision (QFIX=17)
        // out = (coeff * iQ)
        const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
        const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
        const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
        const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
        __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
        __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
        __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
        __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
        // out = (coeff * iQ + B)
        const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]);
        const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]);
        const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]);
        const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]);
        out_00 = _mm_add_epi32(out_00, bias_00);
        out_04 = _mm_add_epi32(out_04, bias_04);
        out_08 = _mm_add_epi32(out_08, bias_08);
        out_12 = _mm_add_epi32(out_12, bias_12);
        // out = QUANTDIV(coeff, iQ, B, QFIX)
        out_00 = _mm_srai_epi32(out_00, QFIX);
        out_04 = _mm_srai_epi32(out_04, QFIX);
        out_08 = _mm_srai_epi32(out_08, QFIX);
        out_12 = _mm_srai_epi32(out_12, QFIX);

        // pack result as 16b
        out0 = _mm_packs_epi32(out_00, out_04);
        out8 = _mm_packs_epi32(out_08, out_12);

        // if (coeff > 2047) coeff = 2047
        out0 = _mm_min_epi16(out0, max_coeff_2047);
        out8 = _mm_min_epi16(out8, max_coeff_2047);
    }

    // put sign back
    out0 = _mm_sign_epi16(out0, in0);
    out8 = _mm_sign_epi16(out8, in8);

    // in = out * Q
    in0 = _mm_mullo_epi16(out0, q0);
    in8 = _mm_mullo_epi16(out8, q8);

    _mm_storeu_si128((__m128i*)&in[0], in0);
    _mm_storeu_si128((__m128i*)&in[8], in8);

    // zigzag the output before storing it. The re-ordering is:
    //    0 1 2 3 4 5 6 7 | 8  9 10 11 12 13 14 15
    // -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15
    // There's only two misplaced entries ([8] and [7]) that are crossing the
    // reg's boundaries.
    // We use pshufb instead of pshuflo/pshufhi.
    {
        const __m128i kCst_lo = PSHUFB_CST(0, 1, 4, -1, 5, 2, 3, 6);
        const __m128i kCst_7 = PSHUFB_CST(-1, -1, -1, -1, 7, -1, -1, -1);
        const __m128i tmp_lo = _mm_shuffle_epi8(out0, kCst_lo);
        const __m128i tmp_7 = _mm_shuffle_epi8(out0, kCst_7);  // extract #7
        const __m128i kCst_hi = PSHUFB_CST(1, 4, 5, 2, -1, 3, 6, 7);
        const __m128i kCst_8 = PSHUFB_CST(-1, -1, -1, 0, -1, -1, -1, -1);
        const __m128i tmp_hi = _mm_shuffle_epi8(out8, kCst_hi);
        const __m128i tmp_8 = _mm_shuffle_epi8(out8, kCst_8);  // extract #8
        const __m128i out_z0 = _mm_or_si128(tmp_lo, tmp_8);
        const __m128i out_z8 = _mm_or_si128(tmp_hi, tmp_7);
        _mm_storeu_si128((__m128i*)&out[0], out_z0);
        _mm_storeu_si128((__m128i*)&out[8], out_z8);
        packed_out = _mm_packs_epi16(out_z0, out_z8);
    }

    // detect if all 'out' values are zeroes or not
    return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
}
Пример #21
0
static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16],
                                       const uint16_t* const sharpen,
                                       const VP8Matrix* const mtx) {
  const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
  const __m128i zero = _mm_setzero_si128();
  __m128i coeff0, coeff8;
  __m128i out0, out8;
  __m128i packed_out;

  // Load all inputs.
  // TODO(cduvivier): Make variable declarations and allocations aligned so that
  //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
  __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
  __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
  const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]);
  const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]);
  const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]);
  const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]);

  // extract sign(in)  (0x0000 if positive, 0xffff if negative)
  const __m128i sign0 = _mm_cmpgt_epi16(zero, in0);
  const __m128i sign8 = _mm_cmpgt_epi16(zero, in8);

  // coeff = abs(in) = (in ^ sign) - sign
  coeff0 = _mm_xor_si128(in0, sign0);
  coeff8 = _mm_xor_si128(in8, sign8);
  coeff0 = _mm_sub_epi16(coeff0, sign0);
  coeff8 = _mm_sub_epi16(coeff8, sign8);

  // coeff = abs(in) + sharpen
  if (sharpen != NULL) {
    const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]);
    const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]);
    coeff0 = _mm_add_epi16(coeff0, sharpen0);
    coeff8 = _mm_add_epi16(coeff8, sharpen8);
  }

  // out = (coeff * iQ + B) >> QFIX
  {
    // doing calculations with 32b precision (QFIX=17)
    // out = (coeff * iQ)
    const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
    const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
    const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
    const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
    __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
    __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
    // out = (coeff * iQ + B)
    const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]);
    const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]);
    const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]);
    const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]);
    out_00 = _mm_add_epi32(out_00, bias_00);
    out_04 = _mm_add_epi32(out_04, bias_04);
    out_08 = _mm_add_epi32(out_08, bias_08);
    out_12 = _mm_add_epi32(out_12, bias_12);
    // out = QUANTDIV(coeff, iQ, B, QFIX)
    out_00 = _mm_srai_epi32(out_00, QFIX);
    out_04 = _mm_srai_epi32(out_04, QFIX);
    out_08 = _mm_srai_epi32(out_08, QFIX);
    out_12 = _mm_srai_epi32(out_12, QFIX);

    // pack result as 16b
    out0 = _mm_packs_epi32(out_00, out_04);
    out8 = _mm_packs_epi32(out_08, out_12);

    // if (coeff > 2047) coeff = 2047
    out0 = _mm_min_epi16(out0, max_coeff_2047);
    out8 = _mm_min_epi16(out8, max_coeff_2047);
  }

  // get sign back (if (sign[j]) out_n = -out_n)
  out0 = _mm_xor_si128(out0, sign0);
  out8 = _mm_xor_si128(out8, sign8);
  out0 = _mm_sub_epi16(out0, sign0);
  out8 = _mm_sub_epi16(out8, sign8);

  // in = out * Q
  in0 = _mm_mullo_epi16(out0, q0);
  in8 = _mm_mullo_epi16(out8, q8);

  _mm_storeu_si128((__m128i*)&in[0], in0);
  _mm_storeu_si128((__m128i*)&in[8], in8);

  // zigzag the output before storing it.
  //
  // The zigzag pattern can almost be reproduced with a small sequence of
  // shuffles. After it, we only need to swap the 7th (ending up in third
  // position instead of twelfth) and 8th values.
  {
    __m128i outZ0, outZ8;
    outZ0 = _mm_shufflehi_epi16(out0,  _MM_SHUFFLE(2, 1, 3, 0));
    outZ0 = _mm_shuffle_epi32  (outZ0, _MM_SHUFFLE(3, 1, 2, 0));
    outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2));
    outZ8 = _mm_shufflelo_epi16(out8,  _MM_SHUFFLE(3, 0, 2, 1));
    outZ8 = _mm_shuffle_epi32  (outZ8, _MM_SHUFFLE(3, 1, 2, 0));
    outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0));
    _mm_storeu_si128((__m128i*)&out[0], outZ0);
    _mm_storeu_si128((__m128i*)&out[8], outZ8);
    packed_out = _mm_packs_epi16(outZ0, outZ8);
  }
  {
    const int16_t outZ_12 = out[12];
    const int16_t outZ_3 = out[3];
    out[3] = outZ_12;
    out[12] = outZ_3;
  }

  // detect if all 'out' values are zeroes or not
  return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
}
Пример #22
0
		void SWModelRenderer::RenderInner(spades::draw::SWModel *model,
									 const client::ModelRenderParam &param) {
			auto& mat = param.matrix;
			auto origin = mat.GetOrigin();
			auto axis1 = mat.GetAxis(0);
			auto axis2 = mat.GetAxis(1);
			auto axis3 = mat.GetAxis(2);
			auto *rawModel = model->GetRawModel();
			auto rawModelOrigin = rawModel->GetOrigin();
			rawModelOrigin += 0.1f;
			origin += axis1 * rawModelOrigin.x;
			origin += axis2 * rawModelOrigin.y;
			origin += axis3 * rawModelOrigin.z;
			
			int w = rawModel->GetWidth();
			int h = rawModel->GetHeight();
			//int d = rawModel->GetDepth();
			
			// evaluate brightness for each normals
			uint8_t brights[3*3*3];
			{
				auto lightVec = MakeVector3(0.f, -0.707f, -0.707f);
				float dot1 = Vector3::Dot(axis1, lightVec) * fastRSqrt(axis1.GetPoweredLength());
				float dot2 = Vector3::Dot(axis2, lightVec) * fastRSqrt(axis2.GetPoweredLength());
				float dot3 = Vector3::Dot(axis3, lightVec) * fastRSqrt(axis3.GetPoweredLength());
				for(int x = 0; x < 3; x++){
					float d;
					int cnt;
					switch(x){
						case 0: d = -dot1; cnt = 1; break;
						case 1: d = 0.f; cnt = 0; break;
						case 2: d = dot1; cnt = 1; break;
					}
					for(int y = 0; y < 3; y++){
						auto d2 = d;
						auto cnt2 = cnt;
						switch(y){
							case 0: d2 -= dot2; cnt2++; break;
							case 1: break;
							case 2: d2 += dot2; cnt2++; break;
						}
						for(int z = 0; z < 3; z++) {
							auto d3 = d;
							auto cnt3 = cnt2;
							switch(y){
								case 0: d3 -= dot3; cnt3++; break;
								case 1: break;
								case 2: d3 += dot3; cnt3++; break;
							}
							switch(cnt3){
								case 2:
									d3 *= 0.707f;
									break;
								case 3:
									d3 *= 0.57735f;
									break;
							}
							d3 = 192.f + d3 * 62.f;
							brights[x + y * 3 + z * 9]
							= static_cast<uint8_t>(d3);
						}
					}
				}
			}
				
			
			// compute center coord. for culling
			{
				auto center = origin;
				auto localCenter = model->GetCenter();
				center += axis1 * localCenter.x;
				center += axis2 * localCenter.y;
				center += axis3 * localCenter.z;
				
				float largestAxis = axis1.GetPoweredLength();
				largestAxis = std::max(largestAxis, axis2.GetPoweredLength());
				largestAxis = std::max(largestAxis, axis3.GetPoweredLength());
				
				if(!r->SphereFrustrumCull(center, model->GetRadius() * sqrtf(largestAxis)))
					return;
			}
			
			Bitmap *fbmp = r->fb;
			auto *fb = fbmp->GetPixels();
			int fw = fbmp->GetWidth();
			int fh = fbmp->GetHeight();
			auto *db = r->depthBuffer.data();
			
			Matrix4 viewproj = r->GetProjectionViewMatrix();
			Vector4 ndc2scrscale = {fw * 0.5f, -fh * 0.5f, 1.f, 1.f};
			//Vector4 ndc2scroff = {fw * 0.5f, fh * 0.5f, 0.f, 0.f};
			int ndc2scroffX = fw >> 1;
			int ndc2scroffY = fh >> 1;
			
			
			// render each points
			auto tOrigin = viewproj * MakeVector4(origin.x, origin.y, origin.z, 1.f);
			auto tAxis1 = viewproj * MakeVector4(axis1.x, axis1.y, axis1.z, 0.f);
			auto tAxis2 = viewproj * MakeVector4(axis2.x, axis2.y, axis2.z, 0.f);
			auto tAxis3 = viewproj * MakeVector4(axis3.x, axis3.y, axis3.z, 0.f);
			tOrigin *= ndc2scrscale;
			tAxis1 *= ndc2scrscale;
			tAxis2 *= ndc2scrscale;
			tAxis3 *= ndc2scrscale;
			
			float pointDiameter;// = largestAxis * 0.55f * fh * 0.5f;
			{
				float largestAxis = tAxis1.GetPoweredLength();
				largestAxis = std::max(largestAxis, tAxis2.GetPoweredLength());
				largestAxis = std::max(largestAxis, tAxis3.GetPoweredLength());
				pointDiameter = sqrtf(largestAxis);
			}
			
			uint32_t customColor;
			customColor =
			ToFixed8(param.customColor.z) |
			(ToFixed8(param.customColor.y) << 8) |
			(ToFixed8(param.customColor.x) << 16);
			
			auto v1 = tOrigin;
			float zNear = r->sceneDef.zNear;
			for(int x = 0; x < w; x++) {
				auto v2 = v1;
				for(int y = 0; y < h; y++) {
					auto *mp = &model->renderData
					[model->renderDataAddr[x + y * w]];
					while(*mp != -1) {
						uint32_t data = *(mp++);
						uint32_t normal = *(mp++);
						int z = static_cast<int>(data >> 24);
						//SPAssert(z < d);
						SPAssert(z >= 0);
						
						auto vv = v2 + tAxis3 * zvals[z];
						if(vv.z < zNear) continue;
						
						// save Z value (don't divide this by W!)
						float zval = vv.z;
						
						// use vv.z for point radius to be divided by W
						vv.z = pointDiameter;
						
						// perspective division
						float scl = fastRcp(vv.w);
						vv *= scl;
						
						int ix = static_cast<int>(vv.x) + ndc2scroffX;
						int iy = static_cast<int>(vv.y) + ndc2scroffY;
						int idm = static_cast<int>(vv.z + .99f);
						idm = std::max(1, idm);
						int minX = ix - (idm >> 1);
						int minY = iy - (idm >> 1);
						if(minX >= fw || minY >= fh) continue;
						int maxX = ix + idm;
						int maxY = iy + idm;
						if(maxX <= 0 || maxY <= 0) continue;
						
						minX = std::max(minX, 0);
						minY = std::max(minY, 0);
						maxX = std::min(maxX, fw);
						maxY = std::min(maxY, fh);
						
						auto *fb2 = fb + (minX + minY * fw);
						auto *db2 = db + (minX + minY * fw);
						int w = maxX - minX;
						
						uint32_t color = data & 0xffffff;
						if(color == 0)
							color = customColor;
						
						SPAssert(normal < 27);
						int bright = brights[normal];
#if ENABLE_SSE2
						if(lvl == SWFeatureLevel::SSE2) {
							auto m = _mm_setr_epi32(color, 0, 0, 0);
							auto f = _mm_set1_epi16(bright << 8);
							
							m = _mm_unpacklo_epi8(m, _mm_setzero_si128());
							m = _mm_mulhi_epu16(m, f);
							m = _mm_packus_epi16(m, m);
							
							_mm_store_ss(reinterpret_cast<float*>(&color),
										 _mm_castsi128_ps(m));
						}else
#endif
						{
							uint32_t c1 = color & 0xff00;
							uint32_t c2 = color & 0xff00ff;
							c1 *= bright;
							c2 *= bright;
							color = ((c1&0xff0000) | (c2&0xff00ff00)) >> 8;
						}
						
						for(int yy = minY; yy < maxY; yy++){
							auto *fb3 = fb2;
							auto *db3 = db2;
							
							for(int xx = w; xx > 0; xx--) {
								if(zval < *db3) {
									*db3 = zval;
									*fb3 = color;
								}
								fb3++; db3++;
							}
							
							fb2 += fw;
							db2 += fw;
						}
						
						
					}
					v2 += tAxis2;
				}
				v1 += tAxis1;
			}
		}
void
mlib_s_ImageBlendLine(
    mlib_work_image * param,
    mlib_u8 *dp,
    __m128i * buffz,
    __m128i * buffd)
{
	mlib_blend blend = param->blend;
	mlib_s32 chan_d = param->chan_d;
	mlib_s32 chan_s = param->channels;
	mlib_d64 alp = (param->alpha) * (1.0 / 255);
	mlib_s32 width = GetElemSubStruct(current, width);
	mlib_u8 *tdp = dp;
	mlib_s32 width2, y_step, next_step = 2;
	mlib_s32 alp_ind = param->alp_ind, mask255;
	__m128i aa, dalp, done;
	__m128i mzero, mask_7fff, mask_8000, amask, amask256, amaskffff;
	__m128i d_rnd;
	mlib_s32 i, j;

	if (!alp_ind) {
		d_rnd = _mm_set1_epi16(0x0080);

		tdp = (void *)dp;
		if (chan_d == 3)
			tdp = (void *)buffd;

		for (i = 0; i < width / 2; i++) {
			__m128i dd;

			dd = buffz[i];
			dd = _mm_adds_epu16(dd, d_rnd);
			dd = _mm_srli_epi16(dd, 8);
			dd = _mm_packus_epi16(dd, dd);
			_mm_storel_epi64((void *)(tdp + 8 * i), dd);
		}
		if (width & 1) {
			__m128i dd;

			dd = buffz[i];
			dd = _mm_adds_epu16(dd, d_rnd);
			dd = _mm_srli_epi16(dd, 8);
			dd = _mm_packus_epi16(dd, dd);
			*(mlib_s32 *)(tdp + 8 * i) = *(mlib_s32 *)&dd;
		}

		if (chan_d == 3) {
			mlib_s_ImageChannelExtract_U8_43L_D1((void *)buffd, dp,
			    width);
		}
		return;
	}

	width2 = (width + 1) / 2;

	mzero = _mm_setzero_si128();
	mask_7fff = _mm_set1_epi16(0x7FFF);
	mask_8000 = _mm_set1_epi16(0x8000);
	done = _mm_set1_epi16(1 << 15);
	if (alp_ind == -1) {
		mask255 = 0xFF;
		amask = _mm_setr_epi32(0xff00, 0, 0xff00, 0);
		amaskffff = _mm_setr_epi32(0xffff, 0, 0xffff, 0);
		amask256 = _mm_setr_epi32(0x0100, 0, 0x0100, 0);
	} else {
		mask255 = 0xFF000000;
		amask = _mm_setr_epi32(0, 0xff000000, 0, 0xff000000);
		amaskffff = _mm_setr_epi32(0, 0xffff0000, 0, 0xffff0000);
		amask256 = _mm_setr_epi32(0, 0x01000000, 0, 0x01000000);
	}
	dalp = _mm_set1_epi16((1 << 15) * alp + 0.5);

	if (chan_s == 3) {
		if (chan_d == 3) {
			mlib_d64 alp = (param->alpha) * (1.0 / 255);
			mlib_s32 ialp;
			mlib_u8 *pz;
			__m128i emask;
			__m128i dalp, ralp, ss, dd, s0, s1, d0, d1, dr;

			mlib_s_ImageChannelExtract_S16_43L_D1((void *)buffz,
			    (void *)buffd, width);

			ialp = alp * (1 << 15);
			dalp = _mm_set1_epi16(ialp);
			ralp = _mm_set1_epi16((1 << 15) - ialp);
			emask = mlib_emask_m128i[(3 * width) & 15].m128i;

			pz = (void *)buffd;
			tdp = dp;
			for (i = 0; i <= 3 * width - 16; i += 16) {
				s0 = _mm_load_si128((__m128i *) (pz + 2 * i));
				s1 = _mm_load_si128((__m128i *) (pz + 2 * i +
				    16));
				dd = _mm_loadu_si128((__m128i *) (tdp + i));
				d0 = _mm_unpacklo_epi8(mzero, dd);
				d1 = _mm_unpackhi_epi8(mzero, dd);
				d0 = _mm_add_epi16(_mm_mulhi_epu16(s0, dalp),
				    _mm_mulhi_epu16(d0, ralp));
				d1 = _mm_add_epi16(_mm_mulhi_epu16(s1, dalp),
				    _mm_mulhi_epu16(d1, ralp));
				d0 = _mm_srli_epi16(d0, 7);
				d1 = _mm_srli_epi16(d1, 7);
				dr = _mm_packus_epi16(d0, d1);
				_mm_storeu_si128((__m128i *) (tdp + i), dr);
			}

			if (i < 3 * width) {
				s0 = _mm_load_si128((__m128i *) (pz + 2 * i));
				s1 = _mm_load_si128((__m128i *) (pz + 2 * i +
				    16));
				dd = _mm_loadu_si128((__m128i *) (tdp + i));
				d0 = _mm_unpacklo_epi8(mzero, dd);
				d1 = _mm_unpackhi_epi8(mzero, dd);
				d0 = _mm_add_epi16(_mm_mulhi_epu16(s0, dalp),
				    _mm_mulhi_epu16(d0, ralp));
				d1 = _mm_add_epi16(_mm_mulhi_epu16(s1, dalp),
				    _mm_mulhi_epu16(d1, ralp));
				d0 = _mm_srli_epi16(d0, 7);
				d1 = _mm_srli_epi16(d1, 7);
				dr = _mm_packus_epi16(d0, d1);

				dr = _mm_or_si128(_mm_and_si128(emask, dr),
				    _mm_andnot_si128(emask, dd));

				_mm_storeu_si128((__m128i *) (tdp + i), dr);
			}
		} else if (blend == MLIB_BLEND_GTK_SRC) {
			mlib_u8 *buffi = (mlib_u8 *)buffz + 1;

			for (i = 0; i < width; i++) {
				tdp[0] = buffi[0];
				tdp[1] = buffi[2];
				tdp[2] = buffi[4];
				tdp[alp_ind] = 255;
				tdp += 4;
				buffi += 8;
			}
		} else {
			mlib_d64 _w0 = param->alpha;
			mlib_d64 _w1s = 1.0 - _w0 * (1.0 / 255);
			__m128i buff[1];
			__m128i done;
			__m128i dalp, ralp, ss, dd, s0, s1, d0, d1, a0, a1, r0,
			    r1, rr, dr;
			__m128i wi, aa, amask;
			__m128 af, w0, w1, w1s, w, rw, w0r, w1r, scale;

			done = _mm_set1_epi16(1 << 15);
			amask = _mm_set1_epi32(mask255);

			w0 = _mm_set_ps1(_w0);
			w1s = _mm_set_ps1(_w1s);
			scale = _mm_set_ps1(1 << 15);

			if (alp_ind == -1) {
				tdp--;
				for (i = 0; i < width / 4; i++) {
					BLEND34_SRC_OVER(0);
					_mm_storeu_si128((__m128i *) tdp, dr);
					tdp += 16;
				}
				if (width & 3) {
					BLEND34_SRC_OVER(0);
					buff[0] = dr;
				}
			} else {
				for (i = 0; i < width / 4; i++) {
					BLEND34_SRC_OVER(3);
					_mm_storeu_si128((__m128i *) tdp, dr);
					tdp += 16;
				}
				if (width & 3) {
					BLEND34_SRC_OVER(3);
					buff[0] = dr;
				}
			}
			for (i = 0; i < (width & 3); i++) {
				((mlib_s32 *)tdp)[i] = ((mlib_s32 *)buff)[i];
			}
		}
	} else if (chan_d == 3) {
		if (blend != MLIB_BLEND_GTK_SRC) {
			if (alp_ind == -1) {
				tdp--;
			}
			for (i = 0; i < width; i++) {
				((mlib_s32 *)buffd)[i] =
				    *(mlib_s32 *)(tdp + 3 * i);
			}

			if (alp_ind == -1) {
				for (i = 0; i < width2; i++) {
					__m128i a0, s0, d0, dd;

					BLEND43_SRC_OVER(0);
				}
				mlib_s_ImageChannelExtract_U8_43R_D1((void *)
				    buffd, dp, width);
			} else {
				for (i = 0; i < width2; i++) {
					__m128i a0, s0, d0, dd;

					BLEND43_SRC_OVER(0xff);
				}
				mlib_s_ImageChannelExtract_U8_43L_D1((void *)
				    buffd, dp, width);
			}
		} else {
			mlib_u8 *buffi = (mlib_u8 *)buffz + 1;

			if (alp_ind == -1)
				buffi += 2;
			for (i = 0; i < width; i++) {
				tdp[0] = buffi[0];
				tdp[1] = buffi[2];
				tdp[2] = buffi[4];
				tdp += 3;
				buffi += 8;
			}
		}
	} else {	/* if (chan_d == 4) */

		if (alp_ind == -1) {
			tdp--;
		}
		if (blend == MLIB_BLEND_GTK_SRC) {
			mlib_u8 *p_alp = (mlib_u8 *)buffz + 1;
			mlib_s32 tail = ((mlib_s32 *)tdp)[width];

			if (alp_ind != -1)
				p_alp += 6;
			for (i = 0; i < width2; i++) {
				__m128i a0, a1, aa, ss, d0, dd;

				ss = buffz[i];
				a0 = _mm_loadl_epi64((void *)((mlib_d64 *)
				    mlib_m_tbl_255DivAlpha + p_alp[0]));
				a1 = _mm_loadl_epi64((void *)((mlib_d64 *)
				    mlib_m_tbl_255DivAlpha + p_alp[8]));
				aa = _mm_unpacklo_epi64(a0, a1);
				aa = _mm_or_si128(amask256,
				    _mm_andnot_si128(amaskffff, aa));
				d0 = _mm_mulhi_epu16(ss, aa);
				dd = _mm_packus_epi16(d0, d0);
				_mm_storel_epi64((void *)(tdp + 8 * i), dd);
				p_alp += 16;
			}

			((mlib_s32 *)tdp)[width] = tail;
		} else {
			mlib_blend blend = param->blend;
			mlib_d64 alp = (param->alpha) * (1.0 / 255);
			__m128i buff[1];
			__m128i done;
			__m128i ss, dd, s0, s1, d0, d1, a0, a1, r0, r1, rr, dr;
			__m128i wi, aa, amask, a16mask, zero_mask_i;
			__m128 dalp, div255, alpha, fone;
			__m128 af, sf, w0, w1, w1s, w, rw, w0r, w1r, scale;
			__m128 zero_mask, f_rnd;
			mlib_m128 s0u, s1u, s2u, s3u;

			done = _mm_set1_epi16(1 << 14);
			amask = _mm_set1_epi32(mask255);
			a16mask = _mm_set1_epi32(0xFFFF);

			dalp = _mm_set_ps1(alp * (1.0 / 256));
			fone = _mm_set_ps1(1.0);
			div255 = _mm_set_ps1(1.0 / 255);
			scale = _mm_set_ps1(1 << 8);
			alpha = _mm_set_ps1((float)(param->alpha) + 0.5);
			f_rnd = _mm_set_ps1(0.6);

			if (blend == MLIB_BLEND_GTK_SRC_OVER2) {
				if (alp_ind == -1) {
					for (i = 0; i < width / 4; i++) {
						BLEND44(SRC_OVER2, 0);
						_mm_storeu_si128((__m128i *)
						    tdp, dr);
						tdp += 16;
					}
					if (width & 3) {
						BLEND44(SRC_OVER2, 0);
						buff[0] = dr;
					}
				} else {
					for (i = 0; i < width / 4; i++) {
						BLEND44(SRC_OVER2, 3);
						_mm_storeu_si128((__m128i *)
						    tdp, dr);
						tdp += 16;
					}
					if (width & 3) {
						BLEND44(SRC_OVER2, 3);
						buff[0] = dr;
					}
				}
			} else {
				if (alp_ind == -1) {
					for (i = 0; i < width / 4; i++) {
						BLEND44(SRC_OVER, 0);
						_mm_storeu_si128((__m128i *)
						    tdp, dr);
						tdp += 16;
					}
					if (width & 3) {
						BLEND44(SRC_OVER, 0);
						buff[0] = dr;
					}
				} else {
					for (i = 0; i < width / 4; i++) {
						BLEND44(SRC_OVER, 3);
						_mm_storeu_si128((__m128i *)
						    tdp, dr);
						tdp += 16;
					}
					if (width & 3) {
						BLEND44(SRC_OVER, 3);
						buff[0] = dr;
					}
				}
			}

			for (i = 0; i < (width & 3); i++) {
				((mlib_s32 *)tdp)[i] = ((mlib_s32 *)buff)[i];
			}
		}
	}
}
test (__m128i s1, __m128i s2)
{
  return _mm_mulhi_epu16 (s1, s2); 
}
mlib_status
mlib_VideoColorJFIFYCC2ABGR444_naligned(
	mlib_u8 *abgr,
	const mlib_u8 *y,
	const mlib_u8 *cb,
	const mlib_u8 *cr,
	mlib_s32 n)
{
	/* 1.402 * 8192 */
	const __m128i x_c13 = _mm_set1_epi16(0x2cdd);
	const mlib_s32 c13 = 0x2cdd;

	/* abs(-0.34414) * 8192 */
	const __m128i x_c22 = _mm_set1_epi16(0xb03);
	const mlib_s32 c22 = 0xb03;

	/* abs(-0.71414) * 8192 */
	const __m128i x_c23 = _mm_set1_epi16(0x16da);
	const mlib_s32 c23 = 0x16da;

	/* 1.772 * 8192 */
	const __m128i x_c32 = _mm_set1_epi16(0x38b4);
	const mlib_s32 c32 = 0x38b4;

	/* -179.456 * 32 */
	const __m128i x_coff0 = _mm_set1_epi16(0xe991);
	const mlib_s32 coff0 = (mlib_s32)0xffffe991;

	/* 135.45984 * 32 */
	const __m128i x_coff1 = _mm_set1_epi16(0x10ef);
	const mlib_s32 coff1 = 0x10ef;

	/* -226.816 * 32 */
	const __m128i x_coff2 = _mm_set1_epi16(0xe3a6);
	const mlib_s32 coff2 = (mlib_s32)0xffffe3a6;

	const __m128i x_a = _mm_set1_epi8(0xff);
	const __m128i x_zero = _mm_setzero_si128();

	/* __m128i variables */
	__m128i x_y, x_cb, x_cr, x_r, x_g, x_b, x_temp;
	__m128i x_y1, x_cb1, x_cr1, x_y2, x_cb2, x_cr2;
	__m128i x_r1, x_r2, x_g1, x_g2, x_b1, x_b2;
	__m128i x_abgrl, x_abgrh, x_grl, x_grh, x_abl, x_abh;

	/* pointers */
	__m128i *px_y, *px_cb, *px_cr, *px_abgr;
	mlib_u8 *pabgr;

	/* other var */
	mlib_s32 i, iTemp, iy1, icb1, icr1, ir1, ig1, ib1;

	px_y = (__m128i *)y;
	px_cb = (__m128i *)cb;
	px_cr = (__m128i *)cr;
	px_abgr = (__m128i *)abgr;
	i = 0;

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
	for (; i <= n - 16; i += 16)	{
		x_y = _mm_loadu_si128(px_y);
		px_y++;
		x_y1 = _mm_unpacklo_epi8(x_y, x_zero);
		x_y2 = _mm_unpackhi_epi8(x_y, x_zero);
		x_cb = _mm_loadu_si128(px_cb);
		px_cb++;
		x_cb1 = _mm_unpacklo_epi8(x_zero, x_cb);
		x_cb2 = _mm_unpackhi_epi8(x_zero, x_cb);
		x_cr = _mm_loadu_si128(px_cr);
		px_cr++;
		x_cr1 = _mm_unpacklo_epi8(x_zero, x_cr);
		x_cr2 = _mm_unpackhi_epi8(x_zero, x_cr);

		/* lower half */
		x_temp = _mm_mulhi_epu16(x_cr1, x_c13);
		x_r1 = _mm_add_epi16(x_temp, x_coff0);
		x_temp = _mm_srai_epi16(x_r1, 5);
		x_r1 = _mm_add_epi16(x_temp, x_y1);

		x_temp = _mm_mulhi_epu16(x_cb1, x_c22);
		x_g1 = _mm_mulhi_epu16(x_cr1, x_c23);
		x_temp = _mm_add_epi16(x_temp, x_g1);
		x_g1 = _mm_sub_epi16(x_coff1, x_temp);
		x_temp = _mm_srai_epi16(x_g1, 5);
		x_g1 = _mm_add_epi16(x_temp, x_y1);

		x_temp = _mm_mulhi_epu16(x_cb1, x_c32);
		x_b1 = _mm_add_epi16(x_temp, x_coff2);
		x_temp = _mm_srai_epi16(x_b1, 5);
		x_b1 = _mm_add_epi16(x_temp, x_y1);

		/* upper half */
		x_temp = _mm_mulhi_epu16(x_cr2, x_c13);
		x_r2 = _mm_add_epi16(x_temp, x_coff0);
		x_temp = _mm_srai_epi16(x_r2, 5);
		x_r2 = _mm_add_epi16(x_temp, x_y2);

		x_temp = _mm_mulhi_epu16(x_cb2, x_c22);
		x_g2 = _mm_mulhi_epu16(x_cr2, x_c23);
		x_temp = _mm_add_epi16(x_temp, x_g2);
		x_g2 = _mm_sub_epi16(x_coff1, x_temp);
		x_temp = _mm_srai_epi16(x_g2, 5);
		x_g2 = _mm_add_epi16(x_temp, x_y2);

		x_temp = _mm_mulhi_epu16(x_cb2, x_c32);
		x_b2 = _mm_add_epi16(x_temp, x_coff2);
		x_temp = _mm_srai_epi16(x_b2, 5);
		x_b2 = _mm_add_epi16(x_temp, x_y2);

		/* pack */
		x_b = _mm_packus_epi16(x_b1, x_b2);
		x_r = _mm_packus_epi16(x_r1, x_r2);
		x_g = _mm_packus_epi16(x_g1, x_g2);

		/* create rgb sequences */
		x_abl = _mm_unpacklo_epi8(x_a, x_b);
		x_abh = _mm_unpackhi_epi8(x_a, x_b);
		x_grl = _mm_unpacklo_epi8(x_g, x_r);
		x_grh = _mm_unpackhi_epi8(x_g, x_r);

		/* save */
		x_abgrl = _mm_unpacklo_epi16(x_abl, x_grl);
		_mm_storeu_si128(px_abgr++, x_abgrl);

		x_abgrh = _mm_unpackhi_epi16(x_abl, x_grl);
		_mm_storeu_si128(px_abgr++, x_abgrh);

		x_abgrl = _mm_unpacklo_epi16(x_abh, x_grh);
		_mm_storeu_si128(px_abgr++, x_abgrl);

		x_abgrh = _mm_unpackhi_epi16(x_abh, x_grh);
		_mm_storeu_si128(px_abgr++, x_abgrh);
	}

	if (i <= n - 8) {
		x_y = _mm_loadl_epi64(px_y);
		px_y = (__m128i *) (((__m64 *)px_y) + 1);
		x_y1 = _mm_unpacklo_epi8(x_y, x_zero);
		x_cb = _mm_loadl_epi64(px_cb);
		px_cb = (__m128i *) (((__m64 *)px_cb) + 1);
		x_cb1 = _mm_unpacklo_epi8(x_zero, x_cb);
		x_cr = _mm_loadl_epi64(px_cr);
		px_cr = (__m128i *) (((__m64 *)px_cr) + 1);
		x_cr1 = _mm_unpacklo_epi8(x_zero, x_cr);

		/* lower half only */
		x_temp = _mm_mulhi_epu16(x_cr1, x_c13);
		x_r1 = _mm_add_epi16(x_temp, x_coff0);
		x_temp = _mm_srai_epi16(x_r1, 5);
		x_r1 = _mm_add_epi16(x_temp, x_y1);

		x_temp = _mm_mulhi_epu16(x_cb1, x_c22);
		x_g1 = _mm_mulhi_epu16(x_cr1, x_c23);
		x_temp = _mm_add_epi16(x_temp, x_g1);
		x_g1 = _mm_sub_epi16(x_coff1, x_temp);
		x_temp = _mm_srai_epi16(x_g1, 5);
		x_g1 = _mm_add_epi16(x_temp, x_y1);

		x_temp = _mm_mulhi_epu16(x_cb1, x_c32);
		x_b1 = _mm_add_epi16(x_temp, x_coff2);
		x_temp = _mm_srai_epi16(x_b1, 5);
		x_b1 = _mm_add_epi16(x_temp, x_y1);

		/* pack */
		x_b = _mm_packus_epi16(x_b1, x_zero);
		x_r = _mm_packus_epi16(x_r1, x_zero);
		x_g = _mm_packus_epi16(x_g1, x_zero);

		/* create rgb sequences */
		x_abl = _mm_unpacklo_epi8(x_a, x_b);
		x_grl = _mm_unpacklo_epi8(x_g, x_r);

		/* save */
		x_abgrl = _mm_unpacklo_epi16(x_abl, x_grl);
		_mm_storeu_si128(px_abgr++, x_abgrl);

		x_abgrh = _mm_unpackhi_epi16(x_abl, x_grl);
		_mm_storeu_si128(px_abgr++, x_abgrh);

		i += 8;
	}

	if (i <= n - 4) {
		iTemp = *((mlib_s32 *)px_y);
		x_y = _mm_cvtsi32_si128(iTemp);
		px_y = (__m128i *) (((mlib_s32 *)px_y) + 1);
		x_y1 = _mm_unpacklo_epi8(x_y, x_zero);

		iTemp = *((mlib_s32 *)px_cb);
		x_cb = _mm_cvtsi32_si128(iTemp);
		px_cb = (__m128i *) (((mlib_s32 *)px_cb) + 1);
		x_cb1 = _mm_unpacklo_epi8(x_zero, x_cb);

		iTemp = *((mlib_s32 *)px_cr);
		x_cr = _mm_cvtsi32_si128(iTemp);
		px_cr = (__m128i *) (((mlib_s32 *)px_cr) + 1);
		x_cr1 = _mm_unpacklo_epi8(x_zero, x_cr);

		/* 64 0f lower half only */
		x_temp = _mm_mulhi_epu16(x_cr1, x_c13);
		x_r1 = _mm_add_epi16(x_temp, x_coff0);
		x_temp = _mm_srai_epi16(x_r1, 5);
		x_r1 = _mm_add_epi16(x_temp, x_y1);

		x_temp = _mm_mulhi_epu16(x_cb1, x_c22);
		x_g1 = _mm_mulhi_epu16(x_cr1, x_c23);
		x_temp = _mm_add_epi16(x_temp, x_g1);
		x_g1 = _mm_sub_epi16(x_coff1, x_temp);
		x_temp = _mm_srai_epi16(x_g1, 5);
		x_g1 = _mm_add_epi16(x_temp, x_y1);
		x_temp = _mm_mulhi_epu16(x_cb1, x_c32);
		x_b1 = _mm_add_epi16(x_temp, x_coff2);
		x_temp = _mm_srai_epi16(x_b1, 5);
		x_b1 = _mm_add_epi16(x_temp, x_y1);

		/* pack */
		x_b = _mm_packus_epi16(x_b1, x_zero);
		x_r = _mm_packus_epi16(x_r1, x_zero);
		x_g = _mm_packus_epi16(x_g1, x_zero);

		/* create rgb sequences */
		x_abl = _mm_unpacklo_epi8(x_a, x_b);
		x_grl = _mm_unpacklo_epi8(x_g, x_r);

		/* save */
		x_abgrl = _mm_unpacklo_epi16(x_abl, x_grl);
		_mm_storeu_si128(px_abgr++, x_abgrl);

		i += 4;
	}

	/* pure C implementation */
	pabgr = (mlib_u8 *)px_abgr;
	for (; i < n; i++) {
		iy1 = y[i];
		icb1 = cb[i];
		icr1 = cr[i];

		iTemp = (icr1 * c13) >> 8;
		ir1 = (iTemp + coff0) >> 5;
		ir1 += iy1;

		iTemp = (icb1 * c22) >> 8;
		ig1 = (icr1 * c23) >> 8;
		iTemp += ig1;
		ig1 = coff1 - iTemp;
		iTemp = ig1 >> 5;
		ig1 = iTemp + iy1;

		iTemp = (icb1 * c32) >> 8;
		ib1 = iTemp + coff2;
		iTemp = ib1 >> 5;
		ib1 = iTemp + iy1;

		pabgr[0] = 0xff;
		CLAMP_U8(ib1, pabgr[1]);
		CLAMP_U8(ig1, pabgr[2]);
		CLAMP_U8(ir1, pabgr[3]);

		pabgr += 4;
	}

	return (MLIB_SUCCESS);
}
Пример #26
0
		template<bool align> SIMD_INLINE __m128i AverageRow16(const Buffer & buffer, size_t offset)
		{
			return _mm_mulhi_epu16(K16_DIVISION_BY_9_FACTOR, _mm_add_epi16(
				_mm_add_epi16(K16_0005, Load<align>((__m128i*)(buffer.src0 + offset))),
				_mm_add_epi16(Load<align>((__m128i*)(buffer.src1 + offset)), Load<align>((__m128i*)(buffer.src2 + offset)))));
		}
Пример #27
0
static uint32_t *
ssse3_fetch_bilinear_cover (pixman_iter_t *iter, const uint32_t *mask)
{
    pixman_fixed_t fx, ux;
    bilinear_info_t *info = iter->data;
    line_t *line0, *line1;
    int y0, y1;
    int32_t dist_y;
    __m128i vw;
    int i;

    fx = info->x;
    ux = iter->image->common.transform->matrix[0][0];

    y0 = pixman_fixed_to_int (info->y);
    y1 = y0 + 1;

    line0 = &info->lines[y0 & 0x01];
    line1 = &info->lines[y1 & 0x01];

    if (line0->y != y0)
    {
	ssse3_fetch_horizontal (
	    &iter->image->bits, line0, y0, fx, ux, iter->width);
    }

    if (line1->y != y1)
    {
	ssse3_fetch_horizontal (
	    &iter->image->bits, line1, y1, fx, ux, iter->width);
    }

    dist_y = pixman_fixed_to_bilinear_weight (info->y);
    dist_y <<= (16 - BILINEAR_INTERPOLATION_BITS);

    vw = _mm_set_epi16 (
	dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y);

    for (i = 0; i + 3 < iter->width; i += 4)
    {
	__m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
	__m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
	__m128i top1 = _mm_load_si128 ((__m128i *)(line0->buffer + i + 2));
	__m128i bot1 = _mm_load_si128 ((__m128i *)(line1->buffer + i + 2));
	__m128i r0, r1, tmp, p;

	r0 = _mm_mulhi_epu16 (
	    _mm_sub_epi16 (bot0, top0), vw);
	tmp = _mm_cmplt_epi16 (bot0, top0);
	tmp = _mm_and_si128 (tmp, vw);
	r0 = _mm_sub_epi16 (r0, tmp);
	r0 = _mm_add_epi16 (r0, top0);
	r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
	/* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */
	r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
	/* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */

	r1 = _mm_mulhi_epu16 (
	    _mm_sub_epi16 (bot1, top1), vw);
	tmp = _mm_cmplt_epi16 (bot1, top1);
	tmp = _mm_and_si128 (tmp, vw);
	r1 = _mm_sub_epi16 (r1, tmp);
	r1 = _mm_add_epi16 (r1, top1);
	r1 = _mm_srli_epi16 (r1, BILINEAR_INTERPOLATION_BITS);
	r1 = _mm_shuffle_epi32 (r1, _MM_SHUFFLE (2, 0, 3, 1));
	/* r1: A3 R3 G3 B3 A2 R2 G2 B2 */

	p = _mm_packus_epi16 (r0, r1);

	_mm_storeu_si128 ((__m128i *)(iter->buffer + i), p);
    }

    while (i < iter->width)
    {
	__m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i));
	__m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i));
	__m128i r0, tmp, p;

	r0 = _mm_mulhi_epu16 (
	    _mm_sub_epi16 (bot0, top0), vw);
	tmp = _mm_cmplt_epi16 (bot0, top0);
	tmp = _mm_and_si128 (tmp, vw);
	r0 = _mm_sub_epi16 (r0, tmp);
	r0 = _mm_add_epi16 (r0, top0);
	r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS);
	/* r0:  A0 R0 A1 R1 G0 B0 G1 B1 */
	r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1));
	/* r0:  A1 R1 G1 B1 A0 R0 G0 B0 */

	p = _mm_packus_epi16 (r0, r0);

	if (iter->width - i == 1)
	{
	    *(uint32_t *)(iter->buffer + i) = _mm_cvtsi128_si32 (p);
	    i++;
	}
	else
	{
	    _mm_storel_epi64 ((__m128i *)(iter->buffer + i), p);
	    i += 2;
	}
    }
    
    info->y += iter->image->common.transform->matrix[1][1];

    return iter->buffer;
}