Example #1
0
void PPUThread::cpu_task()
{
	//SetHostRoundingMode(FPSCR_RN_NEAR);

	if (custom_task)
	{
		if (check_status()) return;

		return custom_task(*this);
	}

	g_tls_log_prefix = []
	{
		const auto cpu = static_cast<PPUThread*>(get_current_cpu_thread());

		return fmt::format("%s [0x%08x]", cpu->get_name(), cpu->pc);
	};

	const auto base = vm::_ptr<const u8>(0);

	// Select opcode table
	const auto& table = *(
		g_cfg_ppu_decoder.get() == ppu_decoder_type::precise ? &s_ppu_interpreter_precise.get_table() :
		g_cfg_ppu_decoder.get() == ppu_decoder_type::fast ? &s_ppu_interpreter_fast.get_table() :
		throw std::logic_error("Invalid PPU decoder"));

	v128 _op;
	decltype(&ppu_interpreter::UNK) func0, func1, func2, func3;

	while (true)
	{
		if (UNLIKELY(state.load()))
		{
			if (check_status()) return;
		}

		// Reinitialize
		{
			const auto _ops = _mm_shuffle_epi8(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(base + pc)), _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3));
			_op.vi = _ops;
			const v128 _i = v128::fromV(_mm_and_si128(_mm_or_si128(_mm_slli_epi32(_op.vi, 6), _mm_srli_epi32(_op.vi, 26)), _mm_set1_epi32(0x1ffff)));
			func0 = table[_i._u32[0]];
			func1 = table[_i._u32[1]];
			func2 = table[_i._u32[2]];
			func3 = table[_i._u32[3]];
		}

		while (LIKELY(func0(*this, { _op._u32[0] })))
		{
			if (pc += 4, LIKELY(func1(*this, { _op._u32[1] })))
			{
				if (pc += 4, LIKELY(func2(*this, { _op._u32[2] })))
				{
					pc += 4;
					func0 = func3;

					const auto _ops = _mm_shuffle_epi8(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(base + pc + 4)), _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3));
					_op.vi = _mm_alignr_epi8(_ops, _op.vi, 12);
					const v128 _i = v128::fromV(_mm_and_si128(_mm_or_si128(_mm_slli_epi32(_op.vi, 6), _mm_srli_epi32(_op.vi, 26)), _mm_set1_epi32(0x1ffff)));
					func1 = table[_i._u32[1]];
					func2 = table[_i._u32[2]];
					func3 = table[_i._u32[3]];

					if (UNLIKELY(state.load()))
					{
						break;
					}
					continue;
				}
				break;
			}
			break;
		}
	}
}
static __m128 mm_pow_ps(__m128 a, __m128 b)
{
  // a^b = exp2(b * log2(a))
  //   exp2(x) and log2(x) are calculated using polynomial approximations.
  __m128 log2_a, b_log2_a, a_exp_b;

  // Calculate log2(x), x = a.
  {
    // To calculate log2(x), we decompose x like this:
    //   x = y * 2^n
    //     n is an integer
    //     y is in the [1.0, 2.0) range
    //
    //   log2(x) = log2(y) + n
    //     n       can be evaluated by playing with float representation.
    //     log2(y) in a small range can be approximated, this code uses an order
    //             five polynomial approximation. The coefficients have been
    //             estimated with the Remez algorithm and the resulting
    //             polynomial has a maximum relative error of 0.00086%.

    // Compute n.
    //    This is done by masking the exponent, shifting it into the top bit of
    //    the mantissa, putting eight into the biased exponent (to shift/
    //    compensate the fact that the exponent has been shifted in the top/
    //    fractional part and finally getting rid of the implicit leading one
    //    from the mantissa by substracting it out.
    static const ALIGN16_BEG int float_exponent_mask[4] ALIGN16_END =
        {0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000};
    static const ALIGN16_BEG int eight_biased_exponent[4] ALIGN16_END =
        {0x43800000, 0x43800000, 0x43800000, 0x43800000};
    static const ALIGN16_BEG int implicit_leading_one[4] ALIGN16_END =
        {0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000};
    static const int shift_exponent_into_top_mantissa = 8;
    const __m128 two_n = _mm_and_ps(a, *((__m128 *)float_exponent_mask));
    const __m128 n_1 = (__m128)_mm_srli_epi32((__m128i)two_n,
        shift_exponent_into_top_mantissa);
    const __m128 n_0 = _mm_or_ps(
        (__m128)n_1, *((__m128 *)eight_biased_exponent));
    const __m128 n   = _mm_sub_ps(n_0,  *((__m128 *)implicit_leading_one));

    // Compute y.
    static const ALIGN16_BEG int mantissa_mask[4] ALIGN16_END =
        {0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF};
    static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END =
        {0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000};
    const __m128 mantissa = _mm_and_ps(a, *((__m128 *)mantissa_mask));
    const __m128 y        = _mm_or_ps(
        mantissa,  *((__m128 *)zero_biased_exponent_is_one));

    // Approximate log2(y) ~= (y - 1) * pol5(y).
    //    pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0
    static const ALIGN16_BEG float ALIGN16_END C5[4] =
        {-3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f};
    static const ALIGN16_BEG float ALIGN16_END C4[4] =
        {3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f};
    static const ALIGN16_BEG float ALIGN16_END C3[4] =
        {-1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f};
    static const ALIGN16_BEG float ALIGN16_END C2[4] =
        {2.5988452f, 2.5988452f, 2.5988452f, 2.5988452f};
    static const ALIGN16_BEG float ALIGN16_END C1[4] =
        {-3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f};
    static const ALIGN16_BEG float ALIGN16_END C0[4] =
        {3.1157899f, 3.1157899f, 3.1157899f, 3.1157899f};
    const __m128 pol5_y_0 = _mm_mul_ps(y,        *((__m128 *)C5));
    const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, *((__m128 *)C4));
    const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y);
    const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, *((__m128 *)C3));
    const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y);
    const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, *((__m128 *)C2));
    const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y);
    const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, *((__m128 *)C1));
    const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y);
    const __m128 pol5_y   = _mm_add_ps(pol5_y_8, *((__m128 *)C0));
    const __m128 y_minus_one = _mm_sub_ps(
        y, *((__m128 *)zero_biased_exponent_is_one));
    const __m128 log2_y = _mm_mul_ps(y_minus_one ,  pol5_y);

    // Combine parts.
    log2_a = _mm_add_ps(n, log2_y);
  }

  // b * log2(a)
  b_log2_a = _mm_mul_ps(b, log2_a);

  // Calculate exp2(x), x = b * log2(a).
  {
    // To calculate 2^x, we decompose x like this:
    //   x = n + y
    //     n is an integer, the value of x - 0.5 rounded down, therefore
    //     y is in the [0.5, 1.5) range
    //
    //   2^x = 2^n * 2^y
    //     2^n can be evaluated by playing with float representation.
    //     2^y in a small range can be approximated, this code uses an order two
    //         polynomial approximation. The coefficients have been estimated
    //         with the Remez algorithm and the resulting polynomial has a
    //         maximum relative error of 0.17%.

    // To avoid over/underflow, we reduce the range of input to ]-127, 129].
    static const ALIGN16_BEG float max_input[4] ALIGN16_END =
        {129.f, 129.f, 129.f, 129.f};
    static const ALIGN16_BEG float min_input[4] ALIGN16_END =
        {-126.99999f, -126.99999f, -126.99999f, -126.99999f};
    const __m128 x_min = _mm_min_ps(b_log2_a, *((__m128 *)max_input));
    const __m128 x_max = _mm_max_ps(x_min,    *((__m128 *)min_input));
    // Compute n.
    static const ALIGN16_BEG float half[4] ALIGN16_END =
        {0.5f, 0.5f, 0.5f, 0.5f};
    const __m128  x_minus_half = _mm_sub_ps(x_max, *((__m128 *)half));
    const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half);
    // Compute 2^n.
    static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END =
        {127, 127, 127, 127};
    static const int float_exponent_shift = 23;
    const __m128i two_n_exponent = _mm_add_epi32(
        x_minus_half_floor, *((__m128i *)float_exponent_bias));
    const __m128  two_n = (__m128)_mm_slli_epi32(
        two_n_exponent, float_exponent_shift);
    // Compute y.
    const __m128 y = _mm_sub_ps(x_max, _mm_cvtepi32_ps(x_minus_half_floor));
    // Approximate 2^y ~= C2 * y^2 + C1 * y + C0.
    static const ALIGN16_BEG float C2[4] ALIGN16_END =
        {3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f};
    static const ALIGN16_BEG float C1[4] ALIGN16_END =
        {6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f};
    static const ALIGN16_BEG float C0[4] ALIGN16_END =
        {1.0017247f, 1.0017247f, 1.0017247f, 1.0017247f};
    const __m128 exp2_y_0 = _mm_mul_ps(y,        *((__m128 *)C2));
    const __m128 exp2_y_1 = _mm_add_ps(exp2_y_0, *((__m128 *)C1));
    const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y);
    const __m128 exp2_y   = _mm_add_ps(exp2_y_2, *((__m128 *)C0));

    // Combine parts.
    a_exp_b = _mm_mul_ps(exp2_y, two_n);
  }
  return a_exp_b;
}
Example #3
0
__m128i aes_ssse3_decrypt(__m128i B, const __m128i* keys, size_t rounds)
   {
   const __m128i k_dipt1 = _mm_set_epi32(
      0x154A411E, 0x114E451A, 0x0F505B04, 0x0B545F00);
   const __m128i k_dipt2 = _mm_set_epi32(
      0x12771772, 0xF491F194, 0x86E383E6, 0x60056500);

   const __m128i sb9u = _mm_set_epi32(
      0xCAD51F50, 0x4F994CC9, 0x851C0353, 0x9A86D600);
   const __m128i sb9t = _mm_set_epi32(
      0x725E2C9E, 0xB2FBA565, 0xC03B1789, 0xECD74900);

   const __m128i sbeu = _mm_set_epi32(
      0x22426004, 0x64B4F6B0, 0x46F29296, 0x26D4D000);
   const __m128i sbet = _mm_set_epi32(
      0x9467F36B, 0x98593E32, 0x0C55A6CD, 0xFFAAC100);

   const __m128i sbdu = _mm_set_epi32(
      0xF56E9B13, 0x882A4439, 0x7D57CCDF, 0xE6B1A200);
   const __m128i sbdt = _mm_set_epi32(
      0x2931180D, 0x15DEEFD3, 0x3CE2FAF7, 0x24C6CB00);

   const __m128i sbbu = _mm_set_epi32(
      0x602646F6, 0xB0F2D404, 0xD0226492, 0x96B44200);
   const __m128i sbbt = _mm_set_epi32(
      0xF3FF0C3E, 0x3255AA6B, 0xC19498A6, 0xCD596700);

   __m128i mc = mc_forward[3];

   __m128i t =
      _mm_shuffle_epi8(k_dipt2,
                       _mm_srli_epi32(
                          _mm_andnot_si128(low_nibs, B),
                          4));

   B = mm_xor3(t, _mm_loadu_si128(keys),
               _mm_shuffle_epi8(k_dipt1, _mm_and_si128(B, low_nibs)));

   for(size_t r = 1; ; ++r)
      {
      const __m128i K = _mm_loadu_si128(keys + r);

      t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, B), 4);

      B = _mm_and_si128(low_nibs, B);

      __m128i t2 = _mm_shuffle_epi8(k_inv2, B);

      B = _mm_xor_si128(B, t);

      __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t));
      __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, B));
      __m128i t5 = _mm_xor_si128(B, _mm_shuffle_epi8(k_inv1, t3));
      __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4));

      if(r == rounds)
         {
         const __m128i sbou = _mm_set_epi32(
            0xC7AA6DB9, 0xD4943E2D, 0x1387EA53, 0x7EF94000);
         const __m128i sbot = _mm_set_epi32(
            0xCA4B8159, 0xD8C58E9C, 0x12D7560F, 0x93441D00);

         __m128i x = _mm_shuffle_epi8(sbou, t5);
         __m128i y = _mm_shuffle_epi8(sbot, t6);
         x = _mm_xor_si128(x, K);
         x = _mm_xor_si128(x, y);

         const uint32_t which_sr = ((((rounds - 1) << 4) ^ 48) & 48) / 16;
         return _mm_shuffle_epi8(x, sr[which_sr]);
         }

      __m128i t8 = _mm_xor_si128(_mm_shuffle_epi8(sb9t, t6),
                                 _mm_xor_si128(_mm_shuffle_epi8(sb9u, t5), K));

      __m128i t9 = mm_xor3(_mm_shuffle_epi8(t8, mc),
                           _mm_shuffle_epi8(sbdu, t5),
                           _mm_shuffle_epi8(sbdt, t6));

      __m128i t12 = _mm_xor_si128(
         _mm_xor_si128(
            _mm_shuffle_epi8(t9, mc),
            _mm_shuffle_epi8(sbbu, t5)),
         _mm_shuffle_epi8(sbbt, t6));

      B = _mm_xor_si128(_mm_xor_si128(_mm_shuffle_epi8(t12, mc),
                                      _mm_shuffle_epi8(sbeu, t5)),
                        _mm_shuffle_epi8(sbet, t6));

      mc = _mm_alignr_epi8(mc, mc, 12);
      }
   }
Example #4
0
static void transform(hashState *state,int r)
{
  __m128i x0;
  __m128i x1;
  __m128i x2;
  __m128i x3;
  __m128i x4;
  __m128i x5;
  __m128i x6;
  __m128i x7;
  __m128i y0;
  __m128i y1;
  __m128i y2;
  __m128i y3;

  x0 = state->x[0];
  x1 = state->x[1];
  x2 = state->x[2];
  x3 = state->x[3];
  x4 = state->x[4];
  x5 = state->x[5];
  x6 = state->x[6];
  x7 = state->x[7];

  for (;r > 0;--r) {
    x4 = _mm_add_epi32(x0,x4);
    x5 = _mm_add_epi32(x1,x5);
    x6 = _mm_add_epi32(x2,x6);
    x7 = _mm_add_epi32(x3,x7);
    y0 = x2;
    y1 = x3;
    y2 = x0;
    y3 = x1;
    x0 = _mm_xor_si128(_mm_slli_epi32(y0,7),_mm_srli_epi32(y0,25));
    x1 = _mm_xor_si128(_mm_slli_epi32(y1,7),_mm_srli_epi32(y1,25));
    x2 = _mm_xor_si128(_mm_slli_epi32(y2,7),_mm_srli_epi32(y2,25));
    x3 = _mm_xor_si128(_mm_slli_epi32(y3,7),_mm_srli_epi32(y3,25));
    x0 = _mm_xor_si128(x0,x4);
    x1 = _mm_xor_si128(x1,x5);
    x2 = _mm_xor_si128(x2,x6);
    x3 = _mm_xor_si128(x3,x7);
    x4 = _mm_shuffle_epi32(x4,0x4e);
    x5 = _mm_shuffle_epi32(x5,0x4e);
    x6 = _mm_shuffle_epi32(x6,0x4e);
    x7 = _mm_shuffle_epi32(x7,0x4e);
    x4 = _mm_add_epi32(x0,x4);
    x5 = _mm_add_epi32(x1,x5);
    x6 = _mm_add_epi32(x2,x6);
    x7 = _mm_add_epi32(x3,x7);
    y0 = x1;
    y1 = x0;
    y2 = x3;
    y3 = x2;
    x0 = _mm_xor_si128(_mm_slli_epi32(y0,11),_mm_srli_epi32(y0,21));
    x1 = _mm_xor_si128(_mm_slli_epi32(y1,11),_mm_srli_epi32(y1,21));
    x2 = _mm_xor_si128(_mm_slli_epi32(y2,11),_mm_srli_epi32(y2,21));
    x3 = _mm_xor_si128(_mm_slli_epi32(y3,11),_mm_srli_epi32(y3,21));
    x0 = _mm_xor_si128(x0,x4);
    x1 = _mm_xor_si128(x1,x5);
    x2 = _mm_xor_si128(x2,x6);
    x3 = _mm_xor_si128(x3,x7);
    x4 = _mm_shuffle_epi32(x4,0xb1);
    x5 = _mm_shuffle_epi32(x5,0xb1);
    x6 = _mm_shuffle_epi32(x6,0xb1);
    x7 = _mm_shuffle_epi32(x7,0xb1);
  }

  state->x[0] = x0;
  state->x[1] = x1;
  state->x[2] = x2;
  state->x[3] = x3;
  state->x[4] = x4;
  state->x[5] = x5;
  state->x[6] = x6;
  state->x[7] = x7;
}
  template<int pixelFormat> void
  imageFromPixels(vl::Image & image, char unsigned const * rgb, int rowStride)
  {
    vl::ImageShape const & shape = image.getShape() ;
    int blockSizeX ;
    int blockSizeY ;
    int pixelStride ;
    int imagePlaneStride = (int)shape.width * (int)shape.height ;
    __m128i shuffleRgb ;
    __m128i const shuffleL = _mm_set_epi8(0xff, 0xff, 0xff,  3,
                                          0xff, 0xff, 0xff,  2,
                                          0xff, 0xff, 0xff,  1,
                                          0xff, 0xff, 0xff,  0) ;
    __m128i const mask = _mm_set_epi32(0xff, 0xff, 0xff, 0xff) ;

    switch (pixelFormat) {
      case pixelFormatL:
        pixelStride = 1 ;
        blockSizeX = 16 ;
        blockSizeY = 4 ;
        break ;
      case pixelFormatBGR:
      case pixelFormatRGB:
        pixelStride = 3 ;
        blockSizeX = 4 ;
        blockSizeY = 4 ;
        assert(shape.depth == 3) ;
        break ;
      case pixelFormatRGBA:
      case pixelFormatBGRA:
      case pixelFormatBGRAasL:
        pixelStride = 4 ;
        blockSizeX = 4 ;
        blockSizeY = 4 ;
        assert(shape.depth == 3) ;
        break ;
      default:
        assert(false) ;
    }

    switch (pixelFormat) {
      case pixelFormatL:
        break ;

      case pixelFormatRGB:
        shuffleRgb = _mm_set_epi8(0xff, 11, 10,  9,
                                  0xff,  8,  7,  6,
                                  0xff,  5,  4,  3,
                                  0xff,  2,  1,  0) ;
        break ;

      case pixelFormatRGBA:
        shuffleRgb = _mm_set_epi8(0xff, 14, 13, 12,
                                  0xff, 10,  9,  8,
                                  0xff,  6,  5,  4,
                                  0xff,  2,  1,  0) ;
        break ;

      case pixelFormatBGR:
        shuffleRgb = _mm_set_epi8(0xff,  9, 10, 11,
                                  0xff,  6,  7,  8,
                                  0xff,  3,  4,  4,
                                  0xff,  0,  1,  2) ;
        break ;

      case pixelFormatBGRA:
        shuffleRgb = _mm_set_epi8(0xff, 12, 13, 14,
                                  0xff,  8,  9, 10,
                                  0xff,  4,  5,  6,
                                  0xff,  0,  1,  2) ;
        break ;

      case pixelFormatBGRAasL:
        shuffleRgb = _mm_set_epi8(0xff, 0xff, 0xff, 12,
                                  0xff, 0xff, 0xff, 8,
                                  0xff, 0xff, 0xff, 4,
                                  0xff, 0xff, 0xff, 0) ;
        break ;
    }

    // we pull out these values as otherwise the compiler
    // will assume that the reference &image can be aliased
    // and recompute silly multiplications in the inner loop
    float *  const __restrict imageMemory = image.getMemory() ;
    int const imageHeight = (int)shape.height ;
    int const imageWidth = (int)shape.width ;

    for (int x = 0 ; x < imageWidth ; x += blockSizeX) {
      int y = 0 ;
      float * __restrict imageMemoryX = imageMemory + x * imageHeight ;
      int bsx = (std::min)(imageWidth - x, blockSizeX) ;
      if (bsx < blockSizeX) goto boundary ;

      for ( ; y < imageHeight - blockSizeY + 1 ; y += blockSizeY) {
        char unsigned const * __restrict pixel = rgb + y * rowStride + x * pixelStride ;
        float * __restrict r = imageMemoryX + y ;
        __m128i p0, p1, p2, p3, T0, T1, T2, T3 ;

        /* convert a blockSizeX x blockSizeY block in the input image */
        switch (pixelFormat) {
          case pixelFormatRGB :
          case pixelFormatRGBA :
          case pixelFormatBGR :
          case pixelFormatBGRA :
          case pixelFormatBGRAasL :
            // load 4x4 RGB pixels
            p0 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pixel), shuffleRgb) ; pixel += rowStride ;
            p1 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pixel), shuffleRgb) ; pixel += rowStride ;
            p2 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pixel), shuffleRgb) ; pixel += rowStride ;
            p3 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pixel), shuffleRgb) ; pixel += rowStride ;

            // transpose pixels as 32-bit integers (see also below)
            T0 = _mm_unpacklo_epi32(p0, p1);
            T1 = _mm_unpacklo_epi32(p2, p3);
            T2 = _mm_unpackhi_epi32(p0, p1);
            T3 = _mm_unpackhi_epi32(p2, p3);
            p0 = _mm_unpacklo_epi64(T0, T1);
            p1 = _mm_unpackhi_epi64(T0, T1);
            p2 = _mm_unpacklo_epi64(T2, T3);
            p3 = _mm_unpackhi_epi64(T2, T3);

            // store r
            _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p0, mask))) ; r += imageHeight ;
            _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p1, mask))) ; r += imageHeight ;
            _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p2, mask))) ; r += imageHeight ;
            _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p3, mask))) ;

            if (pixelFormat == pixelFormatBGRAasL) break ;

            // store g
            r += (imageWidth - 3) * imageHeight ;
            p0 = _mm_srli_epi32 (p0, 8) ;
            p1 = _mm_srli_epi32 (p1, 8) ;
            p2 = _mm_srli_epi32 (p2, 8) ;
            p3 = _mm_srli_epi32 (p3, 8) ;
            _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p0, mask))) ; r += imageHeight ;
            _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p1, mask))) ; r += imageHeight ;
            _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p2, mask))) ; r += imageHeight ;
            _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p3, mask))) ;

            // store b
            r += (imageWidth - 3) * imageHeight ;
            p0 = _mm_srli_epi32 (p0, 8) ;
            p1 = _mm_srli_epi32 (p1, 8) ;
            p2 = _mm_srli_epi32 (p2, 8) ;
            p3 = _mm_srli_epi32 (p3, 8) ;
            _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p0, mask))) ; r += imageHeight ;
            _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p1, mask))) ; r += imageHeight ;
            _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p2, mask))) ; r += imageHeight ;
            _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p3, mask))) ;
            break ;

          case pixelFormatL:
            // load 4x16 L pixels
            p0 = _mm_loadu_si128((__m128i*)pixel) ; pixel += rowStride ;
            p1 = _mm_loadu_si128((__m128i*)pixel) ; pixel += rowStride ;
            p2 = _mm_loadu_si128((__m128i*)pixel) ; pixel += rowStride ;
            p3 = _mm_loadu_si128((__m128i*)pixel) ; pixel += rowStride ;

            /*
             Pixels are collected in little-endian order: the first pixel
             is at the `right' (least significant byte of p0:

             p[0] = a, p[1] = b, ...

             p0: [ ... | ... | ... | d c b a ]
             p1: [ ... | ... | ... | h g f e ]
             p2: [ ... | ... | ... | l k j i ]
             p3: [ ... | ... | ... | p o n m ]

             The goal is to transpose four 4x4 subblocks in the
             4 x 16 pixel array. The first step interlaves individual
             pixels in p0 and p1:

             T0: [ ... | ... | h d g c | f b e a ]
             T1: [ ... | ... | p l o k | n j m i ]
             T2: [ ... | ... | ... | ... ]
             T3: [ ... | ... | ... | ... ]

             The second step interleaves groups of two pixels:

             p0: [pl hd | ok gc | nj fb | mi ea] (pixels in the rightmost 4x4 subblock)
             p1: ...
             p2: ...
             p3: ...

             The third step interlevaes groups of four pixels:

             T0: [ ... | njfb | ... | miea ]
             T1: ...
             T2: ...
             T3: ...

             The last step interleaves groups of eight pixels:

             p0: [ ... | ... | ... | miea ]
             p1: [ ... | ... | ... | njfb ]
             p2: [ ... | ... | ... | okgc ]
             p3: [ ... | ... | ... | dklp ]

             */

            T0 = _mm_unpacklo_epi8(p0, p1);
            T1 = _mm_unpacklo_epi8(p2, p3);
            T2 = _mm_unpackhi_epi8(p0, p1);
            T3 = _mm_unpackhi_epi8(p2, p3);
            p0 = _mm_unpacklo_epi16(T0, T1);
            p1 = _mm_unpackhi_epi16(T0, T1);
            p2 = _mm_unpacklo_epi16(T2, T3);
            p3 = _mm_unpackhi_epi16(T2, T3);
            T0 = _mm_unpacklo_epi32(p0, p1);
            T1 = _mm_unpacklo_epi32(p2, p3);
            T2 = _mm_unpackhi_epi32(p0, p1);
            T3 = _mm_unpackhi_epi32(p2, p3);
            p0 = _mm_unpacklo_epi64(T0, T1);
            p1 = _mm_unpackhi_epi64(T0, T1);
            p2 = _mm_unpacklo_epi64(T2, T3);
            p3 = _mm_unpackhi_epi64(T2, T3);

            // store four 4x4 subblock
            for (int i = 0 ; i < 4 ; ++i) {
              _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_shuffle_epi8(p0, shuffleL))) ; r += imageHeight ;
              _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_shuffle_epi8(p1, shuffleL))) ; r += imageHeight ;
              _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_shuffle_epi8(p2, shuffleL))) ; r += imageHeight ;
              _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_shuffle_epi8(p3, shuffleL))) ; r += imageHeight ;
              p0 = _mm_srli_si128 (p0, 4) ;
              p1 = _mm_srli_si128 (p1, 4) ;
              p2 = _mm_srli_si128 (p2, 4) ;
              p3 = _mm_srli_si128 (p3, 4) ;
            }
            break ;
        }
      } /* next y */

    boundary:
      /* special case if there is not a full 4x4 block to process */
      for ( ; y < imageHeight ; y += blockSizeY) {
        int bsy = (std::min)(imageHeight - y, blockSizeY) ;
        float * __restrict r ;
        float * rend ;
        for (int dx = 0 ; dx < bsx ; ++dx) {
          char unsigned const * __restrict pixel = rgb + y * rowStride + (x + dx) * pixelStride ;
          r = imageMemoryX + y + dx * imageHeight ;
          rend = r + bsy ;
          while (r != rend) {
            switch (pixelFormat) {
              case pixelFormatRGBA:
              case pixelFormatRGB:
                r[0 * imagePlaneStride] = (float) pixel[0] ;
                r[1 * imagePlaneStride] = (float) pixel[1] ;
                r[2 * imagePlaneStride] = (float) pixel[2] ;
                break ;
              case pixelFormatBGR:
              case pixelFormatBGRA:
                r[2 * imagePlaneStride] = (float) pixel[0] ;
                r[1 * imagePlaneStride] = (float) pixel[1] ;
                r[0 * imagePlaneStride] = (float) pixel[2] ;
                break;
              case pixelFormatBGRAasL:
              case pixelFormatL:
                r[0] = (float) pixel[0] ;
                break ;
            }
            r += 1 ;
            pixel += rowStride ;
          }
        }
      }
    }
  }
Example #6
0
void spu_interpreter::CG(SPUThread& CPU, spu_opcode_t op)
{
	const auto a = _mm_xor_si128(CPU.GPR[op.ra].vi, _mm_set1_epi32(0x7fffffff));
	const auto b = _mm_xor_si128(CPU.GPR[op.rb].vi, _mm_set1_epi32(0x80000000));
	CPU.GPR[op.rt].vi = _mm_srli_epi32(_mm_cmpgt_epi32(b, a), 31);
}
Example #7
0
static void GF_FUNC_ALIGN VS_CC
proc_8bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride,
               uint8_t *dstp, const uint8_t *srcp, edge_t *eh,
               uint16_t plane_max)
{
    uint8_t* p0 = buff + 16;
    uint8_t* p1 = p0 + bstride;
    uint8_t* p2 = p1 + bstride;
    uint8_t* p3 = p2 + bstride;
    uint8_t* p4 = p3 + bstride;
    uint8_t* orig = p0;
    uint8_t* end = p4;

    line_copy8(p0, srcp + 2 * stride, width, 2);
    line_copy8(p1, srcp + stride, width, 2);
    line_copy8(p2, srcp, width, 2);
    srcp += stride;
    line_copy8(p3, srcp, width, 2);

    uint8_t th_min = eh->min > 0xFF ? 0xFF : (uint8_t)eh->min;
    uint8_t th_max = eh->max > 0xFF ? 0xFF : (uint8_t)eh->max;

    __m128i zero = _mm_setzero_si128();
    __m128i ab = _mm_set1_epi16(15);
    __m128i max = _mm_set1_epi8((int8_t)th_max);
    __m128i min = _mm_set1_epi8((int8_t)th_min);

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 2 ? 1 : -1);
        line_copy8(p4, srcp, width, 2);
        uint8_t* posh[] = {p2 - 2, p2 - 1, p2 + 1, p2 + 2};
        uint8_t* posv[] = {p0, p1, p3, p4};

        for (int x = 0; x < width; x += 16) {
            __m128i sumx[2] = {zero, zero};
            __m128i sumy[2] = {zero, zero};

            for (int i = 0; i < 4; i++) {
                __m128i xmm0, xmm1, xmul;
                xmul = _mm_load_si128((__m128i *)ar_mulx[i]);
                xmm0 = _mm_loadu_si128((__m128i *)(posh[i] + x));
                xmm1 = _mm_unpackhi_epi8(xmm0, zero);
                xmm0 = _mm_unpacklo_epi8(xmm0, zero);
                sumx[0] = _mm_add_epi16(sumx[0], _mm_mullo_epi16(xmm0, xmul));
                sumx[1] = _mm_add_epi16(sumx[1], _mm_mullo_epi16(xmm1, xmul));

                xmul = _mm_load_si128((__m128i *)ar_muly[i]);
                xmm0 = _mm_load_si128((__m128i *)(posv[i] + x));
                xmm1 = _mm_unpackhi_epi8(xmm0, zero);
                xmm0 = _mm_unpacklo_epi8(xmm0, zero);
                sumy[0] = _mm_add_epi16(sumy[0], _mm_mullo_epi16(xmm0, xmul));
                sumy[1] = _mm_add_epi16(sumy[1], _mm_mullo_epi16(xmm1, xmul));
            }

            for (int i = 0; i < 2; i++) {
                __m128i xmax, xmin, mull, mulh;
                sumx[i] = mm_abs_epi16(sumx[i]);
                sumy[i] = mm_abs_epi16(sumy[i]);
                xmax = _mm_max_epi16(sumx[i], sumy[i]);
                xmin = _mm_min_epi16(sumx[i], sumy[i]);

                mull = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpacklo_epi16(xmax, zero)), 4);
                mulh = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpackhi_epi16(xmax, zero)), 4);
                xmax = mm_cast_epi32(mull, mulh);

                mull = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpacklo_epi16(xmin, zero)), 5);
                mulh = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpackhi_epi16(xmin, zero)), 5);
                xmin = mm_cast_epi32(mull, mulh);

                sumx[i] = _mm_adds_epu16(xmax, xmin);
                sumx[i] = _mm_srli_epi16(sumx[i], eh->rshift);
            }

            __m128i out = _mm_packus_epi16(sumx[0], sumx[1]);
            __m128i temp = _mm_min_epu8(out, max);
            temp = _mm_cmpeq_epi8(temp, max);
            out = _mm_or_si128(temp, out);

            temp = _mm_max_epu8(out, min);
            temp = _mm_cmpeq_epi8(temp, min);
            out = _mm_andnot_si128(temp, out);

            _mm_store_si128((__m128i*)(dstp + x), out);
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = p3;
        p3 = p4;
        p4 = (p4 == end) ? orig : p4 + bstride;
    }
}
Example #8
0
/* Function:  esl_sse_logf()
 * Synopsis:  <r[z] = log x[z]>
 * Incept:    SRE, Fri Dec 14 11:32:54 2007 [Janelia]
 *
 * Purpose:   Given a vector <x> containing four floats, returns a
 *            vector <r> in which each element <r[z] = logf(x[z])>.
 *            
 *            Valid in the domain $x_z > 0$ for normalized IEEE754
 *            $x_z$.
 *
 *            For <x> $< 0$, including -0, returns <NaN>. For <x> $==
 *            0$ or subnormal <x>, returns <-inf>. For <x = inf>,
 *            returns <inf>. For <x = NaN>, returns <NaN>. For 
 *            subnormal <x>, returns <-inf>.
 *
 * Xref:      J2/71.
 * 
 * Note:      Derived from an SSE1 implementation by Julian
 *            Pommier. Converted to SSE2 and added handling
 *            of IEEE754 specials.
 */
__m128 
esl_sse_logf(__m128 x) 
{
  static float cephes_p[9] = {  7.0376836292E-2f, -1.1514610310E-1f,  1.1676998740E-1f,
				-1.2420140846E-1f, 1.4249322787E-1f, -1.6668057665E-1f,
				2.0000714765E-1f, -2.4999993993E-1f,  3.3333331174E-1f };
  __m128  onev = _mm_set1_ps(1.0f);          /* all elem = 1.0 */
  __m128  v0p5 = _mm_set1_ps(0.5f);          /* all elem = 0.5 */
  __m128i vneg = _mm_set1_epi32(0x80000000); /* all elem have IEEE sign bit up */
  __m128i vexp = _mm_set1_epi32(0x7f800000); /* all elem have IEEE exponent bits up */
  __m128i ei;
  __m128  e;
  __m128  invalid_mask, zero_mask, inf_mask;            /* masks used to handle special IEEE754 inputs */
  __m128  mask;
  __m128  origx;
  __m128  tmp;
  __m128  y;
  __m128  z;

  /* first, split x apart: x = frexpf(x, &e); */
  ei           = _mm_srli_epi32( _mm_castps_si128(x), 23);	                                        /* shift right 23: IEEE754 floats: ei = biased exponents     */
  invalid_mask = _mm_castsi128_ps ( _mm_cmpeq_epi32( _mm_and_si128(_mm_castps_si128(x), vneg), vneg));  /* mask any elem that's negative; these become NaN           */
  zero_mask    = _mm_castsi128_ps ( _mm_cmpeq_epi32(ei, _mm_setzero_si128()));                          /* mask any elem zero or subnormal; these become -inf        */
  inf_mask     = _mm_castsi128_ps ( _mm_cmpeq_epi32( _mm_and_si128(_mm_castps_si128(x), vexp), vexp));  /* mask any elem inf or NaN; log(inf)=inf, log(NaN)=NaN      */
  origx        = x;			                                                                /* store original x, used for log(inf) = inf, log(NaN) = NaN */

  x  = _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(~0x7f800000))); /* x now the stored 23 bits of the 24-bit significand        */
  x  = _mm_or_ps (x, v0p5);                                          /* sets hidden bit b[0]                                      */

  ei = _mm_sub_epi32(ei, _mm_set1_epi32(126));                       /* -127 (ei now signed base-2 exponent); then +1             */
  e  = _mm_cvtepi32_ps(ei);

  /* now, calculate the log */
  mask = _mm_cmplt_ps(x, _mm_set1_ps(0.707106781186547524f)); /* avoid conditional branches.           */
  tmp  = _mm_and_ps(x, mask);	                              /* tmp contains x values < 0.707, else 0 */
  x    = _mm_sub_ps(x, onev);
  e    = _mm_sub_ps(e, _mm_and_ps(onev, mask));
  x    = _mm_add_ps(x, tmp);
  z    = _mm_mul_ps(x,x);

  y =               _mm_set1_ps(cephes_p[0]);    y = _mm_mul_ps(y, x); 
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[1]));   y = _mm_mul_ps(y, x);    
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[2]));   y = _mm_mul_ps(y, x);   
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[3]));   y = _mm_mul_ps(y, x);   
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[4]));   y = _mm_mul_ps(y, x);    
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[5]));   y = _mm_mul_ps(y, x);   
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[6]));   y = _mm_mul_ps(y, x); 
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[7]));   y = _mm_mul_ps(y, x);  
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[8]));   y = _mm_mul_ps(y, x);
  y = _mm_mul_ps(y, z);

  tmp = _mm_mul_ps(e, _mm_set1_ps(-2.12194440e-4f));
  y   = _mm_add_ps(y, tmp);

  tmp = _mm_mul_ps(z, v0p5);
  y   = _mm_sub_ps(y, tmp);

  tmp = _mm_mul_ps(e, _mm_set1_ps(0.693359375f));
  x = _mm_add_ps(x, y);
  x = _mm_add_ps(x, tmp);

  /* IEEE754 cleanup: */
  x = esl_sse_select_ps(x, origx,                     inf_mask);  /* log(inf)=inf; log(NaN)      = NaN  */
  x = _mm_or_ps(x, invalid_mask);                                 /* log(x<0, including -0,-inf) = NaN  */
  x = esl_sse_select_ps(x, _mm_set1_ps(-eslINFINITY), zero_mask); /* x zero or subnormal         = -inf */
  return x;
}
}bool validate_utf8_sse(const char *src, size_t len) {
  const char *end = src + len;
  while (src + 16 < end) {
    __m128i chunk = _mm_loadu_si128((const __m128i *)(src));

    int asciiMask = _mm_movemask_epi8(chunk);
    if (!asciiMask) {
      src += 16;
      continue;
    }

    __m128i chunk_signed = _mm_add_epi8(chunk, _mm_set1_epi8(0x80));
    __m128i cond2 =
        _mm_cmplt_epi8(_mm_set1_epi8(0xc2 - 1 - 0x80), chunk_signed);
    __m128i state = _mm_set1_epi8((char)(0x0 | 0x80));
    state = _mm_blendv_epi8(state, _mm_set1_epi8((char)(0x2 | 0xc0)), cond2);

    __m128i cond3 =
        _mm_cmplt_epi8(_mm_set1_epi8(0xe0 - 1 - 0x80), chunk_signed);

    state = _mm_blendv_epi8(state, _mm_set1_epi8((char)(0x3 | 0xe0)), cond3);
    __m128i mask3 = _mm_slli_si128(cond3, 1);

    __m128i cond4 =
        _mm_cmplt_epi8(_mm_set1_epi8(0xf0 - 1 - 0x80), chunk_signed);

    // Fall back to the scalar processing
    if (_mm_movemask_epi8(cond4)) {
      break;
    }

    __m128i count = _mm_and_si128(state, _mm_set1_epi8(0x7));

    __m128i count_sub1 = _mm_subs_epu8(count, _mm_set1_epi8(0x1));

    __m128i counts = _mm_add_epi8(count, _mm_slli_si128(count_sub1, 1));

    __m128i shifts = count_sub1;
    shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 1));
    counts = _mm_add_epi8(
        counts, _mm_slli_si128(_mm_subs_epu8(counts, _mm_set1_epi8(0x2)), 2));
    shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 2));

    if (asciiMask ^ _mm_movemask_epi8(_mm_cmpgt_epi8(counts, _mm_set1_epi8(0))))
      return false; // error
    shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 4));

    if (_mm_movemask_epi8(_mm_cmpgt_epi8(
            _mm_sub_epi8(_mm_slli_si128(counts, 1), counts), _mm_set1_epi8(1))))
      return false; // error

    shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 8));

    __m128i mask = _mm_and_si128(state, _mm_set1_epi8(0xf8));
    shifts =
        _mm_and_si128(shifts, _mm_cmplt_epi8(counts, _mm_set1_epi8(2))); // <=1

    chunk =
        _mm_andnot_si128(mask, chunk); // from now on, we only have usefull bits

    shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 1),
                             _mm_srli_si128(_mm_slli_epi16(shifts, 7), 1));

    __m128i chunk_right = _mm_slli_si128(chunk, 1);

    __m128i chunk_low = _mm_blendv_epi8(
        chunk,
        _mm_or_si128(chunk, _mm_and_si128(_mm_slli_epi16(chunk_right, 6),
                                          _mm_set1_epi8(0xc0))),
        _mm_cmpeq_epi8(counts, _mm_set1_epi8(1)));

    __m128i chunk_high =
        _mm_and_si128(chunk, _mm_cmpeq_epi8(counts, _mm_set1_epi8(2)));

    shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 2),
                             _mm_srli_si128(_mm_slli_epi16(shifts, 6), 2));
    chunk_high = _mm_srli_epi32(chunk_high, 2);

    shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 4),
                             _mm_srli_si128(_mm_slli_epi16(shifts, 5), 4));
    chunk_high = _mm_or_si128(
        chunk_high, _mm_and_si128(_mm_and_si128(_mm_slli_epi32(chunk_right, 4),
                                                _mm_set1_epi8(0xf0)),
                                  mask3));
    int c = _mm_extract_epi16(counts, 7);
    int source_advance = !(c & 0x0200) ? 16 : !(c & 0x02) ? 15 : 14;

    __m128i high_bits = _mm_and_si128(chunk_high, _mm_set1_epi8(0xf8));
    if (!_mm_testz_si128(
            mask3,
            _mm_or_si128(_mm_cmpeq_epi8(high_bits, _mm_set1_epi8(0x00)),
                         _mm_cmpeq_epi8(high_bits, _mm_set1_epi8(0xd8)))))
      return false;

    shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 8),
                             _mm_srli_si128(_mm_slli_epi16(shifts, 4), 8));

    chunk_high = _mm_slli_si128(chunk_high, 1);

    __m128i shuf =
        _mm_add_epi8(shifts, _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5,
                                          4, 3, 2, 1, 0));

    chunk_low = _mm_shuffle_epi8(chunk_low, shuf);
    chunk_high = _mm_shuffle_epi8(chunk_high, shuf);
    __m128i utf16_low = _mm_unpacklo_epi8(chunk_low, chunk_high);
    __m128i utf16_high = _mm_unpackhi_epi8(chunk_low, chunk_high);

    if (_mm_cmpestrc(_mm_cvtsi64_si128(0xfdeffdd0fffffffe), 4, utf16_high, 8,
                     _SIDD_UWORD_OPS | _SIDD_CMP_RANGES) |
        _mm_cmpestrc(_mm_cvtsi64_si128(0xfdeffdd0fffffffe), 4, utf16_low, 8,
                     _SIDD_UWORD_OPS | _SIDD_CMP_RANGES)) {
      return false;
    }

    src += source_advance;
  }
  return validate_utf8(src, end - src);
}
Example #10
0
__forceinline void GSMem_to_ClutBuffer__T16_I4_CSM1_core_sse2(u32* vm, u32* clut)
{
    __m128i vm_0;
    __m128i vm_1;
    __m128i vm_2;
    __m128i vm_3;
    __m128i clut_0;
    __m128i clut_1;
    __m128i clut_2;
    __m128i clut_3;

    __m128i clut_mask = _mm_load_si128((__m128i*)s_clut_16bits_mask);

    // !HIGH_16BITS_VM
    // CSA in 0-15
    // Replace lower 16 bits of clut with lower 16 bits of vm
    // CSA in 16-31
    // Replace higher 16 bits of clut with lower 16 bits of vm

    // HIGH_16BITS_VM
    // CSA in 0-15
    // Replace lower 16 bits of clut with higher 16 bits of vm
    // CSA in 16-31
    // Replace higher 16 bits of clut with higher 16 bits of vm
    if(HIGH_16BITS_VM && CSA_0_15) {
        // move up to low
        vm_0 = _mm_load_si128((__m128i*)vm); // 9 8 1 0
        vm_1 = _mm_load_si128((__m128i*)vm+1); // 11 10 3 2
        vm_2 = _mm_load_si128((__m128i*)vm+2); // 13 12 5 4
        vm_3 = _mm_load_si128((__m128i*)vm+3); // 15 14 7 6
        vm_0 = _mm_srli_epi32(vm_0, 16);
        vm_1 = _mm_srli_epi32(vm_1, 16);
        vm_2 = _mm_srli_epi32(vm_2, 16);
        vm_3 = _mm_srli_epi32(vm_3, 16);
    } else if(HIGH_16BITS_VM && !CSA_0_15) {
        // Remove lower 16 bits
        vm_0 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm)); // 9 8 1 0
        vm_1 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm+1)); // 11 10 3 2
        vm_2 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm+2)); // 13 12 5 4
        vm_3 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm+3)); // 15 14 7 6
    } else if(!HIGH_16BITS_VM && CSA_0_15) {
        // Remove higher 16 bits
        vm_0 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm)); // 9 8 1 0
        vm_1 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm+1)); // 11 10 3 2
        vm_2 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm+2)); // 13 12 5 4
        vm_3 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm+3)); // 15 14 7 6
    } else if(!HIGH_16BITS_VM && !CSA_0_15) {
        // move low to high
        vm_0 = _mm_load_si128((__m128i*)vm); // 9 8 1 0
        vm_1 = _mm_load_si128((__m128i*)vm+1); // 11 10 3 2
        vm_2 = _mm_load_si128((__m128i*)vm+2); // 13 12 5 4
        vm_3 = _mm_load_si128((__m128i*)vm+3); // 15 14 7 6
        vm_0 = _mm_slli_epi32(vm_0, 16);
        vm_1 = _mm_slli_epi32(vm_1, 16);
        vm_2 = _mm_slli_epi32(vm_2, 16);
        vm_3 = _mm_slli_epi32(vm_3, 16);
    }

    // Unsizzle the data
    __m128i row_0 = _mm_unpacklo_epi64(vm_0, vm_1); // 3 2 1 0
    __m128i row_1 = _mm_unpacklo_epi64(vm_2, vm_3); // 7 6 5 4
    __m128i row_2 = _mm_unpackhi_epi64(vm_0, vm_1); // 11 10 9 8
    __m128i row_3 = _mm_unpackhi_epi64(vm_2, vm_3); // 15 14 13 12

    // load old data & remove useless part
    if(CSA_0_15) {
        // Remove lower 16 bits
        clut_0 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut));
        clut_1 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut+1));
        clut_2 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut+2));
        clut_3 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut+3));
    } else {
        // Remove higher 16 bits
        clut_0 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut));
        clut_1 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut+1));
        clut_2 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut+2));
        clut_3 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut+3));
    }

    // Merge old & new data
    clut_0 = _mm_or_si128(clut_0, row_0);
    clut_1 = _mm_or_si128(clut_1, row_1);
    clut_2 = _mm_or_si128(clut_2, row_2);
    clut_3 = _mm_or_si128(clut_3, row_3);

    _mm_store_si128((__m128i*)clut, clut_0);
    _mm_store_si128((__m128i*)clut+1, clut_1);
    _mm_store_si128((__m128i*)clut+2, clut_2);
    _mm_store_si128((__m128i*)clut+3, clut_3);
}
Example #11
0
__forceinline bool Cmp_ClutBuffer_GSMem_core(u16* GSmem, u16* clut)
{
    __m128i GSmem_0;
    __m128i GSmem_1;
    __m128i GSmem_2;
    __m128i GSmem_3;
    __m128i clut_0;
    __m128i clut_1;
    __m128i clut_2;
    __m128i clut_3;

    __m128i clut_mask = _mm_load_si128((__m128i*)s_clut_16bits_mask);

    // !HIGH_16BITS_VM
    // CSA in 0-15
    // cmp lower 16 bits of clut with lower 16 bits of GSmem
    // CSA in 16-31
    // cmp higher 16 bits of clut with lower 16 bits of GSmem

    // HIGH_16BITS_VM
    // CSA in 0-15
    // cmp lower 16 bits of clut with higher 16 bits of GSmem
    // CSA in 16-31
    // cmp higher 16 bits of clut with higher 16 bits of GSmem
    if(HIGH_16BITS_VM && CSA_0_15) {
        // move up to low
        GSmem_0 = _mm_load_si128((__m128i*)GSmem); // 9 8 1 0
        GSmem_1 = _mm_load_si128((__m128i*)GSmem+1); // 11 10 3 2
        GSmem_2 = _mm_load_si128((__m128i*)GSmem+2); // 13 12 5 4
        GSmem_3 = _mm_load_si128((__m128i*)GSmem+3); // 15 14 7 6
        GSmem_0 = _mm_srli_epi32(GSmem_0, 16);
        GSmem_1 = _mm_srli_epi32(GSmem_1, 16);
        GSmem_2 = _mm_srli_epi32(GSmem_2, 16);
        GSmem_3 = _mm_srli_epi32(GSmem_3, 16);
    } else if(HIGH_16BITS_VM && !CSA_0_15) {
        // Remove lower 16 bits
        GSmem_0 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)GSmem)); // 9 8 1 0
        GSmem_1 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)GSmem+1)); // 11 10 3 2
        GSmem_2 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)GSmem+2)); // 13 12 5 4
        GSmem_3 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)GSmem+3)); // 15 14 7 6
    } else if(!HIGH_16BITS_VM && CSA_0_15) {
        // Remove higher 16 bits
        GSmem_0 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)GSmem)); // 9 8 1 0
        GSmem_1 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)GSmem+1)); // 11 10 3 2
        GSmem_2 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)GSmem+2)); // 13 12 5 4
        GSmem_3 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)GSmem+3)); // 15 14 7 6
    } else if(!HIGH_16BITS_VM && !CSA_0_15) {
        // move low to high
        GSmem_0 = _mm_load_si128((__m128i*)GSmem); // 9 8 1 0
        GSmem_1 = _mm_load_si128((__m128i*)GSmem+1); // 11 10 3 2
        GSmem_2 = _mm_load_si128((__m128i*)GSmem+2); // 13 12 5 4
        GSmem_3 = _mm_load_si128((__m128i*)GSmem+3); // 15 14 7 6
        GSmem_0 = _mm_slli_epi32(GSmem_0, 16);
        GSmem_1 = _mm_slli_epi32(GSmem_1, 16);
        GSmem_2 = _mm_slli_epi32(GSmem_2, 16);
        GSmem_3 = _mm_slli_epi32(GSmem_3, 16);
    }

    // Unsizzle the data
    __m128i row_0 = _mm_unpacklo_epi64(GSmem_0, GSmem_1); // 3 2 1 0
    __m128i row_1 = _mm_unpacklo_epi64(GSmem_2, GSmem_3); // 7 6 5 4
    __m128i row_2 = _mm_unpackhi_epi64(GSmem_0, GSmem_1); // 11 10 9 8
    __m128i row_3 = _mm_unpackhi_epi64(GSmem_2, GSmem_3); // 15 14 13 12

    // load old data & remove useless part
    if(!CSA_0_15) {
        // Remove lower 16 bits
        clut_0 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut));
        clut_1 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut+1));
        clut_2 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut+2));
        clut_3 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut+3));
    } else {
        // Remove higher 16 bits
        clut_0 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut));
        clut_1 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut+1));
        clut_2 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut+2));
        clut_3 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut+3));
    }

    // Do the comparaison
    __m128i result = _mm_cmpeq_epi16(row_0, clut_0);
    __m128i result_tmp = _mm_cmpeq_epi16(row_1, clut_1);
    result = _mm_and_si128(result, result_tmp);

    result_tmp = _mm_cmpeq_epi16(row_2, clut_2);
    result = _mm_and_si128(result, result_tmp);

    result_tmp = _mm_cmpeq_epi16(row_3, clut_3);
    result = _mm_and_si128(result, result_tmp);

    u32 result_int = _mm_movemask_epi8(result);
    if(CSA_0_15) {
        // only lower 16bits must be checked
        if ((result_int&0x3333) != 0x3333)
            return true;
    } else {
        // only higher 16bits must be checked
        if ((result_int&0xCCCC) != 0xCCCC)
            return true;
    }

    return false;
}
void tuned_ConvertRGBToULY4(uint8_t *pYBegin, uint8_t *pUBegin, uint8_t *pVBegin, const uint8_t *pSrcBegin, const uint8_t *pSrcEnd, size_t cbWidth, ssize_t scbStride)
{
	const int shift = 14;

	__m128i rb2y, xg2y, rb2u, xg2u, rb2v, xg2v;

	if (std::is_same<T, CBGRAColorOrder>::value || std::is_same<T, CBGRColorOrder>::value)
	{
		rb2y = _mm_set2_epi16_shift(C::R2Y, C::B2Y, shift);
		xg2y = _mm_set2_epi16_shift(16.5 / 0xff, C::G2Y, shift);
		rb2u = _mm_set2_epi16_shift(C::R2U, C::B2U, shift);
		xg2u = _mm_set2_epi16_shift(128.5 / 0xff, C::G2U, shift);
		rb2v = _mm_set2_epi16_shift(C::R2V, C::B2V, shift);
		xg2v = _mm_set2_epi16_shift(128.5 / 0xff, C::G2V, shift);
	}
	else
	{
		rb2y = _mm_set2_epi16_shift(C::B2Y, C::R2Y, shift);
		xg2y = _mm_set2_epi16_shift(C::G2Y, 16.5 / 0xff, shift);
		rb2u = _mm_set2_epi16_shift(C::B2U, C::R2U, shift);
		xg2u = _mm_set2_epi16_shift(C::G2U, 128.5 / 0xff, shift);
		rb2v = _mm_set2_epi16_shift(C::B2V, C::R2V, shift);
		xg2v = _mm_set2_epi16_shift(C::G2V, 128.5 / 0xff, shift);
	}

	auto y = pYBegin;
	auto u = pUBegin;
	auto v = pVBegin;

	for (auto p = pSrcBegin; p != pSrcEnd; p += scbStride)
	{
		auto pp = p;

		for (; pp <= p + cbWidth - 16; pp += T::BYPP*4)
		{
			__m128i m = _mm_loadu_si128((const __m128i *)pp);
			__m128i rb, xg;
			if (std::is_same<T, CBGRAColorOrder>::value)
			{
				// m = XX R3 G3 B3 XX R2 G2 B2 XX R1 G1 B1 XX R0 G0 B0
				rb = _mm_and_si128(m, _mm_set1_epi16(0x00ff)); // 00 R3 00 B3 00 R2 00 B2 00 R1 00 B1 00 R0 00 B0
				xg = _mm_or_si128(_mm_srli_epi16(m, 8), _mm_set1_epi32(0x00ff0000)); // 00 ff 00 G3 00 ff 00 G2 00 ff 00 G1 00 ff 00 G0
			}
#ifdef __SSSE3__
			else if (std::is_same<T, CBGRColorOrder>::value)
			{
				// m = XX XX XX XX R3 G3 B3 R2 G2 B2 R1 G1 B1 R0 G0 B0
				rb = _mm_shuffle_epi8(m, _mm_set_epi8(-1, 11, -1, 9, -1, 8, -1, 6, -1, 5, -1, 3, -1, 2, -1, 0)); // 00 R3 00 B3 00 R2 00 B2 00 R1 00 B1 00 R0 00 B0
				xg = _mm_or_si128(_mm_shuffle_epi8(m, _mm_set_epi8(-1, -1, -1, 10, -1, -1, -1, 7, -1, -1, -1, 4, -1, -1, -1, 1)), _mm_set1_epi32(0x00ff0000)); // 00 ff 00 G3 00 ff 00 G2 00 ff 00 G1 00 ff 00 G0
			}
#endif
			else if (std::is_same<T, CARGBColorOrder>::value)
			{
				// m = B3 G3 R3 XX B2 G2 R2 XX B1 G1 R1 XX B0 G0 R0 XX
				rb = _mm_srli_epi16(m, 8); // 00 B3 00 R3 00 B2 00 R2 00 B1 00 R1 00 B0 00 R0
				xg = _mm_or_si128(_mm_and_si128(m, _mm_set1_epi32(0x00ff0000)), _mm_set1_epi32(0x000000ff)); // 00 G3 00 ff 00 G2 00 ff 00 G1 00 ff 00 G0 00 ff
			}
#ifdef __SSSE3__
			else if (std::is_same<T, CRGBColorOrder>::value)
			{
				// m = XX XX XX XX B3 G3 R3 B2 G2 R2 B1 G1 R1 B0 G0 R0
				rb = _mm_shuffle_epi8(m, _mm_set_epi8(-1, 11, -1, 9, -1, 8, -1, 6, -1, 5, -1, 3, -1, 2, -1, 0)); // 00 B3 00 R3 00 B2 00 R2 00 B1 00 R1 00 B0 00 R0
				xg = _mm_or_si128(_mm_shuffle_epi8(m, _mm_set_epi8(-1, 10, -1, -1, -1, 7, -1, -1, -1, 4, -1, -1, -1, 1, -1, -1)), _mm_set1_epi32(0x000000ff)); // 00 G3 00 ff 00 G2 00 ff 00 G1 00 ff 00 G0 00 ff
			}
#endif

			auto xrgb2yuv = [rb, xg, shift](__m128i rb2yuv, __m128i xg2yuv) -> uint32_t {
				__m128i yuv = _mm_add_epi32(_mm_madd_epi16(rb, rb2yuv), _mm_madd_epi16(xg, xg2yuv));
				yuv = _mm_srli_epi32(yuv, shift);
#ifdef __SSSE3__
				if (F >= CODEFEATURE_SSSE3)
				{
					yuv = _mm_shuffle_epi8(yuv, _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0));
				}
				else
#endif
				{
					yuv = _mm_packs_epi32(yuv, yuv);
					yuv = _mm_packus_epi16(yuv, yuv);
				}
				return _mm_cvtsi128_si32(yuv);
			};
			*(uint32_t *)y = xrgb2yuv(rb2y, xg2y);
			*(uint32_t *)u = xrgb2yuv(rb2u, xg2u);
			*(uint32_t *)v = xrgb2yuv(rb2v, xg2v);

			y += 4;
			u += 4;
			v += 4;
		}
		for (; pp < p + cbWidth; pp += T::BYPP)
		{
			__m128i m;
			__m128i rb, xg;
			if (std::is_same<T, CBGRAColorOrder>::value || std::is_same<T, CBGRColorOrder>::value)
			{
				if (std::is_same<T, CBGRAColorOrder>::value)
				{
					m = _mm_cvtsi32_si128(*(const uint32_t *)pp); // m = XX XX XX XX XX XX XX XX XX XX XX XX XX R0 G0 B0
				}
				else
				{
					m = _mm_cvtsi32_si128(*(const uint32_t *)(pp - 1)); // m = XX XX XX XX XX XX XX XX XX XX XX XX R0 G0 B0 XX
					m = _mm_srli_epi32(m, 8);
				}
				rb = _mm_and_si128(m, _mm_set1_epi16(0x00ff)); // 00 XX 00 XX 00 XX 00 XX 00 XX 00 XX 00 R0 00 B0
				xg = _mm_or_si128(_mm_srli_epi16(m, 8), _mm_set1_epi32(0x00ff0000)); // 00 ff 00 XX 00 ff 00 XX 00 ff 00 XX 00 ff 00 G0
			}
			else if (std::is_same<T, CARGBColorOrder>::value || std::is_same<T, CRGBColorOrder>::value)
			{
				if (std::is_same<T, CARGBColorOrder>::value)
				{
					m = _mm_cvtsi32_si128(*(const uint32_t *)pp); // m = XX XX XX XX XX XX XX XX XX XX XX XX B0 G0 R0 XX
				}
				else
				{
					m = _mm_cvtsi32_si128(*(const uint32_t *)(pp - 1)); // m = XX XX XX XX XX XX XX XX XX XX XX XX B0 G0 R0 XX
				}
				rb = _mm_srli_epi16(m, 8); // 00 XX 00 XX 00 XX 00 XX 00 XX 00 XX 00 B0 00 R0
				xg = _mm_or_si128(_mm_and_si128(m, _mm_set1_epi32(0x00ff0000)), _mm_set1_epi32(0x000000ff)); // 00 XX 00 ff 00 XX 00 ff 00 XX 00 ff 00 G0 00 ff
			}

			auto xrgb2yuv = [rb, xg, shift](__m128i rb2yuv, __m128i xg2yuv) -> uint8_t {
				__m128i yuv = _mm_add_epi32(_mm_madd_epi16(rb, rb2yuv), _mm_madd_epi16(xg, xg2yuv));
				yuv = _mm_srli_epi32(yuv, shift);
				return (uint8_t)_mm_cvtsi128_si32(yuv);
			};
			*y = xrgb2yuv(rb2y, xg2y);
			*u = xrgb2yuv(rb2u, xg2u);
			*v = xrgb2yuv(rb2v, xg2v);

			y++;
			u++;
			v++;
		}
	}
}
Example #13
0
static inline void
desc_to_olflags_v(struct i40e_rx_queue *rxq, __m128i descs[4],
	struct rte_mbuf **rx_pkts)
{
	const __m128i mbuf_init = _mm_set_epi64x(0, rxq->mbuf_initializer);
	__m128i rearm0, rearm1, rearm2, rearm3;

	__m128i vlan0, vlan1, rss, l3_l4e;

	/* mask everything except RSS, flow director and VLAN flags
	 * bit2 is for VLAN tag, bit11 for flow director indication
	 * bit13:12 for RSS indication.
	 */
	const __m128i rss_vlan_msk = _mm_set_epi32(
			0x1c03804, 0x1c03804, 0x1c03804, 0x1c03804);

	const __m128i cksum_mask = _mm_set_epi32(
			PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
			PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
			PKT_RX_EIP_CKSUM_BAD,
			PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
			PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
			PKT_RX_EIP_CKSUM_BAD,
			PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
			PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
			PKT_RX_EIP_CKSUM_BAD,
			PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD |
			PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD |
			PKT_RX_EIP_CKSUM_BAD);

	/* map rss and vlan type to rss hash and vlan flag */
	const __m128i vlan_flags = _mm_set_epi8(0, 0, 0, 0,
			0, 0, 0, 0,
			0, 0, 0, PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED,
			0, 0, 0, 0);

	const __m128i rss_flags = _mm_set_epi8(0, 0, 0, 0,
			0, 0, 0, 0,
			PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH, 0, 0,
			0, 0, PKT_RX_FDIR, 0);

	const __m128i l3_l4e_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0,
			/* shift right 1 bit to make sure it not exceed 255 */
			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD |
			 PKT_RX_IP_CKSUM_BAD) >> 1,
			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD |
			 PKT_RX_L4_CKSUM_BAD) >> 1,
			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD) >> 1,
			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
			PKT_RX_IP_CKSUM_BAD >> 1,
			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1);

	vlan0 = _mm_unpackhi_epi32(descs[0], descs[1]);
	vlan1 = _mm_unpackhi_epi32(descs[2], descs[3]);
	vlan0 = _mm_unpacklo_epi64(vlan0, vlan1);

	vlan1 = _mm_and_si128(vlan0, rss_vlan_msk);
	vlan0 = _mm_shuffle_epi8(vlan_flags, vlan1);

	rss = _mm_srli_epi32(vlan1, 11);
	rss = _mm_shuffle_epi8(rss_flags, rss);

	l3_l4e = _mm_srli_epi32(vlan1, 22);
	l3_l4e = _mm_shuffle_epi8(l3_l4e_flags, l3_l4e);
	/* then we shift left 1 bit */
	l3_l4e = _mm_slli_epi32(l3_l4e, 1);
	/* we need to mask out the reduntant bits */
	l3_l4e = _mm_and_si128(l3_l4e, cksum_mask);

	vlan0 = _mm_or_si128(vlan0, rss);
	vlan0 = _mm_or_si128(vlan0, l3_l4e);

	/*
	 * At this point, we have the 4 sets of flags in the low 16-bits
	 * of each 32-bit value in vlan0.
	 * We want to extract these, and merge them with the mbuf init data
	 * so we can do a single 16-byte write to the mbuf to set the flags
	 * and all the other initialization fields. Extracting the
	 * appropriate flags means that we have to do a shift and blend for
	 * each mbuf before we do the write.
	 */
	rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vlan0, 8), 0x10);
	rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vlan0, 4), 0x10);
	rearm2 = _mm_blend_epi16(mbuf_init, vlan0, 0x10);
	rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(vlan0, 4), 0x10);

	/* write the rearm data and the olflags in one write */
	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) !=
			offsetof(struct rte_mbuf, rearm_data) + 8);
	RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) !=
			RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16));
	_mm_store_si128((__m128i *)&rx_pkts[0]->rearm_data, rearm0);
	_mm_store_si128((__m128i *)&rx_pkts[1]->rearm_data, rearm1);
	_mm_store_si128((__m128i *)&rx_pkts[2]->rearm_data, rearm2);
	_mm_store_si128((__m128i *)&rx_pkts[3]->rearm_data, rearm3);
}
 __m64 interpolvline128_3(__m128i* temp){



	__m128i xmm6;

	__m64 ret;

	__m128i xmm7 = _mm_setzero_si128();

	__m128i xmm0 = _mm_load_si128(temp++);
	__m128i xmm1 = _mm_load_si128(temp++);
	__m128i xmm2 = _mm_load_si128(temp++);
	__m128i xmm3 = _mm_load_si128(temp++);
	__m128i xmm4 = _mm_load_si128(temp++);
	__m128i xmm5 = _mm_load_si128(temp);

	xmm1 = _mm_add_epi16(xmm1,xmm4);
	xmm0 = _mm_add_epi16(xmm0,xmm5);

	xmm6 = _mm_set_epi32(0xFFFBFFFB,0xFFFBFFFB,0xFFFBFFFB,0xFFFBFFFB);

	xmm4 = _mm_mullo_epi16(xmm1, xmm6);
	xmm5 = _mm_mulhi_epi16(xmm1, xmm6);

	xmm1 = _mm_unpacklo_epi16(xmm4, xmm5);
	xmm6 = _mm_unpackhi_epi16(xmm4, xmm5);

	xmm7 = _mm_set_epi32(0x00140014,0x00140014,0x00140014,0x00140014);
	xmm5 = _mm_add_epi16(xmm2,xmm3);

	xmm4 = _mm_mullo_epi16(xmm5, xmm7);
	xmm5 = _mm_mulhi_epi16(xmm5, xmm7);

	xmm7 = _mm_unpacklo_epi16(xmm4, xmm5);
	xmm4 = _mm_unpackhi_epi16(xmm4, xmm5);

	xmm7 = _mm_add_epi32(xmm7,xmm1);
	xmm4 = _mm_add_epi32(xmm4,xmm6); 

	xmm6 = _mm_set_epi32(0x00010001,0x00010001,0x00010001,0x00010001);
	xmm6 = _mm_mulhi_epi16(xmm0, xmm6);

	xmm1 = _mm_unpacklo_epi16(xmm0, xmm6);
	xmm6 = _mm_unpackhi_epi16(xmm0, xmm6);

	xmm7 = _mm_add_epi32(xmm7,xmm1);
	xmm4 = _mm_add_epi32(xmm4,xmm6); 
	
	xmm1 = _mm_set_epi32(0x00000200,0x00000200,0x00000200,0x00000200);
	
	xmm7 = _mm_add_epi32(xmm7,xmm1);
	xmm4 = _mm_add_epi32(xmm4,xmm1);
	
	xmm5 = _mm_setzero_si128();

	xmm7 = _mm_srli_epi32(xmm7, 10);
	xmm7 = _mm_max_epi16(xmm7, xmm5); // preventing negative values
	xmm7 = _mm_slli_epi32(xmm7,16);
	xmm7 = _mm_srli_epi32(xmm7,16);

	xmm4 = _mm_srli_epi32(xmm4, 10);

	xmm4 = _mm_max_epi16(xmm4, xmm5); // preventing negative values
	xmm4 = _mm_slli_epi32(xmm4,16);
	xmm4 = _mm_srli_epi32(xmm4,16);

	xmm6 = _mm_packs_epi32(xmm7, xmm4);
	
	xmm1 = _mm_set_epi32(0x00100010,0x00100010,0x00100010,0x00100010);
	xmm2 = _mm_add_epi16(xmm2,xmm1);
	xmm2 = _mm_max_epi16(xmm2, xmm5); // preventing negative values
	xmm2 = _mm_srli_epi16(xmm2,5);

	
	xmm3 = _mm_add_epi16(xmm3,xmm1);
	xmm3 = _mm_max_epi16(xmm3, xmm5); // preventing negative values
	xmm3 = _mm_srli_epi16(xmm3,5);

	xmm2 = _mm_packus_epi16(xmm2,xmm5);
	xmm3 = _mm_packus_epi16(xmm3,xmm5);
	xmm6 = _mm_packus_epi16(xmm6,xmm5);

	xmm7 = _mm_unpacklo_epi8(xmm2,xmm6);
	xmm4 = _mm_unpacklo_epi8(xmm6,xmm3);
	xmm6 = _mm_avg_epu8(xmm4,xmm7);

	xmm6 = _mm_srli_epi16(xmm6,8);
	xmm6 = _mm_packus_epi16(xmm6,xmm5);
	ret = _mm_movepi64_pi64(xmm6);

	_mm_empty(); 

	return(ret);
}
Example #15
0
void spu_interpreter::ROTI(SPUThread& CPU, spu_opcode_t op)
{
	const auto a = CPU.GPR[op.ra].vi;
	const s32 n = op.si7 & 0x1f;
	CPU.GPR[op.rt].vi = _mm_or_si128(_mm_slli_epi32(a, n), _mm_srli_epi32(a, 32 - n));
}
/* natural logarithm computed for 4 simultaneous float 
return NaN for x <= 0
*/
__m128 log_ps(v4sfu *xPtr) {
   __m128 x=*((__m128 *)xPtr);
#ifdef USE_SSE2
   __m128i emm0;
#else
   __m64 mm0, mm1;
#endif
   __m128 one = *(__m128*)_ps_1;

   __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());

   x = _mm_max_ps(x, *(__m128*)_ps_min_norm_pos);  /* cut off denormalized stuff */

#ifndef USE_SSE2
   /* part 1: x = frexpf(x, &e); */
   COPY_XMM_TO_MM(x, mm0, mm1);
   mm0 = _mm_srli_pi32(mm0, 23);
   mm1 = _mm_srli_pi32(mm1, 23);
#else
   emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
#endif
   /* keep only the fractional part */
   x = _mm_and_ps(x, *(__m128*)_ps_inv_mant_mask);
   x = _mm_or_ps(x, *(__m128*)_ps_0p5);

#ifndef USE_SSE2
   /* now e=mm0:mm1 contain the really base-2 exponent */
   mm0 = _mm_sub_pi32(mm0, *(__m64*)_pi32_0x7f);
   mm1 = _mm_sub_pi32(mm1, *(__m64*)_pi32_0x7f);
   __m128 e = _mm_cvtpi32x2_ps(mm0, mm1);
   _mm_empty(); /* bye bye mmx */
#else
   emm0 = _mm_sub_epi32(emm0, *(__m128i*)_pi32_0x7f);
   __m128 e = _mm_cvtepi32_ps(emm0);
#endif

   e = _mm_add_ps(e, one);

   /* part2: 
   if( x < SQRTHF ) {
   e -= 1;
   x = x + x - 1.0;
   } else { x = x - 1.0; }
   */
   __m128 mask = _mm_cmplt_ps(x, *(__m128*)_ps_cephes_SQRTHF);
   __m128 tmp = _mm_and_ps(x, mask);
   x = _mm_sub_ps(x, one);
   e = _mm_sub_ps(e, _mm_and_ps(one, mask));
   x = _mm_add_ps(x, tmp);


   __m128 z = _mm_mul_ps(x,x);

   __m128 y = *(__m128*)_ps_cephes_log_p0;
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p1);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p2);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p3);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p4);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p5);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p6);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p7);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p8);
   y = _mm_mul_ps(y, x);

   y = _mm_mul_ps(y, z);


   tmp = _mm_mul_ps(e, *(__m128*)_ps_cephes_log_q1);
   y = _mm_add_ps(y, tmp);


   tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
   y = _mm_sub_ps(y, tmp);

   tmp = _mm_mul_ps(e, *(__m128*)_ps_cephes_log_q2);
   x = _mm_add_ps(x, y);
   x = _mm_add_ps(x, tmp);
   x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
   return x;
}
Example #17
0
void spu_interpreter::ROTMI(SPUThread& CPU, spu_opcode_t op)
{
	CPU.GPR[op.rt].vi = _mm_srli_epi32(CPU.GPR[op.ra].vi, -op.si7 & 0x3f);
}
Example #18
0
			 PKT_RX_L4_CKSUM_BAD) >> 1,
			(PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD) >> 1,
			(PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1,
			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1,
			PKT_RX_IP_CKSUM_BAD >> 1,
			(PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1);

	vlan0 = _mm_unpackhi_epi32(descs[0], descs[1]);
	vlan1 = _mm_unpackhi_epi32(descs[2], descs[3]);
	vlan0 = _mm_unpacklo_epi64(vlan0, vlan1);

	vlan1 = _mm_and_si128(vlan0, rss_vlan_msk);
	vlan0 = _mm_shuffle_epi8(vlan_flags, vlan1);

	rss = _mm_srli_epi32(vlan1, 11);
	rss = _mm_shuffle_epi8(rss_flags, rss);

	l3_l4e = _mm_srli_epi32(vlan1, 22);
	l3_l4e = _mm_shuffle_epi8(l3_l4e_flags, l3_l4e);
	/* then we shift left 1 bit */
	l3_l4e = _mm_slli_epi32(l3_l4e, 1);
	/* we need to mask out the reduntant bits */
	l3_l4e = _mm_and_si128(l3_l4e, cksum_mask);

	vlan0 = _mm_or_si128(vlan0, rss);
	vlan0 = _mm_or_si128(vlan0, l3_l4e);

	/*
	 * At this point, we have the 4 sets of flags in the low 16-bits
	 * of each 32-bit value in vlan0.
Example #19
0
static void GF_FUNC_ALIGN VS_CC
proc_16bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride,
                uint8_t *d, const uint8_t *s, edge_t *eh, uint16_t plane_max)
{
    const uint16_t *srcp = (uint16_t *)s;
    uint16_t *dstp = (uint16_t *)d;
    stride /= 2;
    bstride /= 2;

    uint16_t* p0 = (uint16_t *)buff + 8;
    uint16_t* p1 = p0 + bstride;
    uint16_t* p2 = p1 + bstride;
    uint16_t* p3 = p2 + bstride;
    uint16_t* p4 = p3 + bstride;
    uint16_t *orig = p0, *end = p4;

    line_copy16(p0, srcp + 2 * stride, width, 2);
    line_copy16(p1, srcp + stride, width, 2);
    line_copy16(p2, srcp, width, 2);
    srcp += stride;
    line_copy16(p3, srcp, width, 2);

    __m128i zero = _mm_setzero_si128();
    __m128 alpha = _mm_set1_ps((float)0.96043387);
    __m128 beta = _mm_set1_ps((float)0.39782473);
    __m128i pmax = _mm_set1_epi32(0xFFFF);
    __m128i min = _mm_set1_epi16((int16_t)eh->min);
    __m128i max = _mm_set1_epi16((int16_t)eh->max);

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 2 ? 1 : -1);
        line_copy16(p4, srcp, width, 2);
        uint16_t* posh[] = {p2 - 2, p2 - 1, p2 + 1, p2 + 2};
        uint16_t* posv[] = {p0, p1, p3, p4};

        for (int x = 0; x < width; x += 8) {
            __m128 sumx[2] = {(__m128)zero, (__m128)zero};
            __m128 sumy[2] = {(__m128)zero, (__m128)zero};

            for (int i = 0; i < 4; i++) {
                __m128 xmul = _mm_load_ps(ar_mulxf[i]);
                __m128i xmm0 = _mm_loadu_si128((__m128i *)(posh[i] + x));
                __m128i xmm1 = _mm_unpackhi_epi16(xmm0, zero);
                xmm0 = _mm_unpacklo_epi16(xmm0, zero);
                sumx[0] = _mm_add_ps(sumx[0], _mm_mul_ps(_mm_cvtepi32_ps(xmm0), xmul));
                sumx[1] = _mm_add_ps(sumx[1], _mm_mul_ps(_mm_cvtepi32_ps(xmm1), xmul));

                xmul = _mm_load_ps(ar_mulyf[i]);
                xmm0 = _mm_load_si128((__m128i *)(posv[i] + x));
                xmm1 = _mm_unpackhi_epi16(xmm0, zero);
                xmm0 = _mm_unpacklo_epi16(xmm0, zero);
                sumy[0] = _mm_add_ps(sumy[0], _mm_mul_ps(_mm_cvtepi32_ps(xmm0), xmul));
                sumy[1] = _mm_add_ps(sumy[1], _mm_mul_ps(_mm_cvtepi32_ps(xmm1), xmul));
            }

            __m128i out[2];
            for (int i = 0; i < 2; i++) {
                sumx[i] = mm_abs_ps(sumx[i]);
                sumy[i] = mm_abs_ps(sumy[i]);
                __m128 t0 = _mm_max_ps(sumx[i], sumy[i]);
                __m128 t1 = _mm_min_ps(sumx[i], sumy[i]);
                t0 = _mm_add_ps(_mm_mul_ps(alpha, t0), _mm_mul_ps(beta, t1));
                out[i] = _mm_srli_epi32(_mm_cvtps_epi32(t0), eh->rshift);
                out[i] = mm_min_epi32(out[i], pmax);
            }
            out[0] = mm_cast_epi32(out[0], out[1]);

            out[1] = MM_MIN_EPU16(out[0], max);
            out[1] = _mm_cmpeq_epi16(out[1], max);
            out[0] = _mm_or_si128(out[1], out[0]);

            out[1] = MM_MAX_EPU16(out[0], min);
            out[1] = _mm_cmpeq_epi16(out[1], min);
            out[0] = _mm_andnot_si128(out[1], out[0]);

            _mm_store_si128((__m128i *)(dstp + x), out[0]);
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = p3;
        p3 = p4;
        p4 = (p4 == end) ? orig : p4 + bstride;
    }
}
Example #20
0
OD_SIMD_INLINE od_m256i od_mm256_srli_epi32(od_m256i a, int c) {
  od_m256i r;
  r.lo = _mm_srli_epi32(a.lo, c);
  r.hi = _mm_srli_epi32(a.hi, c);
  return r;
}
Example #21
0
// this function performs precise calculations
void PreOver_SSE2(void* dest, const void* source1, const void* source2, size_t size)
{
	static const size_t stride = sizeof(__m128i)*4;
	static const u32 PSD = 64;

	static const __m128i round = _mm_set1_epi16(128);
	static const __m128i lomask = _mm_set1_epi32(0x00FF00FF);

	assert(source1 != NULL && source2 != NULL && dest != NULL);
	assert(size % stride == 0);

	const __m128i* source128_1 = reinterpret_cast<const __m128i*>(source1);
	const __m128i* source128_2 = reinterpret_cast<const __m128i*>(source2);
	__m128i*	   dest128 = reinterpret_cast<__m128i*>(dest);	
	
	__m128i d, s, a, rb, ag, t;

	// TODO: dynamic prefetch schedluing distance? needs to be optimized (R.N)

	for(size_t k = 0, length = size/stride; k < length; ++k)	
	{
		// TODO: put prefetch between calculations?(R.N)
		_mm_prefetch(reinterpret_cast<const s8*>(source128_1+PSD), _MM_HINT_NTA);
		_mm_prefetch(reinterpret_cast<const s8*>(source128_2+PSD), _MM_HINT_NTA);	

		// work on entire cacheline before next prefetch
		for(int n = 0; n < 4; ++n, ++dest128, ++source128_1, ++source128_2)
		{
			// TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/

			// TODO: load entire cacheline at the same time? are there enough registers? 32 bit mode (special compile for 64bit?) (R.N)
			s = _mm_load_si128(source128_1);		// AABGGRR
			d = _mm_load_si128(source128_2);		// AABGGRR
						
			// PRELERP(S, D) = S+D - ((S*D[A]+0x80)>>8)+(S*D[A]+0x80))>>8
			// T = S*D[A]+0x80 => PRELERP(S,D) = S+D - ((T>>8)+T)>>8

			// set alpha to lo16 from dest_
			a = _mm_srli_epi32(d, 24);			// 000000AA	
			rb = _mm_slli_epi32(a, 16);			// 00AA0000
			a = _mm_or_si128(rb, a);			// 00AA00AA

			rb = _mm_and_si128(lomask, s);		// 00BB00RR		
			rb = _mm_mullo_epi16(rb, a);		// BBBBRRRR	
			rb = _mm_add_epi16(rb, round);		// BBBBRRRR
			t = _mm_srli_epi16(rb, 8);			// 00BB00RR	
			t = _mm_add_epi16(t, rb);
			rb = _mm_srli_epi16(t, 8);

			ag = _mm_srli_epi16(s, 8); 			// 00AA00GG		
			ag = _mm_mullo_epi16(ag, a);		// AAAAGGGG		
			ag = _mm_add_epi16(ag, round);
			t = _mm_srli_epi16(ag, 8);
			t = _mm_add_epi16(t, ag);
			ag = _mm_andnot_si128(lomask, t);	// AA00GG00		
					
			rb = _mm_or_si128(rb, ag);			// AABGGRR		pack
					
			rb = _mm_sub_epi8(s, rb);			// sub S-[(D[A]*S)/255]
			d = _mm_add_epi8(d, rb);			// add D+[S-(D[A]*S)/255]

			_mm_store_si128(dest128, d);
		}
	}		
}
Example #22
0
OD_SIMD_INLINE __m128i od_unbiased_rshift_epi32(__m128i a, int b) {
  return _mm_srai_epi32(_mm_add_epi32(_mm_srli_epi32(a, 32 - b), a), b);
}
Example #23
0
HashReturn Update(hashState *state, const BitSequence *data,
                  DataLength databitlen)
{
  int r;
  __m128i x0;
  __m128i x1;
  __m128i x2;
  __m128i x3;
  __m128i x4;
  __m128i x5;
  __m128i x6;
  __m128i x7;
  __m128i y0;
  __m128i y1;
  __m128i y2;
  __m128i y3;

  x0 = state->x[0];
  x1 = state->x[1];
  x2 = state->x[2];
  x3 = state->x[3];
  x4 = state->x[4];
  x5 = state->x[5];
  x6 = state->x[6];
  x7 = state->x[7];
    
  while (databitlen >= 8) {
    x0 = _mm_xor_si128(x0,_mm_set_epi32(0,0,0,(int) (unsigned int) *data));
    data += 1;
    databitlen -= 8;
    
    for (r = 0;r < CUBEHASH_ROUNDS;++r) {
      x4 = _mm_add_epi32(x0,x4);
      x5 = _mm_add_epi32(x1,x5);
      x6 = _mm_add_epi32(x2,x6);
      x7 = _mm_add_epi32(x3,x7);
      y0 = x2;
      y1 = x3;
      y2 = x0;
      y3 = x1;
      x0 = _mm_xor_si128(_mm_slli_epi32(y0,7),_mm_srli_epi32(y0,25));
      x1 = _mm_xor_si128(_mm_slli_epi32(y1,7),_mm_srli_epi32(y1,25));
      x2 = _mm_xor_si128(_mm_slli_epi32(y2,7),_mm_srli_epi32(y2,25));
      x3 = _mm_xor_si128(_mm_slli_epi32(y3,7),_mm_srli_epi32(y3,25));
      x0 = _mm_xor_si128(x0,x4);
      x1 = _mm_xor_si128(x1,x5);
      x2 = _mm_xor_si128(x2,x6);
      x3 = _mm_xor_si128(x3,x7);
      x4 = _mm_shuffle_epi32(x4,0x4e);
      x5 = _mm_shuffle_epi32(x5,0x4e);
      x6 = _mm_shuffle_epi32(x6,0x4e);
      x7 = _mm_shuffle_epi32(x7,0x4e);
      x4 = _mm_add_epi32(x0,x4);
      x5 = _mm_add_epi32(x1,x5);
      x6 = _mm_add_epi32(x2,x6);
      x7 = _mm_add_epi32(x3,x7);
      y0 = x1;
      y1 = x0;
      y2 = x3;
      y3 = x2;
      x0 = _mm_xor_si128(_mm_slli_epi32(y0,11),_mm_srli_epi32(y0,21));
      x1 = _mm_xor_si128(_mm_slli_epi32(y1,11),_mm_srli_epi32(y1,21));
      x2 = _mm_xor_si128(_mm_slli_epi32(y2,11),_mm_srli_epi32(y2,21));
      x3 = _mm_xor_si128(_mm_slli_epi32(y3,11),_mm_srli_epi32(y3,21));
      x0 = _mm_xor_si128(x0,x4);
      x1 = _mm_xor_si128(x1,x5);
      x2 = _mm_xor_si128(x2,x6);
      x3 = _mm_xor_si128(x3,x7);
      x4 = _mm_shuffle_epi32(x4,0xb1);
      x5 = _mm_shuffle_epi32(x5,0xb1);
      x6 = _mm_shuffle_epi32(x6,0xb1);
      x7 = _mm_shuffle_epi32(x7,0xb1);
    }
  }
  
  state->x[0] = x0;
  state->x[1] = x1;
  state->x[2] = x2;
  state->x[3] = x3;
  state->x[4] = x4;
  state->x[5] = x5;
  state->x[6] = x6;
  state->x[7] = x7;

  if (databitlen > 0) {
    ((unsigned char *) state->x)[state->pos / 8] ^= *data;
    state->pos += databitlen;
  }
  return SUCCESS;
}
Example #24
0
 static inline __m128 gen_05(void)
 {
    __m128i ones = (__m128i)gen_ones();
     return (__m128)_mm_slli_epi32 (_mm_srli_epi32(ones, 26), 24);
 }
Example #25
0
void CubeHash::transform ( int r )
{
#ifdef __SSE2__

	__m128i x0, x1, x2, x3, x4, x5, x6, x7;
	__m128i y0, y1, y2, y3;

	x0 = m_x[0];   x1 = m_x[1];   x2 = m_x[2];   x3 = m_x[3];
	x4 = m_x[4];   x5 = m_x[5];   x6 = m_x[6];   x7 = m_x[7];

	for( ; r > 0; --r )
	{
		x4 = _mm_add_epi32( x0, x4 );
		x5 = _mm_add_epi32( x1, x5 );
		x6 = _mm_add_epi32( x2, x6 );
		x7 = _mm_add_epi32( x3, x7 );

		y0 = x2;
		y1 = x3;
		y2 = x0;
		y3 = x1;

		x0 = _mm_xor_si128( _mm_slli_epi32(y0,7), _mm_srli_epi32(y0,25) );
		x1 = _mm_xor_si128( _mm_slli_epi32(y1,7), _mm_srli_epi32(y1,25) );
		x2 = _mm_xor_si128( _mm_slli_epi32(y2,7), _mm_srli_epi32(y2,25) );
		x3 = _mm_xor_si128( _mm_slli_epi32(y3,7), _mm_srli_epi32(y3,25) );

		x0 = _mm_xor_si128( x0, x4 );
		x1 = _mm_xor_si128( x1, x5 );
		x2 = _mm_xor_si128( x2, x6 );
		x3 = _mm_xor_si128( x3, x7 );

		x4 = _mm_shuffle_epi32( x4, 0x4e );
		x5 = _mm_shuffle_epi32( x5, 0x4e );
		x6 = _mm_shuffle_epi32( x6, 0x4e );
		x7 = _mm_shuffle_epi32( x7, 0x4e );

		x4 = _mm_add_epi32( x0, x4 );
		x5 = _mm_add_epi32( x1, x5 );
		x6 = _mm_add_epi32( x2, x6 );
		x7 = _mm_add_epi32( x3, x7 );

		y0 = x1;
		y1 = x0;
		y2 = x3;
		y3 = x2;

		x0 = _mm_xor_si128( _mm_slli_epi32(y0,11), _mm_srli_epi32(y0,21) );
		x1 = _mm_xor_si128( _mm_slli_epi32(y1,11), _mm_srli_epi32(y1,21) );
		x2 = _mm_xor_si128( _mm_slli_epi32(y2,11), _mm_srli_epi32(y2,21) );
		x3 = _mm_xor_si128( _mm_slli_epi32(y3,11), _mm_srli_epi32(y3,21) );

		x0 = _mm_xor_si128( x0, x4 );
		x1 = _mm_xor_si128( x1, x5 );
		x2 = _mm_xor_si128( x2, x6 );
		x3 = _mm_xor_si128( x3, x7 );

		x4 = _mm_shuffle_epi32( x4, 0xb1 );
		x5 = _mm_shuffle_epi32( x5, 0xb1 );
		x6 = _mm_shuffle_epi32( x6, 0xb1 );
		x7 = _mm_shuffle_epi32( x7, 0xb1 );
	}

	m_x[0] = x0;   m_x[1] = x1;   m_x[2] = x2;   m_x[3] = x3;
	m_x[4] = x4;   m_x[5] = x5;   m_x[6] = x6;   m_x[7] = x7;

#else // non SSE2

	int i;
	uint32_t y[16];

	for( ; r > 0; --r )
	{
		for( i = 0; i < 16; ++i )  m_x[i + 16] += m_x[i];
		for( i = 0; i < 16; ++i )  y[i ^ 8] = m_x[i];
		for( i = 0; i < 16; ++i )  m_x[i] = ROTATE(y[i],7);
		for( i = 0; i < 16; ++i )  m_x[i] ^= m_x[i + 16];
		for( i = 0; i < 16; ++i )  y[i ^ 2] = m_x[i + 16];
		for( i = 0; i < 16; ++i )  m_x[i + 16] = y[i];
		for( i = 0; i < 16; ++i )  m_x[i + 16] += m_x[i];
		for( i = 0; i < 16; ++i )  y[i ^ 4] = m_x[i];
		for( i = 0; i < 16; ++i )  m_x[i] = ROTATE(y[i],11);
		for( i = 0; i < 16; ++i )  m_x[i] ^= m_x[i + 16];
		for( i = 0; i < 16; ++i )  y[i ^ 1] = m_x[i + 16];
		for( i = 0; i < 16; ++i )  m_x[i + 16] = y[i];
	}

#endif
}
Example #26
0
 static inline __m128 gen_abs_mask(void)
 {
     __m128i x = _mm_setzero_si128();
     __m128i ones = _mm_cmpeq_epi32(x, x);
     return (__m128)_mm_srli_epi32 (_mm_slli_epi32(ones, 1), 1);
 }
Example #27
0
__m128i aes_ssse3_encrypt(__m128i B, const __m128i* keys, size_t rounds)
   {
   const __m128i sb2u = _mm_set_epi32(
      0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400);
   const __m128i sb2t = _mm_set_epi32(
      0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900);

   const __m128i sbou = _mm_set_epi32(
      0x15AABF7A, 0xC502A878, 0xD0D26D17, 0x6FBDC700);
   const __m128i sbot = _mm_set_epi32(
      0x8E1E90D1, 0x412B35FA, 0xCFE474A5, 0x5FBB6A00);

   const __m128i mc_backward[4] = {
      _mm_set_epi32(0x0E0D0C0F, 0x0A09080B, 0x06050407, 0x02010003),
      _mm_set_epi32(0x0A09080B, 0x06050407, 0x02010003, 0x0E0D0C0F),
      _mm_set_epi32(0x06050407, 0x02010003, 0x0E0D0C0F, 0x0A09080B),
      _mm_set_epi32(0x02010003, 0x0E0D0C0F, 0x0A09080B, 0x06050407),
   };

   B = mm_xor3(_mm_shuffle_epi8(k_ipt1, _mm_and_si128(low_nibs, B)),
               _mm_shuffle_epi8(k_ipt2,
                                _mm_srli_epi32(
                                   _mm_andnot_si128(low_nibs, B),
                                   4)),
               _mm_loadu_si128(keys));

   for(size_t r = 1; ; ++r)
      {
      const __m128i K = _mm_loadu_si128(keys + r);

      __m128i t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, B), 4);

      B = _mm_and_si128(low_nibs, B);

      __m128i t2 = _mm_shuffle_epi8(k_inv2, B);

      B = _mm_xor_si128(B, t);

      __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t));
      __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, B));

      __m128i t5 = _mm_xor_si128(B, _mm_shuffle_epi8(k_inv1, t3));
      __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4));

      if(r == rounds)
         {
         B = _mm_shuffle_epi8(
            mm_xor3(_mm_shuffle_epi8(sbou, t5),
                    _mm_shuffle_epi8(sbot, t6),
                    K),
            sr[r % 4]);

         return B;
         }

      __m128i t7 = mm_xor3(_mm_shuffle_epi8(sb1t, t6),
                           _mm_shuffle_epi8(sb1u, t5),
                           K);

      __m128i t8 = mm_xor3(_mm_shuffle_epi8(sb2t, t6),
                           _mm_shuffle_epi8(sb2u, t5),
                           _mm_shuffle_epi8(t7, mc_forward[r % 4]));

      B = mm_xor3(_mm_shuffle_epi8(t8, mc_forward[r % 4]),
                  _mm_shuffle_epi8(t7, mc_backward[r % 4]),
                  t8);
      }
   }
Example #28
0
 static inline __m128 gen_05(void)
 {
     __m128i x = _mm_setzero_si128();
     __m128i ones = _mm_cmpeq_epi32(x, x);
     return (__m128)_mm_slli_epi32 (_mm_srli_epi32(ones, 26), 24);
 }
Example #29
0
uint32_t
br_chacha20_sse2_run(const void *key,
	const void *iv, uint32_t cc, void *data, size_t len)
{
	unsigned char *buf;
	uint32_t ivtmp[4];
	__m128i kw0, kw1;
	__m128i iw, cw;
	__m128i one;

	static const uint32_t CW[] = {
		0x61707865, 0x3320646e, 0x79622d32, 0x6b206574
	};

	buf = data;
	kw0 = _mm_loadu_si128(key);
	kw1 = _mm_loadu_si128((const void *)((const unsigned char *)key + 16));
	ivtmp[0] = cc;
	memcpy(ivtmp + 1, iv, 12);
	iw = _mm_loadu_si128((const void *)ivtmp);
	cw = _mm_loadu_si128((const void *)CW);
	one = _mm_set_epi32(0, 0, 0, 1);

	while (len > 0) {
		/*
		 * sj contains state words 4*j to 4*j+3.
		 */
		__m128i s0, s1, s2, s3;
		int i;

		s0 = cw;
		s1 = kw0;
		s2 = kw1;
		s3 = iw;
		for (i = 0; i < 10; i ++) {
			/*
			 * Even round is straightforward application on
			 * the state words.
			 */
			s0 = _mm_add_epi32(s0, s1);
			s3 = _mm_xor_si128(s3, s0);
			s3 = _mm_or_si128(
				_mm_slli_epi32(s3, 16),
				_mm_srli_epi32(s3, 16));

			s2 = _mm_add_epi32(s2, s3);
			s1 = _mm_xor_si128(s1, s2);
			s1 = _mm_or_si128(
				_mm_slli_epi32(s1, 12),
				_mm_srli_epi32(s1, 20));

			s0 = _mm_add_epi32(s0, s1);
			s3 = _mm_xor_si128(s3, s0);
			s3 = _mm_or_si128(
				_mm_slli_epi32(s3, 8),
				_mm_srli_epi32(s3, 24));

			s2 = _mm_add_epi32(s2, s3);
			s1 = _mm_xor_si128(s1, s2);
			s1 = _mm_or_si128(
				_mm_slli_epi32(s1, 7),
				_mm_srli_epi32(s1, 25));

			/*
			 * For the odd round, we must rotate some state
			 * words so that the computations apply on the
			 * right combinations of words.
			 */
			s1 = _mm_shuffle_epi32(s1, 0x39);
			s2 = _mm_shuffle_epi32(s2, 0x4E);
			s3 = _mm_shuffle_epi32(s3, 0x93);

			s0 = _mm_add_epi32(s0, s1);
			s3 = _mm_xor_si128(s3, s0);
			s3 = _mm_or_si128(
				_mm_slli_epi32(s3, 16),
				_mm_srli_epi32(s3, 16));

			s2 = _mm_add_epi32(s2, s3);
			s1 = _mm_xor_si128(s1, s2);
			s1 = _mm_or_si128(
				_mm_slli_epi32(s1, 12),
				_mm_srli_epi32(s1, 20));

			s0 = _mm_add_epi32(s0, s1);
			s3 = _mm_xor_si128(s3, s0);
			s3 = _mm_or_si128(
				_mm_slli_epi32(s3, 8),
				_mm_srli_epi32(s3, 24));

			s2 = _mm_add_epi32(s2, s3);
			s1 = _mm_xor_si128(s1, s2);
			s1 = _mm_or_si128(
				_mm_slli_epi32(s1, 7),
				_mm_srli_epi32(s1, 25));

			/*
			 * After the odd round, we rotate back the values
			 * to undo the rotate at the start of the odd round.
			 */
			s1 = _mm_shuffle_epi32(s1, 0x93);
			s2 = _mm_shuffle_epi32(s2, 0x4E);
			s3 = _mm_shuffle_epi32(s3, 0x39);
		}

		/*
		 * Addition with the initial state.
		 */
		s0 = _mm_add_epi32(s0, cw);
		s1 = _mm_add_epi32(s1, kw0);
		s2 = _mm_add_epi32(s2, kw1);
		s3 = _mm_add_epi32(s3, iw);

		/*
		 * Increment block counter.
		 */
		iw = _mm_add_epi32(iw, one);

		/*
		 * XOR final state with the data.
		 */
		if (len < 64) {
			unsigned char tmp[64];
			size_t u;

			_mm_storeu_si128((void *)(tmp +  0), s0);
			_mm_storeu_si128((void *)(tmp + 16), s1);
			_mm_storeu_si128((void *)(tmp + 32), s2);
			_mm_storeu_si128((void *)(tmp + 48), s3);
			for (u = 0; u < len; u ++) {
				buf[u] ^= tmp[u];
			}
			break;
		} else {
			__m128i b0, b1, b2, b3;

			b0 = _mm_loadu_si128((const void *)(buf +  0));
			b1 = _mm_loadu_si128((const void *)(buf + 16));
			b2 = _mm_loadu_si128((const void *)(buf + 32));
			b3 = _mm_loadu_si128((const void *)(buf + 48));
			b0 = _mm_xor_si128(b0, s0);
			b1 = _mm_xor_si128(b1, s1);
			b2 = _mm_xor_si128(b2, s2);
			b3 = _mm_xor_si128(b3, s3);
			_mm_storeu_si128((void *)(buf +  0), b0);
			_mm_storeu_si128((void *)(buf + 16), b1);
			_mm_storeu_si128((void *)(buf + 32), b2);
			_mm_storeu_si128((void *)(buf + 48), b3);
			buf += 64;
			len -= 64;
		}
	}

	/*
	 * _mm_extract_epi32() requires SSE4.1. We prefer to stick to
	 * raw SSE2, thus we use _mm_extract_epi16().
	 */
	return (uint32_t)_mm_extract_epi16(iw, 0)
		| ((uint32_t)_mm_extract_epi16(iw, 1) << 16);
}
Example #30
0
sse2_tests (void)
{
  /* psraw */
  c128.v = _mm_srai_epi16 (m128_16, SHIFT);
  dump128_16 (buf, "_mm_srai_epi16", c128);
  c128.v = _mm_sra_epi16 (m128_16, s128);
  dump128_16 (buf, "_mm_sra_epi16", c128);

  /* psrad */
  c128.v = _mm_srai_epi32 (m128_32, SHIFT);
  dump128_32 (buf, "_mm_srai_epi32", c128);
  c128.v = _mm_sra_epi32 (m128_32, s128);
  dump128_32 (buf, "_mm_sra_epi32", c128);

  /* psrlw */
  c128.v = _mm_srli_epi16 (m128_16, SHIFT);
  dump128_16 (buf, "_mm_srli_epi16", c128);
  c128.v = _mm_srl_epi16 (m128_16, s128);
  dump128_16 (buf, "_mm_srl_epi16", c128);

  /* psrld */
  c128.v = _mm_srli_epi32 (m128_32, SHIFT);
  dump128_32 (buf, "_mm_srli_epi32", c128);
  c128.v = _mm_srl_epi32 (m128_32, s128);
  dump128_32 (buf, "_mm_srl_epi32", c128);

  /* psrlq */
  c128.v = _mm_srli_epi64 (m128_64, SHIFT);
  dump128_64 (buf, "_mm_srli_epi64", c128);
  c128.v = _mm_srl_epi64 (m128_64, s128);
  dump128_64 (buf, "_mm_srl_epi64", c128);

  /* psrldq */
  c128.v = _mm_srli_si128 (m128_128, SHIFT);
  dump128_128 (buf, "_mm_srli_si128 (byte shift) ", c128);

  /* psllw */
  c128.v = _mm_slli_epi16 (m128_16, SHIFT);
  dump128_16 (buf, "_mm_slli_epi16", c128);
  c128.v = _mm_sll_epi16 (m128_16, s128);
  dump128_16 (buf, "_mm_sll_epi16", c128);

  /* pslld */
  c128.v = _mm_slli_epi32 (m128_32, SHIFT);
  dump128_32 (buf, "_mm_slli_epi32", c128);
  c128.v = _mm_sll_epi32 (m128_32, s128);
  dump128_32 (buf, "_mm_sll_epi32", c128);

  /* psllq */
  c128.v = _mm_slli_epi64 (m128_64, SHIFT);
  dump128_64 (buf, "_mm_slli_epi64", c128);
  c128.v = _mm_sll_epi64 (m128_64, s128);
  dump128_64 (buf, "_mm_sll_epi64", c128);

  /* pslldq */
  c128.v = _mm_slli_si128 (m128_128, SHIFT);
  dump128_128 (buf, "_mm_sll_si128 (byte shift)", c128);

  /* Shuffle constant 0x1b == 0b_00_01_10_11, e.g. swap words: ABCD => DCBA.  */

  /* pshufd */
  c128.v = _mm_shuffle_epi32 (m128_128, 0x1b);
  dump128_32 (buf, "_mm_shuffle_epi32", c128);

  /* pshuflw */
  c128.v = _mm_shufflelo_epi16 (m128_128, 0x1b);
  dump128_16 (buf, "_mm_shuffelo_epi16", c128);

  /* pshufhw */
  c128.v = _mm_shufflehi_epi16 (m128_128, 0x1b);
  dump128_16 (buf, "_mm_shuffehi_epi16", c128);
}