void PPUThread::cpu_task() { //SetHostRoundingMode(FPSCR_RN_NEAR); if (custom_task) { if (check_status()) return; return custom_task(*this); } g_tls_log_prefix = [] { const auto cpu = static_cast<PPUThread*>(get_current_cpu_thread()); return fmt::format("%s [0x%08x]", cpu->get_name(), cpu->pc); }; const auto base = vm::_ptr<const u8>(0); // Select opcode table const auto& table = *( g_cfg_ppu_decoder.get() == ppu_decoder_type::precise ? &s_ppu_interpreter_precise.get_table() : g_cfg_ppu_decoder.get() == ppu_decoder_type::fast ? &s_ppu_interpreter_fast.get_table() : throw std::logic_error("Invalid PPU decoder")); v128 _op; decltype(&ppu_interpreter::UNK) func0, func1, func2, func3; while (true) { if (UNLIKELY(state.load())) { if (check_status()) return; } // Reinitialize { const auto _ops = _mm_shuffle_epi8(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(base + pc)), _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3)); _op.vi = _ops; const v128 _i = v128::fromV(_mm_and_si128(_mm_or_si128(_mm_slli_epi32(_op.vi, 6), _mm_srli_epi32(_op.vi, 26)), _mm_set1_epi32(0x1ffff))); func0 = table[_i._u32[0]]; func1 = table[_i._u32[1]]; func2 = table[_i._u32[2]]; func3 = table[_i._u32[3]]; } while (LIKELY(func0(*this, { _op._u32[0] }))) { if (pc += 4, LIKELY(func1(*this, { _op._u32[1] }))) { if (pc += 4, LIKELY(func2(*this, { _op._u32[2] }))) { pc += 4; func0 = func3; const auto _ops = _mm_shuffle_epi8(_mm_lddqu_si128(reinterpret_cast<const __m128i*>(base + pc + 4)), _mm_set_epi8(12, 13, 14, 15, 8, 9, 10, 11, 4, 5, 6, 7, 0, 1, 2, 3)); _op.vi = _mm_alignr_epi8(_ops, _op.vi, 12); const v128 _i = v128::fromV(_mm_and_si128(_mm_or_si128(_mm_slli_epi32(_op.vi, 6), _mm_srli_epi32(_op.vi, 26)), _mm_set1_epi32(0x1ffff))); func1 = table[_i._u32[1]]; func2 = table[_i._u32[2]]; func3 = table[_i._u32[3]]; if (UNLIKELY(state.load())) { break; } continue; } break; } break; } } }
static __m128 mm_pow_ps(__m128 a, __m128 b) { // a^b = exp2(b * log2(a)) // exp2(x) and log2(x) are calculated using polynomial approximations. __m128 log2_a, b_log2_a, a_exp_b; // Calculate log2(x), x = a. { // To calculate log2(x), we decompose x like this: // x = y * 2^n // n is an integer // y is in the [1.0, 2.0) range // // log2(x) = log2(y) + n // n can be evaluated by playing with float representation. // log2(y) in a small range can be approximated, this code uses an order // five polynomial approximation. The coefficients have been // estimated with the Remez algorithm and the resulting // polynomial has a maximum relative error of 0.00086%. // Compute n. // This is done by masking the exponent, shifting it into the top bit of // the mantissa, putting eight into the biased exponent (to shift/ // compensate the fact that the exponent has been shifted in the top/ // fractional part and finally getting rid of the implicit leading one // from the mantissa by substracting it out. static const ALIGN16_BEG int float_exponent_mask[4] ALIGN16_END = {0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000}; static const ALIGN16_BEG int eight_biased_exponent[4] ALIGN16_END = {0x43800000, 0x43800000, 0x43800000, 0x43800000}; static const ALIGN16_BEG int implicit_leading_one[4] ALIGN16_END = {0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000}; static const int shift_exponent_into_top_mantissa = 8; const __m128 two_n = _mm_and_ps(a, *((__m128 *)float_exponent_mask)); const __m128 n_1 = (__m128)_mm_srli_epi32((__m128i)two_n, shift_exponent_into_top_mantissa); const __m128 n_0 = _mm_or_ps( (__m128)n_1, *((__m128 *)eight_biased_exponent)); const __m128 n = _mm_sub_ps(n_0, *((__m128 *)implicit_leading_one)); // Compute y. static const ALIGN16_BEG int mantissa_mask[4] ALIGN16_END = {0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF}; static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END = {0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000}; const __m128 mantissa = _mm_and_ps(a, *((__m128 *)mantissa_mask)); const __m128 y = _mm_or_ps( mantissa, *((__m128 *)zero_biased_exponent_is_one)); // Approximate log2(y) ~= (y - 1) * pol5(y). // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0 static const ALIGN16_BEG float ALIGN16_END C5[4] = {-3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f}; static const ALIGN16_BEG float ALIGN16_END C4[4] = {3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f}; static const ALIGN16_BEG float ALIGN16_END C3[4] = {-1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f}; static const ALIGN16_BEG float ALIGN16_END C2[4] = {2.5988452f, 2.5988452f, 2.5988452f, 2.5988452f}; static const ALIGN16_BEG float ALIGN16_END C1[4] = {-3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f}; static const ALIGN16_BEG float ALIGN16_END C0[4] = {3.1157899f, 3.1157899f, 3.1157899f, 3.1157899f}; const __m128 pol5_y_0 = _mm_mul_ps(y, *((__m128 *)C5)); const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, *((__m128 *)C4)); const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y); const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, *((__m128 *)C3)); const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y); const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, *((__m128 *)C2)); const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y); const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, *((__m128 *)C1)); const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y); const __m128 pol5_y = _mm_add_ps(pol5_y_8, *((__m128 *)C0)); const __m128 y_minus_one = _mm_sub_ps( y, *((__m128 *)zero_biased_exponent_is_one)); const __m128 log2_y = _mm_mul_ps(y_minus_one , pol5_y); // Combine parts. log2_a = _mm_add_ps(n, log2_y); } // b * log2(a) b_log2_a = _mm_mul_ps(b, log2_a); // Calculate exp2(x), x = b * log2(a). { // To calculate 2^x, we decompose x like this: // x = n + y // n is an integer, the value of x - 0.5 rounded down, therefore // y is in the [0.5, 1.5) range // // 2^x = 2^n * 2^y // 2^n can be evaluated by playing with float representation. // 2^y in a small range can be approximated, this code uses an order two // polynomial approximation. The coefficients have been estimated // with the Remez algorithm and the resulting polynomial has a // maximum relative error of 0.17%. // To avoid over/underflow, we reduce the range of input to ]-127, 129]. static const ALIGN16_BEG float max_input[4] ALIGN16_END = {129.f, 129.f, 129.f, 129.f}; static const ALIGN16_BEG float min_input[4] ALIGN16_END = {-126.99999f, -126.99999f, -126.99999f, -126.99999f}; const __m128 x_min = _mm_min_ps(b_log2_a, *((__m128 *)max_input)); const __m128 x_max = _mm_max_ps(x_min, *((__m128 *)min_input)); // Compute n. static const ALIGN16_BEG float half[4] ALIGN16_END = {0.5f, 0.5f, 0.5f, 0.5f}; const __m128 x_minus_half = _mm_sub_ps(x_max, *((__m128 *)half)); const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half); // Compute 2^n. static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END = {127, 127, 127, 127}; static const int float_exponent_shift = 23; const __m128i two_n_exponent = _mm_add_epi32( x_minus_half_floor, *((__m128i *)float_exponent_bias)); const __m128 two_n = (__m128)_mm_slli_epi32( two_n_exponent, float_exponent_shift); // Compute y. const __m128 y = _mm_sub_ps(x_max, _mm_cvtepi32_ps(x_minus_half_floor)); // Approximate 2^y ~= C2 * y^2 + C1 * y + C0. static const ALIGN16_BEG float C2[4] ALIGN16_END = {3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f}; static const ALIGN16_BEG float C1[4] ALIGN16_END = {6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f}; static const ALIGN16_BEG float C0[4] ALIGN16_END = {1.0017247f, 1.0017247f, 1.0017247f, 1.0017247f}; const __m128 exp2_y_0 = _mm_mul_ps(y, *((__m128 *)C2)); const __m128 exp2_y_1 = _mm_add_ps(exp2_y_0, *((__m128 *)C1)); const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y); const __m128 exp2_y = _mm_add_ps(exp2_y_2, *((__m128 *)C0)); // Combine parts. a_exp_b = _mm_mul_ps(exp2_y, two_n); } return a_exp_b; }
__m128i aes_ssse3_decrypt(__m128i B, const __m128i* keys, size_t rounds) { const __m128i k_dipt1 = _mm_set_epi32( 0x154A411E, 0x114E451A, 0x0F505B04, 0x0B545F00); const __m128i k_dipt2 = _mm_set_epi32( 0x12771772, 0xF491F194, 0x86E383E6, 0x60056500); const __m128i sb9u = _mm_set_epi32( 0xCAD51F50, 0x4F994CC9, 0x851C0353, 0x9A86D600); const __m128i sb9t = _mm_set_epi32( 0x725E2C9E, 0xB2FBA565, 0xC03B1789, 0xECD74900); const __m128i sbeu = _mm_set_epi32( 0x22426004, 0x64B4F6B0, 0x46F29296, 0x26D4D000); const __m128i sbet = _mm_set_epi32( 0x9467F36B, 0x98593E32, 0x0C55A6CD, 0xFFAAC100); const __m128i sbdu = _mm_set_epi32( 0xF56E9B13, 0x882A4439, 0x7D57CCDF, 0xE6B1A200); const __m128i sbdt = _mm_set_epi32( 0x2931180D, 0x15DEEFD3, 0x3CE2FAF7, 0x24C6CB00); const __m128i sbbu = _mm_set_epi32( 0x602646F6, 0xB0F2D404, 0xD0226492, 0x96B44200); const __m128i sbbt = _mm_set_epi32( 0xF3FF0C3E, 0x3255AA6B, 0xC19498A6, 0xCD596700); __m128i mc = mc_forward[3]; __m128i t = _mm_shuffle_epi8(k_dipt2, _mm_srli_epi32( _mm_andnot_si128(low_nibs, B), 4)); B = mm_xor3(t, _mm_loadu_si128(keys), _mm_shuffle_epi8(k_dipt1, _mm_and_si128(B, low_nibs))); for(size_t r = 1; ; ++r) { const __m128i K = _mm_loadu_si128(keys + r); t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, B), 4); B = _mm_and_si128(low_nibs, B); __m128i t2 = _mm_shuffle_epi8(k_inv2, B); B = _mm_xor_si128(B, t); __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t)); __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, B)); __m128i t5 = _mm_xor_si128(B, _mm_shuffle_epi8(k_inv1, t3)); __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4)); if(r == rounds) { const __m128i sbou = _mm_set_epi32( 0xC7AA6DB9, 0xD4943E2D, 0x1387EA53, 0x7EF94000); const __m128i sbot = _mm_set_epi32( 0xCA4B8159, 0xD8C58E9C, 0x12D7560F, 0x93441D00); __m128i x = _mm_shuffle_epi8(sbou, t5); __m128i y = _mm_shuffle_epi8(sbot, t6); x = _mm_xor_si128(x, K); x = _mm_xor_si128(x, y); const uint32_t which_sr = ((((rounds - 1) << 4) ^ 48) & 48) / 16; return _mm_shuffle_epi8(x, sr[which_sr]); } __m128i t8 = _mm_xor_si128(_mm_shuffle_epi8(sb9t, t6), _mm_xor_si128(_mm_shuffle_epi8(sb9u, t5), K)); __m128i t9 = mm_xor3(_mm_shuffle_epi8(t8, mc), _mm_shuffle_epi8(sbdu, t5), _mm_shuffle_epi8(sbdt, t6)); __m128i t12 = _mm_xor_si128( _mm_xor_si128( _mm_shuffle_epi8(t9, mc), _mm_shuffle_epi8(sbbu, t5)), _mm_shuffle_epi8(sbbt, t6)); B = _mm_xor_si128(_mm_xor_si128(_mm_shuffle_epi8(t12, mc), _mm_shuffle_epi8(sbeu, t5)), _mm_shuffle_epi8(sbet, t6)); mc = _mm_alignr_epi8(mc, mc, 12); } }
static void transform(hashState *state,int r) { __m128i x0; __m128i x1; __m128i x2; __m128i x3; __m128i x4; __m128i x5; __m128i x6; __m128i x7; __m128i y0; __m128i y1; __m128i y2; __m128i y3; x0 = state->x[0]; x1 = state->x[1]; x2 = state->x[2]; x3 = state->x[3]; x4 = state->x[4]; x5 = state->x[5]; x6 = state->x[6]; x7 = state->x[7]; for (;r > 0;--r) { x4 = _mm_add_epi32(x0,x4); x5 = _mm_add_epi32(x1,x5); x6 = _mm_add_epi32(x2,x6); x7 = _mm_add_epi32(x3,x7); y0 = x2; y1 = x3; y2 = x0; y3 = x1; x0 = _mm_xor_si128(_mm_slli_epi32(y0,7),_mm_srli_epi32(y0,25)); x1 = _mm_xor_si128(_mm_slli_epi32(y1,7),_mm_srli_epi32(y1,25)); x2 = _mm_xor_si128(_mm_slli_epi32(y2,7),_mm_srli_epi32(y2,25)); x3 = _mm_xor_si128(_mm_slli_epi32(y3,7),_mm_srli_epi32(y3,25)); x0 = _mm_xor_si128(x0,x4); x1 = _mm_xor_si128(x1,x5); x2 = _mm_xor_si128(x2,x6); x3 = _mm_xor_si128(x3,x7); x4 = _mm_shuffle_epi32(x4,0x4e); x5 = _mm_shuffle_epi32(x5,0x4e); x6 = _mm_shuffle_epi32(x6,0x4e); x7 = _mm_shuffle_epi32(x7,0x4e); x4 = _mm_add_epi32(x0,x4); x5 = _mm_add_epi32(x1,x5); x6 = _mm_add_epi32(x2,x6); x7 = _mm_add_epi32(x3,x7); y0 = x1; y1 = x0; y2 = x3; y3 = x2; x0 = _mm_xor_si128(_mm_slli_epi32(y0,11),_mm_srli_epi32(y0,21)); x1 = _mm_xor_si128(_mm_slli_epi32(y1,11),_mm_srli_epi32(y1,21)); x2 = _mm_xor_si128(_mm_slli_epi32(y2,11),_mm_srli_epi32(y2,21)); x3 = _mm_xor_si128(_mm_slli_epi32(y3,11),_mm_srli_epi32(y3,21)); x0 = _mm_xor_si128(x0,x4); x1 = _mm_xor_si128(x1,x5); x2 = _mm_xor_si128(x2,x6); x3 = _mm_xor_si128(x3,x7); x4 = _mm_shuffle_epi32(x4,0xb1); x5 = _mm_shuffle_epi32(x5,0xb1); x6 = _mm_shuffle_epi32(x6,0xb1); x7 = _mm_shuffle_epi32(x7,0xb1); } state->x[0] = x0; state->x[1] = x1; state->x[2] = x2; state->x[3] = x3; state->x[4] = x4; state->x[5] = x5; state->x[6] = x6; state->x[7] = x7; }
template<int pixelFormat> void imageFromPixels(vl::Image & image, char unsigned const * rgb, int rowStride) { vl::ImageShape const & shape = image.getShape() ; int blockSizeX ; int blockSizeY ; int pixelStride ; int imagePlaneStride = (int)shape.width * (int)shape.height ; __m128i shuffleRgb ; __m128i const shuffleL = _mm_set_epi8(0xff, 0xff, 0xff, 3, 0xff, 0xff, 0xff, 2, 0xff, 0xff, 0xff, 1, 0xff, 0xff, 0xff, 0) ; __m128i const mask = _mm_set_epi32(0xff, 0xff, 0xff, 0xff) ; switch (pixelFormat) { case pixelFormatL: pixelStride = 1 ; blockSizeX = 16 ; blockSizeY = 4 ; break ; case pixelFormatBGR: case pixelFormatRGB: pixelStride = 3 ; blockSizeX = 4 ; blockSizeY = 4 ; assert(shape.depth == 3) ; break ; case pixelFormatRGBA: case pixelFormatBGRA: case pixelFormatBGRAasL: pixelStride = 4 ; blockSizeX = 4 ; blockSizeY = 4 ; assert(shape.depth == 3) ; break ; default: assert(false) ; } switch (pixelFormat) { case pixelFormatL: break ; case pixelFormatRGB: shuffleRgb = _mm_set_epi8(0xff, 11, 10, 9, 0xff, 8, 7, 6, 0xff, 5, 4, 3, 0xff, 2, 1, 0) ; break ; case pixelFormatRGBA: shuffleRgb = _mm_set_epi8(0xff, 14, 13, 12, 0xff, 10, 9, 8, 0xff, 6, 5, 4, 0xff, 2, 1, 0) ; break ; case pixelFormatBGR: shuffleRgb = _mm_set_epi8(0xff, 9, 10, 11, 0xff, 6, 7, 8, 0xff, 3, 4, 4, 0xff, 0, 1, 2) ; break ; case pixelFormatBGRA: shuffleRgb = _mm_set_epi8(0xff, 12, 13, 14, 0xff, 8, 9, 10, 0xff, 4, 5, 6, 0xff, 0, 1, 2) ; break ; case pixelFormatBGRAasL: shuffleRgb = _mm_set_epi8(0xff, 0xff, 0xff, 12, 0xff, 0xff, 0xff, 8, 0xff, 0xff, 0xff, 4, 0xff, 0xff, 0xff, 0) ; break ; } // we pull out these values as otherwise the compiler // will assume that the reference &image can be aliased // and recompute silly multiplications in the inner loop float * const __restrict imageMemory = image.getMemory() ; int const imageHeight = (int)shape.height ; int const imageWidth = (int)shape.width ; for (int x = 0 ; x < imageWidth ; x += blockSizeX) { int y = 0 ; float * __restrict imageMemoryX = imageMemory + x * imageHeight ; int bsx = (std::min)(imageWidth - x, blockSizeX) ; if (bsx < blockSizeX) goto boundary ; for ( ; y < imageHeight - blockSizeY + 1 ; y += blockSizeY) { char unsigned const * __restrict pixel = rgb + y * rowStride + x * pixelStride ; float * __restrict r = imageMemoryX + y ; __m128i p0, p1, p2, p3, T0, T1, T2, T3 ; /* convert a blockSizeX x blockSizeY block in the input image */ switch (pixelFormat) { case pixelFormatRGB : case pixelFormatRGBA : case pixelFormatBGR : case pixelFormatBGRA : case pixelFormatBGRAasL : // load 4x4 RGB pixels p0 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pixel), shuffleRgb) ; pixel += rowStride ; p1 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pixel), shuffleRgb) ; pixel += rowStride ; p2 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pixel), shuffleRgb) ; pixel += rowStride ; p3 = _mm_shuffle_epi8(_mm_loadu_si128((__m128i*)pixel), shuffleRgb) ; pixel += rowStride ; // transpose pixels as 32-bit integers (see also below) T0 = _mm_unpacklo_epi32(p0, p1); T1 = _mm_unpacklo_epi32(p2, p3); T2 = _mm_unpackhi_epi32(p0, p1); T3 = _mm_unpackhi_epi32(p2, p3); p0 = _mm_unpacklo_epi64(T0, T1); p1 = _mm_unpackhi_epi64(T0, T1); p2 = _mm_unpacklo_epi64(T2, T3); p3 = _mm_unpackhi_epi64(T2, T3); // store r _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p0, mask))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p1, mask))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p2, mask))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p3, mask))) ; if (pixelFormat == pixelFormatBGRAasL) break ; // store g r += (imageWidth - 3) * imageHeight ; p0 = _mm_srli_epi32 (p0, 8) ; p1 = _mm_srli_epi32 (p1, 8) ; p2 = _mm_srli_epi32 (p2, 8) ; p3 = _mm_srli_epi32 (p3, 8) ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p0, mask))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p1, mask))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p2, mask))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p3, mask))) ; // store b r += (imageWidth - 3) * imageHeight ; p0 = _mm_srli_epi32 (p0, 8) ; p1 = _mm_srli_epi32 (p1, 8) ; p2 = _mm_srli_epi32 (p2, 8) ; p3 = _mm_srli_epi32 (p3, 8) ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p0, mask))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p1, mask))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p2, mask))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_and_si128(p3, mask))) ; break ; case pixelFormatL: // load 4x16 L pixels p0 = _mm_loadu_si128((__m128i*)pixel) ; pixel += rowStride ; p1 = _mm_loadu_si128((__m128i*)pixel) ; pixel += rowStride ; p2 = _mm_loadu_si128((__m128i*)pixel) ; pixel += rowStride ; p3 = _mm_loadu_si128((__m128i*)pixel) ; pixel += rowStride ; /* Pixels are collected in little-endian order: the first pixel is at the `right' (least significant byte of p0: p[0] = a, p[1] = b, ... p0: [ ... | ... | ... | d c b a ] p1: [ ... | ... | ... | h g f e ] p2: [ ... | ... | ... | l k j i ] p3: [ ... | ... | ... | p o n m ] The goal is to transpose four 4x4 subblocks in the 4 x 16 pixel array. The first step interlaves individual pixels in p0 and p1: T0: [ ... | ... | h d g c | f b e a ] T1: [ ... | ... | p l o k | n j m i ] T2: [ ... | ... | ... | ... ] T3: [ ... | ... | ... | ... ] The second step interleaves groups of two pixels: p0: [pl hd | ok gc | nj fb | mi ea] (pixels in the rightmost 4x4 subblock) p1: ... p2: ... p3: ... The third step interlevaes groups of four pixels: T0: [ ... | njfb | ... | miea ] T1: ... T2: ... T3: ... The last step interleaves groups of eight pixels: p0: [ ... | ... | ... | miea ] p1: [ ... | ... | ... | njfb ] p2: [ ... | ... | ... | okgc ] p3: [ ... | ... | ... | dklp ] */ T0 = _mm_unpacklo_epi8(p0, p1); T1 = _mm_unpacklo_epi8(p2, p3); T2 = _mm_unpackhi_epi8(p0, p1); T3 = _mm_unpackhi_epi8(p2, p3); p0 = _mm_unpacklo_epi16(T0, T1); p1 = _mm_unpackhi_epi16(T0, T1); p2 = _mm_unpacklo_epi16(T2, T3); p3 = _mm_unpackhi_epi16(T2, T3); T0 = _mm_unpacklo_epi32(p0, p1); T1 = _mm_unpacklo_epi32(p2, p3); T2 = _mm_unpackhi_epi32(p0, p1); T3 = _mm_unpackhi_epi32(p2, p3); p0 = _mm_unpacklo_epi64(T0, T1); p1 = _mm_unpackhi_epi64(T0, T1); p2 = _mm_unpacklo_epi64(T2, T3); p3 = _mm_unpackhi_epi64(T2, T3); // store four 4x4 subblock for (int i = 0 ; i < 4 ; ++i) { _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_shuffle_epi8(p0, shuffleL))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_shuffle_epi8(p1, shuffleL))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_shuffle_epi8(p2, shuffleL))) ; r += imageHeight ; _mm_storeu_ps(r, _mm_cvtepi32_ps(_mm_shuffle_epi8(p3, shuffleL))) ; r += imageHeight ; p0 = _mm_srli_si128 (p0, 4) ; p1 = _mm_srli_si128 (p1, 4) ; p2 = _mm_srli_si128 (p2, 4) ; p3 = _mm_srli_si128 (p3, 4) ; } break ; } } /* next y */ boundary: /* special case if there is not a full 4x4 block to process */ for ( ; y < imageHeight ; y += blockSizeY) { int bsy = (std::min)(imageHeight - y, blockSizeY) ; float * __restrict r ; float * rend ; for (int dx = 0 ; dx < bsx ; ++dx) { char unsigned const * __restrict pixel = rgb + y * rowStride + (x + dx) * pixelStride ; r = imageMemoryX + y + dx * imageHeight ; rend = r + bsy ; while (r != rend) { switch (pixelFormat) { case pixelFormatRGBA: case pixelFormatRGB: r[0 * imagePlaneStride] = (float) pixel[0] ; r[1 * imagePlaneStride] = (float) pixel[1] ; r[2 * imagePlaneStride] = (float) pixel[2] ; break ; case pixelFormatBGR: case pixelFormatBGRA: r[2 * imagePlaneStride] = (float) pixel[0] ; r[1 * imagePlaneStride] = (float) pixel[1] ; r[0 * imagePlaneStride] = (float) pixel[2] ; break; case pixelFormatBGRAasL: case pixelFormatL: r[0] = (float) pixel[0] ; break ; } r += 1 ; pixel += rowStride ; } } } } }
void spu_interpreter::CG(SPUThread& CPU, spu_opcode_t op) { const auto a = _mm_xor_si128(CPU.GPR[op.ra].vi, _mm_set1_epi32(0x7fffffff)); const auto b = _mm_xor_si128(CPU.GPR[op.rb].vi, _mm_set1_epi32(0x80000000)); CPU.GPR[op.rt].vi = _mm_srli_epi32(_mm_cmpgt_epi32(b, a), 31); }
static void GF_FUNC_ALIGN VS_CC proc_8bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *dstp, const uint8_t *srcp, edge_t *eh, uint16_t plane_max) { uint8_t* p0 = buff + 16; uint8_t* p1 = p0 + bstride; uint8_t* p2 = p1 + bstride; uint8_t* p3 = p2 + bstride; uint8_t* p4 = p3 + bstride; uint8_t* orig = p0; uint8_t* end = p4; line_copy8(p0, srcp + 2 * stride, width, 2); line_copy8(p1, srcp + stride, width, 2); line_copy8(p2, srcp, width, 2); srcp += stride; line_copy8(p3, srcp, width, 2); uint8_t th_min = eh->min > 0xFF ? 0xFF : (uint8_t)eh->min; uint8_t th_max = eh->max > 0xFF ? 0xFF : (uint8_t)eh->max; __m128i zero = _mm_setzero_si128(); __m128i ab = _mm_set1_epi16(15); __m128i max = _mm_set1_epi8((int8_t)th_max); __m128i min = _mm_set1_epi8((int8_t)th_min); for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy8(p4, srcp, width, 2); uint8_t* posh[] = {p2 - 2, p2 - 1, p2 + 1, p2 + 2}; uint8_t* posv[] = {p0, p1, p3, p4}; for (int x = 0; x < width; x += 16) { __m128i sumx[2] = {zero, zero}; __m128i sumy[2] = {zero, zero}; for (int i = 0; i < 4; i++) { __m128i xmm0, xmm1, xmul; xmul = _mm_load_si128((__m128i *)ar_mulx[i]); xmm0 = _mm_loadu_si128((__m128i *)(posh[i] + x)); xmm1 = _mm_unpackhi_epi8(xmm0, zero); xmm0 = _mm_unpacklo_epi8(xmm0, zero); sumx[0] = _mm_add_epi16(sumx[0], _mm_mullo_epi16(xmm0, xmul)); sumx[1] = _mm_add_epi16(sumx[1], _mm_mullo_epi16(xmm1, xmul)); xmul = _mm_load_si128((__m128i *)ar_muly[i]); xmm0 = _mm_load_si128((__m128i *)(posv[i] + x)); xmm1 = _mm_unpackhi_epi8(xmm0, zero); xmm0 = _mm_unpacklo_epi8(xmm0, zero); sumy[0] = _mm_add_epi16(sumy[0], _mm_mullo_epi16(xmm0, xmul)); sumy[1] = _mm_add_epi16(sumy[1], _mm_mullo_epi16(xmm1, xmul)); } for (int i = 0; i < 2; i++) { __m128i xmax, xmin, mull, mulh; sumx[i] = mm_abs_epi16(sumx[i]); sumy[i] = mm_abs_epi16(sumy[i]); xmax = _mm_max_epi16(sumx[i], sumy[i]); xmin = _mm_min_epi16(sumx[i], sumy[i]); mull = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpacklo_epi16(xmax, zero)), 4); mulh = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpackhi_epi16(xmax, zero)), 4); xmax = mm_cast_epi32(mull, mulh); mull = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpacklo_epi16(xmin, zero)), 5); mulh = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpackhi_epi16(xmin, zero)), 5); xmin = mm_cast_epi32(mull, mulh); sumx[i] = _mm_adds_epu16(xmax, xmin); sumx[i] = _mm_srli_epi16(sumx[i], eh->rshift); } __m128i out = _mm_packus_epi16(sumx[0], sumx[1]); __m128i temp = _mm_min_epu8(out, max); temp = _mm_cmpeq_epi8(temp, max); out = _mm_or_si128(temp, out); temp = _mm_max_epu8(out, min); temp = _mm_cmpeq_epi8(temp, min); out = _mm_andnot_si128(temp, out); _mm_store_si128((__m128i*)(dstp + x), out); } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
/* Function: esl_sse_logf() * Synopsis: <r[z] = log x[z]> * Incept: SRE, Fri Dec 14 11:32:54 2007 [Janelia] * * Purpose: Given a vector <x> containing four floats, returns a * vector <r> in which each element <r[z] = logf(x[z])>. * * Valid in the domain $x_z > 0$ for normalized IEEE754 * $x_z$. * * For <x> $< 0$, including -0, returns <NaN>. For <x> $== * 0$ or subnormal <x>, returns <-inf>. For <x = inf>, * returns <inf>. For <x = NaN>, returns <NaN>. For * subnormal <x>, returns <-inf>. * * Xref: J2/71. * * Note: Derived from an SSE1 implementation by Julian * Pommier. Converted to SSE2 and added handling * of IEEE754 specials. */ __m128 esl_sse_logf(__m128 x) { static float cephes_p[9] = { 7.0376836292E-2f, -1.1514610310E-1f, 1.1676998740E-1f, -1.2420140846E-1f, 1.4249322787E-1f, -1.6668057665E-1f, 2.0000714765E-1f, -2.4999993993E-1f, 3.3333331174E-1f }; __m128 onev = _mm_set1_ps(1.0f); /* all elem = 1.0 */ __m128 v0p5 = _mm_set1_ps(0.5f); /* all elem = 0.5 */ __m128i vneg = _mm_set1_epi32(0x80000000); /* all elem have IEEE sign bit up */ __m128i vexp = _mm_set1_epi32(0x7f800000); /* all elem have IEEE exponent bits up */ __m128i ei; __m128 e; __m128 invalid_mask, zero_mask, inf_mask; /* masks used to handle special IEEE754 inputs */ __m128 mask; __m128 origx; __m128 tmp; __m128 y; __m128 z; /* first, split x apart: x = frexpf(x, &e); */ ei = _mm_srli_epi32( _mm_castps_si128(x), 23); /* shift right 23: IEEE754 floats: ei = biased exponents */ invalid_mask = _mm_castsi128_ps ( _mm_cmpeq_epi32( _mm_and_si128(_mm_castps_si128(x), vneg), vneg)); /* mask any elem that's negative; these become NaN */ zero_mask = _mm_castsi128_ps ( _mm_cmpeq_epi32(ei, _mm_setzero_si128())); /* mask any elem zero or subnormal; these become -inf */ inf_mask = _mm_castsi128_ps ( _mm_cmpeq_epi32( _mm_and_si128(_mm_castps_si128(x), vexp), vexp)); /* mask any elem inf or NaN; log(inf)=inf, log(NaN)=NaN */ origx = x; /* store original x, used for log(inf) = inf, log(NaN) = NaN */ x = _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(~0x7f800000))); /* x now the stored 23 bits of the 24-bit significand */ x = _mm_or_ps (x, v0p5); /* sets hidden bit b[0] */ ei = _mm_sub_epi32(ei, _mm_set1_epi32(126)); /* -127 (ei now signed base-2 exponent); then +1 */ e = _mm_cvtepi32_ps(ei); /* now, calculate the log */ mask = _mm_cmplt_ps(x, _mm_set1_ps(0.707106781186547524f)); /* avoid conditional branches. */ tmp = _mm_and_ps(x, mask); /* tmp contains x values < 0.707, else 0 */ x = _mm_sub_ps(x, onev); e = _mm_sub_ps(e, _mm_and_ps(onev, mask)); x = _mm_add_ps(x, tmp); z = _mm_mul_ps(x,x); y = _mm_set1_ps(cephes_p[0]); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[1])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[2])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[3])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[4])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[5])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[6])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[7])); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, _mm_set1_ps(cephes_p[8])); y = _mm_mul_ps(y, x); y = _mm_mul_ps(y, z); tmp = _mm_mul_ps(e, _mm_set1_ps(-2.12194440e-4f)); y = _mm_add_ps(y, tmp); tmp = _mm_mul_ps(z, v0p5); y = _mm_sub_ps(y, tmp); tmp = _mm_mul_ps(e, _mm_set1_ps(0.693359375f)); x = _mm_add_ps(x, y); x = _mm_add_ps(x, tmp); /* IEEE754 cleanup: */ x = esl_sse_select_ps(x, origx, inf_mask); /* log(inf)=inf; log(NaN) = NaN */ x = _mm_or_ps(x, invalid_mask); /* log(x<0, including -0,-inf) = NaN */ x = esl_sse_select_ps(x, _mm_set1_ps(-eslINFINITY), zero_mask); /* x zero or subnormal = -inf */ return x; }
}bool validate_utf8_sse(const char *src, size_t len) { const char *end = src + len; while (src + 16 < end) { __m128i chunk = _mm_loadu_si128((const __m128i *)(src)); int asciiMask = _mm_movemask_epi8(chunk); if (!asciiMask) { src += 16; continue; } __m128i chunk_signed = _mm_add_epi8(chunk, _mm_set1_epi8(0x80)); __m128i cond2 = _mm_cmplt_epi8(_mm_set1_epi8(0xc2 - 1 - 0x80), chunk_signed); __m128i state = _mm_set1_epi8((char)(0x0 | 0x80)); state = _mm_blendv_epi8(state, _mm_set1_epi8((char)(0x2 | 0xc0)), cond2); __m128i cond3 = _mm_cmplt_epi8(_mm_set1_epi8(0xe0 - 1 - 0x80), chunk_signed); state = _mm_blendv_epi8(state, _mm_set1_epi8((char)(0x3 | 0xe0)), cond3); __m128i mask3 = _mm_slli_si128(cond3, 1); __m128i cond4 = _mm_cmplt_epi8(_mm_set1_epi8(0xf0 - 1 - 0x80), chunk_signed); // Fall back to the scalar processing if (_mm_movemask_epi8(cond4)) { break; } __m128i count = _mm_and_si128(state, _mm_set1_epi8(0x7)); __m128i count_sub1 = _mm_subs_epu8(count, _mm_set1_epi8(0x1)); __m128i counts = _mm_add_epi8(count, _mm_slli_si128(count_sub1, 1)); __m128i shifts = count_sub1; shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 1)); counts = _mm_add_epi8( counts, _mm_slli_si128(_mm_subs_epu8(counts, _mm_set1_epi8(0x2)), 2)); shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 2)); if (asciiMask ^ _mm_movemask_epi8(_mm_cmpgt_epi8(counts, _mm_set1_epi8(0)))) return false; // error shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 4)); if (_mm_movemask_epi8(_mm_cmpgt_epi8( _mm_sub_epi8(_mm_slli_si128(counts, 1), counts), _mm_set1_epi8(1)))) return false; // error shifts = _mm_add_epi8(shifts, _mm_slli_si128(shifts, 8)); __m128i mask = _mm_and_si128(state, _mm_set1_epi8(0xf8)); shifts = _mm_and_si128(shifts, _mm_cmplt_epi8(counts, _mm_set1_epi8(2))); // <=1 chunk = _mm_andnot_si128(mask, chunk); // from now on, we only have usefull bits shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 1), _mm_srli_si128(_mm_slli_epi16(shifts, 7), 1)); __m128i chunk_right = _mm_slli_si128(chunk, 1); __m128i chunk_low = _mm_blendv_epi8( chunk, _mm_or_si128(chunk, _mm_and_si128(_mm_slli_epi16(chunk_right, 6), _mm_set1_epi8(0xc0))), _mm_cmpeq_epi8(counts, _mm_set1_epi8(1))); __m128i chunk_high = _mm_and_si128(chunk, _mm_cmpeq_epi8(counts, _mm_set1_epi8(2))); shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 2), _mm_srli_si128(_mm_slli_epi16(shifts, 6), 2)); chunk_high = _mm_srli_epi32(chunk_high, 2); shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 4), _mm_srli_si128(_mm_slli_epi16(shifts, 5), 4)); chunk_high = _mm_or_si128( chunk_high, _mm_and_si128(_mm_and_si128(_mm_slli_epi32(chunk_right, 4), _mm_set1_epi8(0xf0)), mask3)); int c = _mm_extract_epi16(counts, 7); int source_advance = !(c & 0x0200) ? 16 : !(c & 0x02) ? 15 : 14; __m128i high_bits = _mm_and_si128(chunk_high, _mm_set1_epi8(0xf8)); if (!_mm_testz_si128( mask3, _mm_or_si128(_mm_cmpeq_epi8(high_bits, _mm_set1_epi8(0x00)), _mm_cmpeq_epi8(high_bits, _mm_set1_epi8(0xd8))))) return false; shifts = _mm_blendv_epi8(shifts, _mm_srli_si128(shifts, 8), _mm_srli_si128(_mm_slli_epi16(shifts, 4), 8)); chunk_high = _mm_slli_si128(chunk_high, 1); __m128i shuf = _mm_add_epi8(shifts, _mm_set_epi8(15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0)); chunk_low = _mm_shuffle_epi8(chunk_low, shuf); chunk_high = _mm_shuffle_epi8(chunk_high, shuf); __m128i utf16_low = _mm_unpacklo_epi8(chunk_low, chunk_high); __m128i utf16_high = _mm_unpackhi_epi8(chunk_low, chunk_high); if (_mm_cmpestrc(_mm_cvtsi64_si128(0xfdeffdd0fffffffe), 4, utf16_high, 8, _SIDD_UWORD_OPS | _SIDD_CMP_RANGES) | _mm_cmpestrc(_mm_cvtsi64_si128(0xfdeffdd0fffffffe), 4, utf16_low, 8, _SIDD_UWORD_OPS | _SIDD_CMP_RANGES)) { return false; } src += source_advance; } return validate_utf8(src, end - src); }
__forceinline void GSMem_to_ClutBuffer__T16_I4_CSM1_core_sse2(u32* vm, u32* clut) { __m128i vm_0; __m128i vm_1; __m128i vm_2; __m128i vm_3; __m128i clut_0; __m128i clut_1; __m128i clut_2; __m128i clut_3; __m128i clut_mask = _mm_load_si128((__m128i*)s_clut_16bits_mask); // !HIGH_16BITS_VM // CSA in 0-15 // Replace lower 16 bits of clut with lower 16 bits of vm // CSA in 16-31 // Replace higher 16 bits of clut with lower 16 bits of vm // HIGH_16BITS_VM // CSA in 0-15 // Replace lower 16 bits of clut with higher 16 bits of vm // CSA in 16-31 // Replace higher 16 bits of clut with higher 16 bits of vm if(HIGH_16BITS_VM && CSA_0_15) { // move up to low vm_0 = _mm_load_si128((__m128i*)vm); // 9 8 1 0 vm_1 = _mm_load_si128((__m128i*)vm+1); // 11 10 3 2 vm_2 = _mm_load_si128((__m128i*)vm+2); // 13 12 5 4 vm_3 = _mm_load_si128((__m128i*)vm+3); // 15 14 7 6 vm_0 = _mm_srli_epi32(vm_0, 16); vm_1 = _mm_srli_epi32(vm_1, 16); vm_2 = _mm_srli_epi32(vm_2, 16); vm_3 = _mm_srli_epi32(vm_3, 16); } else if(HIGH_16BITS_VM && !CSA_0_15) { // Remove lower 16 bits vm_0 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm)); // 9 8 1 0 vm_1 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm+1)); // 11 10 3 2 vm_2 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm+2)); // 13 12 5 4 vm_3 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)vm+3)); // 15 14 7 6 } else if(!HIGH_16BITS_VM && CSA_0_15) { // Remove higher 16 bits vm_0 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm)); // 9 8 1 0 vm_1 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm+1)); // 11 10 3 2 vm_2 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm+2)); // 13 12 5 4 vm_3 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)vm+3)); // 15 14 7 6 } else if(!HIGH_16BITS_VM && !CSA_0_15) { // move low to high vm_0 = _mm_load_si128((__m128i*)vm); // 9 8 1 0 vm_1 = _mm_load_si128((__m128i*)vm+1); // 11 10 3 2 vm_2 = _mm_load_si128((__m128i*)vm+2); // 13 12 5 4 vm_3 = _mm_load_si128((__m128i*)vm+3); // 15 14 7 6 vm_0 = _mm_slli_epi32(vm_0, 16); vm_1 = _mm_slli_epi32(vm_1, 16); vm_2 = _mm_slli_epi32(vm_2, 16); vm_3 = _mm_slli_epi32(vm_3, 16); } // Unsizzle the data __m128i row_0 = _mm_unpacklo_epi64(vm_0, vm_1); // 3 2 1 0 __m128i row_1 = _mm_unpacklo_epi64(vm_2, vm_3); // 7 6 5 4 __m128i row_2 = _mm_unpackhi_epi64(vm_0, vm_1); // 11 10 9 8 __m128i row_3 = _mm_unpackhi_epi64(vm_2, vm_3); // 15 14 13 12 // load old data & remove useless part if(CSA_0_15) { // Remove lower 16 bits clut_0 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut)); clut_1 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut+1)); clut_2 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut+2)); clut_3 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut+3)); } else { // Remove higher 16 bits clut_0 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut)); clut_1 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut+1)); clut_2 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut+2)); clut_3 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut+3)); } // Merge old & new data clut_0 = _mm_or_si128(clut_0, row_0); clut_1 = _mm_or_si128(clut_1, row_1); clut_2 = _mm_or_si128(clut_2, row_2); clut_3 = _mm_or_si128(clut_3, row_3); _mm_store_si128((__m128i*)clut, clut_0); _mm_store_si128((__m128i*)clut+1, clut_1); _mm_store_si128((__m128i*)clut+2, clut_2); _mm_store_si128((__m128i*)clut+3, clut_3); }
__forceinline bool Cmp_ClutBuffer_GSMem_core(u16* GSmem, u16* clut) { __m128i GSmem_0; __m128i GSmem_1; __m128i GSmem_2; __m128i GSmem_3; __m128i clut_0; __m128i clut_1; __m128i clut_2; __m128i clut_3; __m128i clut_mask = _mm_load_si128((__m128i*)s_clut_16bits_mask); // !HIGH_16BITS_VM // CSA in 0-15 // cmp lower 16 bits of clut with lower 16 bits of GSmem // CSA in 16-31 // cmp higher 16 bits of clut with lower 16 bits of GSmem // HIGH_16BITS_VM // CSA in 0-15 // cmp lower 16 bits of clut with higher 16 bits of GSmem // CSA in 16-31 // cmp higher 16 bits of clut with higher 16 bits of GSmem if(HIGH_16BITS_VM && CSA_0_15) { // move up to low GSmem_0 = _mm_load_si128((__m128i*)GSmem); // 9 8 1 0 GSmem_1 = _mm_load_si128((__m128i*)GSmem+1); // 11 10 3 2 GSmem_2 = _mm_load_si128((__m128i*)GSmem+2); // 13 12 5 4 GSmem_3 = _mm_load_si128((__m128i*)GSmem+3); // 15 14 7 6 GSmem_0 = _mm_srli_epi32(GSmem_0, 16); GSmem_1 = _mm_srli_epi32(GSmem_1, 16); GSmem_2 = _mm_srli_epi32(GSmem_2, 16); GSmem_3 = _mm_srli_epi32(GSmem_3, 16); } else if(HIGH_16BITS_VM && !CSA_0_15) { // Remove lower 16 bits GSmem_0 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)GSmem)); // 9 8 1 0 GSmem_1 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)GSmem+1)); // 11 10 3 2 GSmem_2 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)GSmem+2)); // 13 12 5 4 GSmem_3 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)GSmem+3)); // 15 14 7 6 } else if(!HIGH_16BITS_VM && CSA_0_15) { // Remove higher 16 bits GSmem_0 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)GSmem)); // 9 8 1 0 GSmem_1 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)GSmem+1)); // 11 10 3 2 GSmem_2 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)GSmem+2)); // 13 12 5 4 GSmem_3 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)GSmem+3)); // 15 14 7 6 } else if(!HIGH_16BITS_VM && !CSA_0_15) { // move low to high GSmem_0 = _mm_load_si128((__m128i*)GSmem); // 9 8 1 0 GSmem_1 = _mm_load_si128((__m128i*)GSmem+1); // 11 10 3 2 GSmem_2 = _mm_load_si128((__m128i*)GSmem+2); // 13 12 5 4 GSmem_3 = _mm_load_si128((__m128i*)GSmem+3); // 15 14 7 6 GSmem_0 = _mm_slli_epi32(GSmem_0, 16); GSmem_1 = _mm_slli_epi32(GSmem_1, 16); GSmem_2 = _mm_slli_epi32(GSmem_2, 16); GSmem_3 = _mm_slli_epi32(GSmem_3, 16); } // Unsizzle the data __m128i row_0 = _mm_unpacklo_epi64(GSmem_0, GSmem_1); // 3 2 1 0 __m128i row_1 = _mm_unpacklo_epi64(GSmem_2, GSmem_3); // 7 6 5 4 __m128i row_2 = _mm_unpackhi_epi64(GSmem_0, GSmem_1); // 11 10 9 8 __m128i row_3 = _mm_unpackhi_epi64(GSmem_2, GSmem_3); // 15 14 13 12 // load old data & remove useless part if(!CSA_0_15) { // Remove lower 16 bits clut_0 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut)); clut_1 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut+1)); clut_2 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut+2)); clut_3 = _mm_andnot_si128(clut_mask, _mm_load_si128((__m128i*)clut+3)); } else { // Remove higher 16 bits clut_0 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut)); clut_1 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut+1)); clut_2 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut+2)); clut_3 = _mm_and_si128(clut_mask, _mm_load_si128((__m128i*)clut+3)); } // Do the comparaison __m128i result = _mm_cmpeq_epi16(row_0, clut_0); __m128i result_tmp = _mm_cmpeq_epi16(row_1, clut_1); result = _mm_and_si128(result, result_tmp); result_tmp = _mm_cmpeq_epi16(row_2, clut_2); result = _mm_and_si128(result, result_tmp); result_tmp = _mm_cmpeq_epi16(row_3, clut_3); result = _mm_and_si128(result, result_tmp); u32 result_int = _mm_movemask_epi8(result); if(CSA_0_15) { // only lower 16bits must be checked if ((result_int&0x3333) != 0x3333) return true; } else { // only higher 16bits must be checked if ((result_int&0xCCCC) != 0xCCCC) return true; } return false; }
void tuned_ConvertRGBToULY4(uint8_t *pYBegin, uint8_t *pUBegin, uint8_t *pVBegin, const uint8_t *pSrcBegin, const uint8_t *pSrcEnd, size_t cbWidth, ssize_t scbStride) { const int shift = 14; __m128i rb2y, xg2y, rb2u, xg2u, rb2v, xg2v; if (std::is_same<T, CBGRAColorOrder>::value || std::is_same<T, CBGRColorOrder>::value) { rb2y = _mm_set2_epi16_shift(C::R2Y, C::B2Y, shift); xg2y = _mm_set2_epi16_shift(16.5 / 0xff, C::G2Y, shift); rb2u = _mm_set2_epi16_shift(C::R2U, C::B2U, shift); xg2u = _mm_set2_epi16_shift(128.5 / 0xff, C::G2U, shift); rb2v = _mm_set2_epi16_shift(C::R2V, C::B2V, shift); xg2v = _mm_set2_epi16_shift(128.5 / 0xff, C::G2V, shift); } else { rb2y = _mm_set2_epi16_shift(C::B2Y, C::R2Y, shift); xg2y = _mm_set2_epi16_shift(C::G2Y, 16.5 / 0xff, shift); rb2u = _mm_set2_epi16_shift(C::B2U, C::R2U, shift); xg2u = _mm_set2_epi16_shift(C::G2U, 128.5 / 0xff, shift); rb2v = _mm_set2_epi16_shift(C::B2V, C::R2V, shift); xg2v = _mm_set2_epi16_shift(C::G2V, 128.5 / 0xff, shift); } auto y = pYBegin; auto u = pUBegin; auto v = pVBegin; for (auto p = pSrcBegin; p != pSrcEnd; p += scbStride) { auto pp = p; for (; pp <= p + cbWidth - 16; pp += T::BYPP*4) { __m128i m = _mm_loadu_si128((const __m128i *)pp); __m128i rb, xg; if (std::is_same<T, CBGRAColorOrder>::value) { // m = XX R3 G3 B3 XX R2 G2 B2 XX R1 G1 B1 XX R0 G0 B0 rb = _mm_and_si128(m, _mm_set1_epi16(0x00ff)); // 00 R3 00 B3 00 R2 00 B2 00 R1 00 B1 00 R0 00 B0 xg = _mm_or_si128(_mm_srli_epi16(m, 8), _mm_set1_epi32(0x00ff0000)); // 00 ff 00 G3 00 ff 00 G2 00 ff 00 G1 00 ff 00 G0 } #ifdef __SSSE3__ else if (std::is_same<T, CBGRColorOrder>::value) { // m = XX XX XX XX R3 G3 B3 R2 G2 B2 R1 G1 B1 R0 G0 B0 rb = _mm_shuffle_epi8(m, _mm_set_epi8(-1, 11, -1, 9, -1, 8, -1, 6, -1, 5, -1, 3, -1, 2, -1, 0)); // 00 R3 00 B3 00 R2 00 B2 00 R1 00 B1 00 R0 00 B0 xg = _mm_or_si128(_mm_shuffle_epi8(m, _mm_set_epi8(-1, -1, -1, 10, -1, -1, -1, 7, -1, -1, -1, 4, -1, -1, -1, 1)), _mm_set1_epi32(0x00ff0000)); // 00 ff 00 G3 00 ff 00 G2 00 ff 00 G1 00 ff 00 G0 } #endif else if (std::is_same<T, CARGBColorOrder>::value) { // m = B3 G3 R3 XX B2 G2 R2 XX B1 G1 R1 XX B0 G0 R0 XX rb = _mm_srli_epi16(m, 8); // 00 B3 00 R3 00 B2 00 R2 00 B1 00 R1 00 B0 00 R0 xg = _mm_or_si128(_mm_and_si128(m, _mm_set1_epi32(0x00ff0000)), _mm_set1_epi32(0x000000ff)); // 00 G3 00 ff 00 G2 00 ff 00 G1 00 ff 00 G0 00 ff } #ifdef __SSSE3__ else if (std::is_same<T, CRGBColorOrder>::value) { // m = XX XX XX XX B3 G3 R3 B2 G2 R2 B1 G1 R1 B0 G0 R0 rb = _mm_shuffle_epi8(m, _mm_set_epi8(-1, 11, -1, 9, -1, 8, -1, 6, -1, 5, -1, 3, -1, 2, -1, 0)); // 00 B3 00 R3 00 B2 00 R2 00 B1 00 R1 00 B0 00 R0 xg = _mm_or_si128(_mm_shuffle_epi8(m, _mm_set_epi8(-1, 10, -1, -1, -1, 7, -1, -1, -1, 4, -1, -1, -1, 1, -1, -1)), _mm_set1_epi32(0x000000ff)); // 00 G3 00 ff 00 G2 00 ff 00 G1 00 ff 00 G0 00 ff } #endif auto xrgb2yuv = [rb, xg, shift](__m128i rb2yuv, __m128i xg2yuv) -> uint32_t { __m128i yuv = _mm_add_epi32(_mm_madd_epi16(rb, rb2yuv), _mm_madd_epi16(xg, xg2yuv)); yuv = _mm_srli_epi32(yuv, shift); #ifdef __SSSE3__ if (F >= CODEFEATURE_SSSE3) { yuv = _mm_shuffle_epi8(yuv, _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0)); } else #endif { yuv = _mm_packs_epi32(yuv, yuv); yuv = _mm_packus_epi16(yuv, yuv); } return _mm_cvtsi128_si32(yuv); }; *(uint32_t *)y = xrgb2yuv(rb2y, xg2y); *(uint32_t *)u = xrgb2yuv(rb2u, xg2u); *(uint32_t *)v = xrgb2yuv(rb2v, xg2v); y += 4; u += 4; v += 4; } for (; pp < p + cbWidth; pp += T::BYPP) { __m128i m; __m128i rb, xg; if (std::is_same<T, CBGRAColorOrder>::value || std::is_same<T, CBGRColorOrder>::value) { if (std::is_same<T, CBGRAColorOrder>::value) { m = _mm_cvtsi32_si128(*(const uint32_t *)pp); // m = XX XX XX XX XX XX XX XX XX XX XX XX XX R0 G0 B0 } else { m = _mm_cvtsi32_si128(*(const uint32_t *)(pp - 1)); // m = XX XX XX XX XX XX XX XX XX XX XX XX R0 G0 B0 XX m = _mm_srli_epi32(m, 8); } rb = _mm_and_si128(m, _mm_set1_epi16(0x00ff)); // 00 XX 00 XX 00 XX 00 XX 00 XX 00 XX 00 R0 00 B0 xg = _mm_or_si128(_mm_srli_epi16(m, 8), _mm_set1_epi32(0x00ff0000)); // 00 ff 00 XX 00 ff 00 XX 00 ff 00 XX 00 ff 00 G0 } else if (std::is_same<T, CARGBColorOrder>::value || std::is_same<T, CRGBColorOrder>::value) { if (std::is_same<T, CARGBColorOrder>::value) { m = _mm_cvtsi32_si128(*(const uint32_t *)pp); // m = XX XX XX XX XX XX XX XX XX XX XX XX B0 G0 R0 XX } else { m = _mm_cvtsi32_si128(*(const uint32_t *)(pp - 1)); // m = XX XX XX XX XX XX XX XX XX XX XX XX B0 G0 R0 XX } rb = _mm_srli_epi16(m, 8); // 00 XX 00 XX 00 XX 00 XX 00 XX 00 XX 00 B0 00 R0 xg = _mm_or_si128(_mm_and_si128(m, _mm_set1_epi32(0x00ff0000)), _mm_set1_epi32(0x000000ff)); // 00 XX 00 ff 00 XX 00 ff 00 XX 00 ff 00 G0 00 ff } auto xrgb2yuv = [rb, xg, shift](__m128i rb2yuv, __m128i xg2yuv) -> uint8_t { __m128i yuv = _mm_add_epi32(_mm_madd_epi16(rb, rb2yuv), _mm_madd_epi16(xg, xg2yuv)); yuv = _mm_srli_epi32(yuv, shift); return (uint8_t)_mm_cvtsi128_si32(yuv); }; *y = xrgb2yuv(rb2y, xg2y); *u = xrgb2yuv(rb2u, xg2u); *v = xrgb2yuv(rb2v, xg2v); y++; u++; v++; } } }
static inline void desc_to_olflags_v(struct i40e_rx_queue *rxq, __m128i descs[4], struct rte_mbuf **rx_pkts) { const __m128i mbuf_init = _mm_set_epi64x(0, rxq->mbuf_initializer); __m128i rearm0, rearm1, rearm2, rearm3; __m128i vlan0, vlan1, rss, l3_l4e; /* mask everything except RSS, flow director and VLAN flags * bit2 is for VLAN tag, bit11 for flow director indication * bit13:12 for RSS indication. */ const __m128i rss_vlan_msk = _mm_set_epi32( 0x1c03804, 0x1c03804, 0x1c03804, 0x1c03804); const __m128i cksum_mask = _mm_set_epi32( PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD, PKT_RX_IP_CKSUM_GOOD | PKT_RX_IP_CKSUM_BAD | PKT_RX_L4_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD | PKT_RX_EIP_CKSUM_BAD); /* map rss and vlan type to rss hash and vlan flag */ const __m128i vlan_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, PKT_RX_VLAN_PKT | PKT_RX_VLAN_STRIPPED, 0, 0, 0, 0); const __m128i rss_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, PKT_RX_RSS_HASH | PKT_RX_FDIR, PKT_RX_RSS_HASH, 0, 0, 0, 0, PKT_RX_FDIR, 0); const __m128i l3_l4e_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, /* shift right 1 bit to make sure it not exceed 255 */ (PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD | PKT_RX_L4_CKSUM_BAD) >> 1, (PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD) >> 1, (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1, PKT_RX_IP_CKSUM_BAD >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1); vlan0 = _mm_unpackhi_epi32(descs[0], descs[1]); vlan1 = _mm_unpackhi_epi32(descs[2], descs[3]); vlan0 = _mm_unpacklo_epi64(vlan0, vlan1); vlan1 = _mm_and_si128(vlan0, rss_vlan_msk); vlan0 = _mm_shuffle_epi8(vlan_flags, vlan1); rss = _mm_srli_epi32(vlan1, 11); rss = _mm_shuffle_epi8(rss_flags, rss); l3_l4e = _mm_srli_epi32(vlan1, 22); l3_l4e = _mm_shuffle_epi8(l3_l4e_flags, l3_l4e); /* then we shift left 1 bit */ l3_l4e = _mm_slli_epi32(l3_l4e, 1); /* we need to mask out the reduntant bits */ l3_l4e = _mm_and_si128(l3_l4e, cksum_mask); vlan0 = _mm_or_si128(vlan0, rss); vlan0 = _mm_or_si128(vlan0, l3_l4e); /* * At this point, we have the 4 sets of flags in the low 16-bits * of each 32-bit value in vlan0. * We want to extract these, and merge them with the mbuf init data * so we can do a single 16-byte write to the mbuf to set the flags * and all the other initialization fields. Extracting the * appropriate flags means that we have to do a shift and blend for * each mbuf before we do the write. */ rearm0 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vlan0, 8), 0x10); rearm1 = _mm_blend_epi16(mbuf_init, _mm_slli_si128(vlan0, 4), 0x10); rearm2 = _mm_blend_epi16(mbuf_init, vlan0, 0x10); rearm3 = _mm_blend_epi16(mbuf_init, _mm_srli_si128(vlan0, 4), 0x10); /* write the rearm data and the olflags in one write */ RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, ol_flags) != offsetof(struct rte_mbuf, rearm_data) + 8); RTE_BUILD_BUG_ON(offsetof(struct rte_mbuf, rearm_data) != RTE_ALIGN(offsetof(struct rte_mbuf, rearm_data), 16)); _mm_store_si128((__m128i *)&rx_pkts[0]->rearm_data, rearm0); _mm_store_si128((__m128i *)&rx_pkts[1]->rearm_data, rearm1); _mm_store_si128((__m128i *)&rx_pkts[2]->rearm_data, rearm2); _mm_store_si128((__m128i *)&rx_pkts[3]->rearm_data, rearm3); }
__m64 interpolvline128_3(__m128i* temp){ __m128i xmm6; __m64 ret; __m128i xmm7 = _mm_setzero_si128(); __m128i xmm0 = _mm_load_si128(temp++); __m128i xmm1 = _mm_load_si128(temp++); __m128i xmm2 = _mm_load_si128(temp++); __m128i xmm3 = _mm_load_si128(temp++); __m128i xmm4 = _mm_load_si128(temp++); __m128i xmm5 = _mm_load_si128(temp); xmm1 = _mm_add_epi16(xmm1,xmm4); xmm0 = _mm_add_epi16(xmm0,xmm5); xmm6 = _mm_set_epi32(0xFFFBFFFB,0xFFFBFFFB,0xFFFBFFFB,0xFFFBFFFB); xmm4 = _mm_mullo_epi16(xmm1, xmm6); xmm5 = _mm_mulhi_epi16(xmm1, xmm6); xmm1 = _mm_unpacklo_epi16(xmm4, xmm5); xmm6 = _mm_unpackhi_epi16(xmm4, xmm5); xmm7 = _mm_set_epi32(0x00140014,0x00140014,0x00140014,0x00140014); xmm5 = _mm_add_epi16(xmm2,xmm3); xmm4 = _mm_mullo_epi16(xmm5, xmm7); xmm5 = _mm_mulhi_epi16(xmm5, xmm7); xmm7 = _mm_unpacklo_epi16(xmm4, xmm5); xmm4 = _mm_unpackhi_epi16(xmm4, xmm5); xmm7 = _mm_add_epi32(xmm7,xmm1); xmm4 = _mm_add_epi32(xmm4,xmm6); xmm6 = _mm_set_epi32(0x00010001,0x00010001,0x00010001,0x00010001); xmm6 = _mm_mulhi_epi16(xmm0, xmm6); xmm1 = _mm_unpacklo_epi16(xmm0, xmm6); xmm6 = _mm_unpackhi_epi16(xmm0, xmm6); xmm7 = _mm_add_epi32(xmm7,xmm1); xmm4 = _mm_add_epi32(xmm4,xmm6); xmm1 = _mm_set_epi32(0x00000200,0x00000200,0x00000200,0x00000200); xmm7 = _mm_add_epi32(xmm7,xmm1); xmm4 = _mm_add_epi32(xmm4,xmm1); xmm5 = _mm_setzero_si128(); xmm7 = _mm_srli_epi32(xmm7, 10); xmm7 = _mm_max_epi16(xmm7, xmm5); // preventing negative values xmm7 = _mm_slli_epi32(xmm7,16); xmm7 = _mm_srli_epi32(xmm7,16); xmm4 = _mm_srli_epi32(xmm4, 10); xmm4 = _mm_max_epi16(xmm4, xmm5); // preventing negative values xmm4 = _mm_slli_epi32(xmm4,16); xmm4 = _mm_srli_epi32(xmm4,16); xmm6 = _mm_packs_epi32(xmm7, xmm4); xmm1 = _mm_set_epi32(0x00100010,0x00100010,0x00100010,0x00100010); xmm2 = _mm_add_epi16(xmm2,xmm1); xmm2 = _mm_max_epi16(xmm2, xmm5); // preventing negative values xmm2 = _mm_srli_epi16(xmm2,5); xmm3 = _mm_add_epi16(xmm3,xmm1); xmm3 = _mm_max_epi16(xmm3, xmm5); // preventing negative values xmm3 = _mm_srli_epi16(xmm3,5); xmm2 = _mm_packus_epi16(xmm2,xmm5); xmm3 = _mm_packus_epi16(xmm3,xmm5); xmm6 = _mm_packus_epi16(xmm6,xmm5); xmm7 = _mm_unpacklo_epi8(xmm2,xmm6); xmm4 = _mm_unpacklo_epi8(xmm6,xmm3); xmm6 = _mm_avg_epu8(xmm4,xmm7); xmm6 = _mm_srli_epi16(xmm6,8); xmm6 = _mm_packus_epi16(xmm6,xmm5); ret = _mm_movepi64_pi64(xmm6); _mm_empty(); return(ret); }
void spu_interpreter::ROTI(SPUThread& CPU, spu_opcode_t op) { const auto a = CPU.GPR[op.ra].vi; const s32 n = op.si7 & 0x1f; CPU.GPR[op.rt].vi = _mm_or_si128(_mm_slli_epi32(a, n), _mm_srli_epi32(a, 32 - n)); }
/* natural logarithm computed for 4 simultaneous float return NaN for x <= 0 */ __m128 log_ps(v4sfu *xPtr) { __m128 x=*((__m128 *)xPtr); #ifdef USE_SSE2 __m128i emm0; #else __m64 mm0, mm1; #endif __m128 one = *(__m128*)_ps_1; __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps()); x = _mm_max_ps(x, *(__m128*)_ps_min_norm_pos); /* cut off denormalized stuff */ #ifndef USE_SSE2 /* part 1: x = frexpf(x, &e); */ COPY_XMM_TO_MM(x, mm0, mm1); mm0 = _mm_srli_pi32(mm0, 23); mm1 = _mm_srli_pi32(mm1, 23); #else emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23); #endif /* keep only the fractional part */ x = _mm_and_ps(x, *(__m128*)_ps_inv_mant_mask); x = _mm_or_ps(x, *(__m128*)_ps_0p5); #ifndef USE_SSE2 /* now e=mm0:mm1 contain the really base-2 exponent */ mm0 = _mm_sub_pi32(mm0, *(__m64*)_pi32_0x7f); mm1 = _mm_sub_pi32(mm1, *(__m64*)_pi32_0x7f); __m128 e = _mm_cvtpi32x2_ps(mm0, mm1); _mm_empty(); /* bye bye mmx */ #else emm0 = _mm_sub_epi32(emm0, *(__m128i*)_pi32_0x7f); __m128 e = _mm_cvtepi32_ps(emm0); #endif e = _mm_add_ps(e, one); /* part2: if( x < SQRTHF ) { e -= 1; x = x + x - 1.0; } else { x = x - 1.0; } */ __m128 mask = _mm_cmplt_ps(x, *(__m128*)_ps_cephes_SQRTHF); __m128 tmp = _mm_and_ps(x, mask); x = _mm_sub_ps(x, one); e = _mm_sub_ps(e, _mm_and_ps(one, mask)); x = _mm_add_ps(x, tmp); __m128 z = _mm_mul_ps(x,x); __m128 y = *(__m128*)_ps_cephes_log_p0; y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p1); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p2); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p3); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p4); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p5); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p6); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p7); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p8); y = _mm_mul_ps(y, x); y = _mm_mul_ps(y, z); tmp = _mm_mul_ps(e, *(__m128*)_ps_cephes_log_q1); y = _mm_add_ps(y, tmp); tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5); y = _mm_sub_ps(y, tmp); tmp = _mm_mul_ps(e, *(__m128*)_ps_cephes_log_q2); x = _mm_add_ps(x, y); x = _mm_add_ps(x, tmp); x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN return x; }
void spu_interpreter::ROTMI(SPUThread& CPU, spu_opcode_t op) { CPU.GPR[op.rt].vi = _mm_srli_epi32(CPU.GPR[op.ra].vi, -op.si7 & 0x3f); }
PKT_RX_L4_CKSUM_BAD) >> 1, (PKT_RX_EIP_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_EIP_CKSUM_BAD) >> 1, (PKT_RX_L4_CKSUM_BAD | PKT_RX_IP_CKSUM_BAD) >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_BAD) >> 1, PKT_RX_IP_CKSUM_BAD >> 1, (PKT_RX_IP_CKSUM_GOOD | PKT_RX_L4_CKSUM_GOOD) >> 1); vlan0 = _mm_unpackhi_epi32(descs[0], descs[1]); vlan1 = _mm_unpackhi_epi32(descs[2], descs[3]); vlan0 = _mm_unpacklo_epi64(vlan0, vlan1); vlan1 = _mm_and_si128(vlan0, rss_vlan_msk); vlan0 = _mm_shuffle_epi8(vlan_flags, vlan1); rss = _mm_srli_epi32(vlan1, 11); rss = _mm_shuffle_epi8(rss_flags, rss); l3_l4e = _mm_srli_epi32(vlan1, 22); l3_l4e = _mm_shuffle_epi8(l3_l4e_flags, l3_l4e); /* then we shift left 1 bit */ l3_l4e = _mm_slli_epi32(l3_l4e, 1); /* we need to mask out the reduntant bits */ l3_l4e = _mm_and_si128(l3_l4e, cksum_mask); vlan0 = _mm_or_si128(vlan0, rss); vlan0 = _mm_or_si128(vlan0, l3_l4e); /* * At this point, we have the 4 sets of flags in the low 16-bits * of each 32-bit value in vlan0.
static void GF_FUNC_ALIGN VS_CC proc_16bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *d, const uint8_t *s, edge_t *eh, uint16_t plane_max) { const uint16_t *srcp = (uint16_t *)s; uint16_t *dstp = (uint16_t *)d; stride /= 2; bstride /= 2; uint16_t* p0 = (uint16_t *)buff + 8; uint16_t* p1 = p0 + bstride; uint16_t* p2 = p1 + bstride; uint16_t* p3 = p2 + bstride; uint16_t* p4 = p3 + bstride; uint16_t *orig = p0, *end = p4; line_copy16(p0, srcp + 2 * stride, width, 2); line_copy16(p1, srcp + stride, width, 2); line_copy16(p2, srcp, width, 2); srcp += stride; line_copy16(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128 alpha = _mm_set1_ps((float)0.96043387); __m128 beta = _mm_set1_ps((float)0.39782473); __m128i pmax = _mm_set1_epi32(0xFFFF); __m128i min = _mm_set1_epi16((int16_t)eh->min); __m128i max = _mm_set1_epi16((int16_t)eh->max); for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy16(p4, srcp, width, 2); uint16_t* posh[] = {p2 - 2, p2 - 1, p2 + 1, p2 + 2}; uint16_t* posv[] = {p0, p1, p3, p4}; for (int x = 0; x < width; x += 8) { __m128 sumx[2] = {(__m128)zero, (__m128)zero}; __m128 sumy[2] = {(__m128)zero, (__m128)zero}; for (int i = 0; i < 4; i++) { __m128 xmul = _mm_load_ps(ar_mulxf[i]); __m128i xmm0 = _mm_loadu_si128((__m128i *)(posh[i] + x)); __m128i xmm1 = _mm_unpackhi_epi16(xmm0, zero); xmm0 = _mm_unpacklo_epi16(xmm0, zero); sumx[0] = _mm_add_ps(sumx[0], _mm_mul_ps(_mm_cvtepi32_ps(xmm0), xmul)); sumx[1] = _mm_add_ps(sumx[1], _mm_mul_ps(_mm_cvtepi32_ps(xmm1), xmul)); xmul = _mm_load_ps(ar_mulyf[i]); xmm0 = _mm_load_si128((__m128i *)(posv[i] + x)); xmm1 = _mm_unpackhi_epi16(xmm0, zero); xmm0 = _mm_unpacklo_epi16(xmm0, zero); sumy[0] = _mm_add_ps(sumy[0], _mm_mul_ps(_mm_cvtepi32_ps(xmm0), xmul)); sumy[1] = _mm_add_ps(sumy[1], _mm_mul_ps(_mm_cvtepi32_ps(xmm1), xmul)); } __m128i out[2]; for (int i = 0; i < 2; i++) { sumx[i] = mm_abs_ps(sumx[i]); sumy[i] = mm_abs_ps(sumy[i]); __m128 t0 = _mm_max_ps(sumx[i], sumy[i]); __m128 t1 = _mm_min_ps(sumx[i], sumy[i]); t0 = _mm_add_ps(_mm_mul_ps(alpha, t0), _mm_mul_ps(beta, t1)); out[i] = _mm_srli_epi32(_mm_cvtps_epi32(t0), eh->rshift); out[i] = mm_min_epi32(out[i], pmax); } out[0] = mm_cast_epi32(out[0], out[1]); out[1] = MM_MIN_EPU16(out[0], max); out[1] = _mm_cmpeq_epi16(out[1], max); out[0] = _mm_or_si128(out[1], out[0]); out[1] = MM_MAX_EPU16(out[0], min); out[1] = _mm_cmpeq_epi16(out[1], min); out[0] = _mm_andnot_si128(out[1], out[0]); _mm_store_si128((__m128i *)(dstp + x), out[0]); } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
OD_SIMD_INLINE od_m256i od_mm256_srli_epi32(od_m256i a, int c) { od_m256i r; r.lo = _mm_srli_epi32(a.lo, c); r.hi = _mm_srli_epi32(a.hi, c); return r; }
// this function performs precise calculations void PreOver_SSE2(void* dest, const void* source1, const void* source2, size_t size) { static const size_t stride = sizeof(__m128i)*4; static const u32 PSD = 64; static const __m128i round = _mm_set1_epi16(128); static const __m128i lomask = _mm_set1_epi32(0x00FF00FF); assert(source1 != NULL && source2 != NULL && dest != NULL); assert(size % stride == 0); const __m128i* source128_1 = reinterpret_cast<const __m128i*>(source1); const __m128i* source128_2 = reinterpret_cast<const __m128i*>(source2); __m128i* dest128 = reinterpret_cast<__m128i*>(dest); __m128i d, s, a, rb, ag, t; // TODO: dynamic prefetch schedluing distance? needs to be optimized (R.N) for(size_t k = 0, length = size/stride; k < length; ++k) { // TODO: put prefetch between calculations?(R.N) _mm_prefetch(reinterpret_cast<const s8*>(source128_1+PSD), _MM_HINT_NTA); _mm_prefetch(reinterpret_cast<const s8*>(source128_2+PSD), _MM_HINT_NTA); // work on entire cacheline before next prefetch for(int n = 0; n < 4; ++n, ++dest128, ++source128_1, ++source128_2) { // TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/ // TODO: load entire cacheline at the same time? are there enough registers? 32 bit mode (special compile for 64bit?) (R.N) s = _mm_load_si128(source128_1); // AABGGRR d = _mm_load_si128(source128_2); // AABGGRR // PRELERP(S, D) = S+D - ((S*D[A]+0x80)>>8)+(S*D[A]+0x80))>>8 // T = S*D[A]+0x80 => PRELERP(S,D) = S+D - ((T>>8)+T)>>8 // set alpha to lo16 from dest_ a = _mm_srli_epi32(d, 24); // 000000AA rb = _mm_slli_epi32(a, 16); // 00AA0000 a = _mm_or_si128(rb, a); // 00AA00AA rb = _mm_and_si128(lomask, s); // 00BB00RR rb = _mm_mullo_epi16(rb, a); // BBBBRRRR rb = _mm_add_epi16(rb, round); // BBBBRRRR t = _mm_srli_epi16(rb, 8); // 00BB00RR t = _mm_add_epi16(t, rb); rb = _mm_srli_epi16(t, 8); ag = _mm_srli_epi16(s, 8); // 00AA00GG ag = _mm_mullo_epi16(ag, a); // AAAAGGGG ag = _mm_add_epi16(ag, round); t = _mm_srli_epi16(ag, 8); t = _mm_add_epi16(t, ag); ag = _mm_andnot_si128(lomask, t); // AA00GG00 rb = _mm_or_si128(rb, ag); // AABGGRR pack rb = _mm_sub_epi8(s, rb); // sub S-[(D[A]*S)/255] d = _mm_add_epi8(d, rb); // add D+[S-(D[A]*S)/255] _mm_store_si128(dest128, d); } } }
OD_SIMD_INLINE __m128i od_unbiased_rshift_epi32(__m128i a, int b) { return _mm_srai_epi32(_mm_add_epi32(_mm_srli_epi32(a, 32 - b), a), b); }
HashReturn Update(hashState *state, const BitSequence *data, DataLength databitlen) { int r; __m128i x0; __m128i x1; __m128i x2; __m128i x3; __m128i x4; __m128i x5; __m128i x6; __m128i x7; __m128i y0; __m128i y1; __m128i y2; __m128i y3; x0 = state->x[0]; x1 = state->x[1]; x2 = state->x[2]; x3 = state->x[3]; x4 = state->x[4]; x5 = state->x[5]; x6 = state->x[6]; x7 = state->x[7]; while (databitlen >= 8) { x0 = _mm_xor_si128(x0,_mm_set_epi32(0,0,0,(int) (unsigned int) *data)); data += 1; databitlen -= 8; for (r = 0;r < CUBEHASH_ROUNDS;++r) { x4 = _mm_add_epi32(x0,x4); x5 = _mm_add_epi32(x1,x5); x6 = _mm_add_epi32(x2,x6); x7 = _mm_add_epi32(x3,x7); y0 = x2; y1 = x3; y2 = x0; y3 = x1; x0 = _mm_xor_si128(_mm_slli_epi32(y0,7),_mm_srli_epi32(y0,25)); x1 = _mm_xor_si128(_mm_slli_epi32(y1,7),_mm_srli_epi32(y1,25)); x2 = _mm_xor_si128(_mm_slli_epi32(y2,7),_mm_srli_epi32(y2,25)); x3 = _mm_xor_si128(_mm_slli_epi32(y3,7),_mm_srli_epi32(y3,25)); x0 = _mm_xor_si128(x0,x4); x1 = _mm_xor_si128(x1,x5); x2 = _mm_xor_si128(x2,x6); x3 = _mm_xor_si128(x3,x7); x4 = _mm_shuffle_epi32(x4,0x4e); x5 = _mm_shuffle_epi32(x5,0x4e); x6 = _mm_shuffle_epi32(x6,0x4e); x7 = _mm_shuffle_epi32(x7,0x4e); x4 = _mm_add_epi32(x0,x4); x5 = _mm_add_epi32(x1,x5); x6 = _mm_add_epi32(x2,x6); x7 = _mm_add_epi32(x3,x7); y0 = x1; y1 = x0; y2 = x3; y3 = x2; x0 = _mm_xor_si128(_mm_slli_epi32(y0,11),_mm_srli_epi32(y0,21)); x1 = _mm_xor_si128(_mm_slli_epi32(y1,11),_mm_srli_epi32(y1,21)); x2 = _mm_xor_si128(_mm_slli_epi32(y2,11),_mm_srli_epi32(y2,21)); x3 = _mm_xor_si128(_mm_slli_epi32(y3,11),_mm_srli_epi32(y3,21)); x0 = _mm_xor_si128(x0,x4); x1 = _mm_xor_si128(x1,x5); x2 = _mm_xor_si128(x2,x6); x3 = _mm_xor_si128(x3,x7); x4 = _mm_shuffle_epi32(x4,0xb1); x5 = _mm_shuffle_epi32(x5,0xb1); x6 = _mm_shuffle_epi32(x6,0xb1); x7 = _mm_shuffle_epi32(x7,0xb1); } } state->x[0] = x0; state->x[1] = x1; state->x[2] = x2; state->x[3] = x3; state->x[4] = x4; state->x[5] = x5; state->x[6] = x6; state->x[7] = x7; if (databitlen > 0) { ((unsigned char *) state->x)[state->pos / 8] ^= *data; state->pos += databitlen; } return SUCCESS; }
static inline __m128 gen_05(void) { __m128i ones = (__m128i)gen_ones(); return (__m128)_mm_slli_epi32 (_mm_srli_epi32(ones, 26), 24); }
void CubeHash::transform ( int r ) { #ifdef __SSE2__ __m128i x0, x1, x2, x3, x4, x5, x6, x7; __m128i y0, y1, y2, y3; x0 = m_x[0]; x1 = m_x[1]; x2 = m_x[2]; x3 = m_x[3]; x4 = m_x[4]; x5 = m_x[5]; x6 = m_x[6]; x7 = m_x[7]; for( ; r > 0; --r ) { x4 = _mm_add_epi32( x0, x4 ); x5 = _mm_add_epi32( x1, x5 ); x6 = _mm_add_epi32( x2, x6 ); x7 = _mm_add_epi32( x3, x7 ); y0 = x2; y1 = x3; y2 = x0; y3 = x1; x0 = _mm_xor_si128( _mm_slli_epi32(y0,7), _mm_srli_epi32(y0,25) ); x1 = _mm_xor_si128( _mm_slli_epi32(y1,7), _mm_srli_epi32(y1,25) ); x2 = _mm_xor_si128( _mm_slli_epi32(y2,7), _mm_srli_epi32(y2,25) ); x3 = _mm_xor_si128( _mm_slli_epi32(y3,7), _mm_srli_epi32(y3,25) ); x0 = _mm_xor_si128( x0, x4 ); x1 = _mm_xor_si128( x1, x5 ); x2 = _mm_xor_si128( x2, x6 ); x3 = _mm_xor_si128( x3, x7 ); x4 = _mm_shuffle_epi32( x4, 0x4e ); x5 = _mm_shuffle_epi32( x5, 0x4e ); x6 = _mm_shuffle_epi32( x6, 0x4e ); x7 = _mm_shuffle_epi32( x7, 0x4e ); x4 = _mm_add_epi32( x0, x4 ); x5 = _mm_add_epi32( x1, x5 ); x6 = _mm_add_epi32( x2, x6 ); x7 = _mm_add_epi32( x3, x7 ); y0 = x1; y1 = x0; y2 = x3; y3 = x2; x0 = _mm_xor_si128( _mm_slli_epi32(y0,11), _mm_srli_epi32(y0,21) ); x1 = _mm_xor_si128( _mm_slli_epi32(y1,11), _mm_srli_epi32(y1,21) ); x2 = _mm_xor_si128( _mm_slli_epi32(y2,11), _mm_srli_epi32(y2,21) ); x3 = _mm_xor_si128( _mm_slli_epi32(y3,11), _mm_srli_epi32(y3,21) ); x0 = _mm_xor_si128( x0, x4 ); x1 = _mm_xor_si128( x1, x5 ); x2 = _mm_xor_si128( x2, x6 ); x3 = _mm_xor_si128( x3, x7 ); x4 = _mm_shuffle_epi32( x4, 0xb1 ); x5 = _mm_shuffle_epi32( x5, 0xb1 ); x6 = _mm_shuffle_epi32( x6, 0xb1 ); x7 = _mm_shuffle_epi32( x7, 0xb1 ); } m_x[0] = x0; m_x[1] = x1; m_x[2] = x2; m_x[3] = x3; m_x[4] = x4; m_x[5] = x5; m_x[6] = x6; m_x[7] = x7; #else // non SSE2 int i; uint32_t y[16]; for( ; r > 0; --r ) { for( i = 0; i < 16; ++i ) m_x[i + 16] += m_x[i]; for( i = 0; i < 16; ++i ) y[i ^ 8] = m_x[i]; for( i = 0; i < 16; ++i ) m_x[i] = ROTATE(y[i],7); for( i = 0; i < 16; ++i ) m_x[i] ^= m_x[i + 16]; for( i = 0; i < 16; ++i ) y[i ^ 2] = m_x[i + 16]; for( i = 0; i < 16; ++i ) m_x[i + 16] = y[i]; for( i = 0; i < 16; ++i ) m_x[i + 16] += m_x[i]; for( i = 0; i < 16; ++i ) y[i ^ 4] = m_x[i]; for( i = 0; i < 16; ++i ) m_x[i] = ROTATE(y[i],11); for( i = 0; i < 16; ++i ) m_x[i] ^= m_x[i + 16]; for( i = 0; i < 16; ++i ) y[i ^ 1] = m_x[i + 16]; for( i = 0; i < 16; ++i ) m_x[i + 16] = y[i]; } #endif }
static inline __m128 gen_abs_mask(void) { __m128i x = _mm_setzero_si128(); __m128i ones = _mm_cmpeq_epi32(x, x); return (__m128)_mm_srli_epi32 (_mm_slli_epi32(ones, 1), 1); }
__m128i aes_ssse3_encrypt(__m128i B, const __m128i* keys, size_t rounds) { const __m128i sb2u = _mm_set_epi32( 0x5EB7E955, 0xBC982FCD, 0xE27A93C6, 0x0B712400); const __m128i sb2t = _mm_set_epi32( 0xC2A163C8, 0xAB82234A, 0x69EB8840, 0x0AE12900); const __m128i sbou = _mm_set_epi32( 0x15AABF7A, 0xC502A878, 0xD0D26D17, 0x6FBDC700); const __m128i sbot = _mm_set_epi32( 0x8E1E90D1, 0x412B35FA, 0xCFE474A5, 0x5FBB6A00); const __m128i mc_backward[4] = { _mm_set_epi32(0x0E0D0C0F, 0x0A09080B, 0x06050407, 0x02010003), _mm_set_epi32(0x0A09080B, 0x06050407, 0x02010003, 0x0E0D0C0F), _mm_set_epi32(0x06050407, 0x02010003, 0x0E0D0C0F, 0x0A09080B), _mm_set_epi32(0x02010003, 0x0E0D0C0F, 0x0A09080B, 0x06050407), }; B = mm_xor3(_mm_shuffle_epi8(k_ipt1, _mm_and_si128(low_nibs, B)), _mm_shuffle_epi8(k_ipt2, _mm_srli_epi32( _mm_andnot_si128(low_nibs, B), 4)), _mm_loadu_si128(keys)); for(size_t r = 1; ; ++r) { const __m128i K = _mm_loadu_si128(keys + r); __m128i t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, B), 4); B = _mm_and_si128(low_nibs, B); __m128i t2 = _mm_shuffle_epi8(k_inv2, B); B = _mm_xor_si128(B, t); __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t)); __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, B)); __m128i t5 = _mm_xor_si128(B, _mm_shuffle_epi8(k_inv1, t3)); __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4)); if(r == rounds) { B = _mm_shuffle_epi8( mm_xor3(_mm_shuffle_epi8(sbou, t5), _mm_shuffle_epi8(sbot, t6), K), sr[r % 4]); return B; } __m128i t7 = mm_xor3(_mm_shuffle_epi8(sb1t, t6), _mm_shuffle_epi8(sb1u, t5), K); __m128i t8 = mm_xor3(_mm_shuffle_epi8(sb2t, t6), _mm_shuffle_epi8(sb2u, t5), _mm_shuffle_epi8(t7, mc_forward[r % 4])); B = mm_xor3(_mm_shuffle_epi8(t8, mc_forward[r % 4]), _mm_shuffle_epi8(t7, mc_backward[r % 4]), t8); } }
static inline __m128 gen_05(void) { __m128i x = _mm_setzero_si128(); __m128i ones = _mm_cmpeq_epi32(x, x); return (__m128)_mm_slli_epi32 (_mm_srli_epi32(ones, 26), 24); }
uint32_t br_chacha20_sse2_run(const void *key, const void *iv, uint32_t cc, void *data, size_t len) { unsigned char *buf; uint32_t ivtmp[4]; __m128i kw0, kw1; __m128i iw, cw; __m128i one; static const uint32_t CW[] = { 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 }; buf = data; kw0 = _mm_loadu_si128(key); kw1 = _mm_loadu_si128((const void *)((const unsigned char *)key + 16)); ivtmp[0] = cc; memcpy(ivtmp + 1, iv, 12); iw = _mm_loadu_si128((const void *)ivtmp); cw = _mm_loadu_si128((const void *)CW); one = _mm_set_epi32(0, 0, 0, 1); while (len > 0) { /* * sj contains state words 4*j to 4*j+3. */ __m128i s0, s1, s2, s3; int i; s0 = cw; s1 = kw0; s2 = kw1; s3 = iw; for (i = 0; i < 10; i ++) { /* * Even round is straightforward application on * the state words. */ s0 = _mm_add_epi32(s0, s1); s3 = _mm_xor_si128(s3, s0); s3 = _mm_or_si128( _mm_slli_epi32(s3, 16), _mm_srli_epi32(s3, 16)); s2 = _mm_add_epi32(s2, s3); s1 = _mm_xor_si128(s1, s2); s1 = _mm_or_si128( _mm_slli_epi32(s1, 12), _mm_srli_epi32(s1, 20)); s0 = _mm_add_epi32(s0, s1); s3 = _mm_xor_si128(s3, s0); s3 = _mm_or_si128( _mm_slli_epi32(s3, 8), _mm_srli_epi32(s3, 24)); s2 = _mm_add_epi32(s2, s3); s1 = _mm_xor_si128(s1, s2); s1 = _mm_or_si128( _mm_slli_epi32(s1, 7), _mm_srli_epi32(s1, 25)); /* * For the odd round, we must rotate some state * words so that the computations apply on the * right combinations of words. */ s1 = _mm_shuffle_epi32(s1, 0x39); s2 = _mm_shuffle_epi32(s2, 0x4E); s3 = _mm_shuffle_epi32(s3, 0x93); s0 = _mm_add_epi32(s0, s1); s3 = _mm_xor_si128(s3, s0); s3 = _mm_or_si128( _mm_slli_epi32(s3, 16), _mm_srli_epi32(s3, 16)); s2 = _mm_add_epi32(s2, s3); s1 = _mm_xor_si128(s1, s2); s1 = _mm_or_si128( _mm_slli_epi32(s1, 12), _mm_srli_epi32(s1, 20)); s0 = _mm_add_epi32(s0, s1); s3 = _mm_xor_si128(s3, s0); s3 = _mm_or_si128( _mm_slli_epi32(s3, 8), _mm_srli_epi32(s3, 24)); s2 = _mm_add_epi32(s2, s3); s1 = _mm_xor_si128(s1, s2); s1 = _mm_or_si128( _mm_slli_epi32(s1, 7), _mm_srli_epi32(s1, 25)); /* * After the odd round, we rotate back the values * to undo the rotate at the start of the odd round. */ s1 = _mm_shuffle_epi32(s1, 0x93); s2 = _mm_shuffle_epi32(s2, 0x4E); s3 = _mm_shuffle_epi32(s3, 0x39); } /* * Addition with the initial state. */ s0 = _mm_add_epi32(s0, cw); s1 = _mm_add_epi32(s1, kw0); s2 = _mm_add_epi32(s2, kw1); s3 = _mm_add_epi32(s3, iw); /* * Increment block counter. */ iw = _mm_add_epi32(iw, one); /* * XOR final state with the data. */ if (len < 64) { unsigned char tmp[64]; size_t u; _mm_storeu_si128((void *)(tmp + 0), s0); _mm_storeu_si128((void *)(tmp + 16), s1); _mm_storeu_si128((void *)(tmp + 32), s2); _mm_storeu_si128((void *)(tmp + 48), s3); for (u = 0; u < len; u ++) { buf[u] ^= tmp[u]; } break; } else { __m128i b0, b1, b2, b3; b0 = _mm_loadu_si128((const void *)(buf + 0)); b1 = _mm_loadu_si128((const void *)(buf + 16)); b2 = _mm_loadu_si128((const void *)(buf + 32)); b3 = _mm_loadu_si128((const void *)(buf + 48)); b0 = _mm_xor_si128(b0, s0); b1 = _mm_xor_si128(b1, s1); b2 = _mm_xor_si128(b2, s2); b3 = _mm_xor_si128(b3, s3); _mm_storeu_si128((void *)(buf + 0), b0); _mm_storeu_si128((void *)(buf + 16), b1); _mm_storeu_si128((void *)(buf + 32), b2); _mm_storeu_si128((void *)(buf + 48), b3); buf += 64; len -= 64; } } /* * _mm_extract_epi32() requires SSE4.1. We prefer to stick to * raw SSE2, thus we use _mm_extract_epi16(). */ return (uint32_t)_mm_extract_epi16(iw, 0) | ((uint32_t)_mm_extract_epi16(iw, 1) << 16); }
sse2_tests (void) { /* psraw */ c128.v = _mm_srai_epi16 (m128_16, SHIFT); dump128_16 (buf, "_mm_srai_epi16", c128); c128.v = _mm_sra_epi16 (m128_16, s128); dump128_16 (buf, "_mm_sra_epi16", c128); /* psrad */ c128.v = _mm_srai_epi32 (m128_32, SHIFT); dump128_32 (buf, "_mm_srai_epi32", c128); c128.v = _mm_sra_epi32 (m128_32, s128); dump128_32 (buf, "_mm_sra_epi32", c128); /* psrlw */ c128.v = _mm_srli_epi16 (m128_16, SHIFT); dump128_16 (buf, "_mm_srli_epi16", c128); c128.v = _mm_srl_epi16 (m128_16, s128); dump128_16 (buf, "_mm_srl_epi16", c128); /* psrld */ c128.v = _mm_srli_epi32 (m128_32, SHIFT); dump128_32 (buf, "_mm_srli_epi32", c128); c128.v = _mm_srl_epi32 (m128_32, s128); dump128_32 (buf, "_mm_srl_epi32", c128); /* psrlq */ c128.v = _mm_srli_epi64 (m128_64, SHIFT); dump128_64 (buf, "_mm_srli_epi64", c128); c128.v = _mm_srl_epi64 (m128_64, s128); dump128_64 (buf, "_mm_srl_epi64", c128); /* psrldq */ c128.v = _mm_srli_si128 (m128_128, SHIFT); dump128_128 (buf, "_mm_srli_si128 (byte shift) ", c128); /* psllw */ c128.v = _mm_slli_epi16 (m128_16, SHIFT); dump128_16 (buf, "_mm_slli_epi16", c128); c128.v = _mm_sll_epi16 (m128_16, s128); dump128_16 (buf, "_mm_sll_epi16", c128); /* pslld */ c128.v = _mm_slli_epi32 (m128_32, SHIFT); dump128_32 (buf, "_mm_slli_epi32", c128); c128.v = _mm_sll_epi32 (m128_32, s128); dump128_32 (buf, "_mm_sll_epi32", c128); /* psllq */ c128.v = _mm_slli_epi64 (m128_64, SHIFT); dump128_64 (buf, "_mm_slli_epi64", c128); c128.v = _mm_sll_epi64 (m128_64, s128); dump128_64 (buf, "_mm_sll_epi64", c128); /* pslldq */ c128.v = _mm_slli_si128 (m128_128, SHIFT); dump128_128 (buf, "_mm_sll_si128 (byte shift)", c128); /* Shuffle constant 0x1b == 0b_00_01_10_11, e.g. swap words: ABCD => DCBA. */ /* pshufd */ c128.v = _mm_shuffle_epi32 (m128_128, 0x1b); dump128_32 (buf, "_mm_shuffle_epi32", c128); /* pshuflw */ c128.v = _mm_shufflelo_epi16 (m128_128, 0x1b); dump128_16 (buf, "_mm_shuffelo_epi16", c128); /* pshufhw */ c128.v = _mm_shufflehi_epi16 (m128_128, 0x1b); dump128_16 (buf, "_mm_shuffehi_epi16", c128); }