static inline __m128i exclusion_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i&, __m128i&) { __m128i tmp1 = _mm_mullo_epi16(_mm_set1_epi32(255), sc); // 255 * sc __m128i tmp2 = _mm_mullo_epi16(_mm_set1_epi32(255), dc); // 255 * dc tmp1 = _mm_add_epi32(tmp1, tmp2); tmp2 = _mm_mullo_epi16(sc, dc); // sc * dc tmp2 = _mm_slli_epi32(tmp2, 1); // 2 * sc * dc __m128i r = _mm_sub_epi32(tmp1, tmp2); return clamp_div255round_SSE2(r); }
/* ** doubling (multiply by x over GF(2^n)) */ __inline__ static void mul2(__m128i in, __m128i *out) { const __m128i shuf = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); const __m128i mask = _mm_set_epi32(135, 1, 1, 1); block intmp = _mm_shuffle_epi8(in, shuf); block tmp = _mm_srai_epi32(intmp, 31); tmp = _mm_and_si128(tmp, mask); tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2, 1, 0, 3)); *out = _mm_slli_epi32(intmp, 1); *out = _mm_xor_si128(*out, tmp); *out = _mm_shuffle_epi8(*out, shuf); }
__m256 exp_256( const __m256& x) { //! Clip the value __m256 y = _mm256_max_ps(_mm256_min_ps(x, _mm256_set1_ps(88.3762626647949f)), _mm256_set1_ps(-88.3762626647949f)); //! Express exp(x) as exp(g + n * log(2)) __m256 fx = y * _mm256_set1_ps(1.44269504088896341) + _mm256_set1_ps(0.5f); //! Floor const __m256 tmp = _mm256_round_ps(fx, _MM_FROUND_TO_ZERO); //! If greater, substract 1 const __m256 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS), _mm256_set1_ps(1.f)); fx = tmp - mask; y -= fx * _mm256_set1_ps(0.693359375 - 2.12194440e-4); const __m256 z = y * y; const __m256 t = (((((_mm256_set1_ps(1.9875691500E-4) * y + _mm256_set1_ps(1.3981999507E-3)) * y + _mm256_set1_ps(8.3334519073E-3)) * y + _mm256_set1_ps(4.1665795894E-2)) * y + _mm256_set1_ps(1.6666665459E-1)) * y + _mm256_set1_ps(5.0000001201E-1)) * z + y + _mm256_set1_ps(1.f); //! Build 2^n (split it into two SSE array, since AVX2 equivalent functions //! aren't available. const __m128i emm0 = _mm_add_epi32(_mm_cvttps_epi32(_mm256_castps256_ps128(fx)), _mm_set1_epi32(0x7f)); const __m128i emm1 = _mm_add_epi32(_mm_cvttps_epi32(_mm256_extractf128_ps(fx, 1)), _mm_set1_epi32(0x7f)); fx = _mm256_castps128_ps256(_mm_castsi128_ps(_mm_slli_epi32(emm0, 23))); fx = _mm256_insertf128_ps(fx, _mm_castsi128_ps(_mm_slli_epi32(emm1, 23)), 1); //! Return the result return t * fx; }
// Portable version overlay_byte() is in SkXfermode.cpp. static inline __m128i overlay_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da); __m128i tmp1 = _mm_mullo_epi16(sc, ida); __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa); __m128i tmp2 = _mm_mullo_epi16(dc, isa); __m128i tmp = _mm_add_epi32(tmp1, tmp2); __m128i cmp = _mm_cmpgt_epi32(_mm_slli_epi32(dc, 1), da); __m128i rc1 = _mm_slli_epi32(sc, 1); // 2 * sc rc1 = Multiply32_SSE2(rc1, dc); // *dc __m128i rc2 = _mm_mullo_epi16(sa, da); // sa * da __m128i tmp3 = _mm_slli_epi32(_mm_sub_epi32(da, dc), 1); // 2 * (da - dc) tmp3 = Multiply32_SSE2(tmp3, _mm_sub_epi32(sa, sc)); // * (sa - sc) rc2 = _mm_sub_epi32(rc2, tmp3); __m128i rc = _mm_or_si128(_mm_andnot_si128(cmp, rc1), _mm_and_si128(cmp, rc2)); return clamp_div255round_SSE2(_mm_add_epi32(rc, tmp)); }
__m128 exp_ps(__m128 x) { typedef __m128 v4sf; typedef __m128i v4si; v4sf tmp = _mm_setzero_ps(), fx; v4si emm0; v4sf one = constants::ps_1.ps; x = _mm_min_ps(x, constants::exp_hi.ps); x = _mm_max_ps(x, constants::exp_lo.ps); /* express exp(x) as exp(g + n*log(2)) */ fx = _mm_mul_ps(x, constants::cephes_LOG2EF.ps); fx = _mm_add_ps(fx, constants::ps_0p5.ps); /* how to perform a floorf with SSE: just below */ emm0 = _mm_cvttps_epi32(fx); tmp = _mm_cvtepi32_ps(emm0); /* if greater, substract 1 */ v4sf mask = _mm_cmpgt_ps(tmp, fx); mask = _mm_and_ps(mask, one); fx = _mm_sub_ps(tmp, mask); tmp = _mm_mul_ps(fx, constants::cephes_exp_C1.ps); v4sf z = _mm_mul_ps(fx, constants::cephes_exp_C2.ps); x = _mm_sub_ps(x, tmp); x = _mm_sub_ps(x, z); z = _mm_mul_ps(x,x); v4sf y = constants::cephes_exp_p0.ps; y = _mm_mul_ps(y, x); y = _mm_add_ps(y, constants::cephes_exp_p1.ps); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, constants::cephes_exp_p2.ps); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, constants::cephes_exp_p3.ps); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, constants::cephes_exp_p4.ps); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, constants::cephes_exp_p5.ps); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, x); y = _mm_add_ps(y, one); /* build 2^n */ emm0 = _mm_cvttps_epi32(fx); emm0 = _mm_add_epi32(emm0, constants::pi32_0x7f.pi); emm0 = _mm_slli_epi32(emm0, 23); v4sf pow2n = _mm_castsi128_ps(emm0); y = _mm_mul_ps(y, pow2n); return y; }
SIMDValue SIMDUint32x4Operation::OpFromFloat32x4(const SIMDValue& value, bool& throws) { X86SIMDValue x86Result = { 0 }; X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value); X86SIMDValue temp, temp2; X86SIMDValue two_31_f4, two_31_i4; int mask = 0; // any lanes < 0 ? temp.m128_value = _mm_cmplt_ps(v.m128_value, X86_ALL_ZEROS.m128_value); mask = _mm_movemask_ps(temp.m128_value); // negative value are out of range, caller should throw Range Error if (mask) { throws = true; return X86SIMDValue::ToSIMDValue(x86Result); } // CVTTPS2DQ does a range check over signed range [-2^31, 2^31-1], so will fail to convert values >= 2^31. // To fix this, subtract 2^31 from values >= 2^31, do CVTTPS2DQ, then add 2^31 back. _mm_store_ps(two_31_f4.simdValue.f32, X86_TWO_31_F4.m128_value); // any lanes >= 2^31 ? temp.m128_value = _mm_cmpge_ps(v.m128_value, two_31_f4.m128_value); // two_31_f4 has f32(2^31) for lanes >= 2^31, 0 otherwise two_31_f4.m128_value = _mm_and_ps(two_31_f4.m128_value, temp.m128_value); // subtract 2^31 from lanes >= 2^31, unchanged otherwise. v.m128_value = _mm_sub_ps(v.m128_value, two_31_f4.m128_value); // CVTTPS2DQ x86Result.m128i_value = _mm_cvttps_epi32(v.m128_value); // check if any value is out of range (i.e. >= 2^31, meaning originally >= 2^32 before value adjustment) temp2.m128i_value = _mm_cmpeq_epi32(x86Result.m128i_value, X86_NEG_MASK_F4.m128i_value); // any value == 0x80000000 ? mask = _mm_movemask_ps(temp2.m128_value); if (mask) { throws = true; return X86SIMDValue::ToSIMDValue(x86Result); } // we pass range check // add 2^31 values back to adjusted values. // Use first bit from the 2^31 float mask (0x4f000...0 << 1) // and result with 2^31 int mask (0x8000..0) setting first bit to zero if lane hasn't been adjusted _mm_store_ps(two_31_i4.simdValue.f32, X86_TWO_31_I4.m128_value); two_31_f4.m128i_value = _mm_slli_epi32(two_31_f4.m128i_value, 1); two_31_i4.m128i_value = _mm_and_si128(two_31_i4.m128i_value, two_31_f4.m128i_value); // add 2^31 back to adjusted values // Note at this point all values are in [0, 2^31-1]. Adding 2^31 is guaranteed not to overflow. x86Result.m128i_value = _mm_add_epi32(x86Result.m128i_value, two_31_i4.m128i_value); return X86SIMDValue::ToSIMDValue(x86Result); }
static inline __m128i difference_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { __m128i tmp1 = _mm_mullo_epi16(sc, da); __m128i tmp2 = _mm_mullo_epi16(dc, sa); __m128i tmp = SkMin32_SSE2(tmp1, tmp2); __m128i ret1 = _mm_add_epi32(sc, dc); __m128i ret2 = _mm_slli_epi32(SkDiv255Round_SSE2(tmp), 1); __m128i ret = _mm_sub_epi32(ret1, ret2); ret = clamp_signed_byte_SSE2(ret); return ret; }
void convert_le_d24x8_to_be_d24x8(void *dst, void *src, u32 row_length_in_texels, u32 num_rows) { const u32 num_pixels = row_length_in_texels * num_rows; verify(HERE), (num_pixels & 3) == 0; const auto num_iterations = (num_pixels >> 2); __m128i* dst_ptr = (__m128i*)dst; __m128i* src_ptr = (__m128i*)src; #if defined (_MSC_VER) || defined (__SSSE3__) if (LIKELY(utils::has_ssse3())) { const __m128i swap_mask = _mm_set_epi8 ( 0xF, 0xC, 0xD, 0xE, 0xB, 0x8, 0x9, 0xA, 0x7, 0x4, 0x5, 0x6, 0x3, 0x0, 0x1, 0x2 ); for (u32 n = 0; n < num_iterations; ++n) { const __m128i src_vector = _mm_loadu_si128(src_ptr); const __m128i shuffled_vector = _mm_shuffle_epi8(src_vector, swap_mask); _mm_stream_si128(dst_ptr, shuffled_vector); ++dst_ptr; ++src_ptr; } return; } #endif const __m128i mask1 = _mm_set1_epi32(0xFF00FF00); const __m128i mask2 = _mm_set1_epi32(0x00FF0000); const __m128i mask3 = _mm_set1_epi32(0x000000FF); for (u32 n = 0; n < num_iterations; ++n) { const __m128i src_vector = _mm_loadu_si128(src_ptr); const __m128i v1 = _mm_and_si128(src_vector, mask1); const __m128i v2 = _mm_and_si128(_mm_slli_epi32(src_vector, 16), mask2); const __m128i v3 = _mm_and_si128(_mm_srli_epi32(src_vector, 16), mask3); const __m128i shuffled_vector = _mm_or_si128(_mm_or_si128(v1, v2), v3); _mm_stream_si128(dst_ptr, shuffled_vector); ++dst_ptr; ++src_ptr; } }
INLINE __m128 shade(ColorInterpNoPerspective const&, const SWR_TRIANGLE_DESC &work, WideVector<ColorInterpNoPerspective::NUM_ATTRIBUTES, __m128> const& pAttrs, BYTE*, BYTE*, UINT*) { // convert float to unorm __m128i vBlueI, vGreenI, vRedI, vAlpha; { vBlueI = vFloatToUnorm(get<2>(pAttrs)); vGreenI = vFloatToUnorm(get<1>(pAttrs)); vRedI = vFloatToUnorm(get<0>(pAttrs)); vAlpha = _mm_set1_epi32(0xff000000); } // pack __m128i vPixel = vBlueI; vGreenI = _mm_slli_epi32(vGreenI, 8); vRedI = _mm_slli_epi32(vRedI, 16); vPixel = _mm_or_si128(vPixel, vGreenI); vPixel = _mm_or_si128(vPixel, vRedI); vPixel = _mm_or_si128(vPixel, vAlpha); return _mm_castsi128_ps(vPixel); }
int haraka256256(unsigned char *hash, const unsigned char *msg) { // stuff we need int i, j; __m128i s[2], tmp, rcon; __m128i MSB64 = _mm_set_epi32(0xFFFFFFFF,0xFFFFFFFF,0,0); // set initial round constant rcon = _mm_set_epi32(1,1,1,1); // initialize state to msg s[0] = _mm_load_si128(&((__m128i*)msg)[0]); s[1] = _mm_load_si128(&((__m128i*)msg)[1]); //printf("= input state =\n"); //printstate256(s[0], s[1]); for (i = 0; i < ROUNDS; ++i) { // aes round(s) for (j = 0; j < AES_PER_ROUND; ++j) { s[0] = _mm_aesenc_si128(s[0], rcon); s[1] = _mm_aesenc_si128(s[1], rcon); rcon = _mm_slli_epi32(rcon, 1); } //printf("= round %d : after aes layer =\n", i); //printstate256(s[0], s[1]); // mixing tmp = _mm_unpacklo_epi32(s[0], s[1]); s[1] = _mm_unpackhi_epi32(s[0], s[1]); s[0] = tmp; //printf("= round %d : after mix layer =\n", i); //printstate256(s[0], s[1]); } //printf("= output from permutation =\n"); //printstate256(s[0], s[1]); // xor message to get DM effect s[0] = _mm_xor_si128(s[0], _mm_load_si128(&((__m128i*)msg)[0])); s[1] = _mm_xor_si128(s[1], _mm_load_si128(&((__m128i*)msg)[1])); //printf("= after feed-forward =\n"); //printstate256(s[0], s[1]); // store result _mm_storeu_si128((__m128i*)hash, s[0]); _mm_storeu_si128((__m128i*)(hash + 16), s[1]); }
void store(uint16_t *p) const{ assert(((uintptr_t)p & 7) == 0);//assert aligned //_mm_packus_epi32 (pack with unsigned saturation) is not in SSE2 (2001) for some reason, requires SSE 4.1 (2007) //_mm_storel_epi64((__m128i*)p,_mm_packus_epi32 (a,a)); //a: AAAABBBBCCCCDDDD input vector //slli: AA__BB__CC__DD__ bitshift left by 16 //srli: __________AA__BB byteshift right by 10 //_or_: AA__BB__CCAADDBB OR together //shuf: AA__BB__AABBCCDD reshuffle low half: {[2], [0], [3], [1]} : 10 00 11 01 : 0x8D (I may have gotten this wrong) //storel: AABBCCDD store low half __m128i shifted = _mm_slli_epi32(vec,16); _mm_storel_epi64((__m128i*)p,_mm_shufflelo_epi16(_mm_or_si128(shifted,_mm_srli_si128(shifted,10)),0x8D)); }
static void AddGreenToBlueAndRed(uint32_t* argb_data, int num_pixels) { const __m128i mask = _mm_set1_epi32(0x0000ff00); int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); const __m128i in_00g0 = _mm_and_si128(in, mask); // 00g0|00g0|... const __m128i in_0g00 = _mm_slli_epi32(in_00g0, 8); // 0g00|0g00|... const __m128i in_000g = _mm_srli_epi32(in_00g0, 8); // 000g|000g|... const __m128i in_0g0g = _mm_or_si128(in_0g00, in_000g); const __m128i out = _mm_add_epi8(in, in_0g0g); _mm_storeu_si128((__m128i*)&argb_data[i], out); } // fallthrough and finish off with plain-C VP8LAddGreenToBlueAndRed_C(argb_data + i, num_pixels - i); }
inline FORCE_INLINE __m128 mm_cvtph_ps(__m128i x) { __m128 magic = _mm_castsi128_ps(_mm_set1_epi32((uint32_t)113 << 23)); __m128i shift_exp = _mm_set1_epi32(0x7C00UL << 13); __m128i sign_mask = _mm_set1_epi32(0x8000U); __m128i mant_mask = _mm_set1_epi32(0x7FFF); __m128i exp_adjust = _mm_set1_epi32((127UL - 15UL) << 23); __m128i exp_adjust_nan = _mm_set1_epi32((127UL - 16UL) << 23); __m128i exp_adjust_denorm = _mm_set1_epi32(1UL << 23); __m128i zero = _mm_set1_epi16(0); __m128i exp, ret, ret_nan, ret_denorm, sign, mask0, mask1; x = _mm_unpacklo_epi16(x, zero); ret = _mm_and_si128(x, mant_mask); ret = _mm_slli_epi32(ret, 13); exp = _mm_and_si128(shift_exp, ret); ret = _mm_add_epi32(ret, exp_adjust); mask0 = _mm_cmpeq_epi32(exp, shift_exp); mask1 = _mm_cmpeq_epi32(exp, zero); ret_nan = _mm_add_epi32(ret, exp_adjust_nan); ret_denorm = _mm_add_epi32(ret, exp_adjust_denorm); ret_denorm = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(ret_denorm), magic)); sign = _mm_and_si128(x, sign_mask); sign = _mm_slli_epi32(sign, 16); ret = mm_blendv_ps(ret_nan, ret, mask0); ret = mm_blendv_ps(ret_denorm, ret, mask1); ret = _mm_or_si128(ret, sign); return _mm_castsi128_ps(ret); }
/** * Computes the various filters involved in CNS computation. * First, \c dX, blurX and blurX2 are computed horizontally from \c imgL, img, imgR and stored in \c currentIV. * Then, these intermediate values, the one from the previous line (\c previousIV) and the one from the line * 2 above (passed in \c currentIV) are used to compute sobelX, sobelY, gaussI and gaussI2A/B. The latter one * is floating point and separated into two halves. * * Also \c gaussI is stored in \c currentIV.gaussI (used for downsampling). */ ALWAYSINLINE static void filters(IntermediateValues& currentIV, const IntermediateValues& previousIV, __m128i& sobelX, __m128i& sobelY, __m128i& gaussI, __m128& gaussI2A, __m128& gaussI2B, __m128i imgL, __m128i img, __m128i imgR) { __m128i dX = _mm_sub_epi16(imgR, imgL); // [+1 0 -1]*I sobelX = blur_epi16(dX, previousIV.dX, currentIV.dX); // [1 2 1]^T*[+1 0 -1]*I currentIV.dX = dX; __m128i blurX = blur_epi16(imgL, img, imgR); // [1 2 1]*I sobelY = _mm_sub_epi16(blurX, currentIV.gaussIX); // [+1 0 -1]*[1 2 1]*I gaussI = blur_epi16(blurX, previousIV.gaussIX, currentIV.gaussIX); // [1 2 1]*[1 2 1]*I currentIV.gaussIX = blurX; __m128i img2 = _mm_mullo_epi16(img, img); __m128i img2A = _mm_unpacklo_epi16(img2, _mm_setzero_si128()); __m128i img2B = _mm_unpackhi_epi16(img2, _mm_setzero_si128()); // (img2A, img2B) I^2 32bit __m128i img2L = _mm_mullo_epi16(imgL, imgL); __m128i img2LA = _mm_unpacklo_epi16(img2L, _mm_setzero_si128()); __m128i img2LB = _mm_unpackhi_epi16(img2L, _mm_setzero_si128()); // (img2LA, img2LB) I^2 32bit shifted -1 __m128i img2R = _mm_mullo_epi16(imgR, imgR); __m128i img2RA = _mm_unpacklo_epi16(img2R, _mm_setzero_si128()); __m128i img2RB = _mm_unpackhi_epi16(img2R, _mm_setzero_si128()); // (img2RA, img2RB) img^2 shifted +1 __m128i blurI2XA = blur_epi32(img2LA, img2A, img2RA); // [1 2 1]*I^2 __m128i blurI2XB = blur_epi32(img2LB, img2B, img2RB); // [1 2 1]*I^2 __m128 blurI2XAf = _mm_cvtepi32_ps(_mm_slli_epi32(blurI2XA, 4)); __m128 blurI2XBf = _mm_cvtepi32_ps(_mm_slli_epi32(blurI2XB, 4)); // (blurI2XA, blurI2XB) = 16.0*[1 2 1]*I^2 gaussI2A = blur_ps(blurI2XAf, previousIV.gaussI2XA, currentIV.gaussI2XA); gaussI2B = blur_ps(blurI2XBf, previousIV.gaussI2XB, currentIV.gaussI2XB); // (gaussI2A, gaussI2B) = 16.0*[1 2 1]^T*[1 2 1]*I^2 currentIV.gaussI2XA = blurI2XAf; currentIV.gaussI2XB = blurI2XBf; currentIV.gaussI = gaussI; }
FORCE_INLINE int __ext_v_shift_left_int32(int32* z, int __unused_3, int32* x, int len, int shift) { const int wlen = 4;// sizeof(vi) / sizeof(int32); __m128i* Xs = (__m128i*) x; __m128i* Zs = (__m128i*) z; for (int i = 0; i < len / wlen; i++) { _mm_storeu_si128(&Zs[i], _mm_slli_epi32(_mm_loadu_si128(&Xs[i]), shift)); } for (int i = (len / wlen) * wlen; i < len; i++) { z[i] = x[i] << shift; } return 0; }
/** * This function represents the recursion formula. * @param a a 128-bit part of the interal state array * @param b a 128-bit part of the interal state array * @param c a 128-bit part of the interal state array * @param d a 128-bit part of the interal state array * @param mask 128-bit mask * @return output */ inline static __m128i mm_recursion(__m128i *a, __m128i *b, __m128i c, __m128i d, __m128i mask) { __m128i v, x, y, z; x = _mm_load_si128(a); y = _mm_srli_epi32(*b, SR1); z = _mm_srli_si128(c, SR2); v = _mm_slli_epi32(d, SL1); z = _mm_xor_si128(z, x); z = _mm_xor_si128(z, v); x = _mm_slli_si128(x, SL2); y = _mm_and_si128(y, mask); z = _mm_xor_si128(z, x); z = _mm_xor_si128(z, y); return z; }
v4sf exp_ps(v4sf x) { v4sf tmp = _mm_setzero_ps(), fx; v4si emm0; v4sf one = *(v4sf*)_ps_1; x = _mm_min_ps(x, *(v4sf*)_ps_exp_hi); x = _mm_max_ps(x, *(v4sf*)_ps_exp_lo); fx = _mm_mul_ps(x, *(v4sf*)_ps_cephes_LOG2EF); fx = _mm_add_ps(fx, *(v4sf*)_ps_0p5); emm0 = _mm_cvttps_epi32(fx); tmp = _mm_cvtepi32_ps(emm0); v4sf mask = _mm_cmpgt_ps(tmp, fx); mask = _mm_and_ps(mask, one); fx = _mm_sub_ps(tmp, mask); tmp = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C1); v4sf z = _mm_mul_ps(fx, *(v4sf*)_ps_cephes_exp_C2); x = _mm_sub_ps(x, tmp); x = _mm_sub_ps(x, z); z = _mm_mul_ps(x,x); v4sf y = *(v4sf*)_ps_cephes_exp_p0; y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p1); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p2); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p3); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p4); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, *(v4sf*)_ps_cephes_exp_p5); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, x); y = _mm_add_ps(y, one); emm0 = _mm_cvttps_epi32(fx); emm0 = _mm_add_epi32(emm0, *(v4si*)_pi32_0x7f); emm0 = _mm_slli_epi32(emm0, 23); v4sf pow2n = _mm_castsi128_ps(emm0); y = _mm_mul_ps(y, pow2n); return y; }
static inline __m128i xts_crank_lfsr(__m128i inp) { const __m128i alphamask = _mm_set_epi32(1, 1, 1, AES_XTS_ALPHA); __m128i xtweak, ret; /* set up xor mask */ xtweak = _mm_shuffle_epi32(inp, 0x93); xtweak = _mm_srai_epi32(xtweak, 31); xtweak &= alphamask; /* next term */ ret = _mm_slli_epi32(inp, 1); ret ^= xtweak; return ret; }
FORCE_INLINE int __ext_v_shift_left_complex32(struct complex32* z, int __unused_3, struct complex32* x, int len, int shift) { const int wlen = 2;// sizeof(vci) / sizeof(complex32); __m128i* Xs = (__m128i*) x; __m128i* Zs = (__m128i*) z; for (int i = 0; i < len / wlen; i++) { _mm_storeu_si128(&Zs[i], _mm_slli_epi32(_mm_loadu_si128(&Xs[i]), shift)); } unum32* Ps = (unum32*) x; unum32* Qs = (unum32*) z; for (int i = (len / wlen) * wlen * 2; i < len * 2; i++) { Qs[i] = Ps[i] << shift; } return 0; }
// // This was v_mul_complex16_shift but I changed the name for consistency with v_conj_mul // and the fact that the old v_mul_complex16 was never called // FORCE_INLINE int __ext_v_mul_complex16(struct complex16* out, int lenout, struct complex16* x, int len1, struct complex16* y, int len2, int shift) { const unum8 wlen = 4;// sizeof(vcs) / sizeof(complex16); const __m128i xmm6 = _mm_set1_epi32(0x0000FFFF); const __m128i xmm5 = _mm_set1_epi32(0xFFFF0000); const __m128i xmm4 = _mm_set1_epi32(0x00010000); __m128i* Xs = (__m128i*) x; __m128i* Ys = (__m128i*) y; __m128i* Outs = (__m128i*) out; for (int i = 0; i < len1 / wlen; i++){ __m128i mx = _mm_loadu_si128(&Xs[i]); __m128i my = _mm_loadu_si128(&Ys[i]); __m128i ms1 = _mm_xor_si128(mx, xmm5); ms1 = _mm_add_epi32(ms1, xmm4); __m128i ms2 = _mm_shufflehi_epi16(mx, _MM_SHUFFLE(2, 3, 0, 1)); ms2 = _mm_shufflelo_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1)); __m128i mre = _mm_srai_epi32(_mm_madd_epi16(ms1, my), shift); __m128i mim = _mm_srai_epi32(_mm_madd_epi16(ms2, my), shift); mre = _mm_and_si128(mre,xmm6); mim = _mm_and_si128(mim,xmm6); mim = _mm_slli_epi32(mim,0x10); _mm_storeu_si128(&Outs[i], _mm_or_si128(mre, mim)); } for (int i = (len1 / wlen) * wlen; i < len1; i++){ out[i].re = (x[i].re * y[i].re - x[i].im * y[i].im) >> shift; out[i].im = (x[i].re * y[i].im + x[i].im * y[i].re) >> shift; } return 0; }
//FINL int __ext_v_shift_left_int32(int32* z, int __unused_3, int32* x, int len, int shift) { const int wlen = 4;// sizeof(vi) / sizeof(int32); for (int i = 0; i < len / wlen; i++) { /* vi *xi = (vi *)(x + wlen*i); vi output = (shift_left(*xi, shift)); memcpy((void *)(z + wlen*i), (void *)(&output), sizeof(vi));*/ __m128i mx = _mm_loadu_si128((__m128i *)(x + wlen*i)); _mm_storeu_si128((__m128i *) (z + wlen*i), _mm_slli_epi32(mx, shift)); } for (int i = (len / wlen) * wlen; i < len; i++) { z[i] = x[i] << shift; } return 0; }
//FINL int __ext_v_shift_left_complex32(struct complex32* z, int __unused_3, struct complex32* x, int len, int shift) { const int wlen = 2;// sizeof(vci) / sizeof(complex32); for (int i = 0; i < len / wlen; i++) {/* vci *xi = (vci *)(x + wlen*i); vci output = (shift_left(*xi, shift)); memcpy((void *)(z + wlen*i), (void *)(&output), sizeof(vci));*/ __m128i mx = _mm_loadu_si128((__m128i *)(x + wlen*i)); _mm_storeu_si128((__m128i *) (z + wlen*i), _mm_slli_epi32(mx, shift)); } for (int i = (len / wlen) * wlen; i < len; i++) { z[i].re = x[i].re << shift; z[i].im = x[i].im << shift; } return 0; }
static WEBP_INLINE void TransformColorInverse(const VP8LMultipliers* const m, uint32_t* argb_data, int num_pixels) { const __m128i g_to_r = _mm_set1_epi32(m->green_to_red_); // multipliers const __m128i g_to_b = _mm_set1_epi32(m->green_to_blue_); const __m128i r_to_b = _mm_set1_epi32(m->red_to_blue_); int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); const __m128i alpha_green_mask = _mm_set1_epi32(0xff00ff00); // masks const __m128i red_mask = _mm_set1_epi32(0x00ff0000); const __m128i green_mask = _mm_set1_epi32(0x0000ff00); const __m128i lower_8bit_mask = _mm_set1_epi32(0x000000ff); const __m128i ag = _mm_and_si128(in, alpha_green_mask); // alpha, green const __m128i r = _mm_srli_epi32(_mm_and_si128(in, red_mask), 16); const __m128i g = _mm_srli_epi32(_mm_and_si128(in, green_mask), 8); const __m128i b = in; const __m128i r_delta = ColorTransformDelta(g_to_r, g); // red const __m128i r_new = _mm_and_si128(_mm_add_epi32(r, r_delta), lower_8bit_mask); const __m128i r_new_shifted = _mm_slli_epi32(r_new, 16); const __m128i b_delta_1 = ColorTransformDelta(g_to_b, g); // blue const __m128i b_delta_2 = ColorTransformDelta(r_to_b, r_new); const __m128i b_delta = _mm_add_epi32(b_delta_1, b_delta_2); const __m128i b_new = _mm_and_si128(_mm_add_epi32(b, b_delta), lower_8bit_mask); const __m128i out = _mm_or_si128(_mm_or_si128(ag, r_new_shifted), b_new); _mm_storeu_si128((__m128i*)&argb_data[i], out); } // Fall-back to C-version for left-overs. VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i); }
/** * @brief mux all audio ports to events * @param data * @param offset * @param nevents */ void AmdtpTransmitStreamProcessor::encodeAudioPortsInt24(quadlet_t *data, unsigned int offset, unsigned int nevents) { unsigned int j; quadlet_t *target_event; int i; uint32_t *client_buffers[4]; uint32_t tmp_values[4] __attribute__ ((aligned (16))); // prepare the scratch buffer assert(m_scratch_buffer_size_bytes > nevents * 4); memset(m_scratch_buffer, 0, nevents * 4); const __m128i label = _mm_set_epi32 (0x40000000, 0x40000000, 0x40000000, 0x40000000); const __m128i mask = _mm_set_epi32 (0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF); // this assumes that audio ports are sorted by position, // and that there are no gaps for (i = 0; i < ((int)m_nb_audio_ports)-4; i += 4) { struct _MBLA_port_cache *p; // get the port buffers for (j=0; j<4; j++) { p = &(m_audio_ports.at(i+j)); if(likely(p->buffer && p->enabled)) { client_buffers[j] = (uint32_t *) p->buffer; client_buffers[j] += offset; } else { // if a port is disabled or has no valid // buffer, use the scratch buffer (all zero's) client_buffers[j] = (uint32_t *) m_scratch_buffer; } } // the base event for this position target_event = (quadlet_t *)(data + i); // process the events for (j=0;j < nevents; j += 1) { // read the values tmp_values[0] = *(client_buffers[0]); tmp_values[1] = *(client_buffers[1]); tmp_values[2] = *(client_buffers[2]); tmp_values[3] = *(client_buffers[3]); // now do the SSE based conversion/labeling __m128i *target = (__m128i*)target_event; __m128i v_int = *((__m128i*)tmp_values);; // mask v_int = _mm_and_si128( v_int, mask ); // label it v_int = _mm_or_si128( v_int, label ); // do endian conversion (SSE is always little endian) // do first swap v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) ); // do second swap v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) ); // store the packed int // (target misalignment is assumed since we don't know the m_dimension) _mm_storeu_si128 (target, v_int); // increment the buffer pointers client_buffers[0]++; client_buffers[1]++; client_buffers[2]++; client_buffers[3]++; // go to next target event position target_event += m_dimension; } } // do remaining ports // NOTE: these can be time-SSE'd for (; i < ((int)m_nb_audio_ports); i++) { struct _MBLA_port_cache &p = m_audio_ports.at(i); target_event = (quadlet_t *)(data + i); #ifdef DEBUG assert(nevents + offset <= p.buffer_size ); #endif if(likely(p.buffer && p.enabled)) { uint32_t *buffer = (uint32_t *)(p.buffer); buffer += offset; for (j = 0;j < nevents; j += 4) { // read the values tmp_values[0] = *buffer; buffer++; tmp_values[1] = *buffer; buffer++; tmp_values[2] = *buffer; buffer++; tmp_values[3] = *buffer; buffer++; // now do the SSE based conversion/labeling __m128i v_int = *((__m128i*)tmp_values);; // mask v_int = _mm_and_si128( v_int, mask ); // label it v_int = _mm_or_si128( v_int, label ); // do endian conversion (SSE is always little endian) // do first swap v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) ); // do second swap v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) ); // store the packed int _mm_store_si128 ((__m128i *)(&tmp_values), v_int); // increment the buffer pointers *target_event = tmp_values[0]; target_event += m_dimension; *target_event = tmp_values[1]; target_event += m_dimension; *target_event = tmp_values[2]; target_event += m_dimension; *target_event = tmp_values[3]; target_event += m_dimension; } // do the remainder of the events for(;j < nevents; j += 1) { uint32_t in = (uint32_t)(*buffer); *target_event = CondSwapToBus32((quadlet_t)((in & 0x00FFFFFF) | 0x40000000)); buffer++; target_event += m_dimension; } } else { for (j = 0;j < nevents; j += 1) { // hardcoded byte swapped *target_event = 0x00000040; target_event += m_dimension; } } } }
/** * @brief mux all audio ports to events * @param data * @param offset * @param nevents */ void AmdtpTransmitStreamProcessor::encodeAudioPortsFloat(quadlet_t *data, unsigned int offset, unsigned int nevents) { unsigned int j; quadlet_t *target_event; int i; float * client_buffers[4]; float tmp_values[4] __attribute__ ((aligned (16))); uint32_t tmp_values_int[4] __attribute__ ((aligned (16))); // prepare the scratch buffer assert(m_scratch_buffer_size_bytes > nevents * 4); memset(m_scratch_buffer, 0, nevents * 4); const __m128i label = _mm_set_epi32 (0x40000000, 0x40000000, 0x40000000, 0x40000000); const __m128i mask = _mm_set_epi32 (0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF, 0x00FFFFFF); const __m128 mult = _mm_set_ps(AMDTP_FLOAT_MULTIPLIER, AMDTP_FLOAT_MULTIPLIER, AMDTP_FLOAT_MULTIPLIER, AMDTP_FLOAT_MULTIPLIER); #if AMDTP_CLIP_FLOATS const __m128 v_max = _mm_set_ps(1.0, 1.0, 1.0, 1.0); const __m128 v_min = _mm_set_ps(-1.0, -1.0, -1.0, -1.0); #endif // this assumes that audio ports are sorted by position, // and that there are no gaps for (i = 0; i < ((int)m_nb_audio_ports)-4; i += 4) { struct _MBLA_port_cache *p; // get the port buffers for (j=0; j<4; j++) { p = &(m_audio_ports.at(i+j)); if(likely(p->buffer && p->enabled)) { client_buffers[j] = (float *) p->buffer; client_buffers[j] += offset; } else { // if a port is disabled or has no valid // buffer, use the scratch buffer (all zero's) client_buffers[j] = (float *) m_scratch_buffer; } } // the base event for this position target_event = (quadlet_t *)(data + i); // process the events for (j=0;j < nevents; j += 1) { // read the values tmp_values[0] = *(client_buffers[0]); tmp_values[1] = *(client_buffers[1]); tmp_values[2] = *(client_buffers[2]); tmp_values[3] = *(client_buffers[3]); // now do the SSE based conversion/labeling __m128 v_float = *((__m128*)tmp_values); __m128i *target = (__m128i*)target_event; __m128i v_int; // clip #if AMDTP_CLIP_FLOATS // do SSE clipping v_float = _mm_max_ps(v_float, v_min); v_float = _mm_min_ps(v_float, v_max); #endif // multiply v_float = _mm_mul_ps(v_float, mult); // convert to signed integer v_int = _mm_cvttps_epi32( v_float ); // mask v_int = _mm_and_si128( v_int, mask ); // label it v_int = _mm_or_si128( v_int, label ); // do endian conversion (SSE is always little endian) // do first swap v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) ); // do second swap v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) ); // store the packed int // (target misalignment is assumed since we don't know the m_dimension) _mm_storeu_si128 (target, v_int); // increment the buffer pointers client_buffers[0]++; client_buffers[1]++; client_buffers[2]++; client_buffers[3]++; // go to next target event position target_event += m_dimension; } } // do remaining ports // NOTE: these can be time-SSE'd for (; i < (int)m_nb_audio_ports; i++) { struct _MBLA_port_cache &p = m_audio_ports.at(i); target_event = (quadlet_t *)(data + i); #ifdef DEBUG assert(nevents + offset <= p.buffer_size ); #endif if(likely(p.buffer && p.enabled)) { float *buffer = (float *)(p.buffer); buffer += offset; for (j = 0;j < nevents; j += 4) { // read the values tmp_values[0] = *buffer; buffer++; tmp_values[1] = *buffer; buffer++; tmp_values[2] = *buffer; buffer++; tmp_values[3] = *buffer; buffer++; // now do the SSE based conversion/labeling __m128 v_float = *((__m128*)tmp_values); __m128i v_int; #if AMDTP_CLIP_FLOATS // do SSE clipping v_float = _mm_max_ps(v_float, v_min); v_float = _mm_min_ps(v_float, v_max); #endif // multiply v_float = _mm_mul_ps(v_float, mult); // convert to signed integer v_int = _mm_cvttps_epi32( v_float ); // mask v_int = _mm_and_si128( v_int, mask ); // label it v_int = _mm_or_si128( v_int, label ); // do endian conversion (SSE is always little endian) // do first swap v_int = _mm_or_si128( _mm_slli_epi16( v_int, 8 ), _mm_srli_epi16( v_int, 8 ) ); // do second swap v_int = _mm_or_si128( _mm_slli_epi32( v_int, 16 ), _mm_srli_epi32( v_int, 16 ) ); // store the packed int _mm_store_si128 ((__m128i *)(&tmp_values_int), v_int); // increment the buffer pointers *target_event = tmp_values_int[0]; target_event += m_dimension; *target_event = tmp_values_int[1]; target_event += m_dimension; *target_event = tmp_values_int[2]; target_event += m_dimension; *target_event = tmp_values_int[3]; target_event += m_dimension; } // do the remainder of the events for(;j < nevents; j += 1) { float *in = (float *)buffer; #if AMDTP_CLIP_FLOATS // clip directly to the value of a maxed event if(unlikely(*in > 1.0)) { *target_event = CONDSWAPTOBUS32_CONST(0x407FFFFF); } else if(unlikely(*in < -1.0)) { *target_event = CONDSWAPTOBUS32_CONST(0x40800001); } else { float v = (*in) * AMDTP_FLOAT_MULTIPLIER; unsigned int tmp = ((int) v); tmp = ( tmp & 0x00FFFFFF ) | 0x40000000; *target_event = CondSwapToBus32((quadlet_t)tmp); } #else float v = (*in) * AMDTP_FLOAT_MULTIPLIER; unsigned int tmp = ((int) v); tmp = ( tmp & 0x00FFFFFF ) | 0x40000000; *target_event = CondSwapToBus32((quadlet_t)tmp); #endif buffer++; target_event += m_dimension; } } else { for (j = 0;j < nevents; j += 1) { // hardcoded byte swapped *target_event = 0x00000040; target_event += m_dimension; } } } }
pstatus_t ssse3_YUV420ToRGB_8u_P3AC4R(const BYTE **pSrc, int *srcStep, BYTE *pDst, int dstStep, const prim_size_t *roi) { int lastRow, lastCol; BYTE *UData,*VData,*YData; int i,nWidth,nHeight,VaddDst,VaddY,VaddU,VaddV; __m128i r0,r1,r2,r3,r4,r5,r6,r7; __m128i *buffer; /* last_line: if the last (U,V doubled) line should be skipped, set to 10B * last_column: if it's the last column in a line, set to 10B (for handling line-endings not multiple by four) */ buffer = _aligned_malloc(4 * 16, 16); YData = (BYTE*) pSrc[0]; UData = (BYTE*) pSrc[1]; VData = (BYTE*) pSrc[2]; nWidth = roi->width; nHeight = roi->height; if ((lastCol = (nWidth & 3))) { switch (lastCol) { case 1: r7 = _mm_set_epi32(0,0,0,0xFFFFFFFF); break; case 2: r7 = _mm_set_epi32(0,0,0xFFFFFFFF,0xFFFFFFFF); break; case 3: r7 = _mm_set_epi32(0,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF); break; } _mm_store_si128(buffer+3,r7); lastCol = 1; } nWidth += 3; nWidth = nWidth >> 2; lastRow = nHeight & 1; nHeight++; nHeight = nHeight >> 1; VaddDst = (dstStep << 1) - (nWidth << 4); VaddY = (srcStep[0] << 1) - (nWidth << 2); VaddU = srcStep[1] - (((nWidth << 1) + 2) & 0xFFFC); VaddV = srcStep[2] - (((nWidth << 1) + 2) & 0xFFFC); while (nHeight-- > 0) { if (nHeight == 0) lastRow <<= 1; i = 0; do { if (!(i & 0x01)) { /* Y-, U- and V-data is stored in different arrays. * We start with processing U-data. * * at first we fetch four U-values from its array and shuffle them like this: * 0d0d 0c0c 0b0b 0a0a * we've done two things: converting the values to signed words and duplicating * each value, because always two pixel "share" the same U- (and V-) data */ r0 = _mm_cvtsi32_si128(*(UINT32 *)UData); r5 = _mm_set_epi32(0x80038003,0x80028002,0x80018001,0x80008000); r0 = _mm_shuffle_epi8(r0,r5); UData += 4; /* then we subtract 128 from each value, so we get D */ r3 = _mm_set_epi16(128,128,128,128,128,128,128,128); r0 = _mm_subs_epi16(r0,r3); /* we need to do two things with our D, so let's store it for later use */ r2 = r0; /* now we can multiply our D with 48 and unpack it to xmm4:xmm0 * this is what we need to get G data later on */ r4 = r0; r7 = _mm_set_epi16(48,48,48,48,48,48,48,48); r0 = _mm_mullo_epi16(r0,r7); r4 = _mm_mulhi_epi16(r4,r7); r7 = r0; r0 = _mm_unpacklo_epi16(r0,r4); r4 = _mm_unpackhi_epi16(r7,r4); /* to get B data, we need to prepare a second value, D*475 */ r1 = r2; r7 = _mm_set_epi16(475,475,475,475,475,475,475,475); r1 = _mm_mullo_epi16(r1,r7); r2 = _mm_mulhi_epi16(r2,r7); r7 = r1; r1 = _mm_unpacklo_epi16(r1,r2); r7 = _mm_unpackhi_epi16(r7,r2); /* so we got something like this: xmm7:xmm1 * this pair contains values for 16 pixel: * aabbccdd * aabbccdd, but we can only work on four pixel at once, so we need to save upper values */ _mm_store_si128(buffer+1,r7); /* Now we've prepared U-data. Preparing V-data is actually the same, just with other coefficients */ r2 = _mm_cvtsi32_si128(*(UINT32 *)VData); r2 = _mm_shuffle_epi8(r2,r5); VData += 4; r2 = _mm_subs_epi16(r2,r3); r5 = r2; /* this is also known as E*403, we need it to convert R data */ r3 = r2; r7 = _mm_set_epi16(403,403,403,403,403,403,403,403); r2 = _mm_mullo_epi16(r2,r7); r3 = _mm_mulhi_epi16(r3,r7); r7 = r2; r2 = _mm_unpacklo_epi16(r2,r3); r7 = _mm_unpackhi_epi16(r7,r3); /* and preserve upper four values for future ... */ _mm_store_si128(buffer+2,r7); /* doing this step: E*120 */ r3 = r5; r7 = _mm_set_epi16(120,120,120,120,120,120,120,120); r3 = _mm_mullo_epi16(r3,r7); r5 = _mm_mulhi_epi16(r5,r7); r7 = r3; r3 = _mm_unpacklo_epi16(r3,r5); r7 = _mm_unpackhi_epi16(r7,r5); /* now we complete what we've begun above: * (48*D) + (120*E) = (48*D +120*E) */ r0 = _mm_add_epi32(r0,r3); r4 = _mm_add_epi32(r4,r7); /* and store to memory ! */ _mm_store_si128(buffer,r4); } else { /* maybe you've wondered about the conditional above ? * Well, we prepared UV data for eight pixel in each line, but can only process four * per loop. So we need to load the upper four pixel data from memory each secound loop! */ r1 = _mm_load_si128(buffer+1); r2 = _mm_load_si128(buffer+2); r0 = _mm_load_si128(buffer); } if (++i == nWidth) lastCol <<= 1; /* We didn't produce any output yet, so let's do so! * Ok, fetch four pixel from the Y-data array and shuffle them like this: * 00d0 00c0 00b0 00a0, to get signed dwords and multiply by 256 */ r4 = _mm_cvtsi32_si128(*(UINT32 *)YData); r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); r4 = _mm_shuffle_epi8(r4,r7); r5 = r4; r6 = r4; /* no we can perform the "real" conversion itself and produce output! */ r4 = _mm_add_epi32(r4,r2); r5 = _mm_sub_epi32(r5,r0); r6 = _mm_add_epi32(r6,r1); /* in the end, we only need bytes for RGB values. * So, what do we do? right! shifting left makes values bigger and thats always good. * before we had dwords of data, and by shifting left and treating the result * as packed words, we get not only signed words, but do also divide by 256 * imagine, data is now ordered this way: ddx0 ccx0 bbx0 aax0, and x is the least * significant byte, that we don't need anymore, because we've done some rounding */ r4 = _mm_slli_epi32(r4,8); r5 = _mm_slli_epi32(r5,8); r6 = _mm_slli_epi32(r6,8); /* one thing we still have to face is the clip() function ... * we have still signed words, and there are those min/max instructions in SSE2 ... * the max instruction takes always the bigger of the two operands and stores it in the first one, * and it operates with signs ! * if we feed it with our values and zeros, it takes the zeros if our values are smaller than * zero and otherwise our values */ r7 = _mm_set_epi32(0,0,0,0); r4 = _mm_max_epi16(r4,r7); r5 = _mm_max_epi16(r5,r7); r6 = _mm_max_epi16(r6,r7); /* the same thing just completely different can be used to limit our values to 255, * but now using the min instruction and 255s */ r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_min_epi16(r4,r7); r5 = _mm_min_epi16(r5,r7); r6 = _mm_min_epi16(r6,r7); /* Now we got our bytes. * the moment has come to assemble the three channels R,G and B to the xrgb dwords * on Red channel we just have to and each futural dword with 00FF0000H */ //r7=_mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_and_si128(r4,r7); /* on Green channel we have to shuffle somehow, so we get something like this: * 00d0 00c0 00b0 00a0 */ r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280); r5 = _mm_shuffle_epi8(r5,r7); /* and on Blue channel that one: * 000d 000c 000b 000a */ r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002); r6 = _mm_shuffle_epi8(r6,r7); /* and at last we or it together and get this one: * xrgb xrgb xrgb xrgb */ r4 = _mm_or_si128(r4,r5); r4 = _mm_or_si128(r4,r6); /* Only thing to do know is writing data to memory, but this gets a bit more * complicated if the width is not a multiple of four and it is the last column in line. */ if (lastCol & 0x02) { /* let's say, we need to only convert six pixel in width * Ok, the first 4 pixel will be converted just like every 4 pixel else, but * if it's the last loop in line, last_column is shifted left by one (curious? have a look above), * and we land here. Through initialisation a mask was prepared. In this case it looks like * 0000FFFFH 0000FFFFH 0000FFFFH 0000FFFFH */ r6 = _mm_load_si128(buffer+3); /* we and our output data with this mask to get only the valid pixel */ r4 = _mm_and_si128(r4,r6); /* then we fetch memory from the destination array ... */ r5 = _mm_lddqu_si128((__m128i *)pDst); /* ... and and it with the inverse mask. We get only those pixel, which should not be updated */ r6 = _mm_andnot_si128(r6,r5); /* we only have to or the two values together and write it back to the destination array, * and only the pixel that should be updated really get changed. */ r4 = _mm_or_si128(r4,r6); } _mm_storeu_si128((__m128i *)pDst,r4); if (!(lastRow & 0x02)) { /* Because UV data is the same for two lines, we can process the secound line just here, * in the same loop. Only thing we need to do is to add some offsets to the Y- and destination * pointer. These offsets are iStride[0] and the target scanline. * But if we don't need to process the secound line, like if we are in the last line of processing nine lines, * we just skip all this. */ r4 = _mm_cvtsi32_si128(*(UINT32 *)(YData+srcStep[0])); r7 = _mm_set_epi32(0x80800380,0x80800280,0x80800180,0x80800080); r4 = _mm_shuffle_epi8(r4,r7); r5 = r4; r6 = r4; r4 = _mm_add_epi32(r4,r2); r5 = _mm_sub_epi32(r5,r0); r6 = _mm_add_epi32(r6,r1); r4 = _mm_slli_epi32(r4,8); r5 = _mm_slli_epi32(r5,8); r6 = _mm_slli_epi32(r6,8); r7 = _mm_set_epi32(0,0,0,0); r4 = _mm_max_epi16(r4,r7); r5 = _mm_max_epi16(r5,r7); r6 = _mm_max_epi16(r6,r7); r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_min_epi16(r4,r7); r5 = _mm_min_epi16(r5,r7); r6 = _mm_min_epi16(r6,r7); r7 = _mm_set_epi32(0x00FF0000,0x00FF0000,0x00FF0000,0x00FF0000); r4 = _mm_and_si128(r4,r7); r7 = _mm_set_epi32(0x80800E80,0x80800A80,0x80800680,0x80800280); r5 = _mm_shuffle_epi8(r5,r7); r7 = _mm_set_epi32(0x8080800E,0x8080800A,0x80808006,0x80808002); r6 = _mm_shuffle_epi8(r6,r7); r4 = _mm_or_si128(r4,r5); r4 = _mm_or_si128(r4,r6); if (lastCol & 0x02) { r6 = _mm_load_si128(buffer+3); r4 = _mm_and_si128(r4,r6); r5 = _mm_lddqu_si128((__m128i *)(pDst+dstStep)); r6 = _mm_andnot_si128(r6,r5); r4 = _mm_or_si128(r4,r6); /* only thing is, we should shift [rbp-42] back here, because we have processed the last column, * and this "special condition" can be released */ lastCol >>= 1; } _mm_storeu_si128((__m128i *)(pDst+dstStep),r4); } /* after all we have to increase the destination- and Y-data pointer by four pixel */ pDst += 16; YData += 4; }
void lp_rast_triangle_3_16(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) { const struct lp_rast_triangle *tri = arg.triangle.tri; const struct lp_rast_plane *plane = GET_PLANES(tri); int x = (arg.triangle.plane_mask & 0xff) + task->x; int y = (arg.triangle.plane_mask >> 8) + task->y; unsigned i, j; struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16]; unsigned nr = 0; __m128i p0 = _mm_load_si128((__m128i *)&plane[0]); /* c, dcdx, dcdy, eo */ __m128i p1 = _mm_load_si128((__m128i *)&plane[1]); /* c, dcdx, dcdy, eo */ __m128i p2 = _mm_load_si128((__m128i *)&plane[2]); /* c, dcdx, dcdy, eo */ __m128i zero = _mm_setzero_si128(); __m128i c; __m128i dcdx; __m128i dcdy; __m128i rej4; __m128i dcdx2; __m128i dcdx3; __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ __m128i unused; transpose4_epi32(&p0, &p1, &p2, &zero, &c, &dcdx, &dcdy, &rej4); /* Adjust dcdx; */ dcdx = _mm_sub_epi32(zero, dcdx); c = _mm_add_epi32(c, mm_mullo_epi32(dcdx, _mm_set1_epi32(x))); c = _mm_add_epi32(c, mm_mullo_epi32(dcdy, _mm_set1_epi32(y))); rej4 = _mm_slli_epi32(rej4, 2); /* Adjust so we can just check the sign bit (< 0 comparison), instead of having to do a less efficient <= 0 comparison */ c = _mm_sub_epi32(c, _mm_set1_epi32(1)); rej4 = _mm_add_epi32(rej4, _mm_set1_epi32(1)); dcdx2 = _mm_add_epi32(dcdx, dcdx); dcdx3 = _mm_add_epi32(dcdx2, dcdx); transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3, &span_0, &span_1, &span_2, &unused); for (i = 0; i < 4; i++) { __m128i cx = c; for (j = 0; j < 4; j++) { __m128i c4rej = _mm_add_epi32(cx, rej4); __m128i rej_masks = _mm_srai_epi32(c4rej, 31); /* if (is_zero(rej_masks)) */ if (_mm_movemask_epi8(rej_masks) == 0) { __m128i c0_0 = _mm_add_epi32(SCALAR_EPI32(cx, 0), span_0); __m128i c1_0 = _mm_add_epi32(SCALAR_EPI32(cx, 1), span_1); __m128i c2_0 = _mm_add_epi32(SCALAR_EPI32(cx, 2), span_2); __m128i c_0 = _mm_or_si128(_mm_or_si128(c0_0, c1_0), c2_0); __m128i c0_1 = _mm_add_epi32(c0_0, SCALAR_EPI32(dcdy, 0)); __m128i c1_1 = _mm_add_epi32(c1_0, SCALAR_EPI32(dcdy, 1)); __m128i c2_1 = _mm_add_epi32(c2_0, SCALAR_EPI32(dcdy, 2)); __m128i c_1 = _mm_or_si128(_mm_or_si128(c0_1, c1_1), c2_1); __m128i c_01 = _mm_packs_epi32(c_0, c_1); __m128i c0_2 = _mm_add_epi32(c0_1, SCALAR_EPI32(dcdy, 0)); __m128i c1_2 = _mm_add_epi32(c1_1, SCALAR_EPI32(dcdy, 1)); __m128i c2_2 = _mm_add_epi32(c2_1, SCALAR_EPI32(dcdy, 2)); __m128i c_2 = _mm_or_si128(_mm_or_si128(c0_2, c1_2), c2_2); __m128i c0_3 = _mm_add_epi32(c0_2, SCALAR_EPI32(dcdy, 0)); __m128i c1_3 = _mm_add_epi32(c1_2, SCALAR_EPI32(dcdy, 1)); __m128i c2_3 = _mm_add_epi32(c2_2, SCALAR_EPI32(dcdy, 2)); __m128i c_3 = _mm_or_si128(_mm_or_si128(c0_3, c1_3), c2_3); __m128i c_23 = _mm_packs_epi32(c_2, c_3); __m128i c_0123 = _mm_packs_epi16(c_01, c_23); unsigned mask = _mm_movemask_epi8(c_0123); out[nr].i = i; out[nr].j = j; out[nr].mask = mask; if (mask != 0xffff) nr++; } cx = _mm_add_epi32(cx, _mm_slli_epi32(dcdx, 2)); } c = _mm_add_epi32(c, _mm_slli_epi32(dcdy, 2)); } for (i = 0; i < nr; i++) lp_rast_shade_quads_mask(task, &tri->inputs, x + 4 * out[i].j, y + 4 * out[i].i, 0xffff & ~out[i].mask); }
/* vms_expma: * Compute the component-wise exponential minus <a>: * r[i] <-- e^x[i] - a * * The following comments apply to the SSE2 version of this code: * * Computation is done four doubles as a time by doing computation in paralell * on two vectors of two doubles using SSE2 intrisics. If size is not a * multiple of 4, the remaining elements are computed using the stdlib exp(). * * The computation is done by first doing a range reduction of the argument of * the type e^x = 2^k * e^f choosing k and f so that f is in [-0.5, 0.5]. * Then 2^k can be computed exactly using bit operations to build the double * result and e^f can be efficiently computed with enough precision using a * polynomial approximation. * * The polynomial approximation is done with 11th order polynomial computed by * Remez algorithm with the Solya suite, instead of the more classical Pade * polynomial form cause it is better suited to parallel execution. In order * to achieve the same precision, a Pade form seems to require three less * multiplications but need a very costly division, so it will be less * efficient. * * The maximum error is less than 1lsb and special cases are correctly * handled: * +inf or +oor --> return +inf * -inf or -oor --> return 0.0 * qNaN or sNaN --> return qNaN * * This code is copyright 2004-2012 Thomas Lavergne and licenced under the * BSD licence like the remaining of Wapiti. */ void xvm_expma(double r[], const double x[], double a, uint64_t N) { #if defined(__SSE2__) && !defined(XVM_ANSI) #define xvm_vconst(v) (_mm_castsi128_pd(_mm_set1_epi64x((v)))) assert(r != NULL && ((uintptr_t)r % 16) == 0); assert(x != NULL && ((uintptr_t)x % 16) == 0); const __m128i vl = _mm_set1_epi64x(0x3ff0000000000000ULL); const __m128d ehi = xvm_vconst(0x4086232bdd7abcd2ULL); const __m128d elo = xvm_vconst(0xc086232bdd7abcd2ULL); const __m128d l2e = xvm_vconst(0x3ff71547652b82feULL); const __m128d hal = xvm_vconst(0x3fe0000000000000ULL); const __m128d nan = xvm_vconst(0xfff8000000000000ULL); const __m128d inf = xvm_vconst(0x7ff0000000000000ULL); const __m128d c1 = xvm_vconst(0x3fe62e4000000000ULL); const __m128d c2 = xvm_vconst(0x3eb7f7d1cf79abcaULL); const __m128d p0 = xvm_vconst(0x3feffffffffffffeULL); const __m128d p1 = xvm_vconst(0x3ff000000000000bULL); const __m128d p2 = xvm_vconst(0x3fe0000000000256ULL); const __m128d p3 = xvm_vconst(0x3fc5555555553a2aULL); const __m128d p4 = xvm_vconst(0x3fa55555554e57d3ULL); const __m128d p5 = xvm_vconst(0x3f81111111362f4fULL); const __m128d p6 = xvm_vconst(0x3f56c16c25f3bae1ULL); const __m128d p7 = xvm_vconst(0x3f2a019fc9310c33ULL); const __m128d p8 = xvm_vconst(0x3efa01825f3cb28bULL); const __m128d p9 = xvm_vconst(0x3ec71e2bd880fdd8ULL); const __m128d p10 = xvm_vconst(0x3e9299068168ac8fULL); const __m128d p11 = xvm_vconst(0x3e5ac52350b60b19ULL); const __m128d va = _mm_set1_pd(a); for (uint64_t n = 0; n < N; n += 4) { __m128d mn1, mn2, mi1, mi2; __m128d t1, t2, d1, d2; __m128d v1, v2, w1, w2; __m128i k1, k2; __m128d f1, f2; // Load the next four values __m128d x1 = _mm_load_pd(x + n ); __m128d x2 = _mm_load_pd(x + n + 2); // Check for out of ranges, infinites and NaN mn1 = _mm_cmpneq_pd(x1, x1); mn2 = _mm_cmpneq_pd(x2, x2); mi1 = _mm_cmpgt_pd(x1, ehi); mi2 = _mm_cmpgt_pd(x2, ehi); x1 = _mm_max_pd(x1, elo); x2 = _mm_max_pd(x2, elo); // Range reduction: we search k and f such that e^x = 2^k * e^f // with f in [-0.5, 0.5] t1 = _mm_mul_pd(x1, l2e); t2 = _mm_mul_pd(x2, l2e); t1 = _mm_add_pd(t1, hal); t2 = _mm_add_pd(t2, hal); k1 = _mm_cvttpd_epi32(t1); k2 = _mm_cvttpd_epi32(t2); d1 = _mm_cvtepi32_pd(k1); d2 = _mm_cvtepi32_pd(k2); t1 = _mm_mul_pd(d1, c1); t2 = _mm_mul_pd(d2, c1); f1 = _mm_sub_pd(x1, t1); f2 = _mm_sub_pd(x2, t2); t1 = _mm_mul_pd(d1, c2); t2 = _mm_mul_pd(d2, c2); f1 = _mm_sub_pd(f1, t1); f2 = _mm_sub_pd(f2, t2); // Evaluation of e^f using a 11th order polynom in Horner form v1 = _mm_mul_pd(f1, p11); v2 = _mm_mul_pd(f2, p11); v1 = _mm_add_pd(v1, p10); v2 = _mm_add_pd(v2, p10); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p9); v2 = _mm_add_pd(v2, p9); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p8); v2 = _mm_add_pd(v2, p8); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p7); v2 = _mm_add_pd(v2, p7); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p6); v2 = _mm_add_pd(v2, p6); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p5); v2 = _mm_add_pd(v2, p5); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p4); v2 = _mm_add_pd(v2, p4); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p3); v2 = _mm_add_pd(v2, p3); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p2); v2 = _mm_add_pd(v2, p2); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p1); v2 = _mm_add_pd(v2, p1); v1 = _mm_mul_pd(v1, f1); v2 = _mm_mul_pd(v2, f2); v1 = _mm_add_pd(v1, p0); v2 = _mm_add_pd(v2, p0); // Evaluation of 2^k using bitops to achieve exact computation k1 = _mm_slli_epi32(k1, 20); k2 = _mm_slli_epi32(k2, 20); k1 = _mm_shuffle_epi32(k1, 0x72); k2 = _mm_shuffle_epi32(k2, 0x72); k1 = _mm_add_epi32(k1, vl); k2 = _mm_add_epi32(k2, vl); w1 = _mm_castsi128_pd(k1); w2 = _mm_castsi128_pd(k2); // Return to full range to substract <a> v1 = _mm_mul_pd(v1, w1); v2 = _mm_mul_pd(v2, w2); v1 = _mm_sub_pd(v1, va); v2 = _mm_sub_pd(v2, va); // Finally apply infinite and NaN where needed v1 = _mm_or_pd(_mm_and_pd(mi1, inf), _mm_andnot_pd(mi1, v1)); v2 = _mm_or_pd(_mm_and_pd(mi2, inf), _mm_andnot_pd(mi2, v2)); v1 = _mm_or_pd(_mm_and_pd(mn1, nan), _mm_andnot_pd(mn1, v1)); v2 = _mm_or_pd(_mm_and_pd(mn2, nan), _mm_andnot_pd(mn2, v2)); // Store the results _mm_store_pd(r + n, v1); _mm_store_pd(r + n + 2, v2); } #else for (uint64_t n = 0; n < N; n++) r[n] = exp(x[n]) - a; #endif }
int haraka512256(unsigned char *hash, const unsigned char *msg) { // stuff we need int i, j; __m128i s[4], tmp, rcon; __m128i MSB64 = _mm_set_epi32(0xFFFFFFFF,0xFFFFFFFF,0,0); // set initial round constant rcon = _mm_set_epi32(1,1,1,1); // initialize state to msg s[0] = _mm_load_si128(&((__m128i*)msg)[0]); s[1] = _mm_load_si128(&((__m128i*)msg)[1]); s[2] = _mm_load_si128(&((__m128i*)msg)[2]); s[3] = _mm_load_si128(&((__m128i*)msg)[3]); //printf("= input state =\n"); //printstate512(s[0], s[1], s[2], s[3]); for (i = 0; i < ROUNDS; ++i) { // aes round(s) for (j = 0; j < AES_PER_ROUND; ++j) { s[0] = _mm_aesenc_si128(s[0], rcon); s[1] = _mm_aesenc_si128(s[1], rcon); s[2] = _mm_aesenc_si128(s[2], rcon); s[3] = _mm_aesenc_si128(s[3], rcon); rcon = _mm_slli_epi32(rcon, 1); } //printf("= round %d : after aes layer =\n", i); //printstate512(s[0], s[1], s[2], s[3]); // mixing tmp = _mm_unpacklo_epi32(s[0], s[1]); s[0] = _mm_unpackhi_epi32(s[0], s[1]); s[1] = _mm_unpacklo_epi32(s[2], s[3]); s[2] = _mm_unpackhi_epi32(s[2], s[3]); s[3] = _mm_unpacklo_epi32(s[0], s[2]); s[0] = _mm_unpackhi_epi32(s[0], s[2]); s[2] = _mm_unpackhi_epi32(s[1], tmp); s[1] = _mm_unpacklo_epi32(s[1], tmp); //printf("= round %d : after mix layer =\n", i); //printstate512(s[0], s[1], s[2], s[3]); // little-endian mixing (not used) // tmp = _mm_unpackhi_epi32(s[1], s[0]); // s[0] = _mm_unpacklo_epi32(s[1], s[0]); // s[1] = _mm_unpackhi_epi32(s[3], s[2]); // s[2] = _mm_unpacklo_epi32(s[3], s[2]); // s[3] = _mm_unpackhi_epi32(s[2], s[0]); // s[0] = _mm_unpacklo_epi32(s[2], s[0]); // s[2] = _mm_unpacklo_epi32(tmp, s[1]); // s[1] = _mm_unpackhi_epi32(tmp, s[1]); } //printf("= output from permutation =\n"); //printstate512(s[0], s[1], s[2], s[3]); // xor message to get DM effect s[0] = _mm_xor_si128(s[0], _mm_load_si128(&((__m128i*)msg)[0])); s[1] = _mm_xor_si128(s[1], _mm_load_si128(&((__m128i*)msg)[1])); s[2] = _mm_xor_si128(s[2], _mm_load_si128(&((__m128i*)msg)[2])); s[3] = _mm_xor_si128(s[3], _mm_load_si128(&((__m128i*)msg)[3])); //printf("= after feed-forward =\n"); //printstate512(s[0], s[1], s[2], s[3]); // truncate and store result _mm_maskmoveu_si128(s[0], MSB64, (hash-8)); _mm_maskmoveu_si128(s[1], MSB64, (hash+0)); _mm_storel_epi64((__m128i*)(hash + 16), s[2]); _mm_storel_epi64((__m128i*)(hash + 24), s[3]); }
static void avx2_mshabal_compress(mshabal_context *sc, const unsigned char *buf0, const unsigned char *buf1, const unsigned char *buf2, const unsigned char *buf3, size_t num) { union { u32 words[64]; __m128i data[16]; } u; size_t j; __m128i A[12], B[16], C[16]; __m128i one; for (j = 0; j < 12; j++) A[j] = _mm_loadu_si128((__m128i *)sc->state + j); for (j = 0; j < 16; j++) { B[j] = _mm_loadu_si128((__m128i *)sc->state + j + 12); C[j] = _mm_loadu_si128((__m128i *)sc->state + j + 28); } one = _mm_set1_epi32(C32(0xFFFFFFFF)); #define M(i) _mm_load_si128(u.data + (i)) while (num-- > 0) { for (j = 0; j < 64; j += 4) { u.words[j + 0] = *(u32 *)(buf0 + j); u.words[j + 1] = *(u32 *)(buf1 + j); u.words[j + 2] = *(u32 *)(buf2 + j); u.words[j + 3] = *(u32 *)(buf3 + j); } for (j = 0; j < 16; j++) B[j] = _mm_add_epi32(B[j], M(j)); A[0] = _mm_xor_si128(A[0], _mm_set1_epi32(sc->Wlow)); A[1] = _mm_xor_si128(A[1], _mm_set1_epi32(sc->Whigh)); for (j = 0; j < 16; j++) B[j] = _mm_or_si128(_mm_slli_epi32(B[j], 17), _mm_srli_epi32(B[j], 15)); #define PP(xa0, xa1, xb0, xb1, xb2, xb3, xc, xm) do { \ __m128i tt; \ tt = _mm_or_si128(_mm_slli_epi32(xa1, 15), \ _mm_srli_epi32(xa1, 17)); \ tt = _mm_add_epi32(_mm_slli_epi32(tt, 2), tt); \ tt = _mm_xor_si128(_mm_xor_si128(xa0, tt), xc); \ tt = _mm_add_epi32(_mm_slli_epi32(tt, 1), tt); \ tt = _mm_xor_si128( \ _mm_xor_si128(tt, xb1), \ _mm_xor_si128(_mm_andnot_si128(xb3, xb2), xm)); \ xa0 = tt; \ tt = xb0; \ tt = _mm_or_si128(_mm_slli_epi32(tt, 1), \ _mm_srli_epi32(tt, 31)); \ xb0 = _mm_xor_si128(tt, _mm_xor_si128(xa0, one)); \ } while (0) PP(A[0x0], A[0xB], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); PP(A[0x1], A[0x0], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); PP(A[0x2], A[0x1], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); PP(A[0x3], A[0x2], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); PP(A[0x4], A[0x3], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); PP(A[0x5], A[0x4], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); PP(A[0x6], A[0x5], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); PP(A[0x7], A[0x6], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); PP(A[0x8], A[0x7], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); PP(A[0x9], A[0x8], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); PP(A[0xA], A[0x9], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); PP(A[0xB], A[0xA], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); PP(A[0x0], A[0xB], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); PP(A[0x1], A[0x0], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); PP(A[0x2], A[0x1], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); PP(A[0x3], A[0x2], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); PP(A[0x4], A[0x3], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); PP(A[0x5], A[0x4], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); PP(A[0x6], A[0x5], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); PP(A[0x7], A[0x6], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); PP(A[0x8], A[0x7], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); PP(A[0x9], A[0x8], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); PP(A[0xA], A[0x9], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); PP(A[0xB], A[0xA], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); PP(A[0x0], A[0xB], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); PP(A[0x1], A[0x0], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); PP(A[0x2], A[0x1], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); PP(A[0x3], A[0x2], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); PP(A[0x4], A[0x3], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); PP(A[0x5], A[0x4], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); PP(A[0x6], A[0x5], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); PP(A[0x7], A[0x6], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); PP(A[0x8], A[0x7], B[0x0], B[0xD], B[0x9], B[0x6], C[0x8], M(0x0)); PP(A[0x9], A[0x8], B[0x1], B[0xE], B[0xA], B[0x7], C[0x7], M(0x1)); PP(A[0xA], A[0x9], B[0x2], B[0xF], B[0xB], B[0x8], C[0x6], M(0x2)); PP(A[0xB], A[0xA], B[0x3], B[0x0], B[0xC], B[0x9], C[0x5], M(0x3)); PP(A[0x0], A[0xB], B[0x4], B[0x1], B[0xD], B[0xA], C[0x4], M(0x4)); PP(A[0x1], A[0x0], B[0x5], B[0x2], B[0xE], B[0xB], C[0x3], M(0x5)); PP(A[0x2], A[0x1], B[0x6], B[0x3], B[0xF], B[0xC], C[0x2], M(0x6)); PP(A[0x3], A[0x2], B[0x7], B[0x4], B[0x0], B[0xD], C[0x1], M(0x7)); PP(A[0x4], A[0x3], B[0x8], B[0x5], B[0x1], B[0xE], C[0x0], M(0x8)); PP(A[0x5], A[0x4], B[0x9], B[0x6], B[0x2], B[0xF], C[0xF], M(0x9)); PP(A[0x6], A[0x5], B[0xA], B[0x7], B[0x3], B[0x0], C[0xE], M(0xA)); PP(A[0x7], A[0x6], B[0xB], B[0x8], B[0x4], B[0x1], C[0xD], M(0xB)); PP(A[0x8], A[0x7], B[0xC], B[0x9], B[0x5], B[0x2], C[0xC], M(0xC)); PP(A[0x9], A[0x8], B[0xD], B[0xA], B[0x6], B[0x3], C[0xB], M(0xD)); PP(A[0xA], A[0x9], B[0xE], B[0xB], B[0x7], B[0x4], C[0xA], M(0xE)); PP(A[0xB], A[0xA], B[0xF], B[0xC], B[0x8], B[0x5], C[0x9], M(0xF)); A[0xB] = _mm_add_epi32(A[0xB], C[0x6]); A[0xA] = _mm_add_epi32(A[0xA], C[0x5]); A[0x9] = _mm_add_epi32(A[0x9], C[0x4]); A[0x8] = _mm_add_epi32(A[0x8], C[0x3]); A[0x7] = _mm_add_epi32(A[0x7], C[0x2]); A[0x6] = _mm_add_epi32(A[0x6], C[0x1]); A[0x5] = _mm_add_epi32(A[0x5], C[0x0]); A[0x4] = _mm_add_epi32(A[0x4], C[0xF]); A[0x3] = _mm_add_epi32(A[0x3], C[0xE]); A[0x2] = _mm_add_epi32(A[0x2], C[0xD]); A[0x1] = _mm_add_epi32(A[0x1], C[0xC]); A[0x0] = _mm_add_epi32(A[0x0], C[0xB]); A[0xB] = _mm_add_epi32(A[0xB], C[0xA]); A[0xA] = _mm_add_epi32(A[0xA], C[0x9]); A[0x9] = _mm_add_epi32(A[0x9], C[0x8]); A[0x8] = _mm_add_epi32(A[0x8], C[0x7]); A[0x7] = _mm_add_epi32(A[0x7], C[0x6]); A[0x6] = _mm_add_epi32(A[0x6], C[0x5]); A[0x5] = _mm_add_epi32(A[0x5], C[0x4]); A[0x4] = _mm_add_epi32(A[0x4], C[0x3]); A[0x3] = _mm_add_epi32(A[0x3], C[0x2]); A[0x2] = _mm_add_epi32(A[0x2], C[0x1]); A[0x1] = _mm_add_epi32(A[0x1], C[0x0]); A[0x0] = _mm_add_epi32(A[0x0], C[0xF]); A[0xB] = _mm_add_epi32(A[0xB], C[0xE]); A[0xA] = _mm_add_epi32(A[0xA], C[0xD]); A[0x9] = _mm_add_epi32(A[0x9], C[0xC]); A[0x8] = _mm_add_epi32(A[0x8], C[0xB]); A[0x7] = _mm_add_epi32(A[0x7], C[0xA]); A[0x6] = _mm_add_epi32(A[0x6], C[0x9]); A[0x5] = _mm_add_epi32(A[0x5], C[0x8]); A[0x4] = _mm_add_epi32(A[0x4], C[0x7]); A[0x3] = _mm_add_epi32(A[0x3], C[0x6]); A[0x2] = _mm_add_epi32(A[0x2], C[0x5]); A[0x1] = _mm_add_epi32(A[0x1], C[0x4]); A[0x0] = _mm_add_epi32(A[0x0], C[0x3]); #define SWAP_AND_SUB(xb, xc, xm) do { \ __m128i tmp; \ tmp = xb; \ xb = _mm_sub_epi32(xc, xm); \ xc = tmp; \ } while (0) SWAP_AND_SUB(B[0x0], C[0x0], M(0x0)); SWAP_AND_SUB(B[0x1], C[0x1], M(0x1)); SWAP_AND_SUB(B[0x2], C[0x2], M(0x2)); SWAP_AND_SUB(B[0x3], C[0x3], M(0x3)); SWAP_AND_SUB(B[0x4], C[0x4], M(0x4)); SWAP_AND_SUB(B[0x5], C[0x5], M(0x5)); SWAP_AND_SUB(B[0x6], C[0x6], M(0x6)); SWAP_AND_SUB(B[0x7], C[0x7], M(0x7)); SWAP_AND_SUB(B[0x8], C[0x8], M(0x8)); SWAP_AND_SUB(B[0x9], C[0x9], M(0x9)); SWAP_AND_SUB(B[0xA], C[0xA], M(0xA)); SWAP_AND_SUB(B[0xB], C[0xB], M(0xB)); SWAP_AND_SUB(B[0xC], C[0xC], M(0xC)); SWAP_AND_SUB(B[0xD], C[0xD], M(0xD)); SWAP_AND_SUB(B[0xE], C[0xE], M(0xE)); SWAP_AND_SUB(B[0xF], C[0xF], M(0xF)); buf0 += 64; buf1 += 64; buf2 += 64; buf3 += 64; if (++sc->Wlow == 0) sc->Whigh++; } for (j = 0; j < 12; j++) _mm_storeu_si128((__m128i *)sc->state + j, A[j]); for (j = 0; j < 16; j++) { _mm_storeu_si128((__m128i *)sc->state + j + 12, B[j]); _mm_storeu_si128((__m128i *)sc->state + j + 28, C[j]); } #undef M }