GF2X_STORAGE_CLASS_mul_1_n unsigned long gf2x_mul_1_n (unsigned long *cp, const unsigned long *bp, long sb, unsigned long a) { long i; typedef union { __v2di s; unsigned long x[2]; } __v2di_proxy; __v2di y = (__v2di) { a, a }; __v2di x; __v2di_proxy cc; // do two at a time for (i = 0; i + 2 < sb; i += 2) { x = (__v2di) { bp[i], bp[i+1] }; cc.s = _mm_clmulepi64_si128(x, y, 0); if (i == 0) cp[i] = cc.x[0]; else cp[i] ^= cc.x[0]; cp[i+1] = cc.x[1]; cc.s = _mm_clmulepi64_si128(x, y, 1); cp[i+1] ^= cc.x[0]; cp[i+2] = cc.x[1]; } // last is different, to handle carry out unsigned long cy; if (i == sb - 2) { // case bp is even x = (__v2di) { bp[i], bp[i+1] }; cc.s = _mm_clmulepi64_si128(x, y, 0); if (i == 0) cp[i] = cc.x[0]; else cp[i] ^= cc.x[0]; cp[i+1] = cc.x[1]; cc.s = _mm_clmulepi64_si128(x, y, 1); cp[i+1] ^= cc.x[0]; cy = cc.x[1]; } else { //case bp is odd x = (__v2di) { bp[i], 0 }; cc.s = _mm_clmulepi64_si128(x, y, 0); if (i == 0) cp[i] = cc.x[0]; else cp[i] ^= cc.x[0]; cy = cc.x[1]; } return cy; }
/** Performs a carryless multiplication of two 128bit integers modulo \f$ x^{128} + x^7 + x^2 + x + 1 \f$ */ static __m128i gmul(__m128i v, __m128i h) { /* multiply */ __m128i z0, z1, z2, tmp; z0 = _mm_clmulepi64_si128(v, h, 0x11); z2 = _mm_clmulepi64_si128(v, h, 0x00); __m128i tmpv = _mm_srli_si128(v, 8); tmpv = _mm_xor_si128(tmpv, v); __m128i tmph = _mm_srli_si128(h, 8); tmph = _mm_xor_si128(tmph, h); z1 = _mm_clmulepi64_si128(tmpv, tmph, 0x00); z1 = _mm_xor_si128(z1, z0); z1 = _mm_xor_si128(z1, z2); tmp = _mm_srli_si128(z1, 8); __m128i pl = _mm_xor_si128(z0, tmp); tmp = _mm_slli_si128(z1, 8); __m128i ph = _mm_xor_si128(z2, tmp); tmp = _mm_srli_epi64(ph, 63); tmp = _mm_srli_si128(tmp, 8); pl = shl(pl, 1); pl = _mm_xor_si128(pl, tmp); ph = shl(ph, 1); /* reduce */ __m128i b, c; b = c = _mm_slli_si128(ph, 8); b = _mm_slli_epi64(b, 62); c = _mm_slli_epi64(c, 57); tmp = _mm_xor_si128(b, c); __m128i d = _mm_xor_si128(ph, tmp); __m128i e = shr(d, 1); __m128i f = shr(d, 2); __m128i g = shr(d, 7); pl = _mm_xor_si128(pl, d); pl = _mm_xor_si128(pl, e); pl = _mm_xor_si128(pl, f); pl = _mm_xor_si128(pl, g); return pl; }
void gf2x_mul2(unsigned long * t, unsigned long const * s1, unsigned long const * s2) #endif #endif { typedef union { __v2di s; unsigned long x[2]; } __v2di_proxy; __v2di ss1, ss2, s1s, s2s; __v2di_proxy t00, tk; #ifndef BORROW __v2di_proxy t11; #endif ss1 = _mm_loadu_si128((__v2di *)s1); ss2 = _mm_loadu_si128((__v2di *)s2); t00.s = _mm_clmulepi64_si128(ss1, ss2, 0); #ifndef BORROW t11.s = _mm_clmulepi64_si128(ss1, ss2, 17); #endif s1s = _mm_shuffle_epi32(ss1, 78); ss1 ^= s1s; s2s = _mm_shuffle_epi32(ss2, 78); ss2 ^= s2s; tk.s = _mm_clmulepi64_si128(ss1, ss2, 0); #ifndef BORROW tk.s ^= t00.s ^ t11.s; #endif /* store result */ t[0] = t00.x[0]; #ifdef BORROW t[1] = t00.x[1] ^ tk.x[0] ^ t00.x[0] ^ c[0]; t[2] = c[0] ^ tk.x[1] ^ t00.x[1] ^ c[1]; t[3] = c[1]; #else t[1] = t00.x[1] ^ tk.x[0]; t[2] = t11.x[0] ^ tk.x[1]; t[3] = t11.x[1]; #endif #ifdef CARRY c[0] = t11.x[0]; c[1] = t11.x[1]; #endif }
GF2X_STORAGE_CLASS_addmul_1_n unsigned long gf2x_addmul_1_n (unsigned long *dp, const unsigned long *cp, const unsigned long* bp, long sb, unsigned long a) { long i; typedef union { __v2di s; unsigned long x[2]; } __v2di_proxy; __v2di y = (__v2di) { a, a }; __v2di x; __v2di_proxy dd; // do two at a time for (i = 0; i + 2 < sb; i += 2) { x = (__v2di) { bp[i], bp[i+1] }; dd.s = _mm_clmulepi64_si128(x, y, 0); if (i == 0) dp[i] = cp[i] ^ dd.x[0]; else dp[i] ^= dd.x[0]; dp[i+1] = cp[i+1] ^ dd.x[1]; dd.s = _mm_clmulepi64_si128(x, y, 1); dp[i+1] ^= dd.x[0]; dp[i+2] = cp[i+2] ^ dd.x[1]; } unsigned long cy; if (i == sb - 2) { // case bp is even x = (__v2di) { bp[i], bp[i+1] }; dd.s = _mm_clmulepi64_si128(x, y, 0); if (i == 0) dp[i] = cp[i] ^ dd.x[0]; else dp[i] ^= dd.x[0]; dp[i+1] = cp[i+1] ^ dd.x[1]; dd.s = _mm_clmulepi64_si128(x, y, 1); dp[i+1] ^= dd.x[0]; cy = dd.x[1]; } else { x = (__v2di) { bp[i], 0 }; dd.s = _mm_clmulepi64_si128(x, y, 0); if (i == 0) dp[i] = cp[i] ^ dd.x[0]; else dp[i] ^= dd.x[0]; cy = dd.x[1]; } return cy; }
inline __m128i gcm_multiply(const __m128i& H, const __m128i& x) { __m128i T0, T1, T2, T3; T0 = _mm_clmulepi64_si128(x, H, 0x11); T1 = _mm_clmulepi64_si128(x, H, 0x10); T2 = _mm_clmulepi64_si128(x, H, 0x01); T3 = _mm_clmulepi64_si128(x, H, 0x00); T1 = _mm_xor_si128(T1, T2); T0 = _mm_xor_si128(T0, _mm_srli_si128(T1, 8)); T3 = _mm_xor_si128(T3, _mm_slli_si128(T1, 8)); return gcm_reduce(T0, T3); }
/* TODO: if somebody comes up with a neat way to improve the interface so * as to remove the false dependency on pclmul, that would be nice. */ static inline __v2di GF2X_FUNC(mul4cl1_mul1) (unsigned long a, unsigned long b) { __v2di aa = (__v2di) { a, 0 }; __v2di bb = (__v2di) { b, 0 }; return _mm_clmulepi64_si128(aa, bb, 0); }
GF2X_STORAGE_CLASS_mul1 void gf2x_mul1 (unsigned long *c, unsigned long a, unsigned long b) { __v2di aa = (__v2di) { a, 0 }; __v2di bb = (__v2di) { b, 0 }; _mm_storeu_si128((__v2di*)c, _mm_clmulepi64_si128(aa, bb, 0)); }
static inline void pclmul_mul1 (unsigned long *c, unsigned long a, unsigned long b) { __m128i aa = _mm_setr_epi64( _mm_cvtsi64_m64(a), _mm_cvtsi64_m64(0)); __m128i bb = _mm_setr_epi64( _mm_cvtsi64_m64(b), _mm_cvtsi64_m64(0)); _mm_storeu_si128((__m128i*)c, _mm_clmulepi64_si128(aa, bb, 0)); }
void INIT_Htable(uint8_t Htbl[16*8], uint8_t *H) { int i; __m128i T, TMP0, TMP1, TMP2, TMP3, TMP4, POLY; POLY = _mm_setr_epi32(0x1,0,0,0xc2000000); T = _mm_loadu_si128(((__m128i*)H)); TMP0 = T; _mm_storeu_si128(&((__m128i*)Htbl)[0], T); for (i=1; i<8; i++) { TMP1 = _mm_clmulepi64_si128(T, TMP0, 0x00); TMP4 = _mm_clmulepi64_si128(T, TMP0, 0x11); TMP2 = _mm_clmulepi64_si128(T, TMP0, 0x10); TMP3 = _mm_clmulepi64_si128(T, TMP0, 0x01); TMP2 = _mm_xor_si128(TMP2, TMP3); TMP3 = _mm_slli_si128(TMP2, 8); TMP2 = _mm_srli_si128(TMP2, 8); TMP1 = _mm_xor_si128(TMP3, TMP1); TMP4 = _mm_xor_si128(TMP4, TMP2); TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10); TMP3 = _mm_shuffle_epi32(TMP1, 78); TMP1 = _mm_xor_si128(TMP3, TMP2); TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10); TMP3 = _mm_shuffle_epi32(TMP1, 78); TMP1 = _mm_xor_si128(TMP3, TMP2); T = _mm_xor_si128(TMP4, TMP1); _mm_storeu_si128(&((__m128i*)Htbl)[i], T); } }
int pclmul(){ __ma128i v1; __ma128i v2; for (int i = 1;i >= 0; i--){ v1.ui64[i] = 3; v2.ui64[i] = 3; } __ma128i v3; v3.i = _mm_clmulepi64_si128(v1.i, v2.i, 0); if (v3.ui64[0] != 5) printf("Correct: %d result: %d\n", 5, v3.ui64[0]); return -1; return 0; }
void test8bit (void) { i1 = _mm_cmpistrm (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistri (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistra (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistrc (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistro (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistrs (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ k1 = _mm_cmpistrz (i2, i3, k4); /* { dg-error "the third argument must be an 8-bit immediate" } */ i1 = _mm_cmpestrm (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestri (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestra (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestrc (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestro (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestrs (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ k1 = _mm_cmpestrz (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */ b1 = _mm256_blend_ps (b2, b3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ k1 = _cvtss_sh (f1, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm256_cvtps_ph (b2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ b1 = _mm256_dp_ps (b2, b3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ e1 = _mm256_permute2f128_pd (e2, e3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ b1 = _mm256_permute2f128_ps (b2, b3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ l1 = _mm256_permute2f128_si256 (l2, l3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ b1 = _mm256_permute_ps (b2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_aeskeygenassist_si128 (i2, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_blend_epi16 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_clmulepi64_si128 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_cvtps_ph (a1, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ d1 = _mm_dp_pd (d2, d3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ a1 = _mm_dp_ps (a2, a3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ a1 = _mm_insert_ps (a2, a3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_mpsadbw_epu8 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ a1 = _mm_permute_ps (a2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_slli_si128 (i2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ i1 = _mm_srli_si128 (i2, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */ }
uint32_t crc32_hw_aligned(uint32_t remainder, const __m128i* p, size_t vec_count) { /* Constants precomputed by gen_crc32_multipliers.c. Do not edit! */ const __m128i multipliers_4 = _mm_set_epi32(0, 0x1D9513D7, 0, 0x8F352D95); const __m128i multipliers_2 = _mm_set_epi32(0, 0x81256527, 0, 0xF1DA05AA); const __m128i multipliers_1 = _mm_set_epi32(0, 0xCCAA009E, 0, 0xAE689191); const __m128i final_multiplier = _mm_set_epi32(0, 0, 0, 0xB8BC6765); const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF); const __m128i barrett_reduction_constants = _mm_set_epi32(0x1, 0xDB710641, 0x1, 0xF7011641); const __m128i* const end = p + vec_count; const __m128i* const end512 = p + (vec_count & ~3); __m128i x0, x1, x2, x3; /* * Account for the current 'remainder', i.e. the CRC of the part of * the message already processed. Explanation: rewrite the message * polynomial M(x) in terms of the first part A(x), the second part * B(x), and the length of the second part in bits |B(x)| >= 32: * * M(x) = A(x)*x^|B(x)| + B(x) * * Then the CRC of M(x) is: * * CRC(M(x)) = CRC(A(x)*x^|B(x)| + B(x)) * = CRC(A(x)*x^32*x^(|B(x)| - 32) + B(x)) * = CRC(CRC(A(x))*x^(|B(x)| - 32) + B(x)) * * Note: all arithmetic is modulo G(x), the generator polynomial; that's * why A(x)*x^32 can be replaced with CRC(A(x)) = A(x)*x^32 mod G(x). * * So the CRC of the full message is the CRC of the second part of the * message where the first 32 bits of the second part of the message * have been XOR'ed with the CRC of the first part of the message. */ x0 = *p++; x0 = _mm_xor_si128(x0, _mm_set_epi32(0, 0, 0, remainder)); if (p > end512) /* only 128, 256, or 384 bits of input? */ goto _128_bits_at_a_time; x1 = *p++; x2 = *p++; x3 = *p++; /* Fold 512 bits at a time */ for (; p != end512; p += 4) { __m128i y0, y1, y2, y3; y0 = p[0]; y1 = p[1]; y2 = p[2]; y3 = p[3]; /* * Note: the immediate constant for PCLMULQDQ specifies which * 64-bit halves of the 128-bit vectors to multiply: * * 0x00 means low halves (higher degree polynomial terms for us) * 0x11 means high halves (lower degree polynomial terms for us) */ y0 = _mm_xor_si128(y0, _mm_clmulepi64_si128(x0, multipliers_4, 0x00)); y1 = _mm_xor_si128(y1, _mm_clmulepi64_si128(x1, multipliers_4, 0x00)); y2 = _mm_xor_si128(y2, _mm_clmulepi64_si128(x2, multipliers_4, 0x00)); y3 = _mm_xor_si128(y3, _mm_clmulepi64_si128(x3, multipliers_4, 0x00)); y0 = _mm_xor_si128(y0, _mm_clmulepi64_si128(x0, multipliers_4, 0x11)); y1 = _mm_xor_si128(y1, _mm_clmulepi64_si128(x1, multipliers_4, 0x11)); y2 = _mm_xor_si128(y2, _mm_clmulepi64_si128(x2, multipliers_4, 0x11)); y3 = _mm_xor_si128(y3, _mm_clmulepi64_si128(x3, multipliers_4, 0x11)); x0 = y0; x1 = y1; x2 = y2; x3 = y3; } /* Fold 512 bits => 128 bits */ x2 = _mm_xor_si128(x2, _mm_clmulepi64_si128(x0, multipliers_2, 0x00)); x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x1, multipliers_2, 0x00)); x2 = _mm_xor_si128(x2, _mm_clmulepi64_si128(x0, multipliers_2, 0x11)); x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x1, multipliers_2, 0x11)); x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x2, multipliers_1, 0x00)); x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x2, multipliers_1, 0x11)); x0 = x3; _128_bits_at_a_time: while (p != end) { /* Fold 128 bits into next 128 bits */ x1 = *p++; x1 = _mm_xor_si128(x1, _mm_clmulepi64_si128(x0, multipliers_1, 0x00)); x1 = _mm_xor_si128(x1, _mm_clmulepi64_si128(x0, multipliers_1, 0x11)); x0 = x1; } /* Now there are just 128 bits left, stored in 'x0'. */ /* * Fold 128 => 96 bits. This also implicitly appends 32 zero bits, * which is equivalent to multiplying by x^32. This is needed because * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x). */ x0 = _mm_xor_si128(_mm_srli_si128(x0, 8), _mm_clmulepi64_si128(x0, multipliers_1, 0x10)); /* Fold 96 => 64 bits */ x0 = _mm_xor_si128(_mm_srli_si128(x0, 4), _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), final_multiplier, 0x00)); /* * Finally, reduce 64 => 32 bits using Barrett reduction. * * Let M(x) = A(x)*x^32 + B(x) be the remaining message. The goal is to * compute R(x) = M(x) mod G(x). Since degree(B(x)) < degree(G(x)): * * R(x) = (A(x)*x^32 + B(x)) mod G(x) * = (A(x)*x^32) mod G(x) + B(x) * * Then, by the Division Algorithm there exists a unique q(x) such that: * * A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x) * * Since the left-hand side is of maximum degree 31, the right-hand side * must be too. This implies that we can apply 'mod x^32' to the * right-hand side without changing its value: * * (A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32 * * Note that '+' is equivalent to '-' in polynomials over GF(2). * * We also know that: * * / A(x)*x^32 \ * q(x) = floor ( --------- ) * \ G(x) / * * To compute this efficiently, we can multiply the top and bottom by * x^32 and move the division by G(x) to the top: * * / A(x) * floor(x^64 / G(x)) \ * q(x) = floor ( ------------------------- ) * \ x^32 / * * Note that floor(x^64 / G(x)) is a constant. * * So finally we have: * * / A(x) * floor(x^64 / G(x)) \ * R(x) = B(x) + G(x)*floor ( ------------------------- ) * \ x^32 / */ x1 = x0; x0 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), barrett_reduction_constants, 0x00); x0 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), barrett_reduction_constants, 0x10); return _mm_cvtsi128_si32(_mm_srli_si128(_mm_xor_si128(x0, x1), 4)); }
void Polyval_Htable(unsigned char* Htbl, unsigned char* inp, int length, unsigned char* POLYVAL) { int remainder =0; int rem_128 = (length%128) - length%16; int has_semi = length %16; unsigned char* fixed_inp = inp; int i; uint8_t B[16] ={0}; __m128i data, TMP0, TMP1, TMP2, TMP3, TMP4, T, Xhi, POLY; if (length==0) return; Xhi = _mm_setzero_si128(); POLY = _mm_setr_epi32(0x1,0,0,0xc2000000); T = _mm_loadu_si128(((__m128i*)POLYVAL)); if ((length!=0) || (rem_128!=0)){ if (rem_128!=0) { fixed_inp +=rem_128; remainder = rem_128/16; data = _mm_loadu_si128(((__m128i*)inp)); data = _mm_xor_si128(T, data); TMP2 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[remainder-1], 0x01); TMP0 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[remainder-1], 0x00); TMP1 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[remainder-1], 0x11); TMP3 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[remainder-1], 0x10); TMP2 = _mm_xor_si128(TMP2, TMP3); for (i=1; i<(rem_128/16); i++) { data = _mm_loadu_si128(&((__m128i*)inp)[i]); TMP3 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[remainder-i-1], 0x00); TMP0 = _mm_xor_si128(TMP0, TMP3); TMP3 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[remainder-i-1], 0x11); TMP1 = _mm_xor_si128(TMP1, TMP3); TMP3 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[remainder-i-1], 0x01); TMP2 = _mm_xor_si128(TMP2, TMP3); TMP3 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[remainder-i-1], 0x10); TMP2 = _mm_xor_si128(TMP2, TMP3); } TMP3 = _mm_srli_si128(TMP2, 8); TMP2 = _mm_slli_si128(TMP2, 8); Xhi = _mm_xor_si128(TMP3, TMP1); T = _mm_xor_si128(TMP0, TMP2); length -= rem_128; } length /=16; i=0; if (length!=0) { if (rem_128==0) { data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+7]); TMP2 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x01); TMP0 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x00); TMP1 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x11); TMP3 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x10); TMP2 = _mm_xor_si128(TMP2, TMP3); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+6]); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[1]); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+5]); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[2]); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+4]); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[3]); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+3]); TMP4 = _mm_clmulepi64_si128(T, POLY, 0x10); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[4]); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+2]); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[5]); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+1]); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[6]); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i]); data = _mm_xor_si128(T, data); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[7]); TMP3 = _mm_srli_si128(TMP2, 8); TMP2 = _mm_slli_si128(TMP2, 8); Xhi = _mm_xor_si128(TMP3, TMP1); T = _mm_xor_si128(TMP0, TMP2); i=8; } for (; i<length; i=i+8) { data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+7]); TMP2 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x01); TMP0 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x00); TMP1 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x11); TMP3 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x10); TMP2 = _mm_xor_si128(TMP2, TMP3); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+6]); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[1]); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+5]); TMP4 = _mm_clmulepi64_si128(T, POLY, 0x10); T =_mm_alignr_epi8(T, T, 8); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[2]); T = _mm_xor_si128(T, TMP4); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+4]); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[3]); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+3]); TMP4 = _mm_clmulepi64_si128(T, POLY, 0x10); T =_mm_alignr_epi8(T, T, 8); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[4]); T = _mm_xor_si128(T, TMP4); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+2]); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[5]); T = _mm_xor_si128(T, Xhi); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+1]); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[6]); data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i]); data = _mm_xor_si128(T, data); SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[7]); TMP3 = _mm_srli_si128(TMP2, 8); TMP2 = _mm_slli_si128(TMP2, 8); Xhi = _mm_xor_si128(TMP3, TMP1); T = _mm_xor_si128(TMP0, TMP2); } TMP3 = _mm_clmulepi64_si128(T, POLY, 0x10); T =_mm_alignr_epi8(T, T, 8); T = _mm_xor_si128(TMP3, T); TMP3 = _mm_clmulepi64_si128(T, POLY, 0x10); T =_mm_alignr_epi8(T, T, 8); T = _mm_xor_si128(TMP3, T); T = _mm_xor_si128(Xhi, T); } else { // length was <16 and there was several blocks on start - need to finialize reduction if (rem_128!=0) { TMP3 = _mm_clmulepi64_si128(T, POLY, 0x10); T =_mm_alignr_epi8(T, T, 8); T = _mm_xor_si128(TMP3, T); TMP3 = _mm_clmulepi64_si128(T, POLY, 0x10); T =_mm_alignr_epi8(T, T, 8); T = _mm_xor_si128(TMP3, T); T = _mm_xor_si128(Xhi, T); } } } if (has_semi!=0) { memcpy(B, (uint8_t*)(&((__m128i*)fixed_inp)[i]),has_semi); data = _mm_loadu_si128((__m128i*)B); data = _mm_xor_si128(T,data); TMP2 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x01); TMP0 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x00); TMP1 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x11); TMP3 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x10); TMP2 = _mm_xor_si128(TMP2, TMP3); TMP3 = _mm_srli_si128(TMP2, 8); TMP2 = _mm_slli_si128(TMP2, 8); Xhi = _mm_xor_si128(TMP3, TMP1); T = _mm_xor_si128(TMP0, TMP2); TMP3 = _mm_clmulepi64_si128(T, POLY, 0x10); T =_mm_alignr_epi8(T, T, 8); T = _mm_xor_si128(TMP3, T); TMP3 = _mm_clmulepi64_si128(T, POLY, 0x10); T =_mm_alignr_epi8(T, T, 8); T = _mm_xor_si128(TMP3, T); T = _mm_xor_si128(Xhi, T); } _mm_storeu_si128(((__m128i*)POLYVAL), T); }
__m128i test_mm_clmulepi64_si128(__m128i a, __m128i b) { // CHECK: @llvm.x86.pclmulqdq return _mm_clmulepi64_si128(a, b, 0); }
inline __m128i gcm_multiply_x4(const __m128i& H1, const __m128i& H2, const __m128i& H3, const __m128i& H4, const __m128i& X1, const __m128i& X2, const __m128i& X3, const __m128i& X4) { /* * Mutiply with delayed reduction, algorithm by Krzysztof Jankowski * and Pierre Laurent of Intel */ const __m128i H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00); const __m128i H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00); const __m128i H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00); const __m128i H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00); const __m128i lo = _mm_xor_si128( _mm_xor_si128(H1_X1_lo, H2_X2_lo), _mm_xor_si128(H3_X3_lo, H4_X4_lo)); const __m128i H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11); const __m128i H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11); const __m128i H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11); const __m128i H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11); const __m128i hi = _mm_xor_si128( _mm_xor_si128(H1_X1_hi, H2_X2_hi), _mm_xor_si128(H3_X3_hi, H4_X4_hi)); __m128i T0 = _mm_xor_si128(lo, hi); __m128i T1, T2, T3, T4; T1 = _mm_xor_si128(_mm_srli_si128(H1, 8), H1); T2 = _mm_xor_si128(_mm_srli_si128(X1, 8), X1); T3 = _mm_xor_si128(_mm_srli_si128(H2, 8), H2); T4 = _mm_xor_si128(_mm_srli_si128(X2, 8), X2); T0 = _mm_xor_si128(T0, _mm_clmulepi64_si128(T1, T2, 0x00)); T0 = _mm_xor_si128(T0, _mm_clmulepi64_si128(T3, T4, 0x00)); T1 = _mm_xor_si128(_mm_srli_si128(H3, 8), H3); T2 = _mm_xor_si128(_mm_srli_si128(X3, 8), X3); T3 = _mm_xor_si128(_mm_srli_si128(H4, 8), H4); T4 = _mm_xor_si128(_mm_srli_si128(X4, 8), X4); T0 = _mm_xor_si128(T0, _mm_clmulepi64_si128(T1, T2, 0x00)); T0 = _mm_xor_si128(T0, _mm_clmulepi64_si128(T3, T4, 0x00)); T1 = _mm_xor_si128(_mm_srli_si128(T0, 8), hi); T2 = _mm_xor_si128(_mm_slli_si128(T0, 8), lo); return gcm_reduce(T1, T2); }
void Polyval_Horner(unsigned char* TAG, unsigned char* pH, unsigned char* inp, int length) { __m128i TMP0, TMP1, TMP2, TMP3, TMP4, T, POLY, H; int i=0; if (length==0) return; int has_semi = length%16; uint8_t B[16]={0}; length /=16; H = _mm_loadu_si128(((__m128i*)pH)); T = _mm_loadu_si128(((__m128i*)TAG)); POLY = _mm_setr_epi32(0x1,0,0,0xc2000000); for (i=0; i< length; i++) { T = _mm_xor_si128(T, _mm_loadu_si128(&((__m128i*)inp)[i])); TMP1 = _mm_clmulepi64_si128(T, H, 0x00); TMP4 = _mm_clmulepi64_si128(T, H, 0x11); TMP2 = _mm_clmulepi64_si128(T, H, 0x10); TMP3 = _mm_clmulepi64_si128(T, H, 0x01); TMP2 = _mm_xor_si128(TMP2, TMP3); TMP3 = _mm_slli_si128(TMP2, 8); TMP2 = _mm_srli_si128(TMP2, 8); TMP1 = _mm_xor_si128(TMP3, TMP1); TMP4 = _mm_xor_si128(TMP4, TMP2); TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10); TMP3 = _mm_shuffle_epi32(TMP1, 78); TMP1 = _mm_xor_si128(TMP3, TMP2); TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10); TMP3 = _mm_shuffle_epi32(TMP1, 78); TMP1 = _mm_xor_si128(TMP3, TMP2); T = _mm_xor_si128(TMP4, TMP1); } if (has_semi!=0) { memcpy(B, inp+length*16, has_semi); T = _mm_xor_si128(T, _mm_loadu_si128((__m128i*)B)); TMP1 = _mm_clmulepi64_si128(T, H, 0x00); TMP4 = _mm_clmulepi64_si128(T, H, 0x11); TMP2 = _mm_clmulepi64_si128(T, H, 0x10); TMP3 = _mm_clmulepi64_si128(T, H, 0x01); TMP2 = _mm_xor_si128(TMP2, TMP3); TMP3 = _mm_slli_si128(TMP2, 8); TMP2 = _mm_srli_si128(TMP2, 8); TMP1 = _mm_xor_si128(TMP3, TMP1); TMP4 = _mm_xor_si128(TMP4, TMP2); TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10); TMP3 = _mm_shuffle_epi32(TMP1, 78); TMP1 = _mm_xor_si128(TMP3, TMP2); TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10); TMP3 = _mm_shuffle_epi32(TMP1, 78); TMP1 = _mm_xor_si128(TMP3, TMP2); T = _mm_xor_si128(TMP4, TMP1); } _mm_storeu_si128(((__m128i*)TAG), T); }
void gcm_multiply_clmul(byte x[16], const byte H[16]) { /* * Algorithms 1 and 5 from Intel's CLMUL guide */ const __m128i BSWAP_MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(x)); __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i*>(H)); a = _mm_shuffle_epi8(a, BSWAP_MASK); b = _mm_shuffle_epi8(b, BSWAP_MASK); __m128i T0, T1, T2, T3, T4, T5; T0 = _mm_clmulepi64_si128(a, b, 0x00); T1 = _mm_clmulepi64_si128(a, b, 0x01); T2 = _mm_clmulepi64_si128(a, b, 0x10); T3 = _mm_clmulepi64_si128(a, b, 0x11); T1 = _mm_xor_si128(T1, T2); T2 = _mm_slli_si128(T1, 8); T1 = _mm_srli_si128(T1, 8); T0 = _mm_xor_si128(T0, T2); T3 = _mm_xor_si128(T3, T1); T4 = _mm_srli_epi32(T0, 31); T0 = _mm_slli_epi32(T0, 1); T5 = _mm_srli_epi32(T3, 31); T3 = _mm_slli_epi32(T3, 1); T2 = _mm_srli_si128(T4, 12); T5 = _mm_slli_si128(T5, 4); T4 = _mm_slli_si128(T4, 4); T0 = _mm_or_si128(T0, T4); T3 = _mm_or_si128(T3, T5); T3 = _mm_or_si128(T3, T2); T4 = _mm_slli_epi32(T0, 31); T5 = _mm_slli_epi32(T0, 30); T2 = _mm_slli_epi32(T0, 25); T4 = _mm_xor_si128(T4, T5); T4 = _mm_xor_si128(T4, T2); T5 = _mm_srli_si128(T4, 4); T3 = _mm_xor_si128(T3, T5); T4 = _mm_slli_si128(T4, 12); T0 = _mm_xor_si128(T0, T4); T3 = _mm_xor_si128(T3, T0); T4 = _mm_srli_epi32(T0, 1); T1 = _mm_srli_epi32(T0, 2); T2 = _mm_srli_epi32(T0, 7); T3 = _mm_xor_si128(T3, T1); T3 = _mm_xor_si128(T3, T2); T3 = _mm_xor_si128(T3, T4); T3 = _mm_shuffle_epi8(T3, BSWAP_MASK); _mm_storeu_si128(reinterpret_cast<__m128i*>(x), T3); }