C++ (Cpp) _mm_clmulepi64_si128 Examples

Example #1

0

Show file

File: mul1cl.c Project: nesciens/gf2x

GF2X_STORAGE_CLASS_mul_1_n unsigned long
gf2x_mul_1_n (unsigned long *cp, const unsigned long *bp, long sb, unsigned long a)
{
    long i;
    typedef union {
        __v2di s;
        unsigned long x[2];
    } __v2di_proxy;

    __v2di y = (__v2di) { a, a };
    __v2di x;
    __v2di_proxy cc;


    // do two at a time
    for (i = 0; i + 2 < sb; i += 2) {
        x = (__v2di) { bp[i], bp[i+1] };
        cc.s = _mm_clmulepi64_si128(x, y, 0);
        if (i == 0)
            cp[i] = cc.x[0];
        else
            cp[i] ^= cc.x[0];
        cp[i+1] = cc.x[1];
        cc.s = _mm_clmulepi64_si128(x, y, 1);
        cp[i+1] ^= cc.x[0];
        cp[i+2] = cc.x[1];
    }
    // last is different, to handle carry out
    unsigned long cy;
    if (i == sb - 2) {  // case bp is even
        x = (__v2di) { bp[i], bp[i+1] };
        cc.s = _mm_clmulepi64_si128(x, y, 0);
        if (i == 0)
            cp[i] = cc.x[0];
        else
            cp[i] ^= cc.x[0];
        cp[i+1] = cc.x[1];
        cc.s = _mm_clmulepi64_si128(x, y, 1);
        cp[i+1] ^= cc.x[0];
        cy = cc.x[1];
    } else { //case bp is odd
        x = (__v2di) { bp[i], 0 };
        cc.s = _mm_clmulepi64_si128(x, y, 0);
        if (i == 0)
            cp[i] = cc.x[0];
        else
            cp[i] ^= cc.x[0];
        cy = cc.x[1];
    }
    return cy;
}

Example #2

0

Show file

File: ghash_pclmulqdq_impl.c Project: nomaster/fastd

/** Performs a carryless multiplication of two 128bit integers modulo \f$ x^{128} + x^7 + x^2 + x + 1 \f$ */
static __m128i gmul(__m128i v, __m128i h) {
	/* multiply */
	__m128i z0, z1, z2, tmp;
	z0 = _mm_clmulepi64_si128(v, h, 0x11);
	z2 = _mm_clmulepi64_si128(v, h, 0x00);

	__m128i tmpv = _mm_srli_si128(v, 8);
	tmpv = _mm_xor_si128(tmpv, v);

	__m128i tmph = _mm_srli_si128(h, 8);
	tmph = _mm_xor_si128(tmph, h);

	z1 = _mm_clmulepi64_si128(tmpv, tmph, 0x00);
	z1 = _mm_xor_si128(z1, z0);
	z1 = _mm_xor_si128(z1, z2);

	tmp = _mm_srli_si128(z1, 8);
	__m128i pl = _mm_xor_si128(z0, tmp);

	tmp = _mm_slli_si128(z1, 8);
	__m128i ph = _mm_xor_si128(z2, tmp);

	tmp = _mm_srli_epi64(ph, 63);
	tmp = _mm_srli_si128(tmp, 8);

	pl = shl(pl, 1);
	pl = _mm_xor_si128(pl, tmp);

	ph = shl(ph, 1);

	/* reduce */
	__m128i b, c;
	b = c = _mm_slli_si128(ph, 8);

	b = _mm_slli_epi64(b, 62);
	c = _mm_slli_epi64(c, 57);

	tmp = _mm_xor_si128(b, c);
	__m128i d = _mm_xor_si128(ph, tmp);

	__m128i e = shr(d, 1);
	__m128i f = shr(d, 2);
	__m128i g = shr(d, 7);

	pl = _mm_xor_si128(pl, d);
	pl = _mm_xor_si128(pl, e);
	pl = _mm_xor_si128(pl, f);
	pl = _mm_xor_si128(pl, g);

	return pl;
}

Example #3

0

Show file

File: mul2cl.c Project: cryptobiu/MultiPartyPSI

void gf2x_mul2(unsigned long * t, unsigned long const * s1,
        unsigned long const * s2)
#endif
#endif
{
    typedef union {
        __v2di s;
        unsigned long x[2];
    } __v2di_proxy;

    __v2di ss1, ss2, s1s, s2s;
    __v2di_proxy t00, tk;
#ifndef BORROW
    __v2di_proxy t11;
#endif
    ss1 = _mm_loadu_si128((__v2di *)s1);
    ss2 = _mm_loadu_si128((__v2di *)s2);


    t00.s = _mm_clmulepi64_si128(ss1, ss2, 0);
#ifndef BORROW
    t11.s = _mm_clmulepi64_si128(ss1, ss2, 17);
#endif

    s1s = _mm_shuffle_epi32(ss1, 78);
    ss1 ^= s1s;
    s2s = _mm_shuffle_epi32(ss2, 78);
    ss2 ^= s2s;

    tk.s = _mm_clmulepi64_si128(ss1, ss2, 0);

#ifndef BORROW
    tk.s ^= t00.s ^ t11.s;
#endif

    /* store result */
    t[0] = t00.x[0];
#ifdef BORROW
    t[1] = t00.x[1] ^ tk.x[0] ^ t00.x[0] ^ c[0];
    t[2] = c[0] ^ tk.x[1] ^ t00.x[1] ^ c[1];
    t[3] = c[1];
#else
    t[1] = t00.x[1] ^ tk.x[0];
    t[2] = t11.x[0] ^ tk.x[1];
    t[3] = t11.x[1];
#endif
#ifdef CARRY
    c[0] = t11.x[0];
    c[1] = t11.x[1];
#endif
}

Example #4

0

Show file

File: mul1cl.c Project: nesciens/gf2x

GF2X_STORAGE_CLASS_addmul_1_n unsigned long
gf2x_addmul_1_n (unsigned long *dp, const unsigned long *cp, const unsigned long* bp, long sb, unsigned long a)
{
    long i;
    typedef union {
        __v2di s;
        unsigned long x[2];
    } __v2di_proxy;

    __v2di y = (__v2di) { a, a };
    __v2di x;
    __v2di_proxy dd;

    // do two at a time
    for (i = 0; i + 2 < sb; i += 2) {
        x = (__v2di) { bp[i], bp[i+1] };
        dd.s = _mm_clmulepi64_si128(x, y, 0);
        if (i == 0)
            dp[i] = cp[i] ^ dd.x[0];
        else
            dp[i] ^= dd.x[0];
        dp[i+1] = cp[i+1] ^ dd.x[1];
        dd.s = _mm_clmulepi64_si128(x, y, 1);
        dp[i+1] ^= dd.x[0];
        dp[i+2] = cp[i+2] ^ dd.x[1];
    }
    unsigned long cy;
    if (i == sb - 2) {  // case bp is even
        x = (__v2di) { bp[i], bp[i+1] };
        dd.s = _mm_clmulepi64_si128(x, y, 0);
        if (i == 0)
            dp[i] = cp[i] ^ dd.x[0];
        else
            dp[i] ^= dd.x[0];
        dp[i+1] = cp[i+1] ^ dd.x[1];
        dd.s = _mm_clmulepi64_si128(x, y, 1);
        dp[i+1] ^= dd.x[0];
        cy = dd.x[1];
    } else {
        x = (__v2di) { bp[i], 0 };
        dd.s = _mm_clmulepi64_si128(x, y, 0);
        if (i == 0)
            dp[i] = cp[i] ^ dd.x[0];
        else
            dp[i] ^= dd.x[0];
        cy = dd.x[1];
    }
    return cy;
}

Example #5

0

Show file

File: clmul.cpp Project: Hackmanit/botan

inline __m128i gcm_multiply(const __m128i& H, const __m128i& x)
   {
   __m128i T0, T1, T2, T3;

   T0 = _mm_clmulepi64_si128(x, H, 0x11);
   T1 = _mm_clmulepi64_si128(x, H, 0x10);
   T2 = _mm_clmulepi64_si128(x, H, 0x01);
   T3 = _mm_clmulepi64_si128(x, H, 0x00);

   T1 = _mm_xor_si128(T1, T2);
   T0 = _mm_xor_si128(T0, _mm_srli_si128(T1, 8));
   T3 = _mm_xor_si128(T3, _mm_slli_si128(T1, 8));

   return gcm_reduce(T0, T3);
   }

Example #6

0

Show file

File: mul4cl1.c Project: cryptobiu/MultiPartyPSI

/* TODO: if somebody comes up with a neat way to improve the interface so
 * as to remove the false dependency on pclmul, that would be nice.
 */
static inline __v2di
GF2X_FUNC(mul4cl1_mul1) (unsigned long a, unsigned long b)
{
    __v2di aa = (__v2di) { a, 0 };
    __v2di bb = (__v2di) { b, 0 };
    return _mm_clmulepi64_si128(aa, bb, 0);
}

Example #7

0

Show file

File: mul1cl.c Project: nesciens/gf2x

GF2X_STORAGE_CLASS_mul1 void
gf2x_mul1 (unsigned long *c, unsigned long a, unsigned long b)
{
    __v2di aa = (__v2di) { a, 0 };
    __v2di bb = (__v2di) { b, 0 };
    _mm_storeu_si128((__v2di*)c, _mm_clmulepi64_si128(aa, bb, 0));
}

Example #8

0

Show file

File: GF2X.cpp Project: tell/ntl-unix

static inline void
pclmul_mul1 (unsigned long *c, unsigned long a, unsigned long b)
{
   __m128i aa = _mm_setr_epi64( _mm_cvtsi64_m64(a), _mm_cvtsi64_m64(0));
   __m128i bb = _mm_setr_epi64( _mm_cvtsi64_m64(b), _mm_cvtsi64_m64(0));
   _mm_storeu_si128((__m128i*)c, _mm_clmulepi64_si128(aa, bb, 0));
}

Example #9

0

Show file

File: polyval.c Project: Shay-Gueron/AES-GCM-SIV

void INIT_Htable(uint8_t Htbl[16*8], uint8_t *H)
{
	int i;
	__m128i T, TMP0, TMP1, TMP2, TMP3, TMP4, POLY;
	POLY = _mm_setr_epi32(0x1,0,0,0xc2000000);
	T = _mm_loadu_si128(((__m128i*)H));
	TMP0 = T;
	_mm_storeu_si128(&((__m128i*)Htbl)[0], T);
	for (i=1; i<8; i++)
	{
		TMP1 = _mm_clmulepi64_si128(T, TMP0, 0x00);
		TMP4 = _mm_clmulepi64_si128(T, TMP0, 0x11);
		TMP2 = _mm_clmulepi64_si128(T, TMP0, 0x10);
		TMP3 = _mm_clmulepi64_si128(T, TMP0, 0x01);
		TMP2 = _mm_xor_si128(TMP2, TMP3);
		TMP3 = _mm_slli_si128(TMP2, 8);
		TMP2 = _mm_srli_si128(TMP2, 8);
		TMP1 = _mm_xor_si128(TMP3, TMP1);
		TMP4 = _mm_xor_si128(TMP4, TMP2);
		TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10);
		TMP3 = _mm_shuffle_epi32(TMP1, 78);
		TMP1 = _mm_xor_si128(TMP3, TMP2);
		TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10);
		TMP3 = _mm_shuffle_epi32(TMP1, 78);
		TMP1 = _mm_xor_si128(TMP3, TMP2);
		T = _mm_xor_si128(TMP4, TMP1);
		_mm_storeu_si128(&((__m128i*)Htbl)[i], T);
	}
}

Example #10

0

Show file

File: pclmul.c Project: Antique/virt-test

int pclmul(){
	__ma128i v1;
	__ma128i v2;
	for (int i = 1;i >= 0; i--){
		v1.ui64[i] = 3;
		v2.ui64[i] = 3;
	}
	__ma128i v3;
	v3.i = _mm_clmulepi64_si128(v1.i, v2.i, 0);
	if (v3.ui64[0] != 5)
		printf("Correct: %d result: %d\n", 5, v3.ui64[0]);
		return -1;
	return 0;
}

Example #11

0

Show file

File: testimm-3.c Project: 0day-ci/gcc

void
test8bit (void)
{
  i1 = _mm_cmpistrm (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistri (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistra (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistrc (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistro (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistrs (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  k1 = _mm_cmpistrz (i2, i3, k4);	  /* { dg-error "the third argument must be an 8-bit immediate" } */
  i1 = _mm_cmpestrm (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestri (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestra (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestrc (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestro (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestrs (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  k1 = _mm_cmpestrz (i2, k2, i3, k3, k4); /* { dg-error "the fifth argument must be an 8-bit immediate" } */
  b1 = _mm256_blend_ps (b2, b3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  k1 = _cvtss_sh (f1, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm256_cvtps_ph (b2, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  b1 = _mm256_dp_ps (b2, b3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  e1 = _mm256_permute2f128_pd (e2, e3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */
  b1 = _mm256_permute2f128_ps (b2, b3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */
  l1 = _mm256_permute2f128_si256 (l2, l3, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */
  b1 = _mm256_permute_ps (b2, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_aeskeygenassist_si128 (i2, k4);/* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_blend_epi16 (i2, i3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_clmulepi64_si128 (i2, i3, k4); /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_cvtps_ph (a1, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  d1 = _mm_dp_pd (d2, d3, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  a1 = _mm_dp_ps (a2, a3, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  a1 = _mm_insert_ps (a2, a3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_mpsadbw_epu8 (i2, i3, k4);	  /* { dg-error "the last argument must be an 8-bit immediate" } */
  a1 = _mm_permute_ps (a2, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_slli_si128 (i2, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
  i1 = _mm_srli_si128 (i2, k4);		  /* { dg-error "the last argument must be an 8-bit immediate" } */
}

Example #12

0

Show file

File: ChecksumDetail.cpp Project: genorm/folly

uint32_t
crc32_hw_aligned(uint32_t remainder, const __m128i* p, size_t vec_count) {
  /* Constants precomputed by gen_crc32_multipliers.c.  Do not edit! */
  const __m128i multipliers_4 = _mm_set_epi32(0, 0x1D9513D7, 0, 0x8F352D95);
  const __m128i multipliers_2 = _mm_set_epi32(0, 0x81256527, 0, 0xF1DA05AA);
  const __m128i multipliers_1 = _mm_set_epi32(0, 0xCCAA009E, 0, 0xAE689191);
  const __m128i final_multiplier = _mm_set_epi32(0, 0, 0, 0xB8BC6765);
  const __m128i mask32 = _mm_set_epi32(0, 0, 0, 0xFFFFFFFF);
  const __m128i barrett_reduction_constants =
      _mm_set_epi32(0x1, 0xDB710641, 0x1, 0xF7011641);

  const __m128i* const end = p + vec_count;
  const __m128i* const end512 = p + (vec_count & ~3);
  __m128i x0, x1, x2, x3;

  /*
   * Account for the current 'remainder', i.e. the CRC of the part of
   * the message already processed.  Explanation: rewrite the message
   * polynomial M(x) in terms of the first part A(x), the second part
   * B(x), and the length of the second part in bits |B(x)| >= 32:
   *
   *    M(x) = A(x)*x^|B(x)| + B(x)
   *
   * Then the CRC of M(x) is:
   *
   *    CRC(M(x)) = CRC(A(x)*x^|B(x)| + B(x))
   *              = CRC(A(x)*x^32*x^(|B(x)| - 32) + B(x))
   *              = CRC(CRC(A(x))*x^(|B(x)| - 32) + B(x))
   *
   * Note: all arithmetic is modulo G(x), the generator polynomial; that's
   * why A(x)*x^32 can be replaced with CRC(A(x)) = A(x)*x^32 mod G(x).
   *
   * So the CRC of the full message is the CRC of the second part of the
   * message where the first 32 bits of the second part of the message
   * have been XOR'ed with the CRC of the first part of the message.
   */
  x0 = *p++;
  x0 = _mm_xor_si128(x0, _mm_set_epi32(0, 0, 0, remainder));

  if (p > end512) /* only 128, 256, or 384 bits of input? */
    goto _128_bits_at_a_time;
  x1 = *p++;
  x2 = *p++;
  x3 = *p++;

  /* Fold 512 bits at a time */
  for (; p != end512; p += 4) {
    __m128i y0, y1, y2, y3;

    y0 = p[0];
    y1 = p[1];
    y2 = p[2];
    y3 = p[3];

    /*
     * Note: the immediate constant for PCLMULQDQ specifies which
     * 64-bit halves of the 128-bit vectors to multiply:
     *
     * 0x00 means low halves (higher degree polynomial terms for us)
     * 0x11 means high halves (lower degree polynomial terms for us)
     */
    y0 = _mm_xor_si128(y0, _mm_clmulepi64_si128(x0, multipliers_4, 0x00));
    y1 = _mm_xor_si128(y1, _mm_clmulepi64_si128(x1, multipliers_4, 0x00));
    y2 = _mm_xor_si128(y2, _mm_clmulepi64_si128(x2, multipliers_4, 0x00));
    y3 = _mm_xor_si128(y3, _mm_clmulepi64_si128(x3, multipliers_4, 0x00));
    y0 = _mm_xor_si128(y0, _mm_clmulepi64_si128(x0, multipliers_4, 0x11));
    y1 = _mm_xor_si128(y1, _mm_clmulepi64_si128(x1, multipliers_4, 0x11));
    y2 = _mm_xor_si128(y2, _mm_clmulepi64_si128(x2, multipliers_4, 0x11));
    y3 = _mm_xor_si128(y3, _mm_clmulepi64_si128(x3, multipliers_4, 0x11));

    x0 = y0;
    x1 = y1;
    x2 = y2;
    x3 = y3;
  }

  /* Fold 512 bits => 128 bits */
  x2 = _mm_xor_si128(x2, _mm_clmulepi64_si128(x0, multipliers_2, 0x00));
  x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x1, multipliers_2, 0x00));
  x2 = _mm_xor_si128(x2, _mm_clmulepi64_si128(x0, multipliers_2, 0x11));
  x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x1, multipliers_2, 0x11));
  x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x2, multipliers_1, 0x00));
  x3 = _mm_xor_si128(x3, _mm_clmulepi64_si128(x2, multipliers_1, 0x11));
  x0 = x3;

_128_bits_at_a_time:
  while (p != end) {
    /* Fold 128 bits into next 128 bits */
    x1 = *p++;
    x1 = _mm_xor_si128(x1, _mm_clmulepi64_si128(x0, multipliers_1, 0x00));
    x1 = _mm_xor_si128(x1, _mm_clmulepi64_si128(x0, multipliers_1, 0x11));
    x0 = x1;
  }

  /* Now there are just 128 bits left, stored in 'x0'. */

  /*
   * Fold 128 => 96 bits.  This also implicitly appends 32 zero bits,
   * which is equivalent to multiplying by x^32.  This is needed because
   * the CRC is defined as M(x)*x^32 mod G(x), not just M(x) mod G(x).
   */
  x0 = _mm_xor_si128(_mm_srli_si128(x0, 8), _mm_clmulepi64_si128(x0, multipliers_1, 0x10));

  /* Fold 96 => 64 bits */
  x0 = _mm_xor_si128(_mm_srli_si128(x0, 4),
      _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), final_multiplier, 0x00));

  /*
   * Finally, reduce 64 => 32 bits using Barrett reduction.
   *
   * Let M(x) = A(x)*x^32 + B(x) be the remaining message.  The goal is to
   * compute R(x) = M(x) mod G(x).  Since degree(B(x)) < degree(G(x)):
   *
   *    R(x) = (A(x)*x^32 + B(x)) mod G(x)
   *         = (A(x)*x^32) mod G(x) + B(x)
   *
   * Then, by the Division Algorithm there exists a unique q(x) such that:
   *
   *    A(x)*x^32 mod G(x) = A(x)*x^32 - q(x)*G(x)
   *
   * Since the left-hand side is of maximum degree 31, the right-hand side
   * must be too.  This implies that we can apply 'mod x^32' to the
   * right-hand side without changing its value:
   *
   *    (A(x)*x^32 - q(x)*G(x)) mod x^32 = q(x)*G(x) mod x^32
   *
   * Note that '+' is equivalent to '-' in polynomials over GF(2).
   *
   * We also know that:
   *
   *                  / A(x)*x^32 \
   *    q(x) = floor (  ---------  )
   *                  \    G(x)   /
   *
   * To compute this efficiently, we can multiply the top and bottom by
   * x^32 and move the division by G(x) to the top:
   *
   *                  / A(x) * floor(x^64 / G(x)) \
   *    q(x) = floor (  -------------------------  )
   *                  \           x^32            /
   *
   * Note that floor(x^64 / G(x)) is a constant.
   *
   * So finally we have:
   *
   *                              / A(x) * floor(x^64 / G(x)) \
   *    R(x) = B(x) + G(x)*floor (  -------------------------  )
   *                              \           x^32            /
   */
  x1 = x0;
  x0 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), barrett_reduction_constants, 0x00);
  x0 = _mm_clmulepi64_si128(_mm_and_si128(x0, mask32), barrett_reduction_constants, 0x10);
  return _mm_cvtsi128_si32(_mm_srli_si128(_mm_xor_si128(x0, x1), 4));
}

Example #13

0

Show file

File: polyval.c Project: Shay-Gueron/AES-GCM-SIV

void Polyval_Htable(unsigned char* Htbl,
                    unsigned char* inp,
					int length,
                    unsigned char* POLYVAL)
{
	int remainder =0;
	int rem_128 = (length%128) - length%16;
	int has_semi = length %16;
	unsigned char* fixed_inp = inp;
	int i;
	uint8_t B[16] ={0};
	__m128i data, TMP0, TMP1, TMP2, TMP3, TMP4, T, Xhi, POLY;
	if (length==0)
		return;
	Xhi = _mm_setzero_si128();
	POLY = _mm_setr_epi32(0x1,0,0,0xc2000000);
	T = _mm_loadu_si128(((__m128i*)POLYVAL));
	if ((length!=0) || (rem_128!=0)){
	if (rem_128!=0)
	{
		fixed_inp +=rem_128;
		
		remainder = rem_128/16;
		data = _mm_loadu_si128(((__m128i*)inp));
		data = _mm_xor_si128(T, data);
		TMP2 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[remainder-1], 0x01);
		TMP0 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[remainder-1], 0x00);
		TMP1 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[remainder-1], 0x11);
		TMP3 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[remainder-1], 0x10);
		TMP2 = _mm_xor_si128(TMP2, TMP3);
		for (i=1; i<(rem_128/16); i++)
		{
			data = _mm_loadu_si128(&((__m128i*)inp)[i]);
			TMP3 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[remainder-i-1], 0x00);
			TMP0 = _mm_xor_si128(TMP0, TMP3);
			TMP3 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[remainder-i-1], 0x11);
			TMP1 = _mm_xor_si128(TMP1, TMP3);
			TMP3 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[remainder-i-1], 0x01);
			TMP2 = _mm_xor_si128(TMP2, TMP3);
			TMP3 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[remainder-i-1], 0x10);
			TMP2 = _mm_xor_si128(TMP2, TMP3);    
			
		}
		TMP3 = _mm_srli_si128(TMP2, 8);
		TMP2 = _mm_slli_si128(TMP2, 8);
		Xhi = _mm_xor_si128(TMP3, TMP1);
		T = _mm_xor_si128(TMP0, TMP2);
		length -= rem_128;
	}
	length /=16;
	i=0;
	if (length!=0)
	{
		if (rem_128==0)
		{
			data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+7]);
			TMP2 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x01);
			TMP0 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x00);
			TMP1 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x11);
			TMP3 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x10);
			TMP2 = _mm_xor_si128(TMP2, TMP3);
			data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+6]);
			SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[1]);
			data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+5]);
			SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[2]);
			data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+4]);
			SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[3]);
			data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+3]);
			TMP4 = _mm_clmulepi64_si128(T, POLY, 0x10);
			SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[4]);
			data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+2]);
			SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[5]);
			data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+1]);
			SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[6]);
			data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i]);
			data = _mm_xor_si128(T, data);
			SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[7]);
			TMP3 = _mm_srli_si128(TMP2, 8);
			TMP2 = _mm_slli_si128(TMP2, 8);
			Xhi = _mm_xor_si128(TMP3, TMP1);
			T = _mm_xor_si128(TMP0, TMP2);
			i=8;
		}
		for (; i<length; i=i+8)
		{
			data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+7]);
			TMP2 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x01);
			TMP0 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x00);
			TMP1 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x11);
			TMP3 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x10);
			TMP2 = _mm_xor_si128(TMP2, TMP3);
			data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+6]);
			SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[1]);
			data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+5]);
			TMP4 = _mm_clmulepi64_si128(T, POLY, 0x10);
			T =_mm_alignr_epi8(T, T, 8);
			SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[2]);
			T = _mm_xor_si128(T, TMP4);
			data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+4]);
			SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[3]);
			data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+3]);
			TMP4 = _mm_clmulepi64_si128(T, POLY, 0x10);
			T =_mm_alignr_epi8(T, T, 8);
			SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[4]);
			T = _mm_xor_si128(T, TMP4);
			data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+2]);
			SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[5]);
			T = _mm_xor_si128(T, Xhi);
			data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i+1]);
			SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[6]);
			data = _mm_loadu_si128(&((__m128i*)fixed_inp)[i]);
			data = _mm_xor_si128(T, data);
			SCHOOLBOOK_AAD(data,((__m128i*)Htbl)[7]);
			TMP3 = _mm_srli_si128(TMP2, 8);
			TMP2 = _mm_slli_si128(TMP2, 8);
			Xhi = _mm_xor_si128(TMP3, TMP1);
			T = _mm_xor_si128(TMP0, TMP2);
		}
		TMP3 = _mm_clmulepi64_si128(T, POLY, 0x10);
		T =_mm_alignr_epi8(T, T, 8);
		T = _mm_xor_si128(TMP3, T);
		TMP3 = _mm_clmulepi64_si128(T, POLY, 0x10);
		T =_mm_alignr_epi8(T, T, 8);
		T = _mm_xor_si128(TMP3, T);
		T = _mm_xor_si128(Xhi, T);
	}
	else
	{ // length was <16 and there was several blocks on start - need to finialize reduction
		if (rem_128!=0)
		{
			TMP3 = _mm_clmulepi64_si128(T, POLY, 0x10);
			T =_mm_alignr_epi8(T, T, 8);
			T = _mm_xor_si128(TMP3, T);
			TMP3 = _mm_clmulepi64_si128(T, POLY, 0x10);
			T =_mm_alignr_epi8(T, T, 8);
			T = _mm_xor_si128(TMP3, T);
			T = _mm_xor_si128(Xhi, T);
		}
	}
	}
	if (has_semi!=0)
	{
		memcpy(B, (uint8_t*)(&((__m128i*)fixed_inp)[i]),has_semi);
		data = _mm_loadu_si128((__m128i*)B);
		data = _mm_xor_si128(T,data);
		TMP2 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x01);
		TMP0 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x00);
		TMP1 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x11);
		TMP3 = _mm_clmulepi64_si128(data, ((__m128i*)Htbl)[0], 0x10);
		TMP2 = _mm_xor_si128(TMP2, TMP3);
		TMP3 = _mm_srli_si128(TMP2, 8);
		TMP2 = _mm_slli_si128(TMP2, 8);
		Xhi = _mm_xor_si128(TMP3, TMP1);
		T = _mm_xor_si128(TMP0, TMP2);
		TMP3 = _mm_clmulepi64_si128(T, POLY, 0x10);
		T =_mm_alignr_epi8(T, T, 8);
		T = _mm_xor_si128(TMP3, T);
		TMP3 = _mm_clmulepi64_si128(T, POLY, 0x10);
		T =_mm_alignr_epi8(T, T, 8);
		T = _mm_xor_si128(TMP3, T);
		T = _mm_xor_si128(Xhi, T);
	}
	_mm_storeu_si128(((__m128i*)POLYVAL), T);
}

Example #14

0

Show file

File: pclmul-builtins.c Project: AlexDenisov/clang

__m128i test_mm_clmulepi64_si128(__m128i a, __m128i b) {
  // CHECK: @llvm.x86.pclmulqdq
  return _mm_clmulepi64_si128(a, b, 0);
}

Example #15

0

Show file

File: clmul.cpp Project: Hackmanit/botan

inline __m128i gcm_multiply_x4(const __m128i& H1, const __m128i& H2, const __m128i& H3, const __m128i& H4,
                               const __m128i& X1, const __m128i& X2, const __m128i& X3, const __m128i& X4)
   {
   /*
   * Mutiply with delayed reduction, algorithm by Krzysztof Jankowski
   * and Pierre Laurent of Intel
   */

   const __m128i H1_X1_lo = _mm_clmulepi64_si128(H1, X1, 0x00);
   const __m128i H2_X2_lo = _mm_clmulepi64_si128(H2, X2, 0x00);
   const __m128i H3_X3_lo = _mm_clmulepi64_si128(H3, X3, 0x00);
   const __m128i H4_X4_lo = _mm_clmulepi64_si128(H4, X4, 0x00);

   const __m128i lo = _mm_xor_si128(
      _mm_xor_si128(H1_X1_lo, H2_X2_lo),
      _mm_xor_si128(H3_X3_lo, H4_X4_lo));

   const __m128i H1_X1_hi = _mm_clmulepi64_si128(H1, X1, 0x11);
   const __m128i H2_X2_hi = _mm_clmulepi64_si128(H2, X2, 0x11);
   const __m128i H3_X3_hi = _mm_clmulepi64_si128(H3, X3, 0x11);
   const __m128i H4_X4_hi = _mm_clmulepi64_si128(H4, X4, 0x11);

   const __m128i hi = _mm_xor_si128(
      _mm_xor_si128(H1_X1_hi, H2_X2_hi),
      _mm_xor_si128(H3_X3_hi, H4_X4_hi));

   __m128i T0 = _mm_xor_si128(lo, hi);
   __m128i T1, T2, T3, T4;

   T1 = _mm_xor_si128(_mm_srli_si128(H1, 8), H1);
   T2 = _mm_xor_si128(_mm_srli_si128(X1, 8), X1);
   T3 = _mm_xor_si128(_mm_srli_si128(H2, 8), H2);
   T4 = _mm_xor_si128(_mm_srli_si128(X2, 8), X2);
   T0 = _mm_xor_si128(T0, _mm_clmulepi64_si128(T1, T2, 0x00));
   T0 = _mm_xor_si128(T0, _mm_clmulepi64_si128(T3, T4, 0x00));

   T1 = _mm_xor_si128(_mm_srli_si128(H3, 8), H3);
   T2 = _mm_xor_si128(_mm_srli_si128(X3, 8), X3);
   T3 = _mm_xor_si128(_mm_srli_si128(H4, 8), H4);
   T4 = _mm_xor_si128(_mm_srli_si128(X4, 8), X4);
   T0 = _mm_xor_si128(T0, _mm_clmulepi64_si128(T1, T2, 0x00));
   T0 = _mm_xor_si128(T0, _mm_clmulepi64_si128(T3, T4, 0x00));

   T1 = _mm_xor_si128(_mm_srli_si128(T0, 8), hi);
   T2 = _mm_xor_si128(_mm_slli_si128(T0, 8), lo);

   return gcm_reduce(T1, T2);
   }

Example #16

0

Show file

File: polyval.c Project: Shay-Gueron/AES-GCM-SIV

void Polyval_Horner(unsigned char* TAG,
					unsigned char* pH,
					unsigned char* inp,
					int length)
{
	__m128i TMP0, TMP1, TMP2, TMP3, TMP4, T, POLY, H;
	int i=0;
	if (length==0)
		return;
	int has_semi = length%16;
	uint8_t B[16]={0};
	length /=16;
	
	H = _mm_loadu_si128(((__m128i*)pH));
	T = _mm_loadu_si128(((__m128i*)TAG));
	POLY = _mm_setr_epi32(0x1,0,0,0xc2000000);
	for (i=0; i< length; i++)
	{
		T = _mm_xor_si128(T, _mm_loadu_si128(&((__m128i*)inp)[i]));
		TMP1 = _mm_clmulepi64_si128(T, H, 0x00);
		TMP4 = _mm_clmulepi64_si128(T, H, 0x11);
		TMP2 = _mm_clmulepi64_si128(T, H, 0x10);
		TMP3 = _mm_clmulepi64_si128(T, H, 0x01);
		TMP2 = _mm_xor_si128(TMP2, TMP3);
		TMP3 = _mm_slli_si128(TMP2, 8);
		TMP2 = _mm_srli_si128(TMP2, 8);
		TMP1 = _mm_xor_si128(TMP3, TMP1);
		TMP4 = _mm_xor_si128(TMP4, TMP2);
		TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10);
		TMP3 = _mm_shuffle_epi32(TMP1, 78);
		TMP1 = _mm_xor_si128(TMP3, TMP2);
		TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10);
		TMP3 = _mm_shuffle_epi32(TMP1, 78);
		TMP1 = _mm_xor_si128(TMP3, TMP2);
		T = _mm_xor_si128(TMP4, TMP1);
	}
	if (has_semi!=0)
	{
		memcpy(B, inp+length*16, has_semi);
		T = _mm_xor_si128(T, _mm_loadu_si128((__m128i*)B));
		TMP1 = _mm_clmulepi64_si128(T, H, 0x00);
		TMP4 = _mm_clmulepi64_si128(T, H, 0x11);
		TMP2 = _mm_clmulepi64_si128(T, H, 0x10);
		TMP3 = _mm_clmulepi64_si128(T, H, 0x01);
		TMP2 = _mm_xor_si128(TMP2, TMP3);
		TMP3 = _mm_slli_si128(TMP2, 8);
		TMP2 = _mm_srli_si128(TMP2, 8);
		TMP1 = _mm_xor_si128(TMP3, TMP1);
		TMP4 = _mm_xor_si128(TMP4, TMP2);
		TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10);
		TMP3 = _mm_shuffle_epi32(TMP1, 78);
		TMP1 = _mm_xor_si128(TMP3, TMP2);
		TMP2 = _mm_clmulepi64_si128(TMP1, POLY, 0x10);
		TMP3 = _mm_shuffle_epi32(TMP1, 78);
		TMP1 = _mm_xor_si128(TMP3, TMP2);
		T = _mm_xor_si128(TMP4, TMP1);
	}
	_mm_storeu_si128(((__m128i*)TAG), T);
}

Example #17

0

Show file

File: clmul.cpp Project: louiz/botan

void gcm_multiply_clmul(byte x[16], const byte H[16])
{
    /*
    * Algorithms 1 and 5 from Intel's CLMUL guide
    */
    const __m128i BSWAP_MASK = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15);

    __m128i a = _mm_loadu_si128(reinterpret_cast<const __m128i*>(x));
    __m128i b = _mm_loadu_si128(reinterpret_cast<const __m128i*>(H));

    a = _mm_shuffle_epi8(a, BSWAP_MASK);
    b = _mm_shuffle_epi8(b, BSWAP_MASK);

    __m128i T0, T1, T2, T3, T4, T5;

    T0 = _mm_clmulepi64_si128(a, b, 0x00);
    T1 = _mm_clmulepi64_si128(a, b, 0x01);
    T2 = _mm_clmulepi64_si128(a, b, 0x10);
    T3 = _mm_clmulepi64_si128(a, b, 0x11);

    T1 = _mm_xor_si128(T1, T2);
    T2 = _mm_slli_si128(T1, 8);
    T1 = _mm_srli_si128(T1, 8);
    T0 = _mm_xor_si128(T0, T2);
    T3 = _mm_xor_si128(T3, T1);

    T4 = _mm_srli_epi32(T0, 31);
    T0 = _mm_slli_epi32(T0, 1);

    T5 = _mm_srli_epi32(T3, 31);
    T3 = _mm_slli_epi32(T3, 1);

    T2 = _mm_srli_si128(T4, 12);
    T5 = _mm_slli_si128(T5, 4);
    T4 = _mm_slli_si128(T4, 4);
    T0 = _mm_or_si128(T0, T4);
    T3 = _mm_or_si128(T3, T5);
    T3 = _mm_or_si128(T3, T2);

    T4 = _mm_slli_epi32(T0, 31);
    T5 = _mm_slli_epi32(T0, 30);
    T2 = _mm_slli_epi32(T0, 25);

    T4 = _mm_xor_si128(T4, T5);
    T4 = _mm_xor_si128(T4, T2);
    T5 = _mm_srli_si128(T4, 4);
    T3 = _mm_xor_si128(T3, T5);
    T4 = _mm_slli_si128(T4, 12);
    T0 = _mm_xor_si128(T0, T4);
    T3 = _mm_xor_si128(T3, T0);

    T4 = _mm_srli_epi32(T0, 1);
    T1 = _mm_srli_epi32(T0, 2);
    T2 = _mm_srli_epi32(T0, 7);
    T3 = _mm_xor_si128(T3, T1);
    T3 = _mm_xor_si128(T3, T2);
    T3 = _mm_xor_si128(T3, T4);

    T3 = _mm_shuffle_epi8(T3, BSWAP_MASK);

    _mm_storeu_si128(reinterpret_cast<__m128i*>(x), T3);
}