예제 #1
0
__inline__ static void mul7(__m128i in, __m128i *out){
	__m128i tmp;
	mul2(in, &tmp);
	*out = _mm_xor_si128(in, tmp);
	mul2(tmp, &tmp);
	*out = _mm_xor_si128(*out, tmp);
}
예제 #2
0
파일: decode.c 프로젝트: AleksMx/qfs
/* Recover data blocks x and y using syndromes Q & R. */
static void
rs_decode2qr(int n, int blocksize, int x, int y, v16 **data)
{
    int i;
    v16 qq, rr;
    const uint8_t* const c = rs_r2QR[rs_r2map[x][y]];
#ifndef KFS_QCRS_DONT_INLINE
    v16** pd = data + n - 1;
#endif

    memset(data[x], 0, blocksize);
    memset(data[y], 0, blocksize);
    for (i = 0; i < blocksize/sizeof(v16); i++) {
#ifndef KFS_QCRS_DONT_INLINE
        qq = (*pd)[i];
        rr = qq;
        while (data <= --pd) {
            const v16 d = (*pd)[i];
            qq = mul2(qq) ^ d;
            rr = mul2(mul2(rr)) ^ d;
        }
        pd = data + n + 2;
        rr ^= (*pd--)[i];
        qq ^= (*pd--)[i];
        pd--;
#else
        qq = Q(data, n, i) ^ data[n+1][i];
        rr = R(data, n, i) ^ data[n+2][i];
#endif
        data[x][i] = mulby(c[0], qq) ^ mulby(c[1], rr);
        data[y][i] = mulby(c[2], qq) ^ mulby(c[3], rr);
    }
}
예제 #3
0
파일: decode.c 프로젝트: AleksMx/qfs
/* Compute R syndrome over data[?][i]. */
static v16
R(v16 **data, int n, int i)
{
    int j;
    v16 r;

    r = data[n-1][i];
    for (j = n-2; j >= 0; j--)
        r = mul2(mul2(r)) ^ data[j][i];
    return r;
}
예제 #4
0
파일: decode.c 프로젝트: AleksMx/qfs
/* Recover data blocks x and y using syndromes P & Q. */
static void
rs_decode2pq(int n, int blocksize, int x, int y, v16 **data)
{
    int i;
    v16 pp, qq;
    const uint8_t* const c = rs_r2PQ[rs_r2map[x][y]];
#ifndef KFS_QCRS_DONT_INLINE
    v16** pd = data + n - 1;
#endif

    memset(data[x], 0, blocksize);
    memset(data[y], 0, blocksize);
    for (i = 0; i < blocksize/sizeof(v16); i++) {
#ifndef KFS_QCRS_DONT_INLINE
        pp = (*pd)[i];
        qq = pp;
        while (data <= --pd) {
            const v16 d = (*pd)[i];
            pp ^= d;
            qq = mul2(qq) ^ d;
        }
        pd = data + n + 1;
        qq ^= (*pd--)[i];
        pp ^= (*pd--)[i];
#else
        pp = P(data, n, i) ^ data[n][i];
        qq = Q(data, n, i) ^ data[n+1][i];
#endif
        data[x][i] = mulby(c[0], pp) ^ mulby(c[1], qq);
        data[y][i] = mulby(c[2], pp) ^ mulby(c[3], qq);
    }
}
예제 #5
0
/*
AFuncS : OTR Core Authentication Function (ADP=s)
*/
__m128i AFuncS(
	const uint8 *header,
	uint32 h_len)
{
	uint32 i;
	uint32 m, last;
	block chain = _mm_setzero_si128(), tmp, mask;
	const __m128i *hdp = (__m128i*)header;

	last = h_len % BLOCK;
	if (last == 0) last = BLOCK;
	m = (h_len - last) / BLOCK; //header = m blocks + last bytes

	for (i = 0; i < m; i++){
		chain = _mm_xor_si128(chain, hdp[i]);
		AES_encrypt(chain, &chain, encrypt_key);
	}
	hdp += m;
	/* last block */
	ozp(last, (uint8*)&hdp[0], &tmp);
	chain = _mm_xor_si128(tmp, chain);
	if (last != BLOCK){
		mul2(Q, &mask);
	}
	else{
		mul4(Q, &mask);
	}
	chain = _mm_xor_si128(chain, mask);
	AES_encrypt(chain, &chain, encrypt_key);
	return chain; //TA
}
예제 #6
0
파일: Aes.cpp 프로젝트: frankencode/fluxkit
inline void mixColumns(ByteArray *s_)
{
    uint8_t *s = s_->bytes();

    for (int c = 0; c < Nb; ++c) {
        uint8_t *sc = s + c * Nb;
        uint8_t h0 = sc[0];
        uint8_t h1 = sc[1];
        uint8_t h2 = sc[2];
        uint8_t h3 = sc[3];
        sc[0] = h2 ^ h3 ^ mul2(h0) ^ mul3(h1);
        sc[1] = h0 ^ h3 ^ mul2(h1) ^ mul3(h2);
        sc[2] = h0 ^ h1 ^ mul2(h2) ^ mul3(h3);
        sc[3] = h1 ^ h2 ^ mul2(h3) ^ mul3(h0);
    }
}
예제 #7
0
파일: sum_digits.c 프로젝트: b-trayer/15202
int main()
{
    int arr[1000] = {1, -1};//need 302 + 1, -1 is stoper
    int i;
    for(i = 0;i < 1000;i++)
        mul2(arr);
    printf("%d\n", sum(arr));
    return 0;
}
예제 #8
0
파일: Aes.cpp 프로젝트: frankencode/fluxkit
inline uint8_t mul(uint8_t a, uint8_t b)
{
    int c = 0, h = a, f = 1;
    while (f <= b) {
        if (b & f) c ^= h;
        h = mul2(h);
        f <<= 1;
    }
    return c;
}
예제 #9
0
파일: decode.c 프로젝트: AleksMx/qfs
/* Compute Q syndrome over data[?][i]. */
static v16
Q(v16 **data, int n, int i)
{
    int j;
    v16 q;

    q = data[n-1][i];
    for (j = n-2; j >= 0; j--)
        q = mul2(q) ^ data[j][i];
    return q;
}
예제 #10
0
/* MixBytes reversibly mixes the bytes within a column */
void MixBytes(u8 x[ROWS][COLS1024], int columns) {
  int i, j;
  u8 temp[ROWS];

  for (i = 0; i < columns; i++) {
    for (j = 0; j < ROWS; j++) {
      temp[j] = 
	mul2(x[(j+0)%ROWS][i])^
	mul2(x[(j+1)%ROWS][i])^
	mul3(x[(j+2)%ROWS][i])^
	mul4(x[(j+3)%ROWS][i])^
	mul5(x[(j+4)%ROWS][i])^
	mul3(x[(j+5)%ROWS][i])^
	mul5(x[(j+6)%ROWS][i])^
	mul7(x[(j+7)%ROWS][i]);
    }
    for (j = 0; j < ROWS; j++) {
      x[j][i] = temp[j];
    }
  }
}
예제 #11
0
파일: encode.c 프로젝트: Abioy/qfs
/*
 * Reed-Solomon n+3 encoder.
 * nblocks is `n' data blocks plus 3 syndrome blocks.  blocksize _must_
 * be a multiple of 16.  data contains pointers to blocks.  The first
 * n are input data blocks.  The last 3 are the P, Q, and R syndromes.
 */
void
rs_encode(int nblocks, int blocksize, void **idata)
{
    int i, j, n;
    v16 *p, *q, *r, **data = (v16**)idata;

    assert(nblocks > 3);
    assert(blocksize % 16 == 0);
    n = nblocks - 3;  // # data blocks
    p = data[n];
    q = data[n+1];
    r = data[n+2];
    for (i = 0; i < blocksize/sizeof(v16); i++) {
        p[i] = q[i] = r[i] = data[n-1][i];
        for (j = n-2; j >= 0; j--) {
            p[i] ^= data[j][i];
            q[i] = mul2(q[i]) ^ data[j][i];
            r[i] = mul2(mul2(r[i])) ^ data[j][i];
        }
    }
}
예제 #12
0
파일: decode.c 프로젝트: AleksMx/qfs
/* Recover data blocks x, y, & z using syndromes P, Q & R. */
static void
rs_decode3pqr(int n, int blocksize, int x, int y, int z, v16 **data)
{
    int i;
    v16 pp, qq, rr;
    const uint8_t* const c = rs_r3[rs_r3map[x][y][z]];
#ifndef KFS_QCRS_DONT_INLINE
    v16** pd = data + n - 1;
#endif

    memset(data[x], 0, blocksize);
    memset(data[y], 0, blocksize);
    memset(data[z], 0, blocksize);
    for (i = 0; i < blocksize/sizeof(v16); i++) {
#ifndef KFS_QCRS_DONT_INLINE
        pp = (*pd)[i];
        qq = pp;
        rr = pp;
        while (data <= --pd) {
            const v16 d = (*pd)[i];
            pp ^= d;
            qq = mul2(qq) ^ d;
            rr = mul2(mul2(rr)) ^ d;
        }
        pd = data + n + 2;
        rr ^= (*pd--)[i];
        qq ^= (*pd--)[i];
        pp ^= (*pd--)[i];
#else
        pp = P(data, n, i) ^ data[n][i];
        qq = Q(data, n, i) ^ data[n+1][i];
        rr = R(data, n, i) ^ data[n+2][i];
#endif
        data[x][i] = mulby(c[0], pp) ^ mulby(c[1], qq) ^ mulby(c[2], rr);
        data[y][i] = mulby(c[3], pp) ^ mulby(c[4], qq) ^ mulby(c[5], rr);
        data[z][i] = mulby(c[6], pp) ^ mulby(c[7], qq) ^ mulby(c[8], rr);
    }
}
예제 #13
0
/*
AFunc : OTR Core Authentication Function (ADP=p)
*/
__m128i AFunc(
	const uint8 *header,
	uint32 h_len)
{
	uint32 i;
	uint32 m, last;
	block tmp[PIPE], mask[PIPE + 1], ASum = _mm_setzero_si128();
	uint32 rest_len = h_len;
	const __m128i *hdp = (__m128i*)header;

	mask[0] = _mm_load_si128(&Q);
	while (rest_len > (BLOCK*PIPE)){
		mul2_PIPE(mask);
		for (i = 0; i < PIPE; i++){
			tmp[i] = _mm_xor_si128(mask[i], hdp[i]);
		}
		AES_ecb_encrypt_PIPE(tmp, encrypt_key);
		for (i = 0; i < PIPE; i++){
			ASum = _mm_xor_si128(ASum, tmp[i]);
		}
		rest_len -= (BLOCK*PIPE);
		hdp += PIPE;
		mask[0] = _mm_load_si128(&mask[PIPE]);
	}

	last = rest_len % BLOCK;
	if (last == 0) last = BLOCK;
	m = (rest_len - last) / BLOCK; //header = m blocks + last bytes

	for (i = 0; i < m; i++){
		tmp[0] = _mm_xor_si128(mask[0], hdp[i]);
		AES_encrypt(tmp[0], &tmp[0], encrypt_key);
		ASum = _mm_xor_si128(ASum, tmp[0]);
		mul2(mask[0], &mask[0]);
	}
	hdp += m;
	/* last block */
	ozp(last, (uint8*)&hdp[0], &tmp[0]);
	ASum = _mm_xor_si128(ASum, tmp[0]);
	if (last != BLOCK){
		mul3(mask[0], &mask[0]);
	}
	else{
		mul3twice(mask[0], &mask[0]);
	}
	ASum = _mm_xor_si128(ASum, mask[0]);
	AES_encrypt(ASum, &ASum, encrypt_key);
	return ASum; //TA
}
예제 #14
0
파일: Test_all.cpp 프로젝트: eriser/CSL
void test_vector_ifft() {
	logMsg("playing IFFT crossfade...");
	IFFT vox1, vox2;

	vox1.set_bin_mag_phase(2, 0.25, 0);
	vox1.set_bin_mag_phase(4, 0.25, 0);
	vox2.set_bin_mag_phase(6, 0.25, 0);
	vox2.set_bin_mag_phase(8, 0.25, 0);
	LineSegment env1(3, 1, 0);	// fade out
	LineSegment env2(3, 0, 1);	// fade in
	MulOp mul1(vox1, env1);	
	MulOp mul2(vox2, env2);
	AddOp add3(mul1, mul2);	
	run_test(add3);
	logMsg("IFFT crossfade done.");
}
예제 #15
0
파일: Test_all.cpp 프로젝트: eriser/CSL
void test_mixer_with_sines() {
	Sine vox1(431);				// create 4 scaled sine waves
	MulOp mul1(vox1, 0.3);
	Sine vox2(540);
	MulOp mul2(vox2, 0.1);
	Sine vox3(890);
	MulOp mul3(vox3, 0.3);
	Sine vox4(1280);
	MulOp mul4(vox4, 0.01);
	Mixer mix(2);				// create a stereo mixer
	mix.add_input(mul1);			// add them to the mixer
	mix.add_input(mul2);
	mix.add_input(mul3);
	mix.add_input(mul4);
	logMsg("playing mix of 4 sines...");
	run_test(mix);
	logMsg("mix done.");
}
예제 #16
0
파일: decode.c 프로젝트: AleksMx/qfs
static v16
mulby(uint8_t x, v16 v)
{
#ifdef LIBRS_USE_NEON

#define uint8x16_to_8x8x2(v) ((uint8x8x2_t) { vget_low_u8(v), vget_high_u8(v) })

    v16 lo, hi;

    lo = v & VEC16(0x0f);
    hi = vshrq_n_u8(v, 4);
    lo = vcombine_u8(
            vtbl2_u8(uint8x16_to_8x8x2(rs_nibmul[x].lo), vget_low_u8(lo)),
            vtbl2_u8(uint8x16_to_8x8x2(rs_nibmul[x].lo), vget_high_u8(lo)));
    hi = vcombine_u8(
            vtbl2_u8(uint8x16_to_8x8x2(rs_nibmul[x].hi), vget_low_u8(hi)),
            vtbl2_u8(uint8x16_to_8x8x2(rs_nibmul[x].hi), vget_high_u8(hi)));
    return lo ^ hi;

#elif defined(LIBRS_USE_SSSE3)

    v16 lo, hi;

    lo = v & VEC16(0x0f);
    hi = __builtin_ia32_psrawi128(v, 4);
    hi &= VEC16(0x0f);
    lo = __builtin_ia32_pshufb128(rs_nibmul[x].lo, lo);
    hi = __builtin_ia32_pshufb128(rs_nibmul[x].hi, hi);
    return lo ^ hi;

#else

    v16 vv = VEC16(0);

    while (x != 0) {
        if (x & 1)
            vv ^= v;
        x >>= 1;
        v = mul2(v);
    }
    return vv;

#endif
}
예제 #17
0
void CVisVector::CrossProd( const CVisVector & v1, const CVisVector & v2 )
{
    CVisEqualFixpoint mul1( MTRX_FRACTBITS );
    CVisEqualFixpoint mul2( MTRX_FRACTBITS );

    mul1.Mult( v1.m_fparyStore[1], v2.m_fparyStore[2] );
    mul2.Mult( v1.m_fparyStore[2], v2.m_fparyStore[1] );
    m_fparyStore[0].Sub(  mul1, mul2 );

    mul1.Mult( v1.m_fparyStore[2], v2.m_fparyStore[0] );
    mul2.Mult( v1.m_fparyStore[0], v2.m_fparyStore[2] );
    m_fparyStore[1].Sub(  mul1, mul2 );

    mul1.Mult( v1.m_fparyStore[0], v2.m_fparyStore[1] );
    mul2.Mult( v1.m_fparyStore[1], v2.m_fparyStore[0] );
    m_fparyStore[2].Sub(  mul1, mul2 );

    m_fparyStore[3] = 0;
}
예제 #18
0
파일: Test_all.cpp 프로젝트: eriser/CSL
void test_scaled_sin() {
	Sine vox(220);
	vox.set_scale(0.1);			// simplest: scale the sine directly
	logMsg("playing quiet sin...");
	run_test(vox);
	logMsg("quiet sin done.");

	Sine vox2(220);
	MulOp mul(vox2, 0.1);		// using a MulOp with a constant
	logMsg("playing quiet sin...");
	run_test(mul);
	logMsg("quiet sin done.");
	
	Sine vox3(220);
	StaticVariable var(0.1);		// using a MulOp with a StaticVariable
	MulOp mul2(vox3, var);
	logMsg("playing quiet sin...");
	run_test(mul2);
	logMsg("quiet sin done.");
}
예제 #19
0
 virtual void exec()
 {
   USE(READ, n, tsteps);
   T DX(static_cast<T>(1.0) / n);
   T DY(static_cast<T>(1.0) / n);
   T DT(static_cast<T>(1.0) / tsteps);
   T B1(static_cast<T>(2.0));
   T B2(static_cast<T>(1.0));
   T mul1(B1 * DT / (DX * DX));
   T mul2(B2 * DT / (DY * DY));
   T a(-mul1 / static_cast<T>(2.0));
   T b(static_cast<T>(1.0) + mul1);
   T c(a);
   T d(-mul2 / static_cast<T>(2.0));
   T e(static_cast<T>(1.0) + mul2);
   T f(d);
   USE(READWRITE, v, u, p, q);
   using exec_pol = NestedPolicy<ExecList<omp_parallel_for_exec, simd_exec>,
                                 Tile<TileList<tile_fixed<16>, tile_none>>>;
   for (int t = 0; t < tsteps; ++t) {
     forall<omp_parallel_for_exec>(1, n - 1, [=](int i) {
       v->at(0, i) = static_cast<T>(1.0);
       p->at(i, 0) = static_cast<T>(0.0);
       q->at(i, 0) = v->at(0, i);
       v->at(n - 1, i) = static_cast<T>(1.0);
     });
     forallN<exec_pol>(
       RangeSegment{1, n - 1},
       RangeSegment{1, n - 1},
       [=](int i, int j) {
         p->at(i, j) = -c / (a * p->at(i, j - 1) + b);
         q->at(i, j) = (-d * u->at(j, i - 1) + (1.0 + 2.0 * d) * u->at(j, i)
                        - f * u->at(j, i + 1)
                        - a * q->at(i, j - 1))
                       / (a * p->at(i, j - 1) + b);
       });
     forallN<exec_pol>(
       RangeSegment{1, n - 1},
       RangeSegment{2, n},
       [=](int i, int j_) {
         int j = n - j_;
         v->at(j, i) = p->at(i, j) * v->at(j + 1, i) + q->at(i, j);
       });
     forall<omp_parallel_for_exec>(1, n - 1, [=](int i) {
       u->at(i, 0) = static_cast<T>(1.0);
       p->at(i, 0) = static_cast<T>(0.0);
       q->at(i, 0) = u->at(i, 0);
       u->at(i, n - 1) = static_cast<T>(1.0);
     });
     forallN<exec_pol>(
       RangeSegment{1, n - 1},
       RangeSegment{1, n - 1},
       [=](int i, int j) {
         p->at(i, j) = -f / (d * p->at(i, j - 1) + e);
         q->at(i, j) =
           (-a * v->at(i - 1, j)
            + (static_cast<T>(1.0) + static_cast<T>(2.0) * a) * v->at(i, j)
            - c * v->at(i + 1, j)
            - d * q->at(i, j - 1))
           / (d * p->at(i, j - 1) + e);
       });
     forallN<exec_pol>(
       RangeSegment{1, n - 1},
       RangeSegment{2, n},
       [=](int i, int j_) {
         int j = n - j_;
         u->at(i, j) = p->at(i, j) * u->at(i, j + 1) + q->at(i, j);
       });
   }
 }
예제 #20
0
/*
DFunc : OTR Core Decryption Function, with nonce encryption 
*/
__m128i DFunc(
	const uint8 *nonce,
	uint32 nonce_len,
#if(ADP==Seri)
	const __m128i TA,
#endif
	const uint8 *ciphertext,
	uint32 ci_len,
	uint32 t_len,
	uint8 *plaintext)
{
	uint32 i;
	uint32 ell = 0; //number of 2BLOCK-byte chunks, excl. last one
	uint32 last = 0; //number of bytes in the last chunks

	block Sum = _mm_setzero_si128();
	block txt[PIPE], Ln[PIPE + 1];
	uint32 rest_len = ci_len;
	__m128i *ptp = (__m128i*)plaintext;
	const __m128i *ctp = (__m128i*)ciphertext;
	ALIGN(16)uint8 tmp[BLOCK] = { 0 };
	block *La;

	/* Encryption of nonce */
	memcpy(&tmp[BLOCK - nonce_len], nonce, nonce_len);
	tmp[0] = (uint8)((t_len%BLOCK) << 4);
	tmp[BLOCK - nonce_len - 1] |= 0x01;
	Ln[0] = _mm_load_si128((__m128i*)tmp);
	AES_encrypt(Ln[0], &Ln[0], encrypt_key);

#if (ADP==Seri)
	Ln[0] = _mm_xor_si128(Ln[0], TA);
	mul2(Ln[0], &Ln[0]);
#endif
	while (rest_len > (DBLOCK*PIPE)){
		/* first round*/
		mul2_PIPE(Ln);
		txt[0] = _mm_xor_si128(Ln[0], ctp[0]);
		txt[0] = _mm_xor_si128(Ln[1], txt[0]); 
		txt[1] = _mm_xor_si128(Ln[1], ctp[2]);
		txt[1] = _mm_xor_si128(Ln[2], txt[1]); 
		txt[2] = _mm_xor_si128(Ln[2], ctp[4]);
		txt[2] = _mm_xor_si128(Ln[3], txt[2]); 
		txt[3] = _mm_xor_si128(Ln[3], ctp[6]);
		txt[3] = _mm_xor_si128(Ln[4], txt[3]); 
#if (PIPE>=5)
		txt[4] = _mm_xor_si128(Ln[4], ctp[8]);
		txt[4] = _mm_xor_si128(Ln[5], txt[4]); 
#endif
#if (PIPE>=6)
		txt[5] = _mm_xor_si128(Ln[5], ctp[10]);
		txt[5] = _mm_xor_si128(Ln[6], txt[5]); 
#endif
#if (PIPE>=7)
		txt[6] = _mm_xor_si128(Ln[6], ctp[12]);
		txt[6] = _mm_xor_si128(Ln[7], txt[6]); 
#endif
#if (PIPE==8)
		txt[7] = _mm_xor_si128(Ln[7], ctp[14]);
		txt[7] = _mm_xor_si128(Ln[8], txt[7]); 
#endif
		AES_ecb_encrypt_PIPE(txt, encrypt_key);
		/* second round*/
		ptp[0] = _mm_xor_si128(txt[0], ctp[1]);
		txt[0] = _mm_xor_si128(Ln[0], ptp[0]);
		ptp[2] = _mm_xor_si128(txt[1], ctp[3]);
		txt[1] = _mm_xor_si128(Ln[1], ptp[2]);
		ptp[4] = _mm_xor_si128(txt[2], ctp[5]);
		txt[2] = _mm_xor_si128(Ln[2], ptp[4]);
		ptp[6] = _mm_xor_si128(txt[3], ctp[7]);
		txt[3] = _mm_xor_si128(Ln[3], ptp[6]);
#if (PIPE>=5)
		ptp[8] = _mm_xor_si128(txt[4], ctp[9]);
		txt[4] = _mm_xor_si128(Ln[4], ptp[8]);
#endif
#if (PIPE>=6)
		ptp[10] = _mm_xor_si128(txt[5], ctp[11]);
		txt[5] = _mm_xor_si128(Ln[5], ptp[10]);
#endif
#if (PIPE>=7)
		ptp[12] = _mm_xor_si128(txt[6], ctp[13]);
		txt[6] = _mm_xor_si128(Ln[6], ptp[12]);
#endif
#if (PIPE==8)
		ptp[14] = _mm_xor_si128(txt[7], ctp[15]);
		txt[7] = _mm_xor_si128(Ln[7], ptp[14]);
#endif
		AES_ecb_encrypt_PIPE(txt, encrypt_key);
		ptp[1] = _mm_xor_si128(txt[0], ctp[0]);
		Sum = _mm_xor_si128(Sum, ptp[1]);
		ptp[3] = _mm_xor_si128(txt[1], ctp[2]);
		Sum = _mm_xor_si128(Sum, ptp[3]);
		ptp[5] = _mm_xor_si128(txt[2], ctp[4]);
		Sum = _mm_xor_si128(Sum, ptp[5]);
		ptp[7] = _mm_xor_si128(txt[3], ctp[6]);
		Sum = _mm_xor_si128(Sum, ptp[7]);
#if (PIPE>=5)
		ptp[9] = _mm_xor_si128(txt[4], ctp[8]);
		Sum = _mm_xor_si128(Sum, ptp[9]);
#endif
#if (PIPE>=6)
		ptp[11] = _mm_xor_si128(txt[5], ctp[10]);
		Sum = _mm_xor_si128(Sum, ptp[11]);
#endif
#if (PIPE>=7)
		ptp[13] = _mm_xor_si128(txt[6], ctp[12]);
		Sum = _mm_xor_si128(Sum, ptp[13]);
#endif
#if (PIPE==8)
		ptp[15] = _mm_xor_si128(txt[7], ctp[14]);
		Sum = _mm_xor_si128(Sum, ptp[15]);
#endif
		Ln[0] = _mm_load_si128(&Ln[PIPE]);
		ptp += (2 * PIPE);
		ctp += (2 * PIPE);
		rest_len -= (DBLOCK*PIPE);
	}

	if (rest_len != 0){
		last = rest_len % DBLOCK;
		if (last == 0) last = DBLOCK;
		ell = (rest_len - last) / DBLOCK; // plaintext length = 2BLOCK*ell + last (non-zero)
	}

	/* 2-round Feistel for the full chunks */
	mul3(Ln[0], &Ln[1]);
	for (i = 0; i < (2 * ell); i += 2){
		txt[0] = _mm_xor_si128(Ln[1], ctp[i]);
		AES_encrypt(txt[0], &txt[0], encrypt_key);
		ptp[i] = _mm_xor_si128(txt[0], ctp[i + 1]);
		txt[0] = _mm_xor_si128(Ln[0], ptp[i]);
		AES_encrypt(txt[0], &txt[0], encrypt_key);
		ptp[i + 1] = _mm_xor_si128(txt[0], ctp[i]);
		Sum = _mm_xor_si128(Sum, ptp[i + 1]);
		Ln[0] = _mm_xor_si128(Ln[0], Ln[1]);
		mul2(Ln[1], &Ln[1]);
	}
	ptp += (2 * ell);
	ctp += (2 * ell);
	/* Last chunk */
	if (last <= BLOCK){ 	//odd block, including the case pl_len = 0 (no plaintext)
		AES_encrypt(Ln[0], &txt[0], encrypt_key); //txt[0] is Z
		xorp(last, &txt[0], (uint8*)&ctp[0], (uint8*)&ptp[0]);
		ozp(last, (uint8*)&ptp[0], &txt[0]);
		Sum = _mm_xor_si128(txt[0], Sum);
		La = &Ln[0];
	}
	else{//even blocks, last > BLOCK always holds. 2-round Feistel with last swap
		ozp(last - BLOCK, (uint8*)&ctp[1], &txt[0]);
		Sum = _mm_xor_si128(Sum, txt[0]);
		txt[0] = _mm_xor_si128(Ln[1], txt[0]);
		AES_encrypt(txt[0], &txt[0], encrypt_key);
		ptp[0] = _mm_xor_si128(txt[0], ctp[0]);
		txt[0] = _mm_xor_si128(Ln[0], ptp[0]);
		AES_encrypt(txt[0], &txt[1], encrypt_key); //txt[1] is Z
		xorp(last - BLOCK, &txt[1], (uint8*)&ctp[1], (uint8*)&ptp[1]);
		Sum = _mm_xor_si128(Sum, txt[1]);
		La = &Ln[1];
	}
	/* TE generation */
	if (last == BLOCK || last == DBLOCK){//last = 16 or 32
		mul7(*La, La);
	}
	else{
		mul3twice(*La, La);
	}
	Sum = _mm_xor_si128(Sum, *La);	//Sum = (3^2 or 7)L* xor Sum
	AES_encrypt(Sum, &Sum, encrypt_key);
	return Sum;//TE
}//end of DFunc
예제 #21
0
파일: Aes.cpp 프로젝트: frankencode/fluxkit
inline uint8_t mul3(uint8_t a)
{
    return mul2(a) ^ a;
}