Esempio n. 1
0
void RC2::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
{
	word16 R0, R1, R2, R3;
	Block::Get(inBlock)(R0)(R1)(R2)(R3);

	for (int i = 15; i >= 0; i--)
	{
		if (i == 4 || i == 10)
		{
			R3 = word16(R3 - K[R2 & 63]);
			R2 = word16(R2 - K[R1 & 63]);
			R1 = word16(R1 - K[R0 & 63]);
			R0 = word16(R0 - K[R3 & 63]);
		}

		R3 = rotrFixed(R3, 5);
		R3 = word16(R3 - ((R0 & ~R2) + (R1 & R2) + K[4*i+3]));

		R2 = rotrFixed(R2, 3);
		R2 = word16(R2 - ((R3 & ~R1) + (R0 & R1) + K[4*i+2]));

		R1 = rotrFixed(R1, 2);
		R1 = word16(R1 - ((R2 & ~R0) + (R3 & R0) + K[4*i+1]));

		R0 = rotrFixed(R0, 1);
		R0 = word16(R0 - ((R1 & ~R3) + (R2 & R3) + K[4*i+0]));
	}

	Block::Put(xorBlock, outBlock)(R0)(R1)(R2)(R3);
}
Esempio n. 2
0
static INLINE void FPERM(word32* left, word32* right)
{
    word32 work;

    *right = rotrFixed(*right, 1U);
    work = (*left ^ *right) & 0xaaaaaaaa;
    *right ^= work;

    *left = rotrFixed(*left^work, 9U);
    work = (*left ^ *right) & 0x00ff00ff;
    *right ^= work;

    *left = rotlFixed(*left^work, 6U);
    work = (*left ^ *right) & 0x33333333;
    *right ^= work;

    *left = rotlFixed(*left^work, 18U);
    work = (*left ^ *right) & 0xffff0000;
    *right ^= work;

    *left = rotlFixed(*left^work, 20U);
    work = (*left ^ *right) & 0xf0f0f0f0;
    *right ^= work;

    *left = rotrFixed(*left^work, 4U);
}
Esempio n. 3
0
void DesRawProcessBlock(word32* lIn, word32* rIn, const word32* kptr)
{
    word32 l = *lIn, r = *rIn, i;

    for (i=0; i<8; i++)
    {
        word32 work = rotrFixed(r, 4U) ^ kptr[4*i+0];
        l ^= Spbox[6][(work) & 0x3f]
          ^  Spbox[4][(work >> 8) & 0x3f]
          ^  Spbox[2][(work >> 16) & 0x3f]
          ^  Spbox[0][(work >> 24) & 0x3f];
        work = r ^ kptr[4*i+1];
        l ^= Spbox[7][(work) & 0x3f]
          ^  Spbox[5][(work >> 8) & 0x3f]
          ^  Spbox[3][(work >> 16) & 0x3f]
          ^  Spbox[1][(work >> 24) & 0x3f];

        work = rotrFixed(l, 4U) ^ kptr[4*i+2];
        r ^= Spbox[6][(work) & 0x3f]
          ^  Spbox[4][(work >> 8) & 0x3f]
          ^  Spbox[2][(work >> 16) & 0x3f]
          ^  Spbox[0][(work >> 24) & 0x3f];
        work = l ^ kptr[4*i+3];
        r ^= Spbox[7][(work) & 0x3f]
          ^  Spbox[5][(work >> 8) & 0x3f]
          ^  Spbox[3][(work >> 16) & 0x3f]
          ^  Spbox[1][(work >> 24) & 0x3f];
    }

    *lIn = l; *rIn = r;
}
Esempio n. 4
0
void CRC2Decryptor::Transform(TDes8& aBlock)
	{
	assert(aBlock.Size() == KRC2BlockBytes);

	TUint16 R0, R1, R2, R3;
	GetBlockLittleEndian((TUint8*)&aBlock[0], R0, R1, R2, R3);

	TInt i = 15;
	for (; i >= 0; i--)
		{
		if (i == 4 || i == 10)
			{
			R3 -= iK[R2 & 63];
			R2 -= iK[R1 & 63];
			R1 -= iK[R0 & 63];
			R0 -= iK[R3 & 63];
			}

		R3 = rotrFixed(R3, 5);
		R3 -= (R0 & ~R2) + (R1 & R2) + iK[4*i+3];

		R2 = rotrFixed(R2, 3);
		R2 -= (R3 & ~R1) + (R0 & R1) + iK[4*i+2];

		R1 = rotrFixed(R1, 2);
		R1 -= (R2 & ~R0) + (R3 & R0) + iK[4*i+1];

		R0 = rotrFixed(R0, 1);
		R0 -= (R1 & ~R3) + (R2 & R3) + iK[4*i+0];
		}

	PutBlockLittleEndian((TUint8*)&aBlock[0], R0, R1, R2, R3);
	}
static inline void IPERM(word32& left, word32& right)
{
    word32 work;

    right = rotlFixed(right, 4U);
    work = (left ^ right) & 0xf0f0f0f0;
    left ^= work;

    right = rotrFixed(right^work, 20U);
    work = (left ^ right) & 0xffff0000;
    left ^= work;

    right = rotrFixed(right^work, 18U);
    work = (left ^ right) & 0x33333333;
    left ^= work;

    right = rotrFixed(right^work, 6U);
    work = (left ^ right) & 0x00ff00ff;
    left ^= work;

    right = rotlFixed(right^work, 9U);
    work = (left ^ right) & 0xaaaaaaaa;
    left = rotlFixed(left^work, 1U);
    right ^= work;
}
void BasicDES::RawProcessBlock(word32& lIn, word32& rIn) const
{
    word32 l = lIn, r = rIn;
    const word32* kptr = k_;

    for (unsigned i=0; i<8; i++)
    {
        word32 work = rotrFixed(r, 4U) ^ kptr[4*i+0];
        l ^= Spbox[6][(work) & 0x3f]
          ^  Spbox[4][(work >> 8) & 0x3f]
          ^  Spbox[2][(work >> 16) & 0x3f]
          ^  Spbox[0][(work >> 24) & 0x3f];
        work = r ^ kptr[4*i+1];
        l ^= Spbox[7][(work) & 0x3f]
          ^  Spbox[5][(work >> 8) & 0x3f]
          ^  Spbox[3][(work >> 16) & 0x3f]
          ^  Spbox[1][(work >> 24) & 0x3f];

        work = rotrFixed(l, 4U) ^ kptr[4*i+2];
        r ^= Spbox[6][(work) & 0x3f]
          ^  Spbox[4][(work >> 8) & 0x3f]
          ^  Spbox[2][(work >> 16) & 0x3f]
          ^  Spbox[0][(work >> 24) & 0x3f];
        work = l ^ kptr[4*i+3];
        r ^= Spbox[7][(work) & 0x3f]
          ^  Spbox[5][(work >> 8) & 0x3f]
          ^  Spbox[3][(work >> 16) & 0x3f]
          ^  Spbox[1][(work >> 24) & 0x3f];
    }

    lIn = l; rIn = r;
}
Esempio n. 7
0
void DES::RawProcessBlock(word32 &l_, word32 &r_) const
{
	word32 l = l_, r = r_;
	const word32 *kptr=k;

	for (unsigned i=0; i<8; i++)
	{
		word32 work = rotrFixed(r, 4U) ^ kptr[4*i+0];
		l ^= Spbox[6][(work) & 0x3f]
		  ^  Spbox[4][(work >> 8) & 0x3f]
		  ^  Spbox[2][(work >> 16) & 0x3f]
		  ^  Spbox[0][(work >> 24) & 0x3f];
		work = r ^ kptr[4*i+1];
		l ^= Spbox[7][(work) & 0x3f]
		  ^  Spbox[5][(work >> 8) & 0x3f]
		  ^  Spbox[3][(work >> 16) & 0x3f]
		  ^  Spbox[1][(work >> 24) & 0x3f];

		work = rotrFixed(l, 4U) ^ kptr[4*i+2];
		r ^= Spbox[6][(work) & 0x3f]
		  ^  Spbox[4][(work >> 8) & 0x3f]
		  ^  Spbox[2][(work >> 16) & 0x3f]
		  ^  Spbox[0][(work >> 24) & 0x3f];
		work = l ^ kptr[4*i+3];
		r ^= Spbox[7][(work) & 0x3f]
		  ^  Spbox[5][(work >> 8) & 0x3f]
		  ^  Spbox[3][(work >> 16) & 0x3f]
		  ^  Spbox[1][(work >> 24) & 0x3f];
	}

	l_ = l; r_ = r;
}
Esempio n. 8
0
Rijndael::Rijndael(const byte *userKey, unsigned int keylen)
	: k_len(keylen/4), key(k_len*5 + 24)
{
	assert(keylen == KeyLength(keylen));

	word32 t;
	int i;

	GetUserKeyLittleEndian(key.ptr, k_len, userKey, keylen);

	switch(k_len)
	{
		case 4: t = key[3];
				for(i = 0; i < 10; ++i)
				{
					t = rotrFixed(t, 8);
					t = ls_box(t) ^ rco_tab[i];
					key[4 * i + 4] = t ^= key[4 * i];
					key[4 * i + 5] = t ^= key[4 * i + 1];
					key[4 * i + 6] = t ^= key[4 * i + 2];
					key[4 * i + 7] = t ^= key[4 * i + 3];
				}
				break;

		case 6: t = key[5];
				for(i = 0; i < 8; ++i)
				{
					t = rotrFixed(t,  8);
					t = ls_box(t) ^ rco_tab[i];
					key[6 * i + 6] = t ^= key[6 * i];
					key[6 * i + 7] = t ^= key[6 * i + 1];
					key[6 * i + 8] = t ^= key[6 * i + 2];
					key[6 * i + 9] = t ^= key[6 * i + 3];
					key[6 * i + 10] = t ^= key[6 * i + 4];
					key[6 * i + 11] = t ^= key[6 * i + 5];
				}
				break;

		case 8: t = key[7];
				for(i = 0; i < 7; ++i)
				{
					t = rotrFixed(t,  8);
					t = ls_box(t) ^ rco_tab[i];
					key[8 * i + 8] = t ^= key[8 * i];
					key[8 * i + 9] = t ^= key[8 * i + 1];
					key[8 * i + 10] = t ^= key[8 * i + 2];
					key[8 * i + 11] = t ^= key[8 * i + 3];
					key[8 * i + 12] = t = key[8 * i + 4] ^ ls_box(t);				\
					key[8 * i + 13] = t ^= key[8 * i + 5];
					key[8 * i + 14] = t ^= key[8 * i + 6];
					key[8 * i + 15] = t ^= key[8 * i + 7];
				}
				break;
	}
}
Esempio n. 9
0
// Encrypt or decrypt a block of data in ECB mode
void DES::ProcessBlock(const byte *inBlock, byte * outBlock) const
{
	word32 l,r;
	GetBlockBigEndian(inBlock, l, r);
	IPERM(l,r);

	const word32 *kptr=k;

	for (unsigned i=0; i<8; i++)
	{
		word32 work = rotrFixed(r, 4U) ^ kptr[4*i+0];
		l ^= Spbox[6][(work) & 0x3f]
		  ^  Spbox[4][(work >> 8) & 0x3f]
		  ^  Spbox[2][(work >> 16) & 0x3f]
		  ^  Spbox[0][(work >> 24) & 0x3f];
		work = r ^ kptr[4*i+1];
		l ^= Spbox[7][(work) & 0x3f]
		  ^  Spbox[5][(work >> 8) & 0x3f]
		  ^  Spbox[3][(work >> 16) & 0x3f]
		  ^  Spbox[1][(work >> 24) & 0x3f];

		work = rotrFixed(l, 4U) ^ kptr[4*i+2];
		r ^= Spbox[6][(work) & 0x3f]
		  ^  Spbox[4][(work >> 8) & 0x3f]
		  ^  Spbox[2][(work >> 16) & 0x3f]
		  ^  Spbox[0][(work >> 24) & 0x3f];
		work = l ^ kptr[4*i+3];
		r ^= Spbox[7][(work) & 0x3f]
		  ^  Spbox[5][(work >> 8) & 0x3f]
		  ^  Spbox[3][(work >> 16) & 0x3f]
		  ^  Spbox[1][(work >> 24) & 0x3f];
	}

	FPERM(l,r);
	PutBlockBigEndian(outBlock, r, l);
}
Esempio n. 10
0
static void ILTf (uint32 *a, uint32 *b, uint32 *c, uint32 *d)
{ 
	*c = rotrFixed(*c, 22);
	*a = rotrFixed(*a, 5);
	*c ^= *d ^ (*b << 7);
	*a ^= *b ^ *d;
	*b = rotrFixed(*b, 1);
	*d = rotrFixed(*d, 7) ^ *c ^ (*a << 3);
	*b ^= *a ^ *c;
	*c = rotrFixed(*c, 3);
	*a = rotrFixed(*a, 13);
}
Esempio n. 11
0
void SEAL_Policy<B>::OperateKeystream(KeystreamOperation operation, byte *output, const byte *input, size_t iterationCount)
{
    word32 a, b, c, d, n1, n2, n3, n4;
    unsigned int p, q;

    for (size_t iteration = 0; iteration < iterationCount; ++iteration)
    {
#define Ttab(x) *(word32 *)((byte *)m_T.begin()+x)

        a = m_outsideCounter ^ m_R[4*m_insideCounter];
        b = rotrFixed(m_outsideCounter, 8U) ^ m_R[4*m_insideCounter+1];
        c = rotrFixed(m_outsideCounter, 16U) ^ m_R[4*m_insideCounter+2];
        d = rotrFixed(m_outsideCounter, 24U) ^ m_R[4*m_insideCounter+3];

        for (unsigned int j=0; j<2; j++)
        {
            p = a & 0x7fc;
            b += Ttab(p);
            a = rotrFixed(a, 9U);

            p = b & 0x7fc;
            c += Ttab(p);
            b = rotrFixed(b, 9U);

            p = c & 0x7fc;
            d += Ttab(p);
            c = rotrFixed(c, 9U);

            p = d & 0x7fc;
            a += Ttab(p);
            d = rotrFixed(d, 9U);
        }

        n1 = d, n2 = b, n3 = a, n4 = c;

        p = a & 0x7fc;
        b += Ttab(p);
        a = rotrFixed(a, 9U);

        p = b & 0x7fc;
        c += Ttab(p);
        b = rotrFixed(b, 9U);

        p = c & 0x7fc;
        d += Ttab(p);
        c = rotrFixed(c, 9U);

        p = d & 0x7fc;
        a += Ttab(p);
        d = rotrFixed(d, 9U);

        // generate 8192 bits
        for (unsigned int i=0; i<64; i++)
        {
            p = a & 0x7fc;
            a = rotrFixed(a, 9U);
            b += Ttab(p);
            b ^= a;

            q = b & 0x7fc;
            b = rotrFixed(b, 9U);
            c ^= Ttab(q);
            c += b;

            p = (p+c) & 0x7fc;
            c = rotrFixed(c, 9U);
            d += Ttab(p);
            d ^= c;

            q = (q+d) & 0x7fc;
            d = rotrFixed(d, 9U);
            a ^= Ttab(q);
            a += d;

            p = (p+a) & 0x7fc;
            b ^= Ttab(p);
            a = rotrFixed(a, 9U);

            q = (q+b) & 0x7fc;
            c += Ttab(q);
            b = rotrFixed(b, 9U);

            p = (p+c) & 0x7fc;
            d ^= Ttab(p);
            c = rotrFixed(c, 9U);

            q = (q+d) & 0x7fc;
            d = rotrFixed(d, 9U);
            a += Ttab(q);

#define SEAL_OUTPUT(x)	\
	CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, B::ToEnum(), 0, b + m_S[4*i+0]);\
	CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, B::ToEnum(), 1, c ^ m_S[4*i+1]);\
	CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, B::ToEnum(), 2, d + m_S[4*i+2]);\
	CRYPTOPP_KEYSTREAM_OUTPUT_WORD(x, B::ToEnum(), 3, a ^ m_S[4*i+3]);

            CRYPTOPP_KEYSTREAM_OUTPUT_SWITCH(SEAL_OUTPUT, 4*4);

            if (i & 1)
            {
                a += n3;
                b += n4;
                c ^= n3;
                d ^= n4;
            }
            else
            {
                a += n1;
                b += n2;
                c ^= n1;
                d ^= n2;
            }
        }

        if (++m_insideCounter == m_iterationsPerCount)
        {
            ++m_outsideCounter;
            m_insideCounter = 0;
        }
    }

    a = b = c = d = n1 = n2 = n3 = n4 = 0;
    p = q = 0;
}
Esempio n. 12
0
void SEAL::Generate(word32 in, byte *out) const
{
	word32 a, b, c, d, n1, n2, n3, n4;
	unsigned int p, q;
	word32 *wout = (word32 *)out;

	for (unsigned int l=0; l<L/8192; l++)
	{
		a = in ^ R[4*l];
		b = rotrFixed(in, 8U) ^ R[4*l+1];
		c = rotrFixed(in, 16U) ^ R[4*l+2];
		d = rotrFixed(in, 24U) ^ R[4*l+3];

#define Ttab(x) *(word32 *)((byte *)T.ptr+x)
	
		for (unsigned int j=0; j<2; j++)
		{
			p = a & 0x7fc;
			b += Ttab(p);
			a = rotrFixed(a, 9U);
	
			p = b & 0x7fc;
			c += Ttab(p);
			b = rotrFixed(b, 9U);
	
			p = c & 0x7fc;
			d += Ttab(p);
			c = rotrFixed(c, 9U);
	
			p = d & 0x7fc;
			a += Ttab(p);
			d = rotrFixed(d, 9U);
		}

		n1 = d; n2 = b; n3 = a; n4 = c;
	
		p = a & 0x7fc;
		b += Ttab(p);
		a = rotrFixed(a, 9U);
	
		p = b & 0x7fc;
		c += Ttab(p);
		b = rotrFixed(b, 9U);
	
		p = c & 0x7fc;
		d += Ttab(p);
		c = rotrFixed(c, 9U);
	
		p = d & 0x7fc;
		a += Ttab(p);
		d = rotrFixed(d, 9U);
		
		// generate 8192 bits
		for (unsigned int i=0; i<64; i++)
		{
			p = a & 0x7fc;
			a = rotrFixed(a, 9U);
			b += Ttab(p);
			b ^= a;
	
			q = b & 0x7fc;
			b = rotrFixed(b, 9U);
			c ^= Ttab(q);
			c += b;
	
			p = (p+c) & 0x7fc;
			c = rotrFixed(c, 9U);
			d += Ttab(p);
			d ^= c;
	
			q = (q+d) & 0x7fc;
			d = rotrFixed(d, 9U);
			a ^= Ttab(q);
			a += d;
	
			p = (p+a) & 0x7fc;
			b ^= Ttab(p);
			a = rotrFixed(a, 9U);
	
			q = (q+b) & 0x7fc;
			c += Ttab(q);
			b = rotrFixed(b, 9U);
	
			p = (p+c) & 0x7fc;
			d ^= Ttab(p);
			c = rotrFixed(c, 9U);
	
			q = (q+d) & 0x7fc;
			d = rotrFixed(d, 9U);
			a += Ttab(q);

#ifdef IS_LITTLE_ENDIAN
			wout[0] = byteReverse(b + S[4*i+0]);
			wout[1] = byteReverse(c ^ S[4*i+1]);
			wout[2] = byteReverse(d + S[4*i+2]);
			wout[3] = byteReverse(a ^ S[4*i+3]);
#else
			wout[0] = b + S[4*i+0];
			wout[1] = c ^ S[4*i+1];
			wout[2] = d + S[4*i+2];
			wout[3] = a ^ S[4*i+3];
#endif
			wout += 4;
	
			if (i & 1)
			{
				a += n3;
				b += n4;
				c ^= n3;
				d ^= n4;
			}
			else
			{
				a += n1;
				b += n2;        
				c ^= n1;
				d ^= n2;
			}
		}
	}

	a = b = c = d = n1 = n2 = n3 = n4 = 0;
	p = q = 0;
}
Esempio n. 13
0
// tailor the last output
void HAVAL::Tailor(unsigned int FPTLEN)
{
	word32 temp;

	switch (FPTLEN)
	{
	case 128:
		temp = (digest[7] & 0x000000FF) | 
			   (digest[6] & 0xFF000000) | 
			   (digest[5] & 0x00FF0000) | 
			   (digest[4] & 0x0000FF00);
		digest[0] += rotrFixed(temp,  8U);

		temp = (digest[7] & 0x0000FF00) | 
			   (digest[6] & 0x000000FF) | 
			   (digest[5] & 0xFF000000) | 
			   (digest[4] & 0x00FF0000);
		digest[1] += rotrFixed(temp, 16U);

		temp  = (digest[7] & 0x00FF0000) | 
				(digest[6] & 0x0000FF00) | 
				(digest[5] & 0x000000FF) | 
				(digest[4] & 0xFF000000);
		digest[2] += rotrFixed(temp, 24U);

		temp = (digest[7] & 0xFF000000) | 
			   (digest[6] & 0x00FF0000) | 
			   (digest[5] & 0x0000FF00) | 
			   (digest[4] & 0x000000FF);
		digest[3] += temp;
		break;

	case 160:
		temp = (digest[7] &  (word32)0x3F) | 
			   (digest[6] & ((word32)0x7F << 25)) |  
			   (digest[5] & ((word32)0x3F << 19));
		digest[0] += rotrFixed(temp, 19U);

		temp = (digest[7] & ((word32)0x3F <<  6)) | 
			   (digest[6] &  (word32)0x3F) |  
			   (digest[5] & ((word32)0x7F << 25));
		digest[1] += rotrFixed(temp, 25U);

		temp = (digest[7] & ((word32)0x7F << 12)) | 
			   (digest[6] & ((word32)0x3F <<  6)) |  
			   (digest[5] &  (word32)0x3F);
		digest[2] += temp;

		temp = (digest[7] & ((word32)0x3F << 19)) | 
			   (digest[6] & ((word32)0x7F << 12)) |  
			   (digest[5] & ((word32)0x3F <<  6));
		digest[3] += temp >> 6; 

		temp = (digest[7] & ((word32)0x7F << 25)) | 
			   (digest[6] & ((word32)0x3F << 19)) |  
			   (digest[5] & ((word32)0x7F << 12));
		digest[4] += temp >> 12;
		break;

	case 192:
		temp = (digest[7] &  (word32)0x1F) | 
			   (digest[6] & ((word32)0x3F << 26));
		digest[0] += rotrFixed(temp, 26U);

		temp = (digest[7] & ((word32)0x1F <<  5)) | 
			   (digest[6] &  (word32)0x1F);
		digest[1] += temp;

		temp = (digest[7] & ((word32)0x3F << 10)) | 
			   (digest[6] & ((word32)0x1F <<  5));
		digest[2] += temp >> 5;

		temp = (digest[7] & ((word32)0x1F << 16)) | 
			   (digest[6] & ((word32)0x3F << 10));
		digest[3] += temp >> 10;

		temp = (digest[7] & ((word32)0x1F << 21)) | 
			   (digest[6] & ((word32)0x1F << 16));
		digest[4] += temp >> 16;

		temp = (digest[7] & ((word32)0x3F << 26)) | 
			   (digest[6] & ((word32)0x1F << 21));
		digest[5] += temp >> 21;
		break;

	case 224:
		digest[0] += (digest[7] >> 27) & 0x1F;
		digest[1] += (digest[7] >> 22) & 0x1F;
		digest[2] += (digest[7] >> 18) & 0x0F;
		digest[3] += (digest[7] >> 13) & 0x1F;
		digest[4] += (digest[7] >>  9) & 0x0F;
		digest[5] += (digest[7] >>  4) & 0x1F;
		digest[6] +=  digest[7]        & 0x0F;
		break;

	case 256:
		break;

	default:
		assert(false);
	}
}
Esempio n. 14
0
void Rijndael::Dec::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const
{
	word32 s0, s1, s2, s3, t0, t1, t2, t3;
    const word32 *rk = m_key;

	s0 = ((const word32 *)inBlock)[0] ^ rk[0];
	s1 = ((const word32 *)inBlock)[1] ^ rk[1];
	s2 = ((const word32 *)inBlock)[2] ^ rk[2];
	s3 = ((const word32 *)inBlock)[3] ^ rk[3];
	t0 = rk[4];
	t1 = rk[5];
	t2 = rk[6];
	t3 = rk[7];
	rk += 8;

	// timing attack countermeasure. see comments at top for more details
	unsigned int i;
	word32 u = 0;
	for (i=0; i<sizeof(Td0)/4; i+=CRYPTOPP_L1_CACHE_LINE_SIZE)
		u &= (Td0[i+0*s_lineSizeDiv4] & Td0[i+2*s_lineSizeDiv4]) & (Td0[i+1*s_lineSizeDiv4] & Td0[i+3*s_lineSizeDiv4]);
	s0 |= u; s1 |= u; s2 |= u; s3 |= u;

	// first round
    t0 ^=
        Td0[GETBYTE(s0, s_i3)] ^
        rotrFixed(Td0[GETBYTE(s3, s_i2)], 8) ^
        rotrFixed(Td0[GETBYTE(s2, s_i1)], 16) ^
        rotrFixed(Td0[GETBYTE(s1, s_i0)], 24);
    t1 ^=
        Td0[GETBYTE(s1, s_i3)] ^
        rotrFixed(Td0[GETBYTE(s0, s_i2)], 8) ^
        rotrFixed(Td0[GETBYTE(s3, s_i1)], 16) ^
        rotrFixed(Td0[GETBYTE(s2, s_i0)], 24);
    t2 ^=
        Td0[GETBYTE(s2, s_i3)] ^
        rotrFixed(Td0[GETBYTE(s1, s_i2)], 8) ^
        rotrFixed(Td0[GETBYTE(s0, s_i1)], 16) ^
        rotrFixed(Td0[GETBYTE(s3, s_i0)], 24);
    t3 ^=
        Td0[GETBYTE(s3, s_i3)] ^
        rotrFixed(Td0[GETBYTE(s2, s_i2)], 8) ^
        rotrFixed(Td0[GETBYTE(s1, s_i1)], 16) ^
        rotrFixed(Td0[GETBYTE(s0, s_i0)], 24);

	// Nr - 2 full rounds:
    unsigned int r = m_rounds/2 - 1;
    do
	{
        s0 =
            Td0[GETBYTE(t0, 3)] ^
            Td1[GETBYTE(t3, 2)] ^
            Td2[GETBYTE(t2, 1)] ^
            Td3[GETBYTE(t1, 0)] ^
            rk[0];
        s1 =
            Td0[GETBYTE(t1, 3)] ^
            Td1[GETBYTE(t0, 2)] ^
            Td2[GETBYTE(t3, 1)] ^
            Td3[GETBYTE(t2, 0)] ^
            rk[1];
        s2 =
            Td0[GETBYTE(t2, 3)] ^
            Td1[GETBYTE(t1, 2)] ^
            Td2[GETBYTE(t0, 1)] ^
            Td3[GETBYTE(t3, 0)] ^
            rk[2];
        s3 =
            Td0[GETBYTE(t3, 3)] ^
            Td1[GETBYTE(t2, 2)] ^
            Td2[GETBYTE(t1, 1)] ^
            Td3[GETBYTE(t0, 0)] ^
            rk[3];

        t0 =
            Td0[GETBYTE(s0, 3)] ^
            Td1[GETBYTE(s3, 2)] ^
            Td2[GETBYTE(s2, 1)] ^
            Td3[GETBYTE(s1, 0)] ^
            rk[4];
        t1 =
            Td0[GETBYTE(s1, 3)] ^
            Td1[GETBYTE(s0, 2)] ^
            Td2[GETBYTE(s3, 1)] ^
            Td3[GETBYTE(s2, 0)] ^
            rk[5];
        t2 =
            Td0[GETBYTE(s2, 3)] ^
            Td1[GETBYTE(s1, 2)] ^
            Td2[GETBYTE(s0, 1)] ^
            Td3[GETBYTE(s3, 0)] ^
            rk[6];
        t3 =
            Td0[GETBYTE(s3, 3)] ^
            Td1[GETBYTE(s2, 2)] ^
            Td2[GETBYTE(s1, 1)] ^
            Td3[GETBYTE(s0, 0)] ^
            rk[7];

        rk += 8;
    } while (--r);

	// timing attack countermeasure. see comments at top for more details
	u = 0;
	for (i=0; i<sizeof(Sd)/4; i+=CRYPTOPP_L1_CACHE_LINE_SIZE)
		u &= (((word32*)Sd)[i+0*s_lineSizeDiv4] & ((word32*)Sd)[i+2*s_lineSizeDiv4]) & (((word32*)Sd)[i+1*s_lineSizeDiv4] & ((word32*)Sd)[i+3*s_lineSizeDiv4]);
	t0 |= u; t1 |= u; t2 |= u; t3 |= u;

	word32 tbw[4];
	byte *const tempBlock = (byte *)tbw;
	word32 *const obw = (word32 *)outBlock;
	const word32 *const xbw = (const word32 *)xorBlock;

	// last round
	tempBlock[0] = Sd[GETBYTE(t0, 3)];
	tempBlock[1] = Sd[GETBYTE(t3, 2)];
	tempBlock[2] = Sd[GETBYTE(t2, 1)];
	tempBlock[3] = Sd[GETBYTE(t1, 0)];
	tempBlock[4] = Sd[GETBYTE(t1, 3)];
	tempBlock[5] = Sd[GETBYTE(t0, 2)];
	tempBlock[6] = Sd[GETBYTE(t3, 1)];
	tempBlock[7] = Sd[GETBYTE(t2, 0)];
	tempBlock[8] = Sd[GETBYTE(t2, 3)];
	tempBlock[9] = Sd[GETBYTE(t1, 2)];
	tempBlock[10] = Sd[GETBYTE(t0, 1)];
	tempBlock[11] = Sd[GETBYTE(t3, 0)];
	tempBlock[12] = Sd[GETBYTE(t3, 3)];
	tempBlock[13] = Sd[GETBYTE(t2, 2)];
	tempBlock[14] = Sd[GETBYTE(t1, 1)];
	tempBlock[15] = Sd[GETBYTE(t0, 0)];

	if (xbw)
	{
		obw[0] = tbw[0] ^ xbw[0] ^ rk[0];
		obw[1] = tbw[1] ^ xbw[1] ^ rk[1];
		obw[2] = tbw[2] ^ xbw[2] ^ rk[2];
		obw[3] = tbw[3] ^ xbw[3] ^ rk[3];
	}
	else
	{
		obw[0] = tbw[0] ^ rk[0];
		obw[1] = tbw[1] ^ rk[1];
		obw[2] = tbw[2] ^ rk[2];
		obw[3] = tbw[3] ^ rk[3];
	}
}