void intrin_sequential_enc8(const unsigned char* PT, unsigned char* CT, int n_aesiters, int nkeys, ROUND_KEYS* ks){

	ROUND_KEYS *keyptr=(ROUND_KEYS *)ks;
    register __m128i keyA, keyB, keyC, keyD, keyE, keyF, keyG, keyH, con, mask, x2, keyA_aux, keyB_aux, keyC_aux, keyD_aux, globAux;
    unsigned char *ptptr, ctptr;
	int i, j, ptoffset, ctoffset;

	ctoffset = n_aesiters * 16;

	for (i=0;i<nkeys;i+=8){

		for(j=0;j<n_aesiters; j++) {
			register __m128i block1 = _mm_loadu_si128((__m128i const*)(0*16+PT));
			register __m128i block2 = _mm_loadu_si128((__m128i const*)(1*16+PT));
			register __m128i block3 = _mm_loadu_si128((__m128i const*)(2*16+PT));
			register __m128i block4 = _mm_loadu_si128((__m128i const*)(3*16+PT));
			register __m128i block5 = _mm_loadu_si128((__m128i const*)(4*16+PT));
			register __m128i block6 = _mm_loadu_si128((__m128i const*)(5*16+PT));
			register __m128i block7 = _mm_loadu_si128((__m128i const*)(6*16+PT));
			register __m128i block8 = _mm_loadu_si128((__m128i const*)(7*16+PT));

			READ_KEYS(0)

			block1 = _mm_xor_si128(keyA, block1);
			block2 = _mm_xor_si128(keyB, block2);
			block3 = _mm_xor_si128(keyC, block3);
			block4 = _mm_xor_si128(keyD, block4);
			block5 = _mm_xor_si128(keyE, block5);
			block6 = _mm_xor_si128(keyF, block6);
			block7 = _mm_xor_si128(keyG, block7);
			block8 = _mm_xor_si128(keyH, block8);

			ENC_round(1)
			ENC_round(2)
			ENC_round(3)
			ENC_round(4)
			ENC_round(5)
			ENC_round(6)
			ENC_round(7)
			ENC_round(8)
			ENC_round(9)
			ENC_round_last(10)

			_mm_storeu_si128((__m128i *)(CT+0*16), block1);
			_mm_storeu_si128((__m128i *)(CT+1*16), block2);
			_mm_storeu_si128((__m128i *)(CT+2*16), block3);
			_mm_storeu_si128((__m128i *)(CT+3*16), block4);
			_mm_storeu_si128((__m128i *)(CT+4*16), block5);
			_mm_storeu_si128((__m128i *)(CT+5*16), block6);
			_mm_storeu_si128((__m128i *)(CT+6*16), block7);
			_mm_storeu_si128((__m128i *)(CT+7*16), block8);

			PT+=128;
			CT+=128;

		}
		keyptr+=8;
	}
}
void intrin_sequential_gen_rnd8(unsigned char* ctr_buf, const unsigned long long ctr, unsigned char* CT,
		int n_aesiters, int nkeys, ROUND_KEYS* ks){

	ROUND_KEYS *keyptr=(ROUND_KEYS *)ks;
    register __m128i keyA, keyB, keyC, keyD, keyE, keyF, keyG, keyH, con, mask, x2, keyA_aux, keyB_aux, keyC_aux, keyD_aux, globAux;
    unsigned char *ctptr;
	int i, j, ctoffset;
	unsigned long long* tmpctr = (unsigned long long*) ctr_buf;

	ctoffset = n_aesiters * 16;

	register __m128i inblock, block1, block2, block3, block4, block5, block6, block7, block8;

	for (i=0;i<nkeys;i+=8){
		ctptr=CT + i*ctoffset;
		(*tmpctr) = ctr;
		for(j=0;j<n_aesiters; j++) {
			(*tmpctr)++;
			inblock = _mm_loadu_si128((__m128i const*)(ctr_buf));

			READ_KEYS(0)

			block1 = _mm_xor_si128(keyA, inblock);
			block2 = _mm_xor_si128(keyB, inblock);
			block3 = _mm_xor_si128(keyC, inblock);
			block4 = _mm_xor_si128(keyD, inblock);
			block5 = _mm_xor_si128(keyE, inblock);
			block6 = _mm_xor_si128(keyF, inblock);
			block7 = _mm_xor_si128(keyG, inblock);
			block8 = _mm_xor_si128(keyH, inblock);

			ENC_round(1)
			ENC_round(2)
			ENC_round(3)
			ENC_round(4)
			ENC_round(5)
			ENC_round(6)
			ENC_round(7)
			ENC_round(8)
			ENC_round(9)
			ENC_round_last(10)

			_mm_storeu_si128((__m128i *)(ctptr+0*ctoffset), block1);
			_mm_storeu_si128((__m128i *)(ctptr+1*ctoffset), block2);
			_mm_storeu_si128((__m128i *)(ctptr+2*ctoffset), block3);
			_mm_storeu_si128((__m128i *)(ctptr+3*ctoffset), block4);
			_mm_storeu_si128((__m128i *)(ctptr+4*ctoffset), block5);
			_mm_storeu_si128((__m128i *)(ctptr+5*ctoffset), block6);
			_mm_storeu_si128((__m128i *)(ctptr+6*ctoffset), block7);
			_mm_storeu_si128((__m128i *)(ctptr+7*ctoffset), block8);

			ctptr+=16;
		}
		keyptr+=8;
	}
}
void intrin_sequential_ks2_enc2(const unsigned char* PT, unsigned char* CT, int test_length, unsigned char* KEYS, unsigned char* first_key, unsigned char* TEMP_BUF){
	
	ROUND_KEYS *keys=(ROUND_KEYS *)KEYS;
    register __m128i keyA, keyB, con, mask, x2, keyA_aux, keyB_aux, globAux;
	int i;
	int _con1[4]={1,1,1,1};
	int _con2[4]={0x1b,0x1b,0x1b,0x1b};
	int _mask[4]={0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d};
	int _con3[4]={0x0ffffffff, 0x0ffffffff, 0x07060504, 0x07060504};
	__m128i con3=_mm_loadu_si128((__m128i const*)_con3);
	
	for (i=0;i<test_length;i+=2){
		keys[0].nr=10;
		keys[1].nr=10;

		keyA = _mm_loadu_si128((__m128i const*)(first_key));	
		keyB = _mm_loadu_si128((__m128i const*)(first_key+16));	
	
		_mm_storeu_si128((__m128i *)keys[0].KEY, keyA);	
		_mm_storeu_si128((__m128i *)keys[1].KEY, keyB);	
		
		con = _mm_loadu_si128((__m128i const*)_con1);	
		mask = _mm_loadu_si128((__m128i const*)_mask);	
		
		KS_round(1)
		KS_round(2)
		KS_round(3)
		KS_round(4)
		KS_round(5)
		KS_round(6)
		KS_round(7)
		KS_round(8)

		con = _mm_loadu_si128((__m128i const*)_con2);			

		KS_round(9)
		KS_round_last(10)

		keys+=2;
		first_key+=32;
	}	
	
	keys=(ROUND_KEYS *)KEYS;
	
	for (i=0;i<test_length;i+=2){
		register __m128i block1 = _mm_loadu_si128((__m128i const*)(0*16+PT));	
		register __m128i block2 = _mm_loadu_si128((__m128i const*)(1*16+PT));	
			
		READ_KEYS(0)
		
		block1 = _mm_xor_si128(keyA, block1);
		block2 = _mm_xor_si128(keyB, block2);
		
		ENC_round(1)
		ENC_round(2)
		ENC_round(3)
		ENC_round(4)
		ENC_round(5)
		ENC_round(6)
		ENC_round(7)
		ENC_round(8)
		ENC_round(9)
		ENC_round_last(10)
		
		_mm_storeu_si128((__m128i *)(CT+0*16), block1);	
		_mm_storeu_si128((__m128i *)(CT+1*16), block2);	
		
		PT+=32;
		CT+=32;
		
		keys+=2;
		
	}
}