//generates nkeys round keys from the bytes stored in key_bytes void intrin_sequential_ks4(ROUND_KEYS* ks, unsigned char* key_bytes, int nkeys) { ROUND_KEYS *keyptr=(ROUND_KEYS *)ks; register __m128i keyA, keyB, keyC, keyD, con, mask, x2, keyA_aux, keyB_aux, keyC_aux, keyD_aux, globAux; int i; int _con1[4]={1,1,1,1}; int _con2[4]={0x1b,0x1b,0x1b,0x1b}; int _mask[4]={0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d}; int _con3[4]={0x0ffffffff, 0x0ffffffff, 0x07060504, 0x07060504}; __m128i con3=_mm_loadu_si128((__m128i const*)_con3); for (i=0;i<nkeys;i+=4){ keyptr[0].nr=10; keyptr[1].nr=10; keyptr[2].nr=10; keyptr[3].nr=10; keyA = _mm_loadu_si128((__m128i const*)(key_bytes)); keyB = _mm_loadu_si128((__m128i const*)(key_bytes+16)); keyC = _mm_loadu_si128((__m128i const*)(key_bytes+32)); keyD = _mm_loadu_si128((__m128i const*)(key_bytes+48)); _mm_storeu_si128((__m128i *)keyptr[0].KEY, keyA); _mm_storeu_si128((__m128i *)keyptr[1].KEY, keyB); _mm_storeu_si128((__m128i *)keyptr[2].KEY, keyC); _mm_storeu_si128((__m128i *)keyptr[3].KEY, keyD); con = _mm_loadu_si128((__m128i const*)_con1); mask = _mm_loadu_si128((__m128i const*)_mask); KS_round(1) KS_round(2) KS_round(3) KS_round(4) KS_round(5) KS_round(6) KS_round(7) KS_round(8) con = _mm_loadu_si128((__m128i const*)_con2); KS_round(9) KS_round_last(10) keyptr+=4; key_bytes+=64; } }
void intrin_sequential_ks2_enc2(const unsigned char* PT, unsigned char* CT, int test_length, unsigned char* KEYS, unsigned char* first_key, unsigned char* TEMP_BUF){ ROUND_KEYS *keys=(ROUND_KEYS *)KEYS; register __m128i keyA, keyB, con, mask, x2, keyA_aux, keyB_aux, globAux; int i; int _con1[4]={1,1,1,1}; int _con2[4]={0x1b,0x1b,0x1b,0x1b}; int _mask[4]={0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d,0x0c0f0e0d}; int _con3[4]={0x0ffffffff, 0x0ffffffff, 0x07060504, 0x07060504}; __m128i con3=_mm_loadu_si128((__m128i const*)_con3); for (i=0;i<test_length;i+=2){ keys[0].nr=10; keys[1].nr=10; keyA = _mm_loadu_si128((__m128i const*)(first_key)); keyB = _mm_loadu_si128((__m128i const*)(first_key+16)); _mm_storeu_si128((__m128i *)keys[0].KEY, keyA); _mm_storeu_si128((__m128i *)keys[1].KEY, keyB); con = _mm_loadu_si128((__m128i const*)_con1); mask = _mm_loadu_si128((__m128i const*)_mask); KS_round(1) KS_round(2) KS_round(3) KS_round(4) KS_round(5) KS_round(6) KS_round(7) KS_round(8) con = _mm_loadu_si128((__m128i const*)_con2); KS_round(9) KS_round_last(10) keys+=2; first_key+=32; } keys=(ROUND_KEYS *)KEYS; for (i=0;i<test_length;i+=2){ register __m128i block1 = _mm_loadu_si128((__m128i const*)(0*16+PT)); register __m128i block2 = _mm_loadu_si128((__m128i const*)(1*16+PT)); READ_KEYS(0) block1 = _mm_xor_si128(keyA, block1); block2 = _mm_xor_si128(keyB, block2); ENC_round(1) ENC_round(2) ENC_round(3) ENC_round(4) ENC_round(5) ENC_round(6) ENC_round(7) ENC_round(8) ENC_round(9) ENC_round_last(10) _mm_storeu_si128((__m128i *)(CT+0*16), block1); _mm_storeu_si128((__m128i *)(CT+1*16), block2); PT+=32; CT+=32; keys+=2; } }