static uint8_t gfpow(uint8_t a, int b) { uint8_t v = 1; b %= 255; if ( b < 0 ) b += 255; while ( b ) { if ( b & 1 ) v = gfmul(v,a); a = gfmul(a,a); b >>= 1; } return v; }
void cMul(FIELD* vec,ulong size,FIELD coeff) { ulong i; for(i=0;i<size;i++) vec[i]=gfmul(coeff,vec[i]); return; }
void cMulvAdd(FIELD* vec1,FIELD* vec2,ulong size,FIELD coeff) { /*vec1=coeff*vec2+vec1*/ ulong i; for(i=0;i<size;i++) vec1[i]=gfadd(vec1[i],gfmul(coeff,vec2[i])); return; }
void make_tables(void) { int i, j; uint8_t v; uint32_t b, log; /* Compute multiplication table */ for (i = 0; i < 256; i++) for (j = 0; j < 256; j++) raid6_gfmul[i][j] = gfmul(i, j); /* Compute power-of-2 table (exponent) */ v = 1; for (i = 0; i < 256; i++) { raid6_gfexp[i] = v; v = gfmul(v, 2); if (v == 1) v = 0; /* For entry 255, not a real entry */ } /* Compute inverse table x^-1 == x^254 */ for (i = 0; i < 256; i++) raid6_gfinv[i] = gfpow(i, 254); /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ for (i = 0; i < 256; i ++) raid6_gfexi[i] = raid6_gfinv[raid6_gfexp[i] ^ 1]; /* Compute log and inverse log */ /* Modified code from: * http://web.eecs.utk.edu/~plank/plank/papers/CS-96-332.html */ b = 1; raid6_gflog[0] = 0; raid6_gfilog[255] = 0; for (log = 0; log < 255; log++) { raid6_gflog[b] = (uint8_t) log; raid6_gfilog[log] = (uint8_t) b; b = b << 1; if (b & 256) b = b ^ 0435; } tables_ready = 1; }
int main(int argc, char *argv[]) { int i, j, k; uint8_t v; uint8_t exptbl[256], invtbl[256]; printf("#include <linux/raid/pq.h>\n"); printf("#include <linux/export.h>\n"); printf("\nconst u8 __attribute__((aligned(256)))\n" "raid6_gfmul[256][256] =\n" "{\n"); for (i = 0; i < 256; i++) { printf("\t{\n"); for (j = 0; j < 256; j += 8) { printf("\t\t"); for (k = 0; k < 8; k++) printf("0x%02x,%c", gfmul(i, j + k), (k == 7) ? '\n' : ' '); } printf("\t},\n"); } printf("};\n"); printf("#ifdef __KERNEL__\n"); printf("EXPORT_SYMBOL(raid6_gfmul);\n"); printf("#endif\n"); v = 1; printf("\nconst u8 __attribute__((aligned(256)))\n" "raid6_gfexp[256] =\n" "{\n"); for (i = 0; i < 256; i += 8) { printf("\t"); for (j = 0; j < 8; j++) { exptbl[i + j] = v; printf("0x%02x,%c", v, (j == 7) ? '\n' : ' '); v = gfmul(v, 2); if (v == 1) v = 0; } } printf("};\n"); printf("#ifdef __KERNEL__\n"); printf("EXPORT_SYMBOL(raid6_gfexp);\n"); printf("#endif\n"); printf("\nconst u8 __attribute__((aligned(256)))\n" "raid6_gfinv[256] =\n" "{\n"); for (i = 0; i < 256; i += 8) { printf("\t"); for (j = 0; j < 8; j++) { invtbl[i + j] = v = gfpow(i + j, 254); printf("0x%02x,%c", v, (j == 7) ? '\n' : ' '); } } printf("};\n"); printf("#ifdef __KERNEL__\n"); printf("EXPORT_SYMBOL(raid6_gfinv);\n"); printf("#endif\n"); printf("\nconst u8 __attribute__((aligned(256)))\n" "raid6_gfexi[256] =\n" "{\n"); for (i = 0; i < 256; i += 8) { printf("\t"); for (j = 0; j < 8; j++) printf("0x%02x,%c", invtbl[exptbl[i + j] ^ 1], (j == 7) ? '\n' : ' '); } printf("};\n"); printf("#ifdef __KERNEL__\n"); printf("EXPORT_SYMBOL(raid6_gfexi);\n"); printf("#endif\n"); return 0; }
int AES_GCM_decrypt (const unsigned char *in, unsigned char *out, const unsigned char* addt, const unsigned char* ivec, unsigned char *tag, int nbytes, int abytes, int ibytes, const unsigned char* key, int nr) { int i, j ,k; __m128i hlp1, hlp2, hlp3, hlp4; __m128i tmp1, tmp2, tmp3, tmp4; __m128i H, Y, T; __m128i *KEY = (__m128i*)key; __m128i ctr1, ctr2, ctr3, ctr4; __m128i last_block = _mm_setzero_si128(); __m128i ONE = _mm_set_epi32(0, 1, 0, 0); __m128i FOUR = _mm_set_epi32(0, 4, 0, 0); __m128i BSWAP_EPI64 = _mm_set_epi8(8,9,10,11,12,13,14,15,0,1,2,3,4,5,6,7); __m128i BSWAP_MASK = _mm_set_epi8(0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15); __m128i X = _mm_setzero_si128(); if(ibytes == 96/8){ Y = _mm_loadu_si128((__m128i*)ivec); Y = _mm_insert_epi32(Y, 0x1000000, 3); /*(Compute E[ZERO, KS] and E[Y0, KS] together*/ tmp1 = _mm_xor_si128(X, KEY[0]); tmp2 = _mm_xor_si128(Y, KEY[0]); for(j=1; j < nr-1; j+=2) { tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]); }; tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]); H = _mm_aesenclast_si128(tmp1, KEY[nr]); T = _mm_aesenclast_si128(tmp2, KEY[nr]); H = _mm_shuffle_epi8(H, BSWAP_MASK); } else{ tmp1 = _mm_xor_si128(X, KEY[0]); for(j=1; j <nr; j++) tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); H = _mm_aesenclast_si128(tmp1, KEY[nr]); H = _mm_shuffle_epi8(H, BSWAP_MASK); Y = _mm_xor_si128(Y, Y); for(i=0; i < ibytes/16; i++){ tmp1 = _mm_loadu_si128(&((__m128i*)ivec)[i]); tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); Y = _mm_xor_si128(Y, tmp1); gfmul(Y, H, &Y); } if(ibytes%16){ for(j=0; j < ibytes%16; j++) ((unsigned char*)&last_block)[j] = ivec[i*16+j]; tmp1 = last_block; tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); Y = _mm_xor_si128(Y, tmp1); gfmul(Y, H, &Y); } tmp1 = _mm_insert_epi64(tmp1, ibytes*8, 0); tmp1 = _mm_insert_epi64(tmp1, 0, 1); Y = _mm_xor_si128(Y, tmp1); gfmul(Y, H, &Y); Y = _mm_shuffle_epi8(Y, BSWAP_MASK); /*Compute E(K, Y0)*/ tmp1 = _mm_xor_si128(Y, KEY[0]); for(j=1; j < nr; j++) tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); T = _mm_aesenclast_si128(tmp1, KEY[nr]); } for(i=0; i<abytes/16; i++){ tmp1 = _mm_loadu_si128(&((__m128i*)addt)[i]); tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); X = _mm_xor_si128(X, tmp1); gfmul(X, H, &X); } if(abytes%16){ last_block = _mm_setzero_si128(); for(j=0;j<abytes%16;j++) ((unsigned char*)&last_block)[j] = addt[i*16+j]; tmp1 = last_block; tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); X =_mm_xor_si128(X, tmp1); gfmul(X, H, &X); } for(i=0; i<nbytes/16; i++){ tmp1 = _mm_loadu_si128(&((__m128i*)in)[i]); tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); X = _mm_xor_si128(X, tmp1); gfmul(X, H, &X); } if(nbytes%16){ last_block = _mm_setzero_si128(); for(j=0; j<nbytes%16; j++) ((unsigned char*)&last_block)[j] = in[i*16+j]; tmp1 = last_block; tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); X = _mm_xor_si128(X, tmp1); gfmul(X, H, &X); } tmp1 =_mm_insert_epi64(tmp1, nbytes*8, 0); tmp1 =_mm_insert_epi64(tmp1, abytes*8, 1); X = _mm_xor_si128(X, tmp1); gfmul(X, H, &X); X = _mm_shuffle_epi8(X, BSWAP_MASK); T = _mm_xor_si128(X, T); if(0xffff!=_mm_movemask_epi8(_mm_cmpeq_epi8(T, _mm_loadu_si128((__m128i*)tag)))) return 0; //in case the authentication failed ctr1 = _mm_shuffle_epi8(Y, BSWAP_EPI64); ctr1 = _mm_add_epi32(ctr1, ONE); ctr2 = _mm_add_epi32(ctr1, ONE); ctr3 = _mm_add_epi32(ctr2, ONE); ctr4 = _mm_add_epi32(ctr3, ONE); for(i=0; i < nbytes/16/4; i++){ tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); tmp2 = _mm_shuffle_epi8(ctr2, BSWAP_EPI64); tmp3 = _mm_shuffle_epi8(ctr3, BSWAP_EPI64); tmp4 = _mm_shuffle_epi8(ctr4, BSWAP_EPI64); ctr1 = _mm_add_epi32(ctr1, FOUR); ctr2 = _mm_add_epi32(ctr2, FOUR); ctr3 = _mm_add_epi32(ctr3, FOUR); ctr4 = _mm_add_epi32(ctr4, FOUR); tmp1 =_mm_xor_si128(tmp1, KEY[0]); tmp2 =_mm_xor_si128(tmp2, KEY[0]); tmp3 =_mm_xor_si128(tmp3, KEY[0]); tmp4 =_mm_xor_si128(tmp4, KEY[0]); for(j=1; j < nr-1; j+=2){ tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); tmp2 = _mm_aesenc_si128(tmp2, KEY[j]); tmp3 = _mm_aesenc_si128(tmp3, KEY[j]); tmp4 = _mm_aesenc_si128(tmp4, KEY[j]); tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); tmp2 = _mm_aesenc_si128(tmp2, KEY[j+1]); tmp3 = _mm_aesenc_si128(tmp3, KEY[j+1]); tmp4 = _mm_aesenc_si128(tmp4, KEY[j+1]); } tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); tmp2 = _mm_aesenc_si128(tmp2, KEY[nr-1]); tmp3 = _mm_aesenc_si128(tmp3, KEY[nr-1]); tmp4 = _mm_aesenc_si128(tmp4, KEY[nr-1]); tmp1 =_mm_aesenclast_si128(tmp1, KEY[nr]); tmp2 =_mm_aesenclast_si128(tmp2, KEY[nr]); tmp3 =_mm_aesenclast_si128(tmp3, KEY[nr]); tmp4 =_mm_aesenclast_si128(tmp4, KEY[nr]); tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[i*4+0])); tmp2 = _mm_xor_si128(tmp2, _mm_loadu_si128(&((__m128i*)in)[i*4+1])); tmp3 = _mm_xor_si128(tmp3, _mm_loadu_si128(&((__m128i*)in)[i*4+2])); tmp4 = _mm_xor_si128(tmp4, _mm_loadu_si128(&((__m128i*)in)[i*4+3])); _mm_storeu_si128(&((__m128i*)out)[i*4+0], tmp1); _mm_storeu_si128(&((__m128i*)out)[i*4+1], tmp2); _mm_storeu_si128(&((__m128i*)out)[i*4+2], tmp3); _mm_storeu_si128(&((__m128i*)out)[i*4+3], tmp4); tmp1 = _mm_shuffle_epi8(tmp1, BSWAP_MASK); tmp2 = _mm_shuffle_epi8(tmp2, BSWAP_MASK); tmp3 = _mm_shuffle_epi8(tmp3, BSWAP_MASK); tmp4 = _mm_shuffle_epi8(tmp4, BSWAP_MASK); } for(k = i*4; k < nbytes/16; k++){ tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); ctr1 = _mm_add_epi32(ctr1, ONE); tmp1 = _mm_xor_si128(tmp1, KEY[0]); for(j=1; j<nr-1; j+=2){ tmp1 = _mm_aesenc_si128(tmp1, KEY[j]); tmp1 = _mm_aesenc_si128(tmp1, KEY[j+1]); } tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); _mm_storeu_si128(&((__m128i*)out)[k], tmp1); } //If one partial block remains if(nbytes%16){ tmp1 = _mm_shuffle_epi8(ctr1, BSWAP_EPI64); tmp1 = _mm_xor_si128(tmp1, KEY[0]); for(j=1; j<nr-1; j+=2){ tmp1 =_mm_aesenc_si128(tmp1, KEY[j]); tmp1 =_mm_aesenc_si128(tmp1, KEY[j+1]); } tmp1 = _mm_aesenc_si128(tmp1, KEY[nr-1]); tmp1 = _mm_aesenclast_si128(tmp1, KEY[nr]); tmp1 = _mm_xor_si128(tmp1, _mm_loadu_si128(&((__m128i*)in)[k])); last_block = tmp1; for(j=0; j<nbytes%16; j++) out[k*16+j]=((unsigned char*)&last_block)[j]; } return 1; //when sucessfull returns 1 }
int main(int argc, char *argv[]) { int i, j, k; uint8_t v; uint8_t exptbl[256], invtbl[256]; printf("#include \"raid6.h\"\n"); /* Compute multiplication table */ printf("\nconst u8 __attribute__((aligned(256)))\n" "raid6_gfmul[256][256] =\n" "{\n"); for ( i = 0 ; i < 256 ; i++ ) { printf("\t{\n"); for ( j = 0 ; j < 256 ; j += 8 ) { printf("\t\t"); for ( k = 0 ; k < 8 ; k++ ) { printf("0x%02x, ", gfmul(i,j+k)); } printf("\n"); } printf("\t},\n"); } printf("};\n"); /* Compute power-of-2 table (exponent) */ v = 1; printf("\nconst u8 __attribute__((aligned(256)))\n" "raid6_gfexp[256] =\n" "{\n"); for ( i = 0 ; i < 256 ; i += 8 ) { printf("\t"); for ( j = 0 ; j < 8 ; j++ ) { exptbl[i+j] = v; printf("0x%02x, ", v); v = gfmul(v,2); if ( v == 1 ) v = 0; /* For entry 255, not a real entry */ } printf("\n"); } printf("};\n"); /* Compute inverse table x^-1 == x^254 */ printf("\nconst u8 __attribute__((aligned(256)))\n" "raid6_gfinv[256] =\n" "{\n"); for ( i = 0 ; i < 256 ; i += 8 ) { printf("\t"); for ( j = 0 ; j < 8 ; j++ ) { invtbl[i+j] = v = gfpow(i+j,254); printf("0x%02x, ", v); } printf("\n"); } printf("};\n"); /* Compute inv(2^x + 1) (exponent-xor-inverse) table */ printf("\nconst u8 __attribute__((aligned(256)))\n" "raid6_gfexi[256] =\n" "{\n"); for ( i = 0 ; i < 256 ; i += 8 ) { printf("\t"); for ( j = 0 ; j < 8 ; j++ ) { printf("0x%02x, ", invtbl[exptbl[i+j]^1]); } printf("\n"); } printf("};\n\n"); return 0; }