inline void mixColumns(ByteArray *s_) { uint8_t *s = s_->bytes(); for (int c = 0; c < Nb; ++c) { uint8_t *sc = s + c * Nb; uint8_t h0 = sc[0]; uint8_t h1 = sc[1]; uint8_t h2 = sc[2]; uint8_t h3 = sc[3]; sc[0] = h2 ^ h3 ^ mul2(h0) ^ mul3(h1); sc[1] = h0 ^ h3 ^ mul2(h1) ^ mul3(h2); sc[2] = h0 ^ h1 ^ mul2(h2) ^ mul3(h3); sc[3] = h1 ^ h2 ^ mul2(h3) ^ mul3(h0); } }
/* MixBytes reversibly mixes the bytes within a column */ void MixBytes(u8 x[ROWS][COLS1024], int columns) { int i, j; u8 temp[ROWS]; for (i = 0; i < columns; i++) { for (j = 0; j < ROWS; j++) { temp[j] = mul2(x[(j+0)%ROWS][i])^ mul2(x[(j+1)%ROWS][i])^ mul3(x[(j+2)%ROWS][i])^ mul4(x[(j+3)%ROWS][i])^ mul5(x[(j+4)%ROWS][i])^ mul3(x[(j+5)%ROWS][i])^ mul5(x[(j+6)%ROWS][i])^ mul7(x[(j+7)%ROWS][i]); } for (j = 0; j < ROWS; j++) { x[j][i] = temp[j]; } } }
/* AFunc : OTR Core Authentication Function (ADP=p) */ __m128i AFunc( const uint8 *header, uint32 h_len) { uint32 i; uint32 m, last; block tmp[PIPE], mask[PIPE + 1], ASum = _mm_setzero_si128(); uint32 rest_len = h_len; const __m128i *hdp = (__m128i*)header; mask[0] = _mm_load_si128(&Q); while (rest_len > (BLOCK*PIPE)){ mul2_PIPE(mask); for (i = 0; i < PIPE; i++){ tmp[i] = _mm_xor_si128(mask[i], hdp[i]); } AES_ecb_encrypt_PIPE(tmp, encrypt_key); for (i = 0; i < PIPE; i++){ ASum = _mm_xor_si128(ASum, tmp[i]); } rest_len -= (BLOCK*PIPE); hdp += PIPE; mask[0] = _mm_load_si128(&mask[PIPE]); } last = rest_len % BLOCK; if (last == 0) last = BLOCK; m = (rest_len - last) / BLOCK; //header = m blocks + last bytes for (i = 0; i < m; i++){ tmp[0] = _mm_xor_si128(mask[0], hdp[i]); AES_encrypt(tmp[0], &tmp[0], encrypt_key); ASum = _mm_xor_si128(ASum, tmp[0]); mul2(mask[0], &mask[0]); } hdp += m; /* last block */ ozp(last, (uint8*)&hdp[0], &tmp[0]); ASum = _mm_xor_si128(ASum, tmp[0]); if (last != BLOCK){ mul3(mask[0], &mask[0]); } else{ mul3twice(mask[0], &mask[0]); } ASum = _mm_xor_si128(ASum, mask[0]); AES_encrypt(ASum, &ASum, encrypt_key); return ASum; //TA }
void test_mixer_with_sines() { Sine vox1(431); // create 4 scaled sine waves MulOp mul1(vox1, 0.3); Sine vox2(540); MulOp mul2(vox2, 0.1); Sine vox3(890); MulOp mul3(vox3, 0.3); Sine vox4(1280); MulOp mul4(vox4, 0.01); Mixer mix(2); // create a stereo mixer mix.add_input(mul1); // add them to the mixer mix.add_input(mul2); mix.add_input(mul3); mix.add_input(mul4); logMsg("playing mix of 4 sines..."); run_test(mix); logMsg("mix done."); }
static void __stdcall comb_mask_0_simd(uint8_t* dstp, const uint8_t* srcp, const int dpitch, const int spitch, const int cthresh, const int width, const int height) noexcept { const uint8_t* sc = srcp; const uint8_t* sb = sc + spitch; const uint8_t* sa = sb + spitch; const uint8_t* sd = sc + spitch; const uint8_t* se = sd + spitch; int16_t cth16 = static_cast<int16_t>(cthresh); const V cthp = set1_i16<V>(cth16); const V cthn = set1_i16<V>(-cth16); const V cth6 = set1_i16<V>(cth16 * 6); constexpr int step = sizeof(V) / 2; for (int y = 0; y < height; ++y) { for (int x = 0; x < width; x += step) { V xc = load_half<V>(sc + x); V xb = load_half<V>(sb + x); V xd = load_half<V>(sd + x); V d1 = sub_i16(xc, xb); V d2 = sub_i16(xc, xd); V mask0 = or_reg( and_reg(cmpgt_i16(d1, cthp), cmpgt_i16(d2, cthp)), and_reg(cmpgt_i16(cthn, d1), cmpgt_i16(cthn, d2))); d2 = mul3(add_i16(xb, xd)); d1 = add_i16(load_half<V>(sa + x), load_half<V>(se + x)); d1 = add_i16(d1, lshift_i16(xc, 2)); mask0 = and_reg(mask0, cmpgt_i16(absdiff_i16(d1, d2), cth6)); store_half(dstp + x, mask0); } sa = sb; sb = sc; sc = sd; sd = se; se += (y < height - 3) ? spitch : -spitch; dstp += dpitch; } }
/* brute-force calculation * http://www.fftw.org/doc/The-1d-Discrete-Fourier-Transform-_0028DFT_0029.html */ void ft3d(int n0, int n1, int n2, fftw_complex *in, fftw_complex *out) { int j0, j1, j2, j, k0, k1, k2, k; double (*cs0)[2], (*cs1)[2], (*cs2)[2], cs[2]; cs0 = calloc(n0, sizeof(*cs0)); cs1 = calloc(n1, sizeof(*cs1)); cs2 = calloc(n2, sizeof(*cs2)); for (j = 0; j < n0; j++) { cs0[j][0] = cos(2*M_PI*j/n0); cs0[j][1] = sin(2*M_PI*j/n0); } for (j = 0; j < n1; j++) { cs1[j][0] = cos(2*M_PI*j/n1); cs1[j][1] = sin(2*M_PI*j/n1); } for (j = 0; j < n2; j++) { cs2[j][0] = cos(2*M_PI*j/n2); cs2[j][1] = sin(2*M_PI*j/n2); } for (k0 = 0; k0 < n0; k0++) for (k1 = 0; k1 < n1; k1++) for (k2 = 0; k2 < n2; k2++) { k = k0 * n1 * n2 + k1 * n2 + k2; out[k][0] = out[k][1] = 0; for (j0 = 0; j0 < n0; j0++) for (j1 = 0; j1 < n1; j1++) for (j2 = 0; j2 < n2; j2++) { j = j0 * n1 * n2 + j1 * n2 + j2; mul3(cs, cs0[j0 * k0 % n0], cs1[j1 * k1 % n1], cs2[j2 * k2 % n2]); out[k][0] += in[j][0]*cs[0] + in[j][1]*cs[1]; out[k][1] += in[j][1]*cs[0] - in[j][0]*cs[1]; } } free(cs0); free(cs1); free(cs2); }
/* DFunc : OTR Core Decryption Function, with nonce encryption */ __m128i DFunc( const uint8 *nonce, uint32 nonce_len, #if(ADP==Seri) const __m128i TA, #endif const uint8 *ciphertext, uint32 ci_len, uint32 t_len, uint8 *plaintext) { uint32 i; uint32 ell = 0; //number of 2BLOCK-byte chunks, excl. last one uint32 last = 0; //number of bytes in the last chunks block Sum = _mm_setzero_si128(); block txt[PIPE], Ln[PIPE + 1]; uint32 rest_len = ci_len; __m128i *ptp = (__m128i*)plaintext; const __m128i *ctp = (__m128i*)ciphertext; ALIGN(16)uint8 tmp[BLOCK] = { 0 }; block *La; /* Encryption of nonce */ memcpy(&tmp[BLOCK - nonce_len], nonce, nonce_len); tmp[0] = (uint8)((t_len%BLOCK) << 4); tmp[BLOCK - nonce_len - 1] |= 0x01; Ln[0] = _mm_load_si128((__m128i*)tmp); AES_encrypt(Ln[0], &Ln[0], encrypt_key); #if (ADP==Seri) Ln[0] = _mm_xor_si128(Ln[0], TA); mul2(Ln[0], &Ln[0]); #endif while (rest_len > (DBLOCK*PIPE)){ /* first round*/ mul2_PIPE(Ln); txt[0] = _mm_xor_si128(Ln[0], ctp[0]); txt[0] = _mm_xor_si128(Ln[1], txt[0]); txt[1] = _mm_xor_si128(Ln[1], ctp[2]); txt[1] = _mm_xor_si128(Ln[2], txt[1]); txt[2] = _mm_xor_si128(Ln[2], ctp[4]); txt[2] = _mm_xor_si128(Ln[3], txt[2]); txt[3] = _mm_xor_si128(Ln[3], ctp[6]); txt[3] = _mm_xor_si128(Ln[4], txt[3]); #if (PIPE>=5) txt[4] = _mm_xor_si128(Ln[4], ctp[8]); txt[4] = _mm_xor_si128(Ln[5], txt[4]); #endif #if (PIPE>=6) txt[5] = _mm_xor_si128(Ln[5], ctp[10]); txt[5] = _mm_xor_si128(Ln[6], txt[5]); #endif #if (PIPE>=7) txt[6] = _mm_xor_si128(Ln[6], ctp[12]); txt[6] = _mm_xor_si128(Ln[7], txt[6]); #endif #if (PIPE==8) txt[7] = _mm_xor_si128(Ln[7], ctp[14]); txt[7] = _mm_xor_si128(Ln[8], txt[7]); #endif AES_ecb_encrypt_PIPE(txt, encrypt_key); /* second round*/ ptp[0] = _mm_xor_si128(txt[0], ctp[1]); txt[0] = _mm_xor_si128(Ln[0], ptp[0]); ptp[2] = _mm_xor_si128(txt[1], ctp[3]); txt[1] = _mm_xor_si128(Ln[1], ptp[2]); ptp[4] = _mm_xor_si128(txt[2], ctp[5]); txt[2] = _mm_xor_si128(Ln[2], ptp[4]); ptp[6] = _mm_xor_si128(txt[3], ctp[7]); txt[3] = _mm_xor_si128(Ln[3], ptp[6]); #if (PIPE>=5) ptp[8] = _mm_xor_si128(txt[4], ctp[9]); txt[4] = _mm_xor_si128(Ln[4], ptp[8]); #endif #if (PIPE>=6) ptp[10] = _mm_xor_si128(txt[5], ctp[11]); txt[5] = _mm_xor_si128(Ln[5], ptp[10]); #endif #if (PIPE>=7) ptp[12] = _mm_xor_si128(txt[6], ctp[13]); txt[6] = _mm_xor_si128(Ln[6], ptp[12]); #endif #if (PIPE==8) ptp[14] = _mm_xor_si128(txt[7], ctp[15]); txt[7] = _mm_xor_si128(Ln[7], ptp[14]); #endif AES_ecb_encrypt_PIPE(txt, encrypt_key); ptp[1] = _mm_xor_si128(txt[0], ctp[0]); Sum = _mm_xor_si128(Sum, ptp[1]); ptp[3] = _mm_xor_si128(txt[1], ctp[2]); Sum = _mm_xor_si128(Sum, ptp[3]); ptp[5] = _mm_xor_si128(txt[2], ctp[4]); Sum = _mm_xor_si128(Sum, ptp[5]); ptp[7] = _mm_xor_si128(txt[3], ctp[6]); Sum = _mm_xor_si128(Sum, ptp[7]); #if (PIPE>=5) ptp[9] = _mm_xor_si128(txt[4], ctp[8]); Sum = _mm_xor_si128(Sum, ptp[9]); #endif #if (PIPE>=6) ptp[11] = _mm_xor_si128(txt[5], ctp[10]); Sum = _mm_xor_si128(Sum, ptp[11]); #endif #if (PIPE>=7) ptp[13] = _mm_xor_si128(txt[6], ctp[12]); Sum = _mm_xor_si128(Sum, ptp[13]); #endif #if (PIPE==8) ptp[15] = _mm_xor_si128(txt[7], ctp[14]); Sum = _mm_xor_si128(Sum, ptp[15]); #endif Ln[0] = _mm_load_si128(&Ln[PIPE]); ptp += (2 * PIPE); ctp += (2 * PIPE); rest_len -= (DBLOCK*PIPE); } if (rest_len != 0){ last = rest_len % DBLOCK; if (last == 0) last = DBLOCK; ell = (rest_len - last) / DBLOCK; // plaintext length = 2BLOCK*ell + last (non-zero) } /* 2-round Feistel for the full chunks */ mul3(Ln[0], &Ln[1]); for (i = 0; i < (2 * ell); i += 2){ txt[0] = _mm_xor_si128(Ln[1], ctp[i]); AES_encrypt(txt[0], &txt[0], encrypt_key); ptp[i] = _mm_xor_si128(txt[0], ctp[i + 1]); txt[0] = _mm_xor_si128(Ln[0], ptp[i]); AES_encrypt(txt[0], &txt[0], encrypt_key); ptp[i + 1] = _mm_xor_si128(txt[0], ctp[i]); Sum = _mm_xor_si128(Sum, ptp[i + 1]); Ln[0] = _mm_xor_si128(Ln[0], Ln[1]); mul2(Ln[1], &Ln[1]); } ptp += (2 * ell); ctp += (2 * ell); /* Last chunk */ if (last <= BLOCK){ //odd block, including the case pl_len = 0 (no plaintext) AES_encrypt(Ln[0], &txt[0], encrypt_key); //txt[0] is Z xorp(last, &txt[0], (uint8*)&ctp[0], (uint8*)&ptp[0]); ozp(last, (uint8*)&ptp[0], &txt[0]); Sum = _mm_xor_si128(txt[0], Sum); La = &Ln[0]; } else{//even blocks, last > BLOCK always holds. 2-round Feistel with last swap ozp(last - BLOCK, (uint8*)&ctp[1], &txt[0]); Sum = _mm_xor_si128(Sum, txt[0]); txt[0] = _mm_xor_si128(Ln[1], txt[0]); AES_encrypt(txt[0], &txt[0], encrypt_key); ptp[0] = _mm_xor_si128(txt[0], ctp[0]); txt[0] = _mm_xor_si128(Ln[0], ptp[0]); AES_encrypt(txt[0], &txt[1], encrypt_key); //txt[1] is Z xorp(last - BLOCK, &txt[1], (uint8*)&ctp[1], (uint8*)&ptp[1]); Sum = _mm_xor_si128(Sum, txt[1]); La = &Ln[1]; } /* TE generation */ if (last == BLOCK || last == DBLOCK){//last = 16 or 32 mul7(*La, La); } else{ mul3twice(*La, La); } Sum = _mm_xor_si128(Sum, *La); //Sum = (3^2 or 7)L* xor Sum AES_encrypt(Sum, &Sum, encrypt_key); return Sum;//TE }//end of DFunc