__inline__ static void mul7(__m128i in, __m128i *out){ __m128i tmp; mul2(in, &tmp); *out = _mm_xor_si128(in, tmp); mul2(tmp, &tmp); *out = _mm_xor_si128(*out, tmp); }
/* Recover data blocks x and y using syndromes Q & R. */ static void rs_decode2qr(int n, int blocksize, int x, int y, v16 **data) { int i; v16 qq, rr; const uint8_t* const c = rs_r2QR[rs_r2map[x][y]]; #ifndef KFS_QCRS_DONT_INLINE v16** pd = data + n - 1; #endif memset(data[x], 0, blocksize); memset(data[y], 0, blocksize); for (i = 0; i < blocksize/sizeof(v16); i++) { #ifndef KFS_QCRS_DONT_INLINE qq = (*pd)[i]; rr = qq; while (data <= --pd) { const v16 d = (*pd)[i]; qq = mul2(qq) ^ d; rr = mul2(mul2(rr)) ^ d; } pd = data + n + 2; rr ^= (*pd--)[i]; qq ^= (*pd--)[i]; pd--; #else qq = Q(data, n, i) ^ data[n+1][i]; rr = R(data, n, i) ^ data[n+2][i]; #endif data[x][i] = mulby(c[0], qq) ^ mulby(c[1], rr); data[y][i] = mulby(c[2], qq) ^ mulby(c[3], rr); } }
/* Compute R syndrome over data[?][i]. */ static v16 R(v16 **data, int n, int i) { int j; v16 r; r = data[n-1][i]; for (j = n-2; j >= 0; j--) r = mul2(mul2(r)) ^ data[j][i]; return r; }
/* Recover data blocks x and y using syndromes P & Q. */ static void rs_decode2pq(int n, int blocksize, int x, int y, v16 **data) { int i; v16 pp, qq; const uint8_t* const c = rs_r2PQ[rs_r2map[x][y]]; #ifndef KFS_QCRS_DONT_INLINE v16** pd = data + n - 1; #endif memset(data[x], 0, blocksize); memset(data[y], 0, blocksize); for (i = 0; i < blocksize/sizeof(v16); i++) { #ifndef KFS_QCRS_DONT_INLINE pp = (*pd)[i]; qq = pp; while (data <= --pd) { const v16 d = (*pd)[i]; pp ^= d; qq = mul2(qq) ^ d; } pd = data + n + 1; qq ^= (*pd--)[i]; pp ^= (*pd--)[i]; #else pp = P(data, n, i) ^ data[n][i]; qq = Q(data, n, i) ^ data[n+1][i]; #endif data[x][i] = mulby(c[0], pp) ^ mulby(c[1], qq); data[y][i] = mulby(c[2], pp) ^ mulby(c[3], qq); } }
/* AFuncS : OTR Core Authentication Function (ADP=s) */ __m128i AFuncS( const uint8 *header, uint32 h_len) { uint32 i; uint32 m, last; block chain = _mm_setzero_si128(), tmp, mask; const __m128i *hdp = (__m128i*)header; last = h_len % BLOCK; if (last == 0) last = BLOCK; m = (h_len - last) / BLOCK; //header = m blocks + last bytes for (i = 0; i < m; i++){ chain = _mm_xor_si128(chain, hdp[i]); AES_encrypt(chain, &chain, encrypt_key); } hdp += m; /* last block */ ozp(last, (uint8*)&hdp[0], &tmp); chain = _mm_xor_si128(tmp, chain); if (last != BLOCK){ mul2(Q, &mask); } else{ mul4(Q, &mask); } chain = _mm_xor_si128(chain, mask); AES_encrypt(chain, &chain, encrypt_key); return chain; //TA }
inline void mixColumns(ByteArray *s_) { uint8_t *s = s_->bytes(); for (int c = 0; c < Nb; ++c) { uint8_t *sc = s + c * Nb; uint8_t h0 = sc[0]; uint8_t h1 = sc[1]; uint8_t h2 = sc[2]; uint8_t h3 = sc[3]; sc[0] = h2 ^ h3 ^ mul2(h0) ^ mul3(h1); sc[1] = h0 ^ h3 ^ mul2(h1) ^ mul3(h2); sc[2] = h0 ^ h1 ^ mul2(h2) ^ mul3(h3); sc[3] = h1 ^ h2 ^ mul2(h3) ^ mul3(h0); } }
int main() { int arr[1000] = {1, -1};//need 302 + 1, -1 is stoper int i; for(i = 0;i < 1000;i++) mul2(arr); printf("%d\n", sum(arr)); return 0; }
inline uint8_t mul(uint8_t a, uint8_t b) { int c = 0, h = a, f = 1; while (f <= b) { if (b & f) c ^= h; h = mul2(h); f <<= 1; } return c; }
/* Compute Q syndrome over data[?][i]. */ static v16 Q(v16 **data, int n, int i) { int j; v16 q; q = data[n-1][i]; for (j = n-2; j >= 0; j--) q = mul2(q) ^ data[j][i]; return q; }
/* MixBytes reversibly mixes the bytes within a column */ void MixBytes(u8 x[ROWS][COLS1024], int columns) { int i, j; u8 temp[ROWS]; for (i = 0; i < columns; i++) { for (j = 0; j < ROWS; j++) { temp[j] = mul2(x[(j+0)%ROWS][i])^ mul2(x[(j+1)%ROWS][i])^ mul3(x[(j+2)%ROWS][i])^ mul4(x[(j+3)%ROWS][i])^ mul5(x[(j+4)%ROWS][i])^ mul3(x[(j+5)%ROWS][i])^ mul5(x[(j+6)%ROWS][i])^ mul7(x[(j+7)%ROWS][i]); } for (j = 0; j < ROWS; j++) { x[j][i] = temp[j]; } } }
/* * Reed-Solomon n+3 encoder. * nblocks is `n' data blocks plus 3 syndrome blocks. blocksize _must_ * be a multiple of 16. data contains pointers to blocks. The first * n are input data blocks. The last 3 are the P, Q, and R syndromes. */ void rs_encode(int nblocks, int blocksize, void **idata) { int i, j, n; v16 *p, *q, *r, **data = (v16**)idata; assert(nblocks > 3); assert(blocksize % 16 == 0); n = nblocks - 3; // # data blocks p = data[n]; q = data[n+1]; r = data[n+2]; for (i = 0; i < blocksize/sizeof(v16); i++) { p[i] = q[i] = r[i] = data[n-1][i]; for (j = n-2; j >= 0; j--) { p[i] ^= data[j][i]; q[i] = mul2(q[i]) ^ data[j][i]; r[i] = mul2(mul2(r[i])) ^ data[j][i]; } } }
/* Recover data blocks x, y, & z using syndromes P, Q & R. */ static void rs_decode3pqr(int n, int blocksize, int x, int y, int z, v16 **data) { int i; v16 pp, qq, rr; const uint8_t* const c = rs_r3[rs_r3map[x][y][z]]; #ifndef KFS_QCRS_DONT_INLINE v16** pd = data + n - 1; #endif memset(data[x], 0, blocksize); memset(data[y], 0, blocksize); memset(data[z], 0, blocksize); for (i = 0; i < blocksize/sizeof(v16); i++) { #ifndef KFS_QCRS_DONT_INLINE pp = (*pd)[i]; qq = pp; rr = pp; while (data <= --pd) { const v16 d = (*pd)[i]; pp ^= d; qq = mul2(qq) ^ d; rr = mul2(mul2(rr)) ^ d; } pd = data + n + 2; rr ^= (*pd--)[i]; qq ^= (*pd--)[i]; pp ^= (*pd--)[i]; #else pp = P(data, n, i) ^ data[n][i]; qq = Q(data, n, i) ^ data[n+1][i]; rr = R(data, n, i) ^ data[n+2][i]; #endif data[x][i] = mulby(c[0], pp) ^ mulby(c[1], qq) ^ mulby(c[2], rr); data[y][i] = mulby(c[3], pp) ^ mulby(c[4], qq) ^ mulby(c[5], rr); data[z][i] = mulby(c[6], pp) ^ mulby(c[7], qq) ^ mulby(c[8], rr); } }
/* AFunc : OTR Core Authentication Function (ADP=p) */ __m128i AFunc( const uint8 *header, uint32 h_len) { uint32 i; uint32 m, last; block tmp[PIPE], mask[PIPE + 1], ASum = _mm_setzero_si128(); uint32 rest_len = h_len; const __m128i *hdp = (__m128i*)header; mask[0] = _mm_load_si128(&Q); while (rest_len > (BLOCK*PIPE)){ mul2_PIPE(mask); for (i = 0; i < PIPE; i++){ tmp[i] = _mm_xor_si128(mask[i], hdp[i]); } AES_ecb_encrypt_PIPE(tmp, encrypt_key); for (i = 0; i < PIPE; i++){ ASum = _mm_xor_si128(ASum, tmp[i]); } rest_len -= (BLOCK*PIPE); hdp += PIPE; mask[0] = _mm_load_si128(&mask[PIPE]); } last = rest_len % BLOCK; if (last == 0) last = BLOCK; m = (rest_len - last) / BLOCK; //header = m blocks + last bytes for (i = 0; i < m; i++){ tmp[0] = _mm_xor_si128(mask[0], hdp[i]); AES_encrypt(tmp[0], &tmp[0], encrypt_key); ASum = _mm_xor_si128(ASum, tmp[0]); mul2(mask[0], &mask[0]); } hdp += m; /* last block */ ozp(last, (uint8*)&hdp[0], &tmp[0]); ASum = _mm_xor_si128(ASum, tmp[0]); if (last != BLOCK){ mul3(mask[0], &mask[0]); } else{ mul3twice(mask[0], &mask[0]); } ASum = _mm_xor_si128(ASum, mask[0]); AES_encrypt(ASum, &ASum, encrypt_key); return ASum; //TA }
void test_vector_ifft() { logMsg("playing IFFT crossfade..."); IFFT vox1, vox2; vox1.set_bin_mag_phase(2, 0.25, 0); vox1.set_bin_mag_phase(4, 0.25, 0); vox2.set_bin_mag_phase(6, 0.25, 0); vox2.set_bin_mag_phase(8, 0.25, 0); LineSegment env1(3, 1, 0); // fade out LineSegment env2(3, 0, 1); // fade in MulOp mul1(vox1, env1); MulOp mul2(vox2, env2); AddOp add3(mul1, mul2); run_test(add3); logMsg("IFFT crossfade done."); }
void test_mixer_with_sines() { Sine vox1(431); // create 4 scaled sine waves MulOp mul1(vox1, 0.3); Sine vox2(540); MulOp mul2(vox2, 0.1); Sine vox3(890); MulOp mul3(vox3, 0.3); Sine vox4(1280); MulOp mul4(vox4, 0.01); Mixer mix(2); // create a stereo mixer mix.add_input(mul1); // add them to the mixer mix.add_input(mul2); mix.add_input(mul3); mix.add_input(mul4); logMsg("playing mix of 4 sines..."); run_test(mix); logMsg("mix done."); }
static v16 mulby(uint8_t x, v16 v) { #ifdef LIBRS_USE_NEON #define uint8x16_to_8x8x2(v) ((uint8x8x2_t) { vget_low_u8(v), vget_high_u8(v) }) v16 lo, hi; lo = v & VEC16(0x0f); hi = vshrq_n_u8(v, 4); lo = vcombine_u8( vtbl2_u8(uint8x16_to_8x8x2(rs_nibmul[x].lo), vget_low_u8(lo)), vtbl2_u8(uint8x16_to_8x8x2(rs_nibmul[x].lo), vget_high_u8(lo))); hi = vcombine_u8( vtbl2_u8(uint8x16_to_8x8x2(rs_nibmul[x].hi), vget_low_u8(hi)), vtbl2_u8(uint8x16_to_8x8x2(rs_nibmul[x].hi), vget_high_u8(hi))); return lo ^ hi; #elif defined(LIBRS_USE_SSSE3) v16 lo, hi; lo = v & VEC16(0x0f); hi = __builtin_ia32_psrawi128(v, 4); hi &= VEC16(0x0f); lo = __builtin_ia32_pshufb128(rs_nibmul[x].lo, lo); hi = __builtin_ia32_pshufb128(rs_nibmul[x].hi, hi); return lo ^ hi; #else v16 vv = VEC16(0); while (x != 0) { if (x & 1) vv ^= v; x >>= 1; v = mul2(v); } return vv; #endif }
void CVisVector::CrossProd( const CVisVector & v1, const CVisVector & v2 ) { CVisEqualFixpoint mul1( MTRX_FRACTBITS ); CVisEqualFixpoint mul2( MTRX_FRACTBITS ); mul1.Mult( v1.m_fparyStore[1], v2.m_fparyStore[2] ); mul2.Mult( v1.m_fparyStore[2], v2.m_fparyStore[1] ); m_fparyStore[0].Sub( mul1, mul2 ); mul1.Mult( v1.m_fparyStore[2], v2.m_fparyStore[0] ); mul2.Mult( v1.m_fparyStore[0], v2.m_fparyStore[2] ); m_fparyStore[1].Sub( mul1, mul2 ); mul1.Mult( v1.m_fparyStore[0], v2.m_fparyStore[1] ); mul2.Mult( v1.m_fparyStore[1], v2.m_fparyStore[0] ); m_fparyStore[2].Sub( mul1, mul2 ); m_fparyStore[3] = 0; }
void test_scaled_sin() { Sine vox(220); vox.set_scale(0.1); // simplest: scale the sine directly logMsg("playing quiet sin..."); run_test(vox); logMsg("quiet sin done."); Sine vox2(220); MulOp mul(vox2, 0.1); // using a MulOp with a constant logMsg("playing quiet sin..."); run_test(mul); logMsg("quiet sin done."); Sine vox3(220); StaticVariable var(0.1); // using a MulOp with a StaticVariable MulOp mul2(vox3, var); logMsg("playing quiet sin..."); run_test(mul2); logMsg("quiet sin done."); }
virtual void exec() { USE(READ, n, tsteps); T DX(static_cast<T>(1.0) / n); T DY(static_cast<T>(1.0) / n); T DT(static_cast<T>(1.0) / tsteps); T B1(static_cast<T>(2.0)); T B2(static_cast<T>(1.0)); T mul1(B1 * DT / (DX * DX)); T mul2(B2 * DT / (DY * DY)); T a(-mul1 / static_cast<T>(2.0)); T b(static_cast<T>(1.0) + mul1); T c(a); T d(-mul2 / static_cast<T>(2.0)); T e(static_cast<T>(1.0) + mul2); T f(d); USE(READWRITE, v, u, p, q); using exec_pol = NestedPolicy<ExecList<omp_parallel_for_exec, simd_exec>, Tile<TileList<tile_fixed<16>, tile_none>>>; for (int t = 0; t < tsteps; ++t) { forall<omp_parallel_for_exec>(1, n - 1, [=](int i) { v->at(0, i) = static_cast<T>(1.0); p->at(i, 0) = static_cast<T>(0.0); q->at(i, 0) = v->at(0, i); v->at(n - 1, i) = static_cast<T>(1.0); }); forallN<exec_pol>( RangeSegment{1, n - 1}, RangeSegment{1, n - 1}, [=](int i, int j) { p->at(i, j) = -c / (a * p->at(i, j - 1) + b); q->at(i, j) = (-d * u->at(j, i - 1) + (1.0 + 2.0 * d) * u->at(j, i) - f * u->at(j, i + 1) - a * q->at(i, j - 1)) / (a * p->at(i, j - 1) + b); }); forallN<exec_pol>( RangeSegment{1, n - 1}, RangeSegment{2, n}, [=](int i, int j_) { int j = n - j_; v->at(j, i) = p->at(i, j) * v->at(j + 1, i) + q->at(i, j); }); forall<omp_parallel_for_exec>(1, n - 1, [=](int i) { u->at(i, 0) = static_cast<T>(1.0); p->at(i, 0) = static_cast<T>(0.0); q->at(i, 0) = u->at(i, 0); u->at(i, n - 1) = static_cast<T>(1.0); }); forallN<exec_pol>( RangeSegment{1, n - 1}, RangeSegment{1, n - 1}, [=](int i, int j) { p->at(i, j) = -f / (d * p->at(i, j - 1) + e); q->at(i, j) = (-a * v->at(i - 1, j) + (static_cast<T>(1.0) + static_cast<T>(2.0) * a) * v->at(i, j) - c * v->at(i + 1, j) - d * q->at(i, j - 1)) / (d * p->at(i, j - 1) + e); }); forallN<exec_pol>( RangeSegment{1, n - 1}, RangeSegment{2, n}, [=](int i, int j_) { int j = n - j_; u->at(i, j) = p->at(i, j) * u->at(i, j + 1) + q->at(i, j); }); } }
/* DFunc : OTR Core Decryption Function, with nonce encryption */ __m128i DFunc( const uint8 *nonce, uint32 nonce_len, #if(ADP==Seri) const __m128i TA, #endif const uint8 *ciphertext, uint32 ci_len, uint32 t_len, uint8 *plaintext) { uint32 i; uint32 ell = 0; //number of 2BLOCK-byte chunks, excl. last one uint32 last = 0; //number of bytes in the last chunks block Sum = _mm_setzero_si128(); block txt[PIPE], Ln[PIPE + 1]; uint32 rest_len = ci_len; __m128i *ptp = (__m128i*)plaintext; const __m128i *ctp = (__m128i*)ciphertext; ALIGN(16)uint8 tmp[BLOCK] = { 0 }; block *La; /* Encryption of nonce */ memcpy(&tmp[BLOCK - nonce_len], nonce, nonce_len); tmp[0] = (uint8)((t_len%BLOCK) << 4); tmp[BLOCK - nonce_len - 1] |= 0x01; Ln[0] = _mm_load_si128((__m128i*)tmp); AES_encrypt(Ln[0], &Ln[0], encrypt_key); #if (ADP==Seri) Ln[0] = _mm_xor_si128(Ln[0], TA); mul2(Ln[0], &Ln[0]); #endif while (rest_len > (DBLOCK*PIPE)){ /* first round*/ mul2_PIPE(Ln); txt[0] = _mm_xor_si128(Ln[0], ctp[0]); txt[0] = _mm_xor_si128(Ln[1], txt[0]); txt[1] = _mm_xor_si128(Ln[1], ctp[2]); txt[1] = _mm_xor_si128(Ln[2], txt[1]); txt[2] = _mm_xor_si128(Ln[2], ctp[4]); txt[2] = _mm_xor_si128(Ln[3], txt[2]); txt[3] = _mm_xor_si128(Ln[3], ctp[6]); txt[3] = _mm_xor_si128(Ln[4], txt[3]); #if (PIPE>=5) txt[4] = _mm_xor_si128(Ln[4], ctp[8]); txt[4] = _mm_xor_si128(Ln[5], txt[4]); #endif #if (PIPE>=6) txt[5] = _mm_xor_si128(Ln[5], ctp[10]); txt[5] = _mm_xor_si128(Ln[6], txt[5]); #endif #if (PIPE>=7) txt[6] = _mm_xor_si128(Ln[6], ctp[12]); txt[6] = _mm_xor_si128(Ln[7], txt[6]); #endif #if (PIPE==8) txt[7] = _mm_xor_si128(Ln[7], ctp[14]); txt[7] = _mm_xor_si128(Ln[8], txt[7]); #endif AES_ecb_encrypt_PIPE(txt, encrypt_key); /* second round*/ ptp[0] = _mm_xor_si128(txt[0], ctp[1]); txt[0] = _mm_xor_si128(Ln[0], ptp[0]); ptp[2] = _mm_xor_si128(txt[1], ctp[3]); txt[1] = _mm_xor_si128(Ln[1], ptp[2]); ptp[4] = _mm_xor_si128(txt[2], ctp[5]); txt[2] = _mm_xor_si128(Ln[2], ptp[4]); ptp[6] = _mm_xor_si128(txt[3], ctp[7]); txt[3] = _mm_xor_si128(Ln[3], ptp[6]); #if (PIPE>=5) ptp[8] = _mm_xor_si128(txt[4], ctp[9]); txt[4] = _mm_xor_si128(Ln[4], ptp[8]); #endif #if (PIPE>=6) ptp[10] = _mm_xor_si128(txt[5], ctp[11]); txt[5] = _mm_xor_si128(Ln[5], ptp[10]); #endif #if (PIPE>=7) ptp[12] = _mm_xor_si128(txt[6], ctp[13]); txt[6] = _mm_xor_si128(Ln[6], ptp[12]); #endif #if (PIPE==8) ptp[14] = _mm_xor_si128(txt[7], ctp[15]); txt[7] = _mm_xor_si128(Ln[7], ptp[14]); #endif AES_ecb_encrypt_PIPE(txt, encrypt_key); ptp[1] = _mm_xor_si128(txt[0], ctp[0]); Sum = _mm_xor_si128(Sum, ptp[1]); ptp[3] = _mm_xor_si128(txt[1], ctp[2]); Sum = _mm_xor_si128(Sum, ptp[3]); ptp[5] = _mm_xor_si128(txt[2], ctp[4]); Sum = _mm_xor_si128(Sum, ptp[5]); ptp[7] = _mm_xor_si128(txt[3], ctp[6]); Sum = _mm_xor_si128(Sum, ptp[7]); #if (PIPE>=5) ptp[9] = _mm_xor_si128(txt[4], ctp[8]); Sum = _mm_xor_si128(Sum, ptp[9]); #endif #if (PIPE>=6) ptp[11] = _mm_xor_si128(txt[5], ctp[10]); Sum = _mm_xor_si128(Sum, ptp[11]); #endif #if (PIPE>=7) ptp[13] = _mm_xor_si128(txt[6], ctp[12]); Sum = _mm_xor_si128(Sum, ptp[13]); #endif #if (PIPE==8) ptp[15] = _mm_xor_si128(txt[7], ctp[14]); Sum = _mm_xor_si128(Sum, ptp[15]); #endif Ln[0] = _mm_load_si128(&Ln[PIPE]); ptp += (2 * PIPE); ctp += (2 * PIPE); rest_len -= (DBLOCK*PIPE); } if (rest_len != 0){ last = rest_len % DBLOCK; if (last == 0) last = DBLOCK; ell = (rest_len - last) / DBLOCK; // plaintext length = 2BLOCK*ell + last (non-zero) } /* 2-round Feistel for the full chunks */ mul3(Ln[0], &Ln[1]); for (i = 0; i < (2 * ell); i += 2){ txt[0] = _mm_xor_si128(Ln[1], ctp[i]); AES_encrypt(txt[0], &txt[0], encrypt_key); ptp[i] = _mm_xor_si128(txt[0], ctp[i + 1]); txt[0] = _mm_xor_si128(Ln[0], ptp[i]); AES_encrypt(txt[0], &txt[0], encrypt_key); ptp[i + 1] = _mm_xor_si128(txt[0], ctp[i]); Sum = _mm_xor_si128(Sum, ptp[i + 1]); Ln[0] = _mm_xor_si128(Ln[0], Ln[1]); mul2(Ln[1], &Ln[1]); } ptp += (2 * ell); ctp += (2 * ell); /* Last chunk */ if (last <= BLOCK){ //odd block, including the case pl_len = 0 (no plaintext) AES_encrypt(Ln[0], &txt[0], encrypt_key); //txt[0] is Z xorp(last, &txt[0], (uint8*)&ctp[0], (uint8*)&ptp[0]); ozp(last, (uint8*)&ptp[0], &txt[0]); Sum = _mm_xor_si128(txt[0], Sum); La = &Ln[0]; } else{//even blocks, last > BLOCK always holds. 2-round Feistel with last swap ozp(last - BLOCK, (uint8*)&ctp[1], &txt[0]); Sum = _mm_xor_si128(Sum, txt[0]); txt[0] = _mm_xor_si128(Ln[1], txt[0]); AES_encrypt(txt[0], &txt[0], encrypt_key); ptp[0] = _mm_xor_si128(txt[0], ctp[0]); txt[0] = _mm_xor_si128(Ln[0], ptp[0]); AES_encrypt(txt[0], &txt[1], encrypt_key); //txt[1] is Z xorp(last - BLOCK, &txt[1], (uint8*)&ctp[1], (uint8*)&ptp[1]); Sum = _mm_xor_si128(Sum, txt[1]); La = &Ln[1]; } /* TE generation */ if (last == BLOCK || last == DBLOCK){//last = 16 or 32 mul7(*La, La); } else{ mul3twice(*La, La); } Sum = _mm_xor_si128(Sum, *La); //Sum = (3^2 or 7)L* xor Sum AES_encrypt(Sum, &Sum, encrypt_key); return Sum;//TE }//end of DFunc
inline uint8_t mul3(uint8_t a) { return mul2(a) ^ a; }