int norx_aead_decrypt( unsigned char *m, size_t *mlen, const unsigned char *a, size_t alen, const unsigned char *c, size_t clen, const unsigned char *z, size_t zlen, const unsigned char *nonce, const unsigned char *key ) { const __m256i K = LOADU(key); __m256i A, B, C, D; if (clen < BYTES(NORX_T)) { return -1; } *mlen = clen - BYTES(NORX_T); INITIALISE(A, B, C, D, nonce, K); ABSORB_DATA(A, B, C, D, a, alen, HEADER_TAG); DECRYPT_DATA(A, B, C, D, m, c, clen - BYTES(NORX_T)); ABSORB_DATA(A, B, C, D, z, zlen, TRAILER_TAG); FINALISE(A, B, C, D, K); /* Verify tag */ D = _mm256_cmpeq_epi8(D, LOADU(c + clen - BYTES(NORX_T))); return (((_mm256_movemask_epi8(D) & 0xFFFFFFFFULL) + 1) >> 32) - 1; }
/* inlen <= 40 */ static void block_copy(unsigned char *out, const unsigned char *in, const size_t inlen) { if( inlen & 32 ) { STOREU(out + 0, LOADU(in + 0)); STOREU(out + 16, LOADU(in + 16)); in += 32; out += 32; } if( inlen & 16 ) { STOREU(out + 0, LOADU(in + 0)); in += 16; out += 16; } if( inlen & 8 ) { memcpy(out, in, 8); in += 8; out += 8; } if( inlen & 4 ) { memcpy(out, in, 4); in += 4; out += 4; } if( inlen & 2 ) { memcpy(out, in, 2); in += 2; out += 2; } if( inlen & 1 ) { memcpy(out, in, 1); in += 1; out += 1; } }
int norx_aead_decrypt( unsigned char *m, size_t *mlen, const unsigned char *a, size_t alen, const unsigned char *c, size_t clen, const unsigned char *z, size_t zlen, const unsigned char *nonce, const unsigned char *key ) { const __m128i K = LOADU(key); __m128i S[4]; if (clen < BYTES(NORX_T)) { return -1; } *mlen = clen - BYTES(NORX_T); INITIALISE(S, nonce, K); ABSORB_DATA(S, a, alen, HEADER_TAG); DECRYPT_DATA(S, m, c, clen - BYTES(NORX_T)); ABSORB_DATA(S, z, zlen, TRAILER_TAG); FINALISE(S, K); /* Verify tag */ S[3] = _mm_cmpeq_epi8(S[3], LOADU(c + clen - BYTES(NORX_T))); return (((_mm_movemask_epi8(S[3]) & 0xFFFFU) + 1) >> 16) - 1; }
int norx_aead_decrypt( unsigned char *m, size_t *mlen, const unsigned char *a, size_t alen, const unsigned char *c, size_t clen, const unsigned char *z, size_t zlen, const unsigned char *nonce, const unsigned char *key ) { const uint32x4_t K = LOADU(key); uint32x4_t A, B, C, D; if(clen < BYTES(NORX_T)) { return -1; } *mlen = clen - BYTES(NORX_T); INITIALISE(A, B, C, D, nonce, K); ABSORB_DATA(A, B, C, D, a, alen, HEADER_TAG); DECRYPT_DATA(A, B, C, D, m, c, clen - BYTES(NORX_T)); ABSORB_DATA(A, B, C, D, z, zlen, TRAILER_TAG); FINALISE(A, B, C, D, K); /* Verify tag */ D = vceqq_u32(D, LOADU(c + clen - BYTES(NORX_T))); return 0xFFFFFFFF == (vgetq_lane_u32(D, 0) & vgetq_lane_u32(D, 1) & vgetq_lane_u32(D, 2) & vgetq_lane_u32(D, 3)) ? 0 : -1; }
static inline int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] ) { __m128i row1, row2, row3, row4; __m128i buf1, buf2, buf3, buf4; #if defined(HAVE_SSE41) __m128i t0, t1; #if !defined(HAVE_XOP) __m128i t2; #endif #endif __m128i ff0, ff1; #if defined(HAVE_SSSE3) && !defined(HAVE_XOP) const __m128i r8 = _mm_set_epi8( 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1 ); const __m128i r16 = _mm_set_epi8( 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 ); #endif #if defined(HAVE_SSE41) const __m128i m0 = LOADU( block + 00 ); const __m128i m1 = LOADU( block + 16 ); const __m128i m2 = LOADU( block + 32 ); const __m128i m3 = LOADU( block + 48 ); #else const uint32_t m0 = ( ( uint32_t * )block )[ 0]; const uint32_t m1 = ( ( uint32_t * )block )[ 1]; const uint32_t m2 = ( ( uint32_t * )block )[ 2]; const uint32_t m3 = ( ( uint32_t * )block )[ 3]; const uint32_t m4 = ( ( uint32_t * )block )[ 4]; const uint32_t m5 = ( ( uint32_t * )block )[ 5]; const uint32_t m6 = ( ( uint32_t * )block )[ 6]; const uint32_t m7 = ( ( uint32_t * )block )[ 7]; const uint32_t m8 = ( ( uint32_t * )block )[ 8]; const uint32_t m9 = ( ( uint32_t * )block )[ 9]; const uint32_t m10 = ( ( uint32_t * )block )[10]; const uint32_t m11 = ( ( uint32_t * )block )[11]; const uint32_t m12 = ( ( uint32_t * )block )[12]; const uint32_t m13 = ( ( uint32_t * )block )[13]; const uint32_t m14 = ( ( uint32_t * )block )[14]; const uint32_t m15 = ( ( uint32_t * )block )[15]; #endif row1 = ff0 = LOAD( &S->h[0] ); row2 = ff1 = LOAD( &S->h[4] ); row3 = _mm_setr_epi32( 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A ); row4 = _mm_xor_si128( _mm_setr_epi32( 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 ), LOAD( &S->t[0] ) ); ROUND( 0 ); ROUND( 1 ); ROUND( 2 ); ROUND( 3 ); ROUND( 4 ); ROUND( 5 ); ROUND( 6 ); ROUND( 7 ); ROUND( 8 ); ROUND( 9 ); STORE( &S->h[0], _mm_xor_si128( ff0, _mm_xor_si128( row1, row3 ) ) ); STORE( &S->h[4], _mm_xor_si128( ff1, _mm_xor_si128( row2, row4 ) ) ); return 0; }
void norx_aead_encrypt( unsigned char *c, size_t *clen, const unsigned char *a, size_t alen, const unsigned char *m, size_t mlen, const unsigned char *z, size_t zlen, const unsigned char *nonce, const unsigned char *key ) { const __m128i K0 = LOADU(key + 0); const __m128i K1 = LOADU(key + 16); __m128i S[8]; *clen = mlen + BYTES(NORX_T); INITIALISE(S, nonce, K0, K1); ABSORB_DATA(S, a, alen, HEADER_TAG); ENCRYPT_DATA(S, c, m, mlen); ABSORB_DATA(S, z, zlen, TRAILER_TAG); FINALISE(S, K0, K1); STOREU(c + mlen, S[6]); STOREU(c + mlen + BYTES(NORX_T)/2, S[7]); }
static PyObject *update(PyObject *self, PyObject *args) { PyArrayObject *C, *F, *G; if (!PyArg_ParseTuple(args, "OOO", &C, &F, &G )) return NULL; int nx, ny, i, j, idx; float *c, *f, *g; nx = (int)(C->dimensions)[0]; ny = (int)(C->dimensions)[1]; c = (float*)(C->data); f = (float*)(F->data); g = (float*)(G->data); __m128 vc, vf, vg, vg1, vg2, vg3, vg4, tmp; __m128 c2 = {2,2,2,2}, c4 = {4,4,4,4}; #pragma omp parallel for \ shared(ny, c, f, g, c2, c4) \ private(vc, vf, vg, vg1, vg2, vg3, vg4, tmp, i, j, idx) \ schedule(guided) for ( i=1; i<nx-1; i++ ) { for ( j=0; j<ny; j+=4 ) { idx = i*ny + j; vc = LOAD(c+idx); vf = LOAD(f+idx); vg = LOAD(g+idx); vg1 = LOAD(g+idx+ny); vg2 = LOAD(g+idx-ny); vg3 = LOADU(g+idx+1); vg4 = LOADU(g+idx-1); tmp = ADD(ADD(ADD(vg1,vg2),vg3),vg4); tmp = MUL(vc,SUB(tmp,MUL(c4,vg))); tmp = SUB(ADD(tmp,MUL(c2,vg)),vf); STORE(f+idx,tmp); } } Py_INCREF(Py_None); return Py_None; }
void norx_aead_encrypt( unsigned char *c, size_t *clen, const unsigned char *a, size_t alen, const unsigned char *m, size_t mlen, const unsigned char *z, size_t zlen, const unsigned char *nonce, const unsigned char *key ) { const __m256i K = LOADU(key); __m256i A, B, C, D; *clen = mlen + BYTES(NORX_T); INITIALISE(A, B, C, D, nonce, K); ABSORB_DATA(A, B, C, D, a, alen, HEADER_TAG); ENCRYPT_DATA(A, B, C, D, c, m, mlen); ABSORB_DATA(A, B, C, D, z, zlen, TRAILER_TAG); FINALISE(A, B, C, D, K); STOREU(c + mlen, D); }
static inline int blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] ) { __m128i row1l, row1h; __m128i row2l, row2h; __m128i row3l, row3h; __m128i row4l, row4h; __m128i b0, b1; __m128i t0, t1; #if defined(HAVE_SSSE3) && !defined(HAVE_XOP) const __m128i r16 = _mm_setr_epi8( 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9 ); const __m128i r24 = _mm_setr_epi8( 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10 ); #endif #if defined(HAVE_SSE41) const __m128i m0 = LOADU( block + 00 ); const __m128i m1 = LOADU( block + 16 ); const __m128i m2 = LOADU( block + 32 ); const __m128i m3 = LOADU( block + 48 ); const __m128i m4 = LOADU( block + 64 ); const __m128i m5 = LOADU( block + 80 ); const __m128i m6 = LOADU( block + 96 ); const __m128i m7 = LOADU( block + 112 ); #else const uint64_t m0 = ( ( uint64_t * )block )[ 0]; const uint64_t m1 = ( ( uint64_t * )block )[ 1]; const uint64_t m2 = ( ( uint64_t * )block )[ 2]; const uint64_t m3 = ( ( uint64_t * )block )[ 3]; const uint64_t m4 = ( ( uint64_t * )block )[ 4]; const uint64_t m5 = ( ( uint64_t * )block )[ 5]; const uint64_t m6 = ( ( uint64_t * )block )[ 6]; const uint64_t m7 = ( ( uint64_t * )block )[ 7]; const uint64_t m8 = ( ( uint64_t * )block )[ 8]; const uint64_t m9 = ( ( uint64_t * )block )[ 9]; const uint64_t m10 = ( ( uint64_t * )block )[10]; const uint64_t m11 = ( ( uint64_t * )block )[11]; const uint64_t m12 = ( ( uint64_t * )block )[12]; const uint64_t m13 = ( ( uint64_t * )block )[13]; const uint64_t m14 = ( ( uint64_t * )block )[14]; const uint64_t m15 = ( ( uint64_t * )block )[15]; #endif row1l = LOADU( &S->h[0] ); row1h = LOADU( &S->h[2] ); row2l = LOADU( &S->h[4] ); row2h = LOADU( &S->h[6] ); row3l = LOADU( &blake2b_IV[0] ); row3h = LOADU( &blake2b_IV[2] ); row4l = _mm_xor_si128( LOADU( &blake2b_IV[4] ), LOADU( &S->t[0] ) ); row4h = _mm_xor_si128( LOADU( &blake2b_IV[6] ), LOADU( &S->f[0] ) ); ROUND( 0 ); ROUND( 1 ); ROUND( 2 ); row1l = _mm_xor_si128( row3l, row1l ); row1h = _mm_xor_si128( row3h, row1h ); STOREU( &S->h[0], _mm_xor_si128( LOADU( &S->h[0] ), row1l ) ); STOREU( &S->h[2], _mm_xor_si128( LOADU( &S->h[2] ), row1h ) ); row2l = _mm_xor_si128( row4l, row2l ); row2h = _mm_xor_si128( row4h, row2h ); STOREU( &S->h[4], _mm_xor_si128( LOADU( &S->h[4] ), row2l ) ); STOREU( &S->h[6], _mm_xor_si128( LOADU( &S->h[6] ), row2h ) ); return 0; }
int blake2b_compress_sse41( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] ) { __m128i row1l, row1h; __m128i row2l, row2h; __m128i row3l, row3h; __m128i row4l, row4h; __m128i b0, b1; __m128i t0, t1; const __m128i r16 = _mm_setr_epi8( 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9 ); const __m128i r24 = _mm_setr_epi8( 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10 ); const __m128i m0 = LOADU( block + 00 ); const __m128i m1 = LOADU( block + 16 ); const __m128i m2 = LOADU( block + 32 ); const __m128i m3 = LOADU( block + 48 ); const __m128i m4 = LOADU( block + 64 ); const __m128i m5 = LOADU( block + 80 ); const __m128i m6 = LOADU( block + 96 ); const __m128i m7 = LOADU( block + 112 ); row1l = LOADU( &S->h[0] ); row1h = LOADU( &S->h[2] ); row2l = LOADU( &S->h[4] ); row2h = LOADU( &S->h[6] ); row3l = LOADU( &blake2b_IV[0] ); row3h = LOADU( &blake2b_IV[2] ); row4l = _mm_xor_si128( LOADU( &blake2b_IV[4] ), LOADU( &S->t[0] ) ); row4h = _mm_xor_si128( LOADU( &blake2b_IV[6] ), LOADU( &S->f[0] ) ); ROUND( 0 ); ROUND( 1 ); ROUND( 2 ); ROUND( 3 ); ROUND( 4 ); ROUND( 5 ); ROUND( 6 ); ROUND( 7 ); ROUND( 8 ); ROUND( 9 ); ROUND( 10 ); ROUND( 11 ); row1l = _mm_xor_si128( row3l, row1l ); row1h = _mm_xor_si128( row3h, row1h ); STOREU( &S->h[0], _mm_xor_si128( LOADU( &S->h[0] ), row1l ) ); STOREU( &S->h[2], _mm_xor_si128( LOADU( &S->h[2] ), row1h ) ); row2l = _mm_xor_si128( row4l, row2l ); row2h = _mm_xor_si128( row4h, row2h ); STOREU( &S->h[4], _mm_xor_si128( LOADU( &S->h[4] ), row2l ) ); STOREU( &S->h[6], _mm_xor_si128( LOADU( &S->h[6] ), row2h ) ); return 0; }
void blake256_sse2_compress( state256 * state, const uint8_t * datablock ) { __m128i row1,row2,row3,row4; __m128i buf1,buf2; #pragma GCC diagnostic ignored "-Wunused-variable" union { uint32_t u32[16]; __m128i u128[4]; } m; int r; uint64_t t; static const int sig[][16] = { { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } , { 9, 0, 5, 7, 2, 4, 10, 15, 14, 1, 11, 12, 6, 8, 3, 13 } , { 2, 12, 6, 10, 0, 11, 8, 3, 4, 13, 7, 5, 15, 14, 1, 9 } , { 12, 5, 1, 15, 14, 13, 4, 10, 0, 7, 6, 3, 9, 2, 8, 11 } , { 13, 11, 7, 14, 12, 1, 3, 9, 5, 0, 15, 4, 8, 6, 2, 10 } , { 6, 15, 14, 9, 11, 3, 0, 8, 12, 2, 13, 7, 1, 4, 10, 5 } , { 10, 2, 8, 4, 7, 6, 1, 5, 15, 11, 9, 14, 3, 12, 13 , 0 } , { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 } , { 14, 10, 4, 8, 9, 15, 13, 6, 1, 12, 0, 2, 11, 7, 5, 3 } , { 11, 8, 12, 0, 5, 2, 15, 13, 10, 14, 3, 6, 7, 1, 9, 4 } , { 7, 9, 3, 1, 13, 12, 11, 14, 2, 6, 5, 10, 4, 0, 15, 8 } }; static const uint32_t z[16] = { 0x243F6A88, 0x85A308D3, 0x13198A2E, 0x03707344, 0xA4093822, 0x299F31D0, 0x082EFA98, 0xEC4E6C89, 0x452821E6, 0x38D01377, 0xBE5466CF, 0x34E90C6C, 0xC0AC29B7, 0xC97C50DD, 0x3F84D5B5, 0xB5470917 }; #pragma GCC diagnostic push /* get message */ m.u128[0] = LOADU(datablock + 0); m.u128[1] = LOADU(datablock + 16); m.u128[2] = LOADU(datablock + 32); m.u128[3] = LOADU(datablock + 48); BSWAP32(m.u128[0]); BSWAP32(m.u128[1]); BSWAP32(m.u128[2]); BSWAP32(m.u128[3]); row1 = _mm_set_epi32(state->h[ 3], state->h[ 2], state->h[ 1], state->h[ 0]); row2 = _mm_set_epi32(state->h[ 7], state->h[ 6], state->h[ 5], state->h[ 4]); row3 = _mm_set_epi32(0x03707344, 0x13198A2E, 0x85A308D3, 0x243F6A88); if (state->nullt) row4 = _mm_set_epi32(0xEC4E6C89, 0x082EFA98, 0x299F31D0, 0xA4093822); else row4 = _mm_set_epi32(0xEC4E6C89^state->t[1], 0x082EFA98^state->t[1], 0x299F31D0^state->t[0], 0xA4093822^state->t[0]); #define round(r) \ /* column step */ \ buf1 = _mm_set_epi32(m.u32[sig[r][ 6]], \ m.u32[sig[r][ 4]], \ m.u32[sig[r][ 2]], \ m.u32[sig[r][ 0]]); \ buf2 = _mm_set_epi32(z[sig[r][ 7]], \ z[sig[r][ 5]], \ z[sig[r][ 3]], \ z[sig[r][ 1]]); \ buf1 = _mm_xor_si128( buf1, buf2); \ row1 = _mm_add_epi32( _mm_add_epi32( row1, buf1), row2 ); \ buf1 = _mm_set_epi32(z[sig[r][ 6]], \ z[sig[r][ 4]], \ z[sig[r][ 2]], \ z[sig[r][ 0]]); \ buf2 = _mm_set_epi32(m.u32[sig[r][ 7]], \ m.u32[sig[r][ 5]], \ m.u32[sig[r][ 3]], \ m.u32[sig[r][ 1]]); \ row4 = _mm_xor_si128( row4, row1 ); \ row4 = _mm_xor_si128(_mm_srli_epi32( row4, 16 ),_mm_slli_epi32( row4, 16 )); \ row3 = _mm_add_epi32( row3, row4 ); \ row2 = _mm_xor_si128( row2, row3 ); \ buf1 = _mm_xor_si128( buf1, buf2); \ row2 = _mm_xor_si128(_mm_srli_epi32( row2, 12 ),_mm_slli_epi32( row2, 20 )); \ row1 = _mm_add_epi32( _mm_add_epi32( row1, buf1), row2 ); \ row4 = _mm_xor_si128( row4, row1 ); \ row4 = _mm_xor_si128(_mm_srli_epi32( row4, 8 ),_mm_slli_epi32( row4, 24 )); \ row3 = _mm_add_epi32( row3, row4 ); \ row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(2,1,0,3) ); \ row2 = _mm_xor_si128( row2, row3 ); \ row2 = _mm_xor_si128(_mm_srli_epi32( row2, 7 ),_mm_slli_epi32( row2, 25 )); \ \ row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \ row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(0,3,2,1) ); \ \ /* diagonal step */ \ buf1 = _mm_set_epi32(m.u32[sig[r][14]], \ m.u32[sig[r][12]], \ m.u32[sig[r][10]], \ m.u32[sig[r][ 8]]); \ buf2 = _mm_set_epi32(z[sig[r][15]], \ z[sig[r][13]], \ z[sig[r][11]], \ z[sig[r][ 9]]); \ buf1 = _mm_xor_si128( buf1, buf2); \ row1 = _mm_add_epi32( _mm_add_epi32( row1, buf1 ), row2 ); \ buf1 = _mm_set_epi32(z[sig[r][14]], \ z[sig[r][12]], \ z[sig[r][10]], \ z[sig[r][ 8]]); \ buf2 = _mm_set_epi32(m.u32[sig[r][15]], \ m.u32[sig[r][13]], \ m.u32[sig[r][11]], \ m.u32[sig[r][ 9]]); \ row4 = _mm_xor_si128( row4, row1 ); \ buf1 = _mm_xor_si128( buf1, buf2); \ row4 = _mm_xor_si128(_mm_srli_epi32( row4, 16 ),_mm_slli_epi32( row4, 16 )); \ row3 = _mm_add_epi32( row3, row4 ); \ row2 = _mm_xor_si128( row2, row3 ); \ row2 = _mm_xor_si128(_mm_srli_epi32( row2, 12 ),_mm_slli_epi32( row2, 20 )); \ row1 = _mm_add_epi32( _mm_add_epi32( row1, buf1 ), row2 ); \ row4 = _mm_xor_si128( row4, row1 ); \ row4 = _mm_xor_si128(_mm_srli_epi32( row4, 8 ),_mm_slli_epi32( row4, 24 )); \ row3 = _mm_add_epi32( row3, row4 ); \ row4 = _mm_shuffle_epi32( row4, _MM_SHUFFLE(0,3,2,1) ); \ row2 = _mm_xor_si128( row2, row3 ); \ row2 = _mm_xor_si128(_mm_srli_epi32( row2, 7 ),_mm_slli_epi32( row2, 25 )); \ \ row3 = _mm_shuffle_epi32( row3, _MM_SHUFFLE(1,0,3,2) ); \ row2 = _mm_shuffle_epi32( row2, _MM_SHUFFLE(2,1,0,3) ); \ \ round( 0); round( 1); round( 2); round( 3); round( 4); round( 5); round( 6); round( 7); round( 8); round( 9); round(10); round(11); round(12); round(13); _mm_store_si128( (__m128i *)m.u32, _mm_xor_si128(row1,row3)); state->h[0] ^= m.u32[ 0]; state->h[1] ^= m.u32[ 1]; state->h[2] ^= m.u32[ 2]; state->h[3] ^= m.u32[ 3]; _mm_store_si128( (__m128i *)m.u32, _mm_xor_si128(row2,row4)); state->h[4] ^= m.u32[ 0]; state->h[5] ^= m.u32[ 1]; state->h[6] ^= m.u32[ 2]; state->h[7] ^= m.u32[ 3]; }
} } int crypto_aead_encrypt( unsigned char *c, unsigned long long *clen, const unsigned char *m, unsigned long long mlen, const unsigned char *ad, unsigned long long adlen, const unsigned char *nsec, const unsigned char *npub, const unsigned char *k ) { ALIGN(64) unsigned char lastblock[40]; __m128i A, B, C, D; const uint64_t N = *(const uint64_t *)npub; const __m128i K = LOADU(k + 0); *clen = mlen + NORX_A/8; /* Initialization */ INITIALIZE(A, B, C, D, N, K); /* Process header, if exists */ if( adlen > 0 ) { while(adlen >= 40) { ABSORB_BLOCK(A, B, C, D, ad); ad += 40; adlen -= 40; } PAD(lastblock, sizeof lastblock, ad, adlen);
/* Blake2b compression function modified to do only one single round */ static inline void blake2round(blake2b_state* S, const uint8_t block[BLAKE2B_BLOCKBYTES], unsigned ridx){ __m128i row1l, row1h; __m128i row2l, row2h; __m128i row3l, row3h; __m128i row4l, row4h; __m128i b0, b1; __m128i t0, t1; #if defined(HAVE_SSSE3) && !defined(HAVE_XOP) const __m128i r16 = _mm_setr_epi8( 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9 ); const __m128i r24 = _mm_setr_epi8( 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10 ); #endif #if defined(HAVE_SSE41) const __m128i m0 = LOADU( block + 00 ); const __m128i m1 = LOADU( block + 16 ); const __m128i m2 = LOADU( block + 32 ); const __m128i m3 = LOADU( block + 48 ); const __m128i m4 = LOADU( block + 64 ); const __m128i m5 = LOADU( block + 80 ); const __m128i m6 = LOADU( block + 96 ); const __m128i m7 = LOADU( block + 112 ); #else const uint64_t m0 = ( ( uint64_t * )block )[ 0]; const uint64_t m1 = ( ( uint64_t * )block )[ 1]; const uint64_t m2 = ( ( uint64_t * )block )[ 2]; const uint64_t m3 = ( ( uint64_t * )block )[ 3]; const uint64_t m4 = ( ( uint64_t * )block )[ 4]; const uint64_t m5 = ( ( uint64_t * )block )[ 5]; const uint64_t m6 = ( ( uint64_t * )block )[ 6]; const uint64_t m7 = ( ( uint64_t * )block )[ 7]; const uint64_t m8 = ( ( uint64_t * )block )[ 8]; const uint64_t m9 = ( ( uint64_t * )block )[ 9]; const uint64_t m10 = ( ( uint64_t * )block )[10]; const uint64_t m11 = ( ( uint64_t * )block )[11]; const uint64_t m12 = ( ( uint64_t * )block )[12]; const uint64_t m13 = ( ( uint64_t * )block )[13]; const uint64_t m14 = ( ( uint64_t * )block )[14]; const uint64_t m15 = ( ( uint64_t * )block )[15]; #endif row1l = LOAD( &S->h[0] ); row1h = LOAD( &S->h[2] ); row2l = LOAD( &S->h[4] ); row2h = LOAD( &S->h[6] ); row3l = LOAD( &blake2b_IV[0] ); row3h = LOAD( &blake2b_IV[2] ); row4l = _mm_xor_si128( LOAD( &blake2b_IV[4] ), LOAD( &S->t[0] ) ); row4h = _mm_xor_si128( LOAD( &blake2b_IV[6] ), LOAD( &S->f[0] ) ); switch(ridx){ case 0:ROUND( 0 );break; case 1:ROUND( 1 );break; case 2:ROUND( 2 );break; case 3:ROUND( 3 );break; case 4:ROUND( 4 );break; case 5:ROUND( 5 );break; case 6:ROUND( 6 );break; case 7:ROUND( 7 );break; case 8:ROUND( 8 );break; case 9:ROUND( 9 );break; case 10:ROUND( 10 );break; case 11:ROUND( 11 );break; } row1l = _mm_xor_si128( row3l, row1l ); row1h = _mm_xor_si128( row3h, row1h ); STORE( &S->h[0], _mm_xor_si128( LOAD( &S->h[0] ), row1l ) ); STORE( &S->h[2], _mm_xor_si128( LOAD( &S->h[2] ), row1h ) ); row2l = _mm_xor_si128( row4l, row2l ); row2h = _mm_xor_si128( row4h, row2h ); STORE( &S->h[4], _mm_xor_si128( LOAD( &S->h[4] ), row2l ) ); STORE( &S->h[6], _mm_xor_si128( LOAD( &S->h[6] ), row2h ) ); }
static inline int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] ) { __m128i row1, row2, row3, row4; __m128i buf1, buf2, buf3, buf4; __m128i ff0, ff1; __m128i t0, t1, t2; const __m128i r8 = _mm_set_epi8( 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1 ); const __m128i r16 = _mm_set_epi8( 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 ); const __m128i m0 = LOADU( block + 00 ); const __m128i m1 = LOADU( block + 16 ); const __m128i m2 = LOADU( block + 32 ); const __m128i m3 = LOADU( block + 48 ); row1 = ff0 = LOAD( &S->h[0] ); row2 = ff1 = LOAD( &S->h[4] ); row3 = _mm_setr_epi32( 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A ); row4 = _mm_xor_si128( _mm_setr_epi32( 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 ), LOAD( &S->t[0] ) ); #undef ROUND #if defined(DEBUG) printf128("m0",m0); printf128("m1",m1); printf128("m2",m2); printf128("m3",m3); #define ROUND(r) \ printf("R%u\n", r);\ printf128("row1",row1); printf128("row2",row2); printf128("row3",row3); printf128("row4",row4); \ LOAD_MSG_ ##r ##_1(buf1); \ printf128("MSG_1",buf1); \ G1(row1,row2,row3,row4,buf1); \ printf128("G1\nrow1",row1); printf128("row2",row2); printf128("row3",row3); printf128("row4",row4); \ LOAD_MSG_ ##r ##_2(buf2); \ printf128("MSG_2",buf2); \ G2(row1,row2,row3,row4,buf2); \ printf128("G2\nrow1",row1); printf128("row2",row2); printf128("row3",row3); printf128("row4",row4); \ DIAGONALIZE(row1,row2,row3,row4); \ printf128("DIAG\nrow1",row1); printf128("row2",row2); printf128("row3",row3); printf128("row4",row4); \ LOAD_MSG_ ##r ##_3(buf3); \ printf128("MSG_3",buf3); \ G1(row1,row2,row3,row4,buf3); \ printf128("G1\nrow1",row1); printf128("row2",row2); printf128("row3",row3); printf128("row4",row4); \ LOAD_MSG_ ##r ##_4(buf4); \ printf128("MSG_4",buf4); \ G2(row1,row2,row3,row4,buf4); \ printf128("G2\nrow1",row1); printf128("row2",row2); printf128("row3",row3); printf128("row4",row4); \ UNDIAGONALIZE(row1,row2,row3,row4); \ printf128("UNDIAG\nrow1",row1); printf128("row2",row2); printf128("row3",row3); printf128("row4",row4); #else #define ROUND(r) \ LOAD_MSG_ ##r ##_1(buf1); \ G1(row1,row2,row3,row4,buf1); \ LOAD_MSG_ ##r ##_2(buf2); \ G2(row1,row2,row3,row4,buf2); \ DIAGONALIZE(row1,row2,row3,row4); \ LOAD_MSG_ ##r ##_3(buf3); \ G1(row1,row2,row3,row4,buf3); \ LOAD_MSG_ ##r ##_4(buf4); \ G2(row1,row2,row3,row4,buf4); \ UNDIAGONALIZE(row1,row2,row3,row4); #endif ROUND( 0 ); ROUND( 1 ); ROUND( 2 ); ROUND( 3 ); ROUND( 4 ); ROUND( 5 ); ROUND( 6 ); ROUND( 7 ); ROUND( 8 ); ROUND( 9 ); STORE( &S->h[0], _mm_xor_si128( ff0, _mm_xor_si128( row1, row3 ) ) ); STORE( &S->h[4], _mm_xor_si128( ff1, _mm_xor_si128( row2, row4 ) ) ); return 0; }
} } int crypto_aead_encrypt( unsigned char *c, unsigned long long *clen, const unsigned char *m, unsigned long long mlen, const unsigned char *ad, unsigned long long adlen, const unsigned char *nsec, const unsigned char *npub, const unsigned char *k ) { ALIGN(64) unsigned char lastblock[80]; __m256i A, B, C, D; const __m128i N = LOADU128(npub); const __m256i K = LOADU(k + 0); *clen = mlen + NORX_A/8; /* Initialization */ INITIALIZE(A, B, C, D, N, K); /* Process header, if exists */ if( adlen > 0 ) { while(adlen >= 80) { ABSORB_BLOCK(A, B, C, D, ad); ad += 80; adlen -= 80; }
} while(0) int crypto_aead_encrypt( unsigned char *c, unsigned long long *clen, const unsigned char *m, unsigned long long mlen_, const unsigned char *ad, unsigned long long adlen_, const unsigned char *nsec, const unsigned char *npub, const unsigned char *k ) { ALIGN(32) unsigned char lastblock[40]; uint32x4_t A, B, C, D; uint64_t N; // = *(const uint64_t *)npub; const uint32x4_t K = LOADU(k + 0); size_t mlen = mlen_; size_t adlen = adlen_; memcpy(&N, npub, sizeof N); *clen = mlen + NORX_A/8; /* Initialization */ INITIALIZE(A, B, C, D, N, K); /* Process header, if exists */ if( adlen > 0 ) { while(adlen >= 40) { ABSORB_BLOCK(A, B, C, D, ad);