static inline int blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] ) { __m128i row1l, row1h; __m128i row2l, row2h; __m128i row3l, row3h; __m128i row4l, row4h; __m128i b0, b1; __m128i t0, t1; #if defined(HAVE_SSSE3) && !defined(HAVE_XOP) const __m128i r16 = _mm_setr_epi8( 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9 ); const __m128i r24 = _mm_setr_epi8( 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10 ); #endif #if defined(HAVE_SSE41) const __m128i m0 = LOADU( block + 00 ); const __m128i m1 = LOADU( block + 16 ); const __m128i m2 = LOADU( block + 32 ); const __m128i m3 = LOADU( block + 48 ); const __m128i m4 = LOADU( block + 64 ); const __m128i m5 = LOADU( block + 80 ); const __m128i m6 = LOADU( block + 96 ); const __m128i m7 = LOADU( block + 112 ); #else const uint64_t m0 = ( ( uint64_t * )block )[ 0]; const uint64_t m1 = ( ( uint64_t * )block )[ 1]; const uint64_t m2 = ( ( uint64_t * )block )[ 2]; const uint64_t m3 = ( ( uint64_t * )block )[ 3]; const uint64_t m4 = ( ( uint64_t * )block )[ 4]; const uint64_t m5 = ( ( uint64_t * )block )[ 5]; const uint64_t m6 = ( ( uint64_t * )block )[ 6]; const uint64_t m7 = ( ( uint64_t * )block )[ 7]; const uint64_t m8 = ( ( uint64_t * )block )[ 8]; const uint64_t m9 = ( ( uint64_t * )block )[ 9]; const uint64_t m10 = ( ( uint64_t * )block )[10]; const uint64_t m11 = ( ( uint64_t * )block )[11]; const uint64_t m12 = ( ( uint64_t * )block )[12]; const uint64_t m13 = ( ( uint64_t * )block )[13]; const uint64_t m14 = ( ( uint64_t * )block )[14]; const uint64_t m15 = ( ( uint64_t * )block )[15]; #endif row1l = LOADU( &S->h[0] ); row1h = LOADU( &S->h[2] ); row2l = LOADU( &S->h[4] ); row2h = LOADU( &S->h[6] ); row3l = LOADU( &blake2b_IV[0] ); row3h = LOADU( &blake2b_IV[2] ); row4l = _mm_xor_si128( LOADU( &blake2b_IV[4] ), LOADU( &S->t[0] ) ); row4h = _mm_xor_si128( LOADU( &blake2b_IV[6] ), LOADU( &S->f[0] ) ); ROUND( 0 ); ROUND( 1 ); ROUND( 2 ); row1l = _mm_xor_si128( row3l, row1l ); row1h = _mm_xor_si128( row3h, row1h ); STOREU( &S->h[0], _mm_xor_si128( LOADU( &S->h[0] ), row1l ) ); STOREU( &S->h[2], _mm_xor_si128( LOADU( &S->h[2] ), row1h ) ); row2l = _mm_xor_si128( row4l, row2l ); row2h = _mm_xor_si128( row4h, row2h ); STOREU( &S->h[4], _mm_xor_si128( LOADU( &S->h[4] ), row2l ) ); STOREU( &S->h[6], _mm_xor_si128( LOADU( &S->h[6] ), row2h ) ); return 0; }
/* inlen <= 40 */ static void block_copy(unsigned char *out, const unsigned char *in, const size_t inlen) { if( inlen & 32 ) { STOREU(out + 0, LOADU(in + 0)); STOREU(out + 16, LOADU(in + 16)); in += 32; out += 32; } if( inlen & 16 ) { STOREU(out + 0, LOADU(in + 0)); in += 16; out += 16; } if( inlen & 8 ) { memcpy(out, in, 8); in += 8; out += 8; } if( inlen & 4 ) { memcpy(out, in, 4); in += 4; out += 4; } if( inlen & 2 ) { memcpy(out, in, 2); in += 2; out += 2; } if( inlen & 1 ) { memcpy(out, in, 1); in += 1; out += 1; } }
static inline int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] ) { __m128i row1, row2, row3, row4; __m128i buf1, buf2, buf3, buf4; #if defined(HAVE_SSE41) __m128i t0, t1; #if !defined(HAVE_XOP) __m128i t2; #endif #endif __m128i ff0, ff1; #if defined(HAVE_SSSE3) && !defined(HAVE_XOP) const __m128i r8 = _mm_set_epi8( 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1 ); const __m128i r16 = _mm_set_epi8( 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 ); #endif #if defined(HAVE_SSE41) const __m128i m0 = LOADU( block + 00 ); const __m128i m1 = LOADU( block + 16 ); const __m128i m2 = LOADU( block + 32 ); const __m128i m3 = LOADU( block + 48 ); #else const uint32_t m0 = ( ( uint32_t * )block )[ 0]; const uint32_t m1 = ( ( uint32_t * )block )[ 1]; const uint32_t m2 = ( ( uint32_t * )block )[ 2]; const uint32_t m3 = ( ( uint32_t * )block )[ 3]; const uint32_t m4 = ( ( uint32_t * )block )[ 4]; const uint32_t m5 = ( ( uint32_t * )block )[ 5]; const uint32_t m6 = ( ( uint32_t * )block )[ 6]; const uint32_t m7 = ( ( uint32_t * )block )[ 7]; const uint32_t m8 = ( ( uint32_t * )block )[ 8]; const uint32_t m9 = ( ( uint32_t * )block )[ 9]; const uint32_t m10 = ( ( uint32_t * )block )[10]; const uint32_t m11 = ( ( uint32_t * )block )[11]; const uint32_t m12 = ( ( uint32_t * )block )[12]; const uint32_t m13 = ( ( uint32_t * )block )[13]; const uint32_t m14 = ( ( uint32_t * )block )[14]; const uint32_t m15 = ( ( uint32_t * )block )[15]; #endif row1 = ff0 = LOADU( &S->h[0] ); row2 = ff1 = LOADU( &S->h[4] ); row3 = _mm_setr_epi32( 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A ); row4 = _mm_xor_si128( _mm_setr_epi32( 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 ), LOADU( &S->t[0] ) ); ROUND( 0 ); ROUND( 1 ); ROUND( 2 ); ROUND( 3 ); ROUND( 4 ); ROUND( 5 ); ROUND( 6 ); ROUND( 7 ); ROUND( 8 ); ROUND( 9 ); STOREU( &S->h[0], _mm_xor_si128( ff0, _mm_xor_si128( row1, row3 ) ) ); STOREU( &S->h[4], _mm_xor_si128( ff1, _mm_xor_si128( row2, row4 ) ) ); return 0; }
int blake2b_compress_sse41(blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES]) { __m128i row1l, row1h; __m128i row2l, row2h; __m128i row3l, row3h; __m128i row4l, row4h; __m128i b0, b1; __m128i t0, t1; const __m128i r16 = _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9); const __m128i r24 = _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10); const __m128i m0 = LOADU(block + 00); const __m128i m1 = LOADU(block + 16); const __m128i m2 = LOADU(block + 32); const __m128i m3 = LOADU(block + 48); const __m128i m4 = LOADU(block + 64); const __m128i m5 = LOADU(block + 80); const __m128i m6 = LOADU(block + 96); const __m128i m7 = LOADU(block + 112); row1l = LOADU(&S->h[0]); row1h = LOADU(&S->h[2]); row2l = LOADU(&S->h[4]); row2h = LOADU(&S->h[6]); row3l = LOADU(&blake2b_IV[0]); row3h = LOADU(&blake2b_IV[2]); row4l = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0])); row4h = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0])); ROUND(0); ROUND(1); ROUND(2); ROUND(3); ROUND(4); ROUND(5); ROUND(6); ROUND(7); ROUND(8); ROUND(9); ROUND(10); ROUND(11); row1l = _mm_xor_si128(row3l, row1l); row1h = _mm_xor_si128(row3h, row1h); STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l)); STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h)); row2l = _mm_xor_si128(row4l, row2l); row2h = _mm_xor_si128(row4h, row2h); STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l)); STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h)); return 0; }
void norx_aead_encrypt( unsigned char *c, size_t *clen, const unsigned char *a, size_t alen, const unsigned char *m, size_t mlen, const unsigned char *z, size_t zlen, const unsigned char *nonce, const unsigned char *key ) { uint64x2_t S[8]; *clen = mlen + BYTES(NORX_T); INITIALISE(S, nonce, key); ABSORB_DATA(S, a, alen, HEADER_TAG); ENCRYPT_DATA(S, c, m, mlen); ABSORB_DATA(S, z, zlen, TRAILER_TAG); FINALISE(S); STOREU(c + mlen, S[0]); STOREU(c + mlen + BYTES(NORX_T)/2, S[1]); }
void norx_aead_encrypt( unsigned char *c, size_t *clen, const unsigned char *a, size_t alen, const unsigned char *m, size_t mlen, const unsigned char *z, size_t zlen, const unsigned char *nonce, const unsigned char *key ) { const __m128i K0 = LOADU(key + 0); const __m128i K1 = LOADU(key + 16); __m128i S[8]; *clen = mlen + BYTES(NORX_T); INITIALISE(S, nonce, K0, K1); ABSORB_DATA(S, a, alen, HEADER_TAG); ENCRYPT_DATA(S, c, m, mlen); ABSORB_DATA(S, z, zlen, TRAILER_TAG); FINALISE(S, K0, K1); STOREU(c + mlen, S[6]); STOREU(c + mlen + BYTES(NORX_T)/2, S[7]); }
void norx_aead_encrypt( unsigned char *c, size_t *clen, const unsigned char *a, size_t alen, const unsigned char *m, size_t mlen, const unsigned char *z, size_t zlen, const unsigned char *nonce, const unsigned char *key ) { __m256i A, B, C, D; *clen = mlen + BYTES(NORX_T); INITIALISE(A, B, C, D, nonce, key); ABSORB_DATA(A, B, C, D, a, alen, HEADER_TAG); ENCRYPT_DATA(A, B, C, D, c, m, mlen); ABSORB_DATA(A, B, C, D, z, zlen, TRAILER_TAG); FINALISE(A, B, C, D); STOREU(c + mlen, A); }
static inline void printf128(char* name, __m128i v){ uint32_t v_a[16]; STOREU(v_a, v); printf( "%s: %x%x%x%x\n", name,v_a[3],v_a[2],v_a[1],v_a[0] ); }