예제 #1
0
static inline int blake2b_compress( blake2b_state *S, const uint8_t block[BLAKE2B_BLOCKBYTES] )
{
  __m128i row1l, row1h;
  __m128i row2l, row2h;
  __m128i row3l, row3h;
  __m128i row4l, row4h;
  __m128i b0, b1;
  __m128i t0, t1;
#if defined(HAVE_SSSE3) && !defined(HAVE_XOP)
  const __m128i r16 = _mm_setr_epi8( 2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9 );
  const __m128i r24 = _mm_setr_epi8( 3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10 );
#endif
#if defined(HAVE_SSE41)
  const __m128i m0 = LOADU( block + 00 );
  const __m128i m1 = LOADU( block + 16 );
  const __m128i m2 = LOADU( block + 32 );
  const __m128i m3 = LOADU( block + 48 );
  const __m128i m4 = LOADU( block + 64 );
  const __m128i m5 = LOADU( block + 80 );
  const __m128i m6 = LOADU( block + 96 );
  const __m128i m7 = LOADU( block + 112 );
#else
  const uint64_t  m0 = ( ( uint64_t * )block )[ 0];
  const uint64_t  m1 = ( ( uint64_t * )block )[ 1];
  const uint64_t  m2 = ( ( uint64_t * )block )[ 2];
  const uint64_t  m3 = ( ( uint64_t * )block )[ 3];
  const uint64_t  m4 = ( ( uint64_t * )block )[ 4];
  const uint64_t  m5 = ( ( uint64_t * )block )[ 5];
  const uint64_t  m6 = ( ( uint64_t * )block )[ 6];
  const uint64_t  m7 = ( ( uint64_t * )block )[ 7];
  const uint64_t  m8 = ( ( uint64_t * )block )[ 8];
  const uint64_t  m9 = ( ( uint64_t * )block )[ 9];
  const uint64_t m10 = ( ( uint64_t * )block )[10];
  const uint64_t m11 = ( ( uint64_t * )block )[11];
  const uint64_t m12 = ( ( uint64_t * )block )[12];
  const uint64_t m13 = ( ( uint64_t * )block )[13];
  const uint64_t m14 = ( ( uint64_t * )block )[14];
  const uint64_t m15 = ( ( uint64_t * )block )[15];
#endif
  row1l = LOADU( &S->h[0] );
  row1h = LOADU( &S->h[2] );
  row2l = LOADU( &S->h[4] );
  row2h = LOADU( &S->h[6] );
  row3l = LOADU( &blake2b_IV[0] );
  row3h = LOADU( &blake2b_IV[2] );
  row4l = _mm_xor_si128( LOADU( &blake2b_IV[4] ), LOADU( &S->t[0] ) );
  row4h = _mm_xor_si128( LOADU( &blake2b_IV[6] ), LOADU( &S->f[0] ) );
  ROUND( 0 );
  ROUND( 1 );
  ROUND( 2 );
  row1l = _mm_xor_si128( row3l, row1l );
  row1h = _mm_xor_si128( row3h, row1h );
  STOREU( &S->h[0], _mm_xor_si128( LOADU( &S->h[0] ), row1l ) );
  STOREU( &S->h[2], _mm_xor_si128( LOADU( &S->h[2] ), row1h ) );
  row2l = _mm_xor_si128( row4l, row2l );
  row2h = _mm_xor_si128( row4h, row2h );
  STOREU( &S->h[4], _mm_xor_si128( LOADU( &S->h[4] ), row2l ) );
  STOREU( &S->h[6], _mm_xor_si128( LOADU( &S->h[6] ), row2h ) );
  return 0;
}
예제 #2
0
/* inlen <= 40 */
static void block_copy(unsigned char *out, const unsigned char *in, const size_t inlen)
{
    if( inlen & 32 )
    {
        STOREU(out +  0, LOADU(in +  0));
        STOREU(out + 16, LOADU(in + 16));
        in += 32; out += 32;
    }
    if( inlen & 16 )
    {
        STOREU(out +  0, LOADU(in +  0));
        in += 16; out += 16;
    }
    if( inlen & 8 )
    {
        memcpy(out, in, 8);
        in += 8; out += 8;
    }
    if( inlen & 4 )
    {
        memcpy(out, in, 4);
        in += 4; out += 4;
    }
    if( inlen & 2 )
    {
        memcpy(out, in, 2);
        in += 2; out += 2;
    }
    if( inlen & 1 )
    {
        memcpy(out, in, 1);
        in += 1; out += 1;
    }
}
예제 #3
0
파일: blake2s.c 프로젝트: altoplano/PHC
static inline int blake2s_compress( blake2s_state *S, const uint8_t block[BLAKE2S_BLOCKBYTES] )
{
  __m128i row1, row2, row3, row4;
  __m128i buf1, buf2, buf3, buf4;
#if defined(HAVE_SSE41)
  __m128i t0, t1;
#if !defined(HAVE_XOP)
  __m128i t2;
#endif
#endif
  __m128i ff0, ff1;
#if defined(HAVE_SSSE3) && !defined(HAVE_XOP)
  const __m128i r8 = _mm_set_epi8( 12, 15, 14, 13, 8, 11, 10, 9, 4, 7, 6, 5, 0, 3, 2, 1 );
  const __m128i r16 = _mm_set_epi8( 13, 12, 15, 14, 9, 8, 11, 10, 5, 4, 7, 6, 1, 0, 3, 2 );
#endif
#if defined(HAVE_SSE41)
  const __m128i m0 = LOADU( block +  00 );
  const __m128i m1 = LOADU( block +  16 );
  const __m128i m2 = LOADU( block +  32 );
  const __m128i m3 = LOADU( block +  48 );
#else
  const uint32_t  m0 = ( ( uint32_t * )block )[ 0];
  const uint32_t  m1 = ( ( uint32_t * )block )[ 1];
  const uint32_t  m2 = ( ( uint32_t * )block )[ 2];
  const uint32_t  m3 = ( ( uint32_t * )block )[ 3];
  const uint32_t  m4 = ( ( uint32_t * )block )[ 4];
  const uint32_t  m5 = ( ( uint32_t * )block )[ 5];
  const uint32_t  m6 = ( ( uint32_t * )block )[ 6];
  const uint32_t  m7 = ( ( uint32_t * )block )[ 7];
  const uint32_t  m8 = ( ( uint32_t * )block )[ 8];
  const uint32_t  m9 = ( ( uint32_t * )block )[ 9];
  const uint32_t m10 = ( ( uint32_t * )block )[10];
  const uint32_t m11 = ( ( uint32_t * )block )[11];
  const uint32_t m12 = ( ( uint32_t * )block )[12];
  const uint32_t m13 = ( ( uint32_t * )block )[13];
  const uint32_t m14 = ( ( uint32_t * )block )[14];
  const uint32_t m15 = ( ( uint32_t * )block )[15];
#endif
  row1 = ff0 = LOADU( &S->h[0] );
  row2 = ff1 = LOADU( &S->h[4] );
  row3 = _mm_setr_epi32( 0x6A09E667, 0xBB67AE85, 0x3C6EF372, 0xA54FF53A );
  row4 = _mm_xor_si128( _mm_setr_epi32( 0x510E527F, 0x9B05688C, 0x1F83D9AB, 0x5BE0CD19 ), LOADU( &S->t[0] ) );
  ROUND( 0 );
  ROUND( 1 );
  ROUND( 2 );
  ROUND( 3 );
  ROUND( 4 );
  ROUND( 5 );
  ROUND( 6 );
  ROUND( 7 );
  ROUND( 8 );
  ROUND( 9 );
  STOREU( &S->h[0], _mm_xor_si128( ff0, _mm_xor_si128( row1, row3 ) ) );
  STOREU( &S->h[4], _mm_xor_si128( ff1, _mm_xor_si128( row2, row4 ) ) );
  return 0;
}
예제 #4
0
int
blake2b_compress_sse41(blake2b_state *S,
                       const uint8_t  block[BLAKE2B_BLOCKBYTES])
{
    __m128i       row1l, row1h;
    __m128i       row2l, row2h;
    __m128i       row3l, row3h;
    __m128i       row4l, row4h;
    __m128i       b0, b1;
    __m128i       t0, t1;
    const __m128i r16 =
        _mm_setr_epi8(2, 3, 4, 5, 6, 7, 0, 1, 10, 11, 12, 13, 14, 15, 8, 9);
    const __m128i r24 =
        _mm_setr_epi8(3, 4, 5, 6, 7, 0, 1, 2, 11, 12, 13, 14, 15, 8, 9, 10);
    const __m128i m0 = LOADU(block + 00);
    const __m128i m1 = LOADU(block + 16);
    const __m128i m2 = LOADU(block + 32);
    const __m128i m3 = LOADU(block + 48);
    const __m128i m4 = LOADU(block + 64);
    const __m128i m5 = LOADU(block + 80);
    const __m128i m6 = LOADU(block + 96);
    const __m128i m7 = LOADU(block + 112);
    row1l            = LOADU(&S->h[0]);
    row1h            = LOADU(&S->h[2]);
    row2l            = LOADU(&S->h[4]);
    row2h            = LOADU(&S->h[6]);
    row3l            = LOADU(&blake2b_IV[0]);
    row3h            = LOADU(&blake2b_IV[2]);
    row4l            = _mm_xor_si128(LOADU(&blake2b_IV[4]), LOADU(&S->t[0]));
    row4h            = _mm_xor_si128(LOADU(&blake2b_IV[6]), LOADU(&S->f[0]));
    ROUND(0);
    ROUND(1);
    ROUND(2);
    ROUND(3);
    ROUND(4);
    ROUND(5);
    ROUND(6);
    ROUND(7);
    ROUND(8);
    ROUND(9);
    ROUND(10);
    ROUND(11);
    row1l = _mm_xor_si128(row3l, row1l);
    row1h = _mm_xor_si128(row3h, row1h);
    STOREU(&S->h[0], _mm_xor_si128(LOADU(&S->h[0]), row1l));
    STOREU(&S->h[2], _mm_xor_si128(LOADU(&S->h[2]), row1h));
    row2l = _mm_xor_si128(row4l, row2l);
    row2h = _mm_xor_si128(row4h, row2h);
    STOREU(&S->h[4], _mm_xor_si128(LOADU(&S->h[4]), row2l));
    STOREU(&S->h[6], _mm_xor_si128(LOADU(&S->h[6]), row2h));
    return 0;
}
예제 #5
0
void norx_aead_encrypt(
  unsigned char *c, size_t *clen,
  const unsigned char *a, size_t alen,
  const unsigned char *m, size_t mlen,
  const unsigned char *z, size_t zlen,
  const unsigned char *nonce,
  const unsigned char *key
)
{

    uint64x2_t S[8];

    *clen = mlen + BYTES(NORX_T);
    INITIALISE(S, nonce, key);
    ABSORB_DATA(S, a, alen, HEADER_TAG);
    ENCRYPT_DATA(S, c, m, mlen);
    ABSORB_DATA(S, z, zlen, TRAILER_TAG);
    FINALISE(S);
    STOREU(c + mlen,                   S[0]);
    STOREU(c + mlen + BYTES(NORX_T)/2, S[1]);
}
예제 #6
0
파일: norx.c 프로젝트: 0x64616E69656C/norx
void norx_aead_encrypt(
  unsigned char *c, size_t *clen,
  const unsigned char *a, size_t alen,
  const unsigned char *m, size_t mlen,
  const unsigned char *z, size_t zlen,
  const unsigned char *nonce,
  const unsigned char *key
)
{
    const __m128i K0 = LOADU(key +  0);
    const __m128i K1 = LOADU(key + 16);
    __m128i S[8];

    *clen = mlen + BYTES(NORX_T);
    INITIALISE(S, nonce, K0, K1);
    ABSORB_DATA(S, a, alen, HEADER_TAG);
    ENCRYPT_DATA(S, c, m, mlen);
    ABSORB_DATA(S, z, zlen, TRAILER_TAG);
    FINALISE(S, K0, K1);
    STOREU(c + mlen,                   S[6]);
    STOREU(c + mlen + BYTES(NORX_T)/2, S[7]);
}
예제 #7
0
void norx_aead_encrypt(
    unsigned char *c, size_t *clen,
    const unsigned char *a, size_t alen,
    const unsigned char *m, size_t mlen,
    const unsigned char *z, size_t zlen,
    const unsigned char *nonce,
    const unsigned char *key
    )
{
    __m256i A, B, C, D;

    *clen = mlen + BYTES(NORX_T);
    INITIALISE(A, B, C, D, nonce, key);
    ABSORB_DATA(A, B, C, D, a, alen, HEADER_TAG);
    ENCRYPT_DATA(A, B, C, D, c, m, mlen);
    ABSORB_DATA(A, B, C, D, z, zlen, TRAILER_TAG);
    FINALISE(A, B, C, D);
    STOREU(c + mlen, A);
}
예제 #8
0
static inline void printf128(char* name, __m128i v){
    uint32_t v_a[16];
    STOREU(v_a, v);
    printf( "%s: %x%x%x%x\n", name,v_a[3],v_a[2],v_a[1],v_a[0] );
    
}