示例#1
0
文件: serpent.c 项目: esptl/LibOtr
/* Bulk authentication of complete blocks in OCB mode. */
size_t
_gcry_serpent_ocb_auth (gcry_cipher_hd_t c, const void *abuf_arg,
			size_t nblocks)
{
#if defined(USE_AVX2) || defined(USE_SSE2) || defined(USE_NEON)
  serpent_context_t *ctx = (void *)&c->context.c;
  const unsigned char *abuf = abuf_arg;
  unsigned char l_tmp[sizeof(serpent_block_t)];
  int burn_stack_depth = 2 * sizeof(serpent_block_t);
  u64 blkn = c->u_mode.ocb.aad_nblocks;
#else
  (void)c;
  (void)abuf_arg;
#endif

#ifdef USE_AVX2
  if (ctx->use_avx2)
    {
      int did_use_avx2 = 0;
      u64 Ls[16];
      unsigned int n = 16 - (blkn % 16);
      u64 *l;
      int i;

      if (nblocks >= 16)
	{
	  for (i = 0; i < 16; i += 8)
	    {
	      /* Use u64 to store pointers for x32 support (assembly function
	       * assumes 64-bit pointers). */
	      Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
	      Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
	      Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
	      Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
	      Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
	      Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
	      Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
	    }

	  Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
	  l = &Ls[(15 + n) % 16];

	  /* Process data in 16 block chunks. */
	  while (nblocks >= 16)
	    {
	      /* l_tmp will be used only every 65536-th block. */
	      blkn += 16;
	      *l = (uintptr_t)(void *)ocb_get_l(c, l_tmp, blkn - blkn % 16);

	      _gcry_serpent_avx2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
					  c->u_mode.ocb.aad_sum, Ls);

	      nblocks -= 16;
	      abuf += 16 * sizeof(serpent_block_t);
	      did_use_avx2 = 1;
	    }
	}

      if (did_use_avx2)
	{
	  /* serpent-avx2 assembly code does not use stack */
	  if (nblocks == 0)
	    burn_stack_depth = 0;
	}

      /* Use generic code to handle smaller chunks... */
    }
#endif

#ifdef USE_SSE2
  {
    int did_use_sse2 = 0;
    u64 Ls[8];
    unsigned int n = 8 - (blkn % 8);
    u64 *l;

    if (nblocks >= 8)
      {
	/* Use u64 to store pointers for x32 support (assembly function
	* assumes 64-bit pointers). */
	Ls[(0 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
	Ls[(1 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
	Ls[(2 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
	Ls[(3 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
	Ls[(4 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
	Ls[(5 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
	Ls[(6 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
	l = &Ls[(7 + n) % 8];

	/* Process data in 8 block chunks. */
	while (nblocks >= 8)
	  {
	    /* l_tmp will be used only every 65536-th block. */
	    blkn += 8;
	    *l = (uintptr_t)(void *)ocb_get_l(c, l_tmp, blkn - blkn % 8);

	    _gcry_serpent_sse2_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
					c->u_mode.ocb.aad_sum, Ls);

	    nblocks -= 8;
	    abuf += 8 * sizeof(serpent_block_t);
	    did_use_sse2 = 1;
	  }
      }

    if (did_use_sse2)
      {
	/* serpent-avx2 assembly code does not use stack */
	if (nblocks == 0)
	  burn_stack_depth = 0;
      }

    /* Use generic code to handle smaller chunks... */
  }
#endif

#ifdef USE_NEON
  if (ctx->use_neon)
    {
      int did_use_neon = 0;
      const void *Ls[8];
      unsigned int n = 8 - (blkn % 8);
      const void **l;

      if (nblocks >= 8)
	{
	  Ls[(0 + n) % 8] = c->u_mode.ocb.L[0];
	  Ls[(1 + n) % 8] = c->u_mode.ocb.L[1];
	  Ls[(2 + n) % 8] = c->u_mode.ocb.L[0];
	  Ls[(3 + n) % 8] = c->u_mode.ocb.L[2];
	  Ls[(4 + n) % 8] = c->u_mode.ocb.L[0];
	  Ls[(5 + n) % 8] = c->u_mode.ocb.L[1];
	  Ls[(6 + n) % 8] = c->u_mode.ocb.L[0];
	  l = &Ls[(7 + n) % 8];

	  /* Process data in 8 block chunks. */
	  while (nblocks >= 8)
	    {
	      /* l_tmp will be used only every 65536-th block. */
	      blkn += 8;
	      *l = ocb_get_l(c, l_tmp, blkn - blkn % 8);

	      _gcry_serpent_neon_ocb_auth(ctx, abuf, c->u_mode.ocb.aad_offset,
					  c->u_mode.ocb.aad_sum, Ls);

	      nblocks -= 8;
	      abuf += 8 * sizeof(serpent_block_t);
	      did_use_neon = 1;
	    }
	}

      if (did_use_neon)
	{
	  /* serpent-neon assembly code does not use stack */
	  if (nblocks == 0)
	    burn_stack_depth = 0;
	}

      /* Use generic code to handle smaller chunks... */
    }
#endif

#if defined(USE_AVX2) || defined(USE_SSE2) || defined(USE_NEON)
  c->u_mode.ocb.aad_nblocks = blkn;

  wipememory(&l_tmp, sizeof(l_tmp));

  if (burn_stack_depth)
    _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
#endif

  return nblocks;
}
示例#2
0
/* Bulk encryption/decryption of complete blocks in OCB mode. */
size_t
_gcry_serpent_ocb_crypt (gcry_cipher_hd_t c, void *outbuf_arg,
			const void *inbuf_arg, size_t nblocks, int encrypt)
{
#if defined(USE_AVX2) || defined(USE_SSE2) || defined(USE_NEON)
  serpent_context_t *ctx = (void *)&c->context.c;
  unsigned char *outbuf = outbuf_arg;
  const unsigned char *inbuf = inbuf_arg;
  int burn_stack_depth = 2 * sizeof (serpent_block_t);
  u64 blkn = c->u_mode.ocb.data_nblocks;
#else
  (void)c;
  (void)outbuf_arg;
  (void)inbuf_arg;
  (void)encrypt;
#endif

#ifdef USE_AVX2
  if (ctx->use_avx2)
    {
      int did_use_avx2 = 0;
      u64 Ls[16];
      unsigned int n = 16 - (blkn % 16);
      u64 *l;
      int i;

      if (nblocks >= 16)
	{
	  for (i = 0; i < 16; i += 8)
	    {
	      /* Use u64 to store pointers for x32 support (assembly function
	       * assumes 64-bit pointers). */
	      Ls[(i + 0 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
	      Ls[(i + 1 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
	      Ls[(i + 2 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
	      Ls[(i + 3 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
	      Ls[(i + 4 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
	      Ls[(i + 5 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
	      Ls[(i + 6 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
	    }

	  Ls[(7 + n) % 16] = (uintptr_t)(void *)c->u_mode.ocb.L[3];
	  l = &Ls[(15 + n) % 16];

	  /* Process data in 16 block chunks. */
	  while (nblocks >= 16)
	    {
	      blkn += 16;
	      *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 16);

	      if (encrypt)
		_gcry_serpent_avx2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
					  c->u_ctr.ctr, Ls);
	      else
		_gcry_serpent_avx2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
					  c->u_ctr.ctr, Ls);

	      nblocks -= 16;
	      outbuf += 16 * sizeof(serpent_block_t);
	      inbuf  += 16 * sizeof(serpent_block_t);
	      did_use_avx2 = 1;
	    }
	}

      if (did_use_avx2)
	{
	  /* serpent-avx2 assembly code does not use stack */
	  if (nblocks == 0)
	    burn_stack_depth = 0;
	}

      /* Use generic code to handle smaller chunks... */
    }
#endif

#ifdef USE_SSE2
  {
    int did_use_sse2 = 0;
    u64 Ls[8];
    unsigned int n = 8 - (blkn % 8);
    u64 *l;

    if (nblocks >= 8)
      {
	/* Use u64 to store pointers for x32 support (assembly function
	  * assumes 64-bit pointers). */
	Ls[(0 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
	Ls[(1 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
	Ls[(2 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
	Ls[(3 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[2];
	Ls[(4 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
	Ls[(5 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[1];
	Ls[(6 + n) % 8] = (uintptr_t)(void *)c->u_mode.ocb.L[0];
	l = &Ls[(7 + n) % 8];

	/* Process data in 8 block chunks. */
	while (nblocks >= 8)
	  {
	    blkn += 8;
	    *l = (uintptr_t)(void *)ocb_get_l(c, blkn - blkn % 8);

	    if (encrypt)
	      _gcry_serpent_sse2_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
					  c->u_ctr.ctr, Ls);
	    else
	      _gcry_serpent_sse2_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
					  c->u_ctr.ctr, Ls);

	    nblocks -= 8;
	    outbuf += 8 * sizeof(serpent_block_t);
	    inbuf  += 8 * sizeof(serpent_block_t);
	    did_use_sse2 = 1;
	  }
      }

    if (did_use_sse2)
      {
	/* serpent-sse2 assembly code does not use stack */
	if (nblocks == 0)
	  burn_stack_depth = 0;
      }

    /* Use generic code to handle smaller chunks... */
  }
#endif

#ifdef USE_NEON
  if (ctx->use_neon)
    {
      int did_use_neon = 0;
      const void *Ls[8];
      unsigned int n = 8 - (blkn % 8);
      const void **l;

      if (nblocks >= 8)
	{
	  Ls[(0 + n) % 8] = c->u_mode.ocb.L[0];
	  Ls[(1 + n) % 8] = c->u_mode.ocb.L[1];
	  Ls[(2 + n) % 8] = c->u_mode.ocb.L[0];
	  Ls[(3 + n) % 8] = c->u_mode.ocb.L[2];
	  Ls[(4 + n) % 8] = c->u_mode.ocb.L[0];
	  Ls[(5 + n) % 8] = c->u_mode.ocb.L[1];
	  Ls[(6 + n) % 8] = c->u_mode.ocb.L[0];
	  l = &Ls[(7 + n) % 8];

	  /* Process data in 8 block chunks. */
	  while (nblocks >= 8)
	    {
	      blkn += 8;
	      *l = ocb_get_l(c,  blkn - blkn % 8);

	      if (encrypt)
		_gcry_serpent_neon_ocb_enc(ctx, outbuf, inbuf, c->u_iv.iv,
					  c->u_ctr.ctr, Ls);
	      else
		_gcry_serpent_neon_ocb_dec(ctx, outbuf, inbuf, c->u_iv.iv,
					  c->u_ctr.ctr, Ls);

	      nblocks -= 8;
	      outbuf += 8 * sizeof(serpent_block_t);
	      inbuf  += 8 * sizeof(serpent_block_t);
	      did_use_neon = 1;
	    }
	}

      if (did_use_neon)
	{
	  /* serpent-neon assembly code does not use stack */
	  if (nblocks == 0)
	    burn_stack_depth = 0;
	}

      /* Use generic code to handle smaller chunks... */
    }
#endif

#if defined(USE_AVX2) || defined(USE_SSE2) || defined(USE_NEON)
  c->u_mode.ocb.data_nblocks = blkn;

  if (burn_stack_depth)
    _gcry_burn_stack (burn_stack_depth + 4 * sizeof(void *));
#endif

  return nblocks;
}