Ejemplo n.º 1
0
void Rijndael::blockDecryptSSE(const byte *input, size_t numBlocks, byte *outBuffer)
{
  __m128i initVector = _mm_loadu_si128((__m128i*)m_initVector);
  __m128i *src=(__m128i*)input;
  __m128i *dest=(__m128i*)outBuffer;
  __m128i *rkey=(__m128i*)m_expandedKey;
  while (numBlocks > 0)
  {
    __m128i rl = _mm_loadu_si128(rkey + m_uRounds);
    __m128i d = _mm_loadu_si128(src++);
    __m128i v = _mm_xor_si128(rl, d);

    for (int i=m_uRounds-1; i>0; i--)
    {
      __m128i ri = _mm_loadu_si128(rkey + i);
      v = _mm_aesdec_si128(v, ri);
    }

    __m128i r0 = _mm_loadu_si128(rkey);
    v = _mm_aesdeclast_si128(v, r0);

    if (CBCMode)
      v = _mm_xor_si128(v, initVector);
    initVector = d;
    _mm_storeu_si128(dest++,v);
    numBlocks--;
  }
  _mm_storeu_si128((__m128i*)m_initVector,initVector);
}
Ejemplo n.º 2
0
static void block_decrypt(block_state* self, const u8* in, u8* out)
{
	__m128i m = _mm_loadu_si128((const __m128i*) in);
    /* first 9 rounds */
    m = _mm_xor_si128(m, self->dk[0]);
    m = _mm_aesdec_si128(m, self->dk[1]);
    m = _mm_aesdec_si128(m, self->dk[2]);
    m = _mm_aesdec_si128(m, self->dk[3]);
    m = _mm_aesdec_si128(m, self->dk[4]);
    m = _mm_aesdec_si128(m, self->dk[5]);
    m = _mm_aesdec_si128(m, self->dk[6]);
    m = _mm_aesdec_si128(m, self->dk[7]);
    m = _mm_aesdec_si128(m, self->dk[8]);
    m = _mm_aesdec_si128(m, self->dk[9]);
    if (self->rounds != 10) {
        /* two additional rounds for AES-192/256 */
        m = _mm_aesdec_si128(m, self->dk[10]);
        m = _mm_aesdec_si128(m, self->dk[11]);
        if (self->rounds == 14) {
            /* another two additional rounds for AES-256 */
            m = _mm_aesdec_si128(m, self->dk[12]);
            m = _mm_aesdec_si128(m, self->dk[13]);
        }
    }
    m = _mm_aesdeclast_si128(m, self->dk[self->rounds]);
    _mm_storeu_si128((__m128i*) out, m);
}
Ejemplo n.º 3
0
    void Cryptor::cbcDecrypt(const string &ciphertext, const Key &key,
                             string *plaintext,
                             unsigned char *schedule) {
      plaintext->resize(ciphertext.size());

      int blocks = ciphertext.size() / 16;
      if (ciphertext.size() % 16) {
        blocks++;
      }

      __m128i tmp, tmp2, tmp3;
      __m128i *input = (__m128i*) ciphertext.data();
      __m128i *output = (__m128i*) plaintext->data();      
      __m128i *keySchedule = (__m128i*) schedule;
      int rounds = getRounds(key.size);

      // Load the IV.
      tmp2 = _mm_loadu_si128((__m128i*) key.iv);

      // Swap byte-order => big-endian.
      if (!bigEndian) {        
        reverse_m128i(tmp2); 
      }            
      
      for (int block = 0; block < blocks; block++) {
        // Get next 128-bit block.
        tmp = _mm_loadu_si128(&input[block]);

        // Swap byte-order => big-endian.
        if (!bigEndian) {                
          reverse_m128i(tmp); 
        }

        // Whitening step.
        tmp3 = _mm_xor_si128(tmp, keySchedule[0]);

        // Apply the AES rounds.
        int round = 1;
        for (; round < rounds; round++) {
          tmp3 = _mm_aesdec_si128(tmp3, keySchedule[round]);
        }

        // And the last.
        tmp3 = _mm_aesdeclast_si128(tmp3, keySchedule[round]);

        // XOR IV or last ciphertext with the ciphertext.
        tmp3 = _mm_xor_si128(tmp3, tmp2);

        // Swap byte-order => little-endian.
        if (!bigEndian) {                        
          reverse_m128i(tmp3);
        }
        
        // Save the decrypted block.
        _mm_storeu_si128(&output[block], tmp3);

        // Save the last ciphertext.
        tmp2 = tmp;                
      }            
    }
Ejemplo n.º 4
0
Archivo: aesni.c Proyecto: behemot/pm
void aesni_decrypt(aesni_ctx *ctx, const byte *in, byte *out)
{
	register __m128i tmp;
	tmp = _mm_loadu_si128((__m128i*)in);
	tmp = _mm_xor_si128(tmp, ctx->dec_keys[0]);
	for (int i = 1; i < 10; i++) {
		tmp = _mm_aesdec_si128(tmp, ctx->dec_keys[i]);
	}
	tmp = _mm_aesdeclast_si128(tmp, ctx->dec_keys[10]);
	_mm_storeu_si128(((__m128i*)out), tmp);
}
Ejemplo n.º 5
0
inline void AES_ecb_decrypt_blks(block *blks, unsigned nblks, AES_KEY *key) {
	unsigned i, j, rnds = ROUNDS(key);
	const __m128i *sched = ((__m128i *) (key->rd_key));
	for (i = 0; i < nblks; ++i)
		blks[i] = _mm_xor_si128(blks[i], sched[0]);
	for (j = 1; j < rnds; ++j)
		for (i = 0; i < nblks; ++i)
			blks[i] = _mm_aesdec_si128(blks[i], sched[j]);
	for (i = 0; i < nblks; ++i)
		blks[i] = _mm_aesdeclast_si128(blks[i], sched[j]);
}
Ejemplo n.º 6
0
inline void AES_decrypt(const unsigned char *in, unsigned char *out,
		const AES_KEY *key) {
	int j, rnds = ROUNDS(key);
	const __m128i *sched = ((__m128i *) (key->rd_key));
	__m128i tmp = _mm_load_si128((__m128i *) in);
	tmp = _mm_xor_si128(tmp, sched[0]);
	for (j = 1; j < rnds; j++)
		tmp = _mm_aesdec_si128(tmp, sched[j]);
	tmp = _mm_aesdeclast_si128(tmp, sched[j]);
	_mm_store_si128((__m128i *) out, tmp);
}
Ejemplo n.º 7
0
static __m128i AES_decrypt(__m128i in,  const __m128i* expkey)
{
	int j;

	__m128i tmp = byte_swap(in) ^ expkey[0];
	for (j=1; j <10; j++){
		tmp = _mm_aesdec_si128 (tmp,expkey[j]);
	}
	tmp = _mm_aesdeclast_si128 (tmp,expkey[10]);

	return byte_swap(tmp);
}
Ejemplo n.º 8
0
AES_AES_Block __fastcall aes_AES128_decrypt_block_(
    AES_AES_Block ciphertext,
    const AES_AES128_RoundKeys* decryption_keys)
{
    ciphertext = _mm_xor_si128(ciphertext, decryption_keys->keys[0]);
    ciphertext = _mm_aesdec_si128(ciphertext, decryption_keys->keys[1]);
    ciphertext = _mm_aesdec_si128(ciphertext, decryption_keys->keys[2]);
    ciphertext = _mm_aesdec_si128(ciphertext, decryption_keys->keys[3]);
    ciphertext = _mm_aesdec_si128(ciphertext, decryption_keys->keys[4]);
    ciphertext = _mm_aesdec_si128(ciphertext, decryption_keys->keys[5]);
    ciphertext = _mm_aesdec_si128(ciphertext, decryption_keys->keys[6]);
    ciphertext = _mm_aesdec_si128(ciphertext, decryption_keys->keys[7]);
    ciphertext = _mm_aesdec_si128(ciphertext, decryption_keys->keys[8]);
    ciphertext = _mm_aesdec_si128(ciphertext, decryption_keys->keys[9]);
    return _mm_aesdeclast_si128(ciphertext, decryption_keys->keys[10]);
}
Ejemplo n.º 9
0
void AESNI_decrypt(const uint8_t *in, uint8_t *out, const AES_KEY dec_key)
{
    __m128i tmp;

    tmp = _mm_loadu_si128 ((__m128i*)in);
    tmp = _mm_xor_si128 (tmp,dec_key[0]);

    tmp = _mm_aesdec_si128 (tmp,dec_key[1]);
    tmp = _mm_aesdec_si128 (tmp,dec_key[2]);
    tmp = _mm_aesdec_si128 (tmp,dec_key[3]);
    tmp = _mm_aesdec_si128 (tmp,dec_key[4]);
    tmp = _mm_aesdec_si128 (tmp,dec_key[5]);
    tmp = _mm_aesdec_si128 (tmp,dec_key[6]);
    tmp = _mm_aesdec_si128 (tmp,dec_key[7]);
    tmp = _mm_aesdec_si128 (tmp,dec_key[8]);
    tmp = _mm_aesdec_si128 (tmp,dec_key[9]);
    tmp = _mm_aesdeclast_si128 (tmp,dec_key[10]);

    _mm_storeu_si128 ((__m128i*)out,tmp);
}
Ejemplo n.º 10
0
static void AESNI_CBC_decrypt(const unsigned char *in,unsigned char *out,unsigned char ivec[16],unsigned long length,unsigned char *key,int number_of_rounds)
{
    __m128i data,feedback,last_in;
    int i,j;
    if (length%16)
        length = length/16+1;
    else length /=16;
    feedback=_mm_loadu_si128 ((__m128i*)ivec);
    for(i=0; i < length; i++)
    {
        last_in=_mm_loadu_si128 (&((__m128i*)in)[i]);
        data = _mm_xor_si128 (last_in,((__m128i*)key)[0]);
        for(j=1; j <number_of_rounds; j++)
        {
            data = _mm_aesdec_si128 (data,((__m128i*)key)[j]);
	}
        data = _mm_aesdeclast_si128 (data,((__m128i*)key)[j]);
        data = _mm_xor_si128 (data,feedback);
        _mm_storeu_si128 (&((__m128i*)out)[i],data);
        feedback=last_in;
    }
}
Ejemplo n.º 11
0
int aesni_xcryptecb( aes_context *ctx,
                     int mode,
                     const unsigned char input[16],
                     unsigned char output[16] )
{
    __m128i block;
    const __m128i *subkeys = (__m128i *) ctx->rk;
    const int rounds = ctx->nr;
    int i;

    /* This could be faster if more data was provided at once. */

    block = _mm_loadu_si128( (__m128i *) input );
    block = _mm_xor_si128( block, subkeys[0] );

    if( mode == AES_ENCRYPT ) {
        for( i = 1; i < rounds - 1; i += 2 ) {
            block = _mm_aesenc_si128( block, subkeys[i] );
            block = _mm_aesenc_si128( block, subkeys[i + 1] );
        }

        block = _mm_aesenc_si128( block, subkeys[rounds - 1] );
        block = _mm_aesenclast_si128( block, subkeys[rounds] );
    } else {
        for( i = 1; i < rounds - 1; i += 2 ) {
            block = _mm_aesdec_si128( block, subkeys[i] );
            block = _mm_aesdec_si128( block, subkeys[i + 1] );
        }

        block = _mm_aesdec_si128( block, subkeys[rounds - 1] );
        block = _mm_aesdeclast_si128( block, subkeys[rounds] );
    }

    _mm_storeu_si128( (__m128i *) output, block );

    return( 0 );
}
Ejemplo n.º 12
0
void aes_ecb_decrypt(aes *a,MR_BYTE *buff)
{
    int i,j,k;
    MR_WORD p[4],q[4],*x,*y,*t;

#ifdef AES_NI_SUPPORT
	__m128i ky,m = _mm_loadu_si128((__m128i *) buff);
	ky = _mm_loadu_si128((__m128i *) &a->rkey[0]);
    m = _mm_xor_si128       (m, ky); 
	k=NB;
	for (i=1;i<a->Nr;i++)
	{
		ky=_mm_loadu_si128((__m128i *) &a->rkey[k]);
		m =_mm_aesdec_si128    (m, ky); 
		k+=4;
	}
	ky=_mm_loadu_si128((__m128i *) &a->rkey[k]);
    m=_mm_aesdeclast_si128(m, ky);

    _mm_storeu_si128((__m128i *)buff, m);
#else

    for (i=j=0;i<NB;i++,j+=4)
    {
        p[i]=pack((MR_BYTE *)&buff[j]);
        p[i]^=a->rkey[i];
    }

    k=NB;
    x=p; y=q;

/* State alternates between x and y */
    for (i=1;i<a->Nr;i++)
    { /* Nr is number of rounds. May be odd. */
#ifndef MR_SMALL_AES
        y[0]=a->rkey[k]^rtable[MR_TOBYTE(x[0])]^
             rtable1[MR_TOBYTE(x[3]>>8)]^
             rtable2[MR_TOBYTE(x[2]>>16)]^
             rtable3[x[1]>>24];
        y[1]=a->rkey[k+1]^rtable[MR_TOBYTE(x[1])]^
             rtable1[MR_TOBYTE(x[0]>>8)]^
             rtable2[MR_TOBYTE(x[3]>>16)]^
             rtable3[x[2]>>24];
        y[2]=a->rkey[k+2]^rtable[MR_TOBYTE(x[2])]^
             rtable1[MR_TOBYTE(x[1]>>8)]^
             rtable2[MR_TOBYTE(x[0]>>16)]^
             rtable3[x[3]>>24];
        y[3]=a->rkey[k+3]^rtable[MR_TOBYTE(x[3])]^
             rtable1[MR_TOBYTE(x[2]>>8)]^
             rtable2[MR_TOBYTE(x[1]>>16)]^
             rtable3[x[0]>>24];
#else
        y[0]=a->rkey[k]^rtable[MR_TOBYTE(x[0])]^
             ROTL8(rtable[MR_TOBYTE(x[3]>>8)])^
             ROTL16(rtable[MR_TOBYTE(x[2]>>16)])^
             ROTL24(rtable[x[1]>>24]);
        y[1]=a->rkey[k+1]^rtable[MR_TOBYTE(x[1])]^
             ROTL8(rtable[MR_TOBYTE(x[0]>>8)])^
             ROTL16(rtable[MR_TOBYTE(x[3]>>16)])^
             ROTL24(rtable[x[2]>>24]);
        y[2]=a->rkey[k+2]^rtable[MR_TOBYTE(x[2])]^
             ROTL8(rtable[MR_TOBYTE(x[1]>>8)])^
             ROTL16(rtable[MR_TOBYTE(x[0]>>16)])^
             ROTL24(rtable[x[3]>>24]);
        y[3]=a->rkey[k+3]^rtable[MR_TOBYTE(x[3])]^
             ROTL8(rtable[MR_TOBYTE(x[2]>>8)])^
             ROTL16(rtable[MR_TOBYTE(x[1]>>16)])^
             ROTL24(rtable[x[0]>>24]);
#endif
        k+=4;
        t=x; x=y; y=t;      /* swap pointers */
    }

/* Last Round */ 
    y[0]=a->rkey[k]^(MR_WORD)rbsub[MR_TOBYTE(x[0])]^
         ROTL8((MR_WORD)rbsub[MR_TOBYTE(x[3]>>8)])^
         ROTL16((MR_WORD)rbsub[MR_TOBYTE(x[2]>>16)])^
         ROTL24((MR_WORD)rbsub[x[1]>>24]);
    y[1]=a->rkey[k+1]^(MR_WORD)rbsub[MR_TOBYTE(x[1])]^
         ROTL8((MR_WORD)rbsub[MR_TOBYTE(x[0]>>8)])^
         ROTL16((MR_WORD)rbsub[MR_TOBYTE(x[3]>>16)])^
         ROTL24((MR_WORD)rbsub[x[2]>>24]);
    y[2]=a->rkey[k+2]^(MR_WORD)rbsub[MR_TOBYTE(x[2])]^
         ROTL8((MR_WORD)rbsub[MR_TOBYTE(x[1]>>8)])^
         ROTL16((MR_WORD)rbsub[MR_TOBYTE(x[0]>>16)])^
         ROTL24((MR_WORD)rbsub[x[3]>>24]);
    y[3]=a->rkey[k+3]^(MR_WORD)rbsub[MR_TOBYTE(x[3])]^
         ROTL8((MR_WORD)rbsub[MR_TOBYTE(x[2]>>8)])^
         ROTL16((MR_WORD)rbsub[MR_TOBYTE(x[1]>>16)])^
         ROTL24((MR_WORD)rbsub[x[0]>>24]);

    for (i=j=0;i<NB;i++,j+=4)
    {
        unpack(y[i],(MR_BYTE *)&buff[j]);
        x[i]=y[i]=0;   /* clean up stack */
    }
#endif
}
Ejemplo n.º 13
0
int aesni_xcryptcbc( aes_context *ctx,
                     int mode,
                     size_t length,
                     unsigned char iv[16],
                     const unsigned char *input,
                     unsigned char *output )
{
    const __m128i *subkeys = (__m128i *) ctx->rk;
    const int rounds = ctx->nr;
    const size_t blocks = length / 16;
    __m128i block0, block1, block2, block3;
    __m128i fb0, fb1, fb2, fb3;
    __m128i rk;
    __m128i last;
    size_t i;
    int j;

    fb0 = _mm_loadu_si128( (__m128i *) iv );

    if (mode == AES_ENCRYPT ) {
        for( i = 0 ; i < blocks; i++ ) {
            block0 = _mm_loadu_si128( &((__m128i *) input)[i] );

            fb0 = _mm_xor_si128( block0, fb0 );
            fb0 = _mm_xor_si128( fb0, subkeys[0] );

            for( j = 1; j < rounds - 1; j += 2 ) {
                fb0 = _mm_aesenc_si128( fb0, subkeys[j] );
                fb0 = _mm_aesenc_si128( fb0, subkeys[j + 1] );
            }

            fb0 = _mm_aesenc_si128( fb0, subkeys[rounds - 1] );
            fb0 = _mm_aesenclast_si128( fb0, subkeys[rounds] );

            _mm_storeu_si128( &((__m128i*) output)[i], fb0 );
        }
    } else {
        /* Take advantage of pipelining by decrypting 4 blocks at once. */

        for( i = 0; i < blocks / 4; i++ ) {
            block0 = _mm_loadu_si128( (__m128i *) input + i * 4 );
            block1 = _mm_loadu_si128( (__m128i *) input + i * 4 + 1 );
            block2 = _mm_loadu_si128( (__m128i *) input + i * 4 + 2 );
            block3 = _mm_loadu_si128( (__m128i *) input + i * 4 + 3 );

            fb1 = block0;
            fb2 = block1;
            fb3 = block2;
            last = block3;

            rk = subkeys[0];
            block0 = _mm_xor_si128( block0, rk );
            block1 = _mm_xor_si128( block1, rk );
            block2 = _mm_xor_si128( block2, rk );
            block3 = _mm_xor_si128( block3, rk );

            for( j = 1; j < rounds; j++ ) {
                rk = subkeys[j];
                block0 = _mm_aesdec_si128( block0, rk );
                block1 = _mm_aesdec_si128( block1, rk );
                block2 = _mm_aesdec_si128( block2, rk );
                block3 = _mm_aesdec_si128( block3, rk );
            }

            rk = subkeys[rounds];
            block0 = _mm_aesdeclast_si128( block0, rk );
            block1 = _mm_aesdeclast_si128( block1, rk );
            block2 = _mm_aesdeclast_si128( block2, rk );
            block3 = _mm_aesdeclast_si128( block3, rk );

            block0 = _mm_xor_si128( block0, fb0 );
            block1 = _mm_xor_si128( block1, fb1 );
            block2 = _mm_xor_si128( block2, fb2 );
            block3 = _mm_xor_si128( block3, fb3 );

            _mm_storeu_si128( ((__m128i *) output) + i * 4, block0 );
            _mm_storeu_si128( ((__m128i *) output) + i * 4 + 1, block1 );
            _mm_storeu_si128( ((__m128i *) output) + i * 4 + 2, block2 );
            _mm_storeu_si128( ((__m128i *) output) + i * 4 + 3, block3 );

            fb0 = last;
        }

        for( i *= 4; i < blocks; i++ ) {
            block0 = _mm_loadu_si128( (__m128i *) input + i );

            last = block0;

            block0 = _mm_xor_si128 (last, subkeys[0] );

            for( j = 1; j < rounds - 1; j += 2 ) {
                block0 = _mm_aesdec_si128( block0, subkeys[j] );
                block0 = _mm_aesdec_si128( block0, subkeys[j + 1] );
            }

            block0 = _mm_aesdec_si128( block0, subkeys[rounds - 1] );
            block0 = _mm_aesdeclast_si128( block0, subkeys[rounds] );

            block0 = _mm_xor_si128( block0, fb0 );

            _mm_storeu_si128( ((__m128i *) output) + i, block0 );

            fb0 = last;
        }
    }

    _mm_storeu_si128( (__m128i *) iv, fb0 );

    return( 0 );
}
Ejemplo n.º 14
0
/*
* AES-256 Decryption
*/
void AES_256_NI::decrypt_n(const byte in[], byte out[], size_t blocks) const
   {
   const __m128i* in_mm = (const __m128i*)in;
   __m128i* out_mm = (__m128i*)out;

   const __m128i* key_mm = (const __m128i*)&DK[0];

   __m128i K0  = _mm_loadu_si128(key_mm);
   __m128i K1  = _mm_loadu_si128(key_mm + 1);
   __m128i K2  = _mm_loadu_si128(key_mm + 2);
   __m128i K3  = _mm_loadu_si128(key_mm + 3);
   __m128i K4  = _mm_loadu_si128(key_mm + 4);
   __m128i K5  = _mm_loadu_si128(key_mm + 5);
   __m128i K6  = _mm_loadu_si128(key_mm + 6);
   __m128i K7  = _mm_loadu_si128(key_mm + 7);
   __m128i K8  = _mm_loadu_si128(key_mm + 8);
   __m128i K9  = _mm_loadu_si128(key_mm + 9);
   __m128i K10 = _mm_loadu_si128(key_mm + 10);
   __m128i K11 = _mm_loadu_si128(key_mm + 11);
   __m128i K12 = _mm_loadu_si128(key_mm + 12);
   __m128i K13 = _mm_loadu_si128(key_mm + 13);
   __m128i K14 = _mm_loadu_si128(key_mm + 14);

   while(blocks >= 4)
      {
      __m128i B0 = _mm_loadu_si128(in_mm + 0);
      __m128i B1 = _mm_loadu_si128(in_mm + 1);
      __m128i B2 = _mm_loadu_si128(in_mm + 2);
      __m128i B3 = _mm_loadu_si128(in_mm + 3);

      B0 = _mm_xor_si128(B0, K0);
      B1 = _mm_xor_si128(B1, K0);
      B2 = _mm_xor_si128(B2, K0);
      B3 = _mm_xor_si128(B3, K0);

      AES_DEC_4_ROUNDS(K1);
      AES_DEC_4_ROUNDS(K2);
      AES_DEC_4_ROUNDS(K3);
      AES_DEC_4_ROUNDS(K4);
      AES_DEC_4_ROUNDS(K5);
      AES_DEC_4_ROUNDS(K6);
      AES_DEC_4_ROUNDS(K7);
      AES_DEC_4_ROUNDS(K8);
      AES_DEC_4_ROUNDS(K9);
      AES_DEC_4_ROUNDS(K10);
      AES_DEC_4_ROUNDS(K11);
      AES_DEC_4_ROUNDS(K12);
      AES_DEC_4_ROUNDS(K13);
      AES_DEC_4_LAST_ROUNDS(K14);

      _mm_storeu_si128(out_mm + 0, B0);
      _mm_storeu_si128(out_mm + 1, B1);
      _mm_storeu_si128(out_mm + 2, B2);
      _mm_storeu_si128(out_mm + 3, B3);

      blocks -= 4;
      in_mm += 4;
      out_mm += 4;
      }

   for(size_t i = 0; i != blocks; ++i)
      {
      __m128i B = _mm_loadu_si128(in_mm + i);

      B = _mm_xor_si128(B, K0);

      B = _mm_aesdec_si128(B, K1);
      B = _mm_aesdec_si128(B, K2);
      B = _mm_aesdec_si128(B, K3);
      B = _mm_aesdec_si128(B, K4);
      B = _mm_aesdec_si128(B, K5);
      B = _mm_aesdec_si128(B, K6);
      B = _mm_aesdec_si128(B, K7);
      B = _mm_aesdec_si128(B, K8);
      B = _mm_aesdec_si128(B, K9);
      B = _mm_aesdec_si128(B, K10);
      B = _mm_aesdec_si128(B, K11);
      B = _mm_aesdec_si128(B, K12);
      B = _mm_aesdec_si128(B, K13);
      B = _mm_aesdeclast_si128(B, K14);

      _mm_storeu_si128(out_mm + i, B);
      }
   }