static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA256 *key, unsigned char *out, const unsigned char *inp, size_t inp_len, int n4x) { /* n4x is 1 or 2 */ HASH_DESC hash_d[8], edges[8]; CIPH_DESC ciph_d[8]; unsigned char storage[sizeof(SHA256_MB_CTX) + 32]; union { u64 q[16]; u32 d[32]; u8 c[128]; } blocks[8]; SHA256_MB_CTX *ctx; unsigned int frag, last, packlen, i, x4 = 4 * n4x, minblocks, processed = 0; size_t ret = 0; u8 *IVs; # if defined(BSWAP8) u64 seqnum; # endif /* ask for IVs in bulk */ if (RAND_bytes((IVs = blocks[0].c), 16 * x4) <= 0) return 0; /* align */ ctx = (SHA256_MB_CTX *) (storage + 32 - ((size_t)storage % 32)); frag = (unsigned int)inp_len >> (1 + n4x); last = (unsigned int)inp_len + frag - (frag << (1 + n4x)); if (last > frag && ((last + 13 + 9) % 64) < (x4 - 1)) { frag++; last -= x4 - 1; } packlen = 5 + 16 + ((frag + 32 + 16) & -16); /* populate descriptors with pointers and IVs */ hash_d[0].ptr = inp; ciph_d[0].inp = inp; /* 5+16 is place for header and explicit IV */ ciph_d[0].out = out + 5 + 16; memcpy(ciph_d[0].out - 16, IVs, 16); memcpy(ciph_d[0].iv, IVs, 16); IVs += 16; for (i = 1; i < x4; i++) { ciph_d[i].inp = hash_d[i].ptr = hash_d[i - 1].ptr + frag; ciph_d[i].out = ciph_d[i - 1].out + packlen; memcpy(ciph_d[i].out - 16, IVs, 16); memcpy(ciph_d[i].iv, IVs, 16); IVs += 16; } # if defined(BSWAP8) memcpy(blocks[0].c, key->md.data, 8); seqnum = BSWAP8(blocks[0].q[0]); # endif for (i = 0; i < x4; i++) { unsigned int len = (i == (x4 - 1) ? last : frag); # if !defined(BSWAP8) unsigned int carry, j; # endif ctx->A[i] = key->md.h[0]; ctx->B[i] = key->md.h[1]; ctx->C[i] = key->md.h[2]; ctx->D[i] = key->md.h[3]; ctx->E[i] = key->md.h[4]; ctx->F[i] = key->md.h[5]; ctx->G[i] = key->md.h[6]; ctx->H[i] = key->md.h[7]; /* fix seqnum */ # if defined(BSWAP8) blocks[i].q[0] = BSWAP8(seqnum + i); # else for (carry = i, j = 8; j--;) { blocks[i].c[j] = ((u8 *)key->md.data)[j] + carry; carry = (blocks[i].c[j] - carry) >> (sizeof(carry) * 8 - 1); } # endif blocks[i].c[8] = ((u8 *)key->md.data)[8]; blocks[i].c[9] = ((u8 *)key->md.data)[9]; blocks[i].c[10] = ((u8 *)key->md.data)[10]; /* fix length */ blocks[i].c[11] = (u8)(len >> 8); blocks[i].c[12] = (u8)(len); memcpy(blocks[i].c + 13, hash_d[i].ptr, 64 - 13); hash_d[i].ptr += 64 - 13; hash_d[i].blocks = (len - (64 - 13)) / 64; edges[i].ptr = blocks[i].c; edges[i].blocks = 1; } /* hash 13-byte headers and first 64-13 bytes of inputs */ sha256_multi_block(ctx, edges, n4x); /* hash bulk inputs */ # define MAXCHUNKSIZE 2048 # if MAXCHUNKSIZE%64 # error "MAXCHUNKSIZE is not divisible by 64" # elif MAXCHUNKSIZE /* * goal is to minimize pressure on L1 cache by moving in shorter steps, * so that hashed data is still in the cache by the time we encrypt it */ minblocks = ((frag <= last ? frag : last) - (64 - 13)) / 64; if (minblocks > MAXCHUNKSIZE / 64) { for (i = 0; i < x4; i++) { edges[i].ptr = hash_d[i].ptr; edges[i].blocks = MAXCHUNKSIZE / 64; ciph_d[i].blocks = MAXCHUNKSIZE / 16; } do { sha256_multi_block(ctx, edges, n4x); aesni_multi_cbc_encrypt(ciph_d, &key->ks, n4x); for (i = 0; i < x4; i++) { edges[i].ptr = hash_d[i].ptr += MAXCHUNKSIZE; hash_d[i].blocks -= MAXCHUNKSIZE / 64; edges[i].blocks = MAXCHUNKSIZE / 64; ciph_d[i].inp += MAXCHUNKSIZE; ciph_d[i].out += MAXCHUNKSIZE; ciph_d[i].blocks = MAXCHUNKSIZE / 16; memcpy(ciph_d[i].iv, ciph_d[i].out - 16, 16); } processed += MAXCHUNKSIZE; minblocks -= MAXCHUNKSIZE / 64; } while (minblocks > MAXCHUNKSIZE / 64); } # endif # undef MAXCHUNKSIZE sha256_multi_block(ctx, hash_d, n4x); memset(blocks, 0, sizeof(blocks)); for (i = 0; i < x4; i++) { unsigned int len = (i == (x4 - 1) ? last : frag), off = hash_d[i].blocks * 64; const unsigned char *ptr = hash_d[i].ptr + off; off = (len - processed) - (64 - 13) - off; /* remainder actually */ memcpy(blocks[i].c, ptr, off); blocks[i].c[off] = 0x80; len += 64 + 13; /* 64 is HMAC header */ len *= 8; /* convert to bits */ if (off < (64 - 8)) { # ifdef BSWAP4 blocks[i].d[15] = BSWAP4(len); # else PUTU32(blocks[i].c + 60, len); # endif edges[i].blocks = 1; } else { # ifdef BSWAP4 blocks[i].d[31] = BSWAP4(len); # else PUTU32(blocks[i].c + 124, len); # endif edges[i].blocks = 2; } edges[i].ptr = blocks[i].c; } /* hash input tails and finalize */ sha256_multi_block(ctx, edges, n4x); memset(blocks, 0, sizeof(blocks)); for (i = 0; i < x4; i++) { # ifdef BSWAP4 blocks[i].d[0] = BSWAP4(ctx->A[i]); ctx->A[i] = key->tail.h[0]; blocks[i].d[1] = BSWAP4(ctx->B[i]); ctx->B[i] = key->tail.h[1]; blocks[i].d[2] = BSWAP4(ctx->C[i]); ctx->C[i] = key->tail.h[2]; blocks[i].d[3] = BSWAP4(ctx->D[i]); ctx->D[i] = key->tail.h[3]; blocks[i].d[4] = BSWAP4(ctx->E[i]); ctx->E[i] = key->tail.h[4]; blocks[i].d[5] = BSWAP4(ctx->F[i]); ctx->F[i] = key->tail.h[5]; blocks[i].d[6] = BSWAP4(ctx->G[i]); ctx->G[i] = key->tail.h[6]; blocks[i].d[7] = BSWAP4(ctx->H[i]); ctx->H[i] = key->tail.h[7]; blocks[i].c[32] = 0x80; blocks[i].d[15] = BSWAP4((64 + 32) * 8); # else PUTU32(blocks[i].c + 0, ctx->A[i]); ctx->A[i] = key->tail.h[0]; PUTU32(blocks[i].c + 4, ctx->B[i]); ctx->B[i] = key->tail.h[1]; PUTU32(blocks[i].c + 8, ctx->C[i]); ctx->C[i] = key->tail.h[2]; PUTU32(blocks[i].c + 12, ctx->D[i]); ctx->D[i] = key->tail.h[3]; PUTU32(blocks[i].c + 16, ctx->E[i]); ctx->E[i] = key->tail.h[4]; PUTU32(blocks[i].c + 20, ctx->F[i]); ctx->F[i] = key->tail.h[5]; PUTU32(blocks[i].c + 24, ctx->G[i]); ctx->G[i] = key->tail.h[6]; PUTU32(blocks[i].c + 28, ctx->H[i]); ctx->H[i] = key->tail.h[7]; blocks[i].c[32] = 0x80; PUTU32(blocks[i].c + 60, (64 + 32) * 8); # endif edges[i].ptr = blocks[i].c; edges[i].blocks = 1; } /* finalize MACs */ sha256_multi_block(ctx, edges, n4x); for (i = 0; i < x4; i++) { unsigned int len = (i == (x4 - 1) ? last : frag), pad, j; unsigned char *out0 = out; memcpy(ciph_d[i].out, ciph_d[i].inp, len - processed); ciph_d[i].inp = ciph_d[i].out; out += 5 + 16 + len; /* write MAC */ PUTU32(out + 0, ctx->A[i]); PUTU32(out + 4, ctx->B[i]); PUTU32(out + 8, ctx->C[i]); PUTU32(out + 12, ctx->D[i]); PUTU32(out + 16, ctx->E[i]); PUTU32(out + 20, ctx->F[i]); PUTU32(out + 24, ctx->G[i]); PUTU32(out + 28, ctx->H[i]); out += 32; len += 32; /* pad */ pad = 15 - len % 16; for (j = 0; j <= pad; j++) *(out++) = pad; len += pad + 1; ciph_d[i].blocks = (len - processed) / 16; len += 16; /* account for explicit iv */ /* arrange header */ out0[0] = ((u8 *)key->md.data)[8]; out0[1] = ((u8 *)key->md.data)[9]; out0[2] = ((u8 *)key->md.data)[10]; out0[3] = (u8)(len >> 8); out0[4] = (u8)(len); ret += len + 5; inp += frag; } aesni_multi_cbc_encrypt(ciph_d, &key->ks, n4x); OPENSSL_cleanse(blocks, sizeof(blocks)); OPENSSL_cleanse(ctx, sizeof(*ctx)); return ret; }
static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA256 *key, unsigned char *out, const unsigned char *inp, size_t inp_len, int n4x) /* n4x is 1 or 2 */ { HASH_DESC hash_d[8], edges[8]; CIPH_DESC ciph_d[8]; unsigned char storage[sizeof(SHA256_MB_CTX)+32]; union { u64 q[16]; u32 d[32]; u8 c[128]; } blocks[8]; SHA256_MB_CTX *ctx; unsigned int frag, last, packlen, i, x4=4*n4x; size_t ret = 0; u8 *IVs; ctx = (SHA256_MB_CTX *)(storage+32-((size_t)storage%32)); /* align */ frag = (unsigned int)inp_len>>(1+n4x); last = (unsigned int)inp_len+frag-(frag<<(1+n4x)); if (last>frag && ((last+13+9)%64)<(x4-1)) { frag++; last -= x4-1; } hash_d[0].ptr = inp; for (i=1;i<x4;i++) hash_d[i].ptr = hash_d[i-1].ptr+frag; for (i=0;i<x4;i++) { unsigned int len = (i==(x4-1)?last:frag); ctx->A[i] = key->md.h[0]; ctx->B[i] = key->md.h[1]; ctx->C[i] = key->md.h[2]; ctx->D[i] = key->md.h[3]; ctx->E[i] = key->md.h[4]; ctx->F[i] = key->md.h[5]; ctx->G[i] = key->md.h[6]; ctx->H[i] = key->md.h[7]; /* fix seqnum */ #if defined(BSWAP8) blocks[i].q[0] = BSWAP8(BSWAP8(*(u64*)key->md.data)+i); #else blocks[i].c[7] += ((u8*)key->md.data)[7]+i; if (blocks[i].c[7] < i) { int j; for (j=6;j>=0;j--) { if (blocks[i].c[j]=((u8*)key->md.data)[j]+1) break; } } #endif blocks[i].c[8] = ((u8*)key->md.data)[8]; blocks[i].c[9] = ((u8*)key->md.data)[9]; blocks[i].c[10] = ((u8*)key->md.data)[10]; /* fix length */ blocks[i].c[11] = (u8)(len>>8); blocks[i].c[12] = (u8)(len); memcpy(blocks[i].c+13,hash_d[i].ptr,64-13); hash_d[i].ptr += 64-13; hash_d[i].blocks = (len-(64-13))/64; edges[i].ptr = blocks[i].c; edges[i].blocks = 1; } /* hash 13-byte headers and first 64-13 bytes of inputs */ sha256_multi_block(ctx,edges,n4x); /* hash bulk inputs */ sha256_multi_block(ctx,hash_d,n4x); memset(blocks,0,sizeof(blocks)); for (i=0;i<x4;i++) { unsigned int len = (i==(x4-1)?last:frag), off = hash_d[i].blocks*64; const unsigned char *ptr = hash_d[i].ptr+off; off = len-(64-13)-off; /* remainder actually */ memcpy(blocks[i].c,ptr,off); blocks[i].c[off]=0x80; len += 64+13; /* 64 is HMAC header */ len *= 8; /* convert to bits */ if (off<(64-8)) { blocks[i].d[15] = BSWAP4(len); edges[i].blocks = 1; } else { blocks[i].d[31] = BSWAP4(len); edges[i].blocks = 2; } edges[i].ptr = blocks[i].c; } /* hash input tails and finalize */ sha256_multi_block(ctx,edges,n4x); memset(blocks,0,sizeof(blocks)); for (i=0;i<x4;i++) { blocks[i].d[0] = BSWAP4(ctx->A[i]); ctx->A[i] = key->tail.h[0]; blocks[i].d[1] = BSWAP4(ctx->B[i]); ctx->B[i] = key->tail.h[1]; blocks[i].d[2] = BSWAP4(ctx->C[i]); ctx->C[i] = key->tail.h[2]; blocks[i].d[3] = BSWAP4(ctx->D[i]); ctx->D[i] = key->tail.h[3]; blocks[i].d[4] = BSWAP4(ctx->E[i]); ctx->E[i] = key->tail.h[4]; blocks[i].d[5] = BSWAP4(ctx->F[i]); ctx->F[i] = key->tail.h[5]; blocks[i].d[6] = BSWAP4(ctx->G[i]); ctx->G[i] = key->tail.h[6]; blocks[i].d[7] = BSWAP4(ctx->H[i]); ctx->H[i] = key->tail.h[7]; blocks[i].c[32] = 0x80; blocks[i].d[15] = BSWAP4((64+32)*8); edges[i].ptr = blocks[i].c; edges[i].blocks = 1; } /* finalize MACs */ sha256_multi_block(ctx,edges,n4x); packlen = 5+16+((frag+32+16)&-16); out += (packlen<<(1+n4x))-packlen; inp += (frag<<(1+n4x))-frag; RAND_bytes((IVs=blocks[0].c),16*x4); /* ask for IVs in bulk */ for (i=x4-1;;i--) { unsigned int len = (i==(x4-1)?last:frag), pad, j; unsigned char *out0 = out; out += 5+16; /* place for header and explicit IV */ ciph_d[i].inp = out; ciph_d[i].out = out; memmove(out,inp,len); out += len; /* write MAC */ ((u32 *)out)[0] = BSWAP4(ctx->A[i]); ((u32 *)out)[1] = BSWAP4(ctx->B[i]); ((u32 *)out)[2] = BSWAP4(ctx->C[i]); ((u32 *)out)[3] = BSWAP4(ctx->D[i]); ((u32 *)out)[4] = BSWAP4(ctx->E[i]); ((u32 *)out)[5] = BSWAP4(ctx->F[i]); ((u32 *)out)[6] = BSWAP4(ctx->G[i]); ((u32 *)out)[7] = BSWAP4(ctx->H[i]); out += 32; len += 32; /* pad */ pad = 15-len%16; for (j=0;j<=pad;j++) *(out++) = pad; len += pad+1; ciph_d[i].blocks = len/16; len += 16; /* account for explicit iv */ /* arrange header */ out0[0] = ((u8*)key->md.data)[8]; out0[1] = ((u8*)key->md.data)[9]; out0[2] = ((u8*)key->md.data)[10]; out0[3] = (u8)(len>>8); out0[4] = (u8)(len); /* explicit iv */ memcpy(ciph_d[i].iv, IVs, 16); memcpy(&out0[5], IVs, 16); ret += len+5; if (i==0) break; out = out0-packlen; inp -= frag; IVs += 16; } aesni_multi_cbc_encrypt(ciph_d,&key->ks,n4x); OPENSSL_cleanse(blocks,sizeof(blocks)); OPENSSL_cleanse(ctx,sizeof(*ctx)); return ret; }