static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA1 *key, unsigned char *out, const unsigned char *inp, size_t inp_len, int n4x) /* n4x is 1 or 2 */ { HASH_DESC hash_d[8], edges[8]; CIPH_DESC ciph_d[8]; unsigned char storage[sizeof(SHA1_MB_CTX)+32]; union { u64 q[16]; u32 d[32]; u8 c[128]; } blocks[8]; SHA1_MB_CTX *ctx; unsigned int frag, last, packlen, i, x4=4*n4x; size_t ret = 0; u8 *IVs; ctx = (SHA1_MB_CTX *)(storage+32-((size_t)storage%32)); /* align */ frag = (unsigned int)inp_len>>(1+n4x); last = (unsigned int)inp_len+frag-(frag<<(1+n4x)); if (last>frag && ((last+13+9)%64)<(x4-1)) { frag++; last -= x4-1; } hash_d[0].ptr = inp; for (i=1;i<x4;i++) hash_d[i].ptr = hash_d[i-1].ptr+frag; for (i=0;i<x4;i++) { unsigned int len = (i==(x4-1)?last:frag); ctx->A[i] = key->md.h0; ctx->B[i] = key->md.h1; ctx->C[i] = key->md.h2; ctx->D[i] = key->md.h3; ctx->E[i] = key->md.h4; /* fix seqnum */ #if defined(BSWAP8) blocks[i].q[0] = BSWAP8(BSWAP8(*(u64*)key->md.data)+i); #else blocks[i].c[7] += ((u8*)key->md.data)[7]+i; if (blocks[i].c[7] < i) { int j; for (j=6;j>=0;j--) { if (blocks[i].c[j]=((u8*)key->md.data)[j]+1) break; } } #endif blocks[i].c[8] = ((u8*)key->md.data)[8]; blocks[i].c[9] = ((u8*)key->md.data)[9]; blocks[i].c[10] = ((u8*)key->md.data)[10]; /* fix length */ blocks[i].c[11] = (u8)(len>>8); blocks[i].c[12] = (u8)(len); memcpy(blocks[i].c+13,hash_d[i].ptr,64-13); hash_d[i].ptr += 64-13; hash_d[i].blocks = (len-(64-13))/64; edges[i].ptr = blocks[i].c; edges[i].blocks = 1; } /* hash 13-byte headers and first 64-13 bytes of inputs */ sha1_multi_block(ctx,edges,n4x); /* hash bulk inputs */ sha1_multi_block(ctx,hash_d,n4x); memset(blocks,0,sizeof(blocks)); for (i=0;i<x4;i++) { unsigned int len = (i==(x4-1)?last:frag), off = hash_d[i].blocks*64; const unsigned char *ptr = hash_d[i].ptr+off; off = len-(64-13)-off; /* remainder actually */ memcpy(blocks[i].c,ptr,off); blocks[i].c[off]=0x80; len += 64+13; /* 64 is HMAC header */ len *= 8; /* convert to bits */ if (off<(64-8)) { blocks[i].d[15] = BSWAP4(len); edges[i].blocks = 1; } else { blocks[i].d[31] = BSWAP4(len); edges[i].blocks = 2; } edges[i].ptr = blocks[i].c; } /* hash input tails and finalize */ sha1_multi_block(ctx,edges,n4x); memset(blocks,0,sizeof(blocks)); for (i=0;i<x4;i++) { blocks[i].d[0] = BSWAP4(ctx->A[i]); ctx->A[i] = key->tail.h0; blocks[i].d[1] = BSWAP4(ctx->B[i]); ctx->B[i] = key->tail.h1; blocks[i].d[2] = BSWAP4(ctx->C[i]); ctx->C[i] = key->tail.h2; blocks[i].d[3] = BSWAP4(ctx->D[i]); ctx->D[i] = key->tail.h3; blocks[i].d[4] = BSWAP4(ctx->E[i]); ctx->E[i] = key->tail.h4; blocks[i].c[20] = 0x80; blocks[i].d[15] = BSWAP4((64+20)*8); edges[i].ptr = blocks[i].c; edges[i].blocks = 1; } /* finalize MACs */ sha1_multi_block(ctx,edges,n4x); packlen = 5+16+((frag+20+16)&-16); out += (packlen<<(1+n4x))-packlen; inp += (frag<<(1+n4x))-frag; RAND_bytes((IVs=blocks[0].c),16*x4); /* ask for IVs in bulk */ for (i=x4-1;;i--) { unsigned int len = (i==(x4-1)?last:frag), pad, j; unsigned char *out0 = out; out += 5+16; /* place for header and explicit IV */ ciph_d[i].inp = out; ciph_d[i].out = out; memmove(out,inp,len); out += len; /* write MAC */ ((u32 *)out)[0] = BSWAP4(ctx->A[i]); ((u32 *)out)[1] = BSWAP4(ctx->B[i]); ((u32 *)out)[2] = BSWAP4(ctx->C[i]); ((u32 *)out)[3] = BSWAP4(ctx->D[i]); ((u32 *)out)[4] = BSWAP4(ctx->E[i]); out += 20; len += 20; /* pad */ pad = 15-len%16; for (j=0;j<=pad;j++) *(out++) = pad; len += pad+1; ciph_d[i].blocks = len/16; len += 16; /* account for explicit iv */ /* arrange header */ out0[0] = ((u8*)key->md.data)[8]; out0[1] = ((u8*)key->md.data)[9]; out0[2] = ((u8*)key->md.data)[10]; out0[3] = (u8)(len>>8); out0[4] = (u8)(len); /* explicit iv */ memcpy(ciph_d[i].iv, IVs, 16); memcpy(&out0[5], IVs, 16); ret += len+5; if (i==0) break; out = out0-packlen; inp -= frag; IVs += 16; } aesni_multi_cbc_encrypt(ciph_d,&key->ks,n4x); OPENSSL_cleanse(blocks,sizeof(blocks)); OPENSSL_cleanse(ctx,sizeof(*ctx)); return ret; }
static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA256 *key, unsigned char *out, const unsigned char *inp, size_t inp_len, int n4x) { /* n4x is 1 or 2 */ HASH_DESC hash_d[8], edges[8]; CIPH_DESC ciph_d[8]; unsigned char storage[sizeof(SHA256_MB_CTX) + 32]; union { u64 q[16]; u32 d[32]; u8 c[128]; } blocks[8]; SHA256_MB_CTX *ctx; unsigned int frag, last, packlen, i, x4 = 4 * n4x, minblocks, processed = 0; size_t ret = 0; u8 *IVs; # if defined(BSWAP8) u64 seqnum; # endif /* ask for IVs in bulk */ if (RAND_bytes((IVs = blocks[0].c), 16 * x4) <= 0) return 0; /* align */ ctx = (SHA256_MB_CTX *) (storage + 32 - ((size_t)storage % 32)); frag = (unsigned int)inp_len >> (1 + n4x); last = (unsigned int)inp_len + frag - (frag << (1 + n4x)); if (last > frag && ((last + 13 + 9) % 64) < (x4 - 1)) { frag++; last -= x4 - 1; } packlen = 5 + 16 + ((frag + 32 + 16) & -16); /* populate descriptors with pointers and IVs */ hash_d[0].ptr = inp; ciph_d[0].inp = inp; /* 5+16 is place for header and explicit IV */ ciph_d[0].out = out + 5 + 16; memcpy(ciph_d[0].out - 16, IVs, 16); memcpy(ciph_d[0].iv, IVs, 16); IVs += 16; for (i = 1; i < x4; i++) { ciph_d[i].inp = hash_d[i].ptr = hash_d[i - 1].ptr + frag; ciph_d[i].out = ciph_d[i - 1].out + packlen; memcpy(ciph_d[i].out - 16, IVs, 16); memcpy(ciph_d[i].iv, IVs, 16); IVs += 16; } # if defined(BSWAP8) memcpy(blocks[0].c, key->md.data, 8); seqnum = BSWAP8(blocks[0].q[0]); # endif for (i = 0; i < x4; i++) { unsigned int len = (i == (x4 - 1) ? last : frag); # if !defined(BSWAP8) unsigned int carry, j; # endif ctx->A[i] = key->md.h[0]; ctx->B[i] = key->md.h[1]; ctx->C[i] = key->md.h[2]; ctx->D[i] = key->md.h[3]; ctx->E[i] = key->md.h[4]; ctx->F[i] = key->md.h[5]; ctx->G[i] = key->md.h[6]; ctx->H[i] = key->md.h[7]; /* fix seqnum */ # if defined(BSWAP8) blocks[i].q[0] = BSWAP8(seqnum + i); # else for (carry = i, j = 8; j--;) { blocks[i].c[j] = ((u8 *)key->md.data)[j] + carry; carry = (blocks[i].c[j] - carry) >> (sizeof(carry) * 8 - 1); } # endif blocks[i].c[8] = ((u8 *)key->md.data)[8]; blocks[i].c[9] = ((u8 *)key->md.data)[9]; blocks[i].c[10] = ((u8 *)key->md.data)[10]; /* fix length */ blocks[i].c[11] = (u8)(len >> 8); blocks[i].c[12] = (u8)(len); memcpy(blocks[i].c + 13, hash_d[i].ptr, 64 - 13); hash_d[i].ptr += 64 - 13; hash_d[i].blocks = (len - (64 - 13)) / 64; edges[i].ptr = blocks[i].c; edges[i].blocks = 1; } /* hash 13-byte headers and first 64-13 bytes of inputs */ sha256_multi_block(ctx, edges, n4x); /* hash bulk inputs */ # define MAXCHUNKSIZE 2048 # if MAXCHUNKSIZE%64 # error "MAXCHUNKSIZE is not divisible by 64" # elif MAXCHUNKSIZE /* * goal is to minimize pressure on L1 cache by moving in shorter steps, * so that hashed data is still in the cache by the time we encrypt it */ minblocks = ((frag <= last ? frag : last) - (64 - 13)) / 64; if (minblocks > MAXCHUNKSIZE / 64) { for (i = 0; i < x4; i++) { edges[i].ptr = hash_d[i].ptr; edges[i].blocks = MAXCHUNKSIZE / 64; ciph_d[i].blocks = MAXCHUNKSIZE / 16; } do { sha256_multi_block(ctx, edges, n4x); aesni_multi_cbc_encrypt(ciph_d, &key->ks, n4x); for (i = 0; i < x4; i++) { edges[i].ptr = hash_d[i].ptr += MAXCHUNKSIZE; hash_d[i].blocks -= MAXCHUNKSIZE / 64; edges[i].blocks = MAXCHUNKSIZE / 64; ciph_d[i].inp += MAXCHUNKSIZE; ciph_d[i].out += MAXCHUNKSIZE; ciph_d[i].blocks = MAXCHUNKSIZE / 16; memcpy(ciph_d[i].iv, ciph_d[i].out - 16, 16); } processed += MAXCHUNKSIZE; minblocks -= MAXCHUNKSIZE / 64; } while (minblocks > MAXCHUNKSIZE / 64); } # endif # undef MAXCHUNKSIZE sha256_multi_block(ctx, hash_d, n4x); memset(blocks, 0, sizeof(blocks)); for (i = 0; i < x4; i++) { unsigned int len = (i == (x4 - 1) ? last : frag), off = hash_d[i].blocks * 64; const unsigned char *ptr = hash_d[i].ptr + off; off = (len - processed) - (64 - 13) - off; /* remainder actually */ memcpy(blocks[i].c, ptr, off); blocks[i].c[off] = 0x80; len += 64 + 13; /* 64 is HMAC header */ len *= 8; /* convert to bits */ if (off < (64 - 8)) { # ifdef BSWAP4 blocks[i].d[15] = BSWAP4(len); # else PUTU32(blocks[i].c + 60, len); # endif edges[i].blocks = 1; } else { # ifdef BSWAP4 blocks[i].d[31] = BSWAP4(len); # else PUTU32(blocks[i].c + 124, len); # endif edges[i].blocks = 2; } edges[i].ptr = blocks[i].c; } /* hash input tails and finalize */ sha256_multi_block(ctx, edges, n4x); memset(blocks, 0, sizeof(blocks)); for (i = 0; i < x4; i++) { # ifdef BSWAP4 blocks[i].d[0] = BSWAP4(ctx->A[i]); ctx->A[i] = key->tail.h[0]; blocks[i].d[1] = BSWAP4(ctx->B[i]); ctx->B[i] = key->tail.h[1]; blocks[i].d[2] = BSWAP4(ctx->C[i]); ctx->C[i] = key->tail.h[2]; blocks[i].d[3] = BSWAP4(ctx->D[i]); ctx->D[i] = key->tail.h[3]; blocks[i].d[4] = BSWAP4(ctx->E[i]); ctx->E[i] = key->tail.h[4]; blocks[i].d[5] = BSWAP4(ctx->F[i]); ctx->F[i] = key->tail.h[5]; blocks[i].d[6] = BSWAP4(ctx->G[i]); ctx->G[i] = key->tail.h[6]; blocks[i].d[7] = BSWAP4(ctx->H[i]); ctx->H[i] = key->tail.h[7]; blocks[i].c[32] = 0x80; blocks[i].d[15] = BSWAP4((64 + 32) * 8); # else PUTU32(blocks[i].c + 0, ctx->A[i]); ctx->A[i] = key->tail.h[0]; PUTU32(blocks[i].c + 4, ctx->B[i]); ctx->B[i] = key->tail.h[1]; PUTU32(blocks[i].c + 8, ctx->C[i]); ctx->C[i] = key->tail.h[2]; PUTU32(blocks[i].c + 12, ctx->D[i]); ctx->D[i] = key->tail.h[3]; PUTU32(blocks[i].c + 16, ctx->E[i]); ctx->E[i] = key->tail.h[4]; PUTU32(blocks[i].c + 20, ctx->F[i]); ctx->F[i] = key->tail.h[5]; PUTU32(blocks[i].c + 24, ctx->G[i]); ctx->G[i] = key->tail.h[6]; PUTU32(blocks[i].c + 28, ctx->H[i]); ctx->H[i] = key->tail.h[7]; blocks[i].c[32] = 0x80; PUTU32(blocks[i].c + 60, (64 + 32) * 8); # endif edges[i].ptr = blocks[i].c; edges[i].blocks = 1; } /* finalize MACs */ sha256_multi_block(ctx, edges, n4x); for (i = 0; i < x4; i++) { unsigned int len = (i == (x4 - 1) ? last : frag), pad, j; unsigned char *out0 = out; memcpy(ciph_d[i].out, ciph_d[i].inp, len - processed); ciph_d[i].inp = ciph_d[i].out; out += 5 + 16 + len; /* write MAC */ PUTU32(out + 0, ctx->A[i]); PUTU32(out + 4, ctx->B[i]); PUTU32(out + 8, ctx->C[i]); PUTU32(out + 12, ctx->D[i]); PUTU32(out + 16, ctx->E[i]); PUTU32(out + 20, ctx->F[i]); PUTU32(out + 24, ctx->G[i]); PUTU32(out + 28, ctx->H[i]); out += 32; len += 32; /* pad */ pad = 15 - len % 16; for (j = 0; j <= pad; j++) *(out++) = pad; len += pad + 1; ciph_d[i].blocks = (len - processed) / 16; len += 16; /* account for explicit iv */ /* arrange header */ out0[0] = ((u8 *)key->md.data)[8]; out0[1] = ((u8 *)key->md.data)[9]; out0[2] = ((u8 *)key->md.data)[10]; out0[3] = (u8)(len >> 8); out0[4] = (u8)(len); ret += len + 5; inp += frag; } aesni_multi_cbc_encrypt(ciph_d, &key->ks, n4x); OPENSSL_cleanse(blocks, sizeof(blocks)); OPENSSL_cleanse(ctx, sizeof(*ctx)); return ret; }
static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256]) { u128 Z = { 0, 0}; const u8 *xi = (const u8 *)Xi+15; size_t rem, n = *xi; static const size_t rem_8bit[256] = { PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246), PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E), PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56), PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E), PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66), PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E), PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076), PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E), PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06), PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E), PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416), PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E), PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626), PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E), PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836), PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E), PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6), PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE), PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6), PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE), PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6), PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE), PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6), PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE), PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86), PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E), PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496), PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E), PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6), PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE), PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6), PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE), PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346), PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E), PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56), PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E), PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66), PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E), PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176), PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E), PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06), PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E), PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516), PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E), PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726), PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E), PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936), PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E), PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6), PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE), PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6), PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE), PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6), PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE), PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6), PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE), PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86), PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E), PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596), PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E), PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6), PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE), PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6), PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) }; while (1) { Z.hi ^= Htable[n].hi; Z.lo ^= Htable[n].lo; if ((u8 *)Xi==xi) break; n = *(--xi); rem = (size_t)Z.lo&0xff; Z.lo = (Z.hi<<56)|(Z.lo>>8); Z.hi = (Z.hi>>8); if (sizeof(size_t)==8) Z.hi ^= rem_8bit[rem]; else Z.hi ^= (u64)rem_8bit[rem]<<32; } if (BYTE_ORDER == LITTLE_ENDIAN) { #ifdef BSWAP8 Xi[0] = BSWAP8(Z.hi); Xi[1] = BSWAP8(Z.lo); #else u8 *p = (u8 *)Xi; u32 v; v = (u32)(Z.hi>>32); PUTU32(p,v); v = (u32)(Z.hi); PUTU32(p+4,v); v = (u32)(Z.lo>>32); PUTU32(p+8,v); v = (u32)(Z.lo); PUTU32(p+12,v); #endif }