示例#1
0
static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA1 *key,
	unsigned char *out, const unsigned char *inp, size_t inp_len,
	int n4x)	/* n4x is 1 or 2 */
{
	HASH_DESC	hash_d[8], edges[8];
	CIPH_DESC	ciph_d[8];
	unsigned char	storage[sizeof(SHA1_MB_CTX)+32];
	union {	u64	q[16];
		u32	d[32];
		u8	c[128];	} blocks[8];
	SHA1_MB_CTX	*ctx;
	unsigned int	frag, last, packlen, i, x4=4*n4x;
	size_t		ret = 0;
	u8		*IVs;

	ctx = (SHA1_MB_CTX *)(storage+32-((size_t)storage%32));	/* align */

	frag = (unsigned int)inp_len>>(1+n4x);
	last = (unsigned int)inp_len+frag-(frag<<(1+n4x));
	if (last>frag && ((last+13+9)%64)<(x4-1)) {
		frag++;
		last -= x4-1;
	}

	hash_d[0].ptr = inp;
	for (i=1;i<x4;i++)	hash_d[i].ptr = hash_d[i-1].ptr+frag;

	for (i=0;i<x4;i++) {
		unsigned int len = (i==(x4-1)?last:frag);

		ctx->A[i] = key->md.h0;
		ctx->B[i] = key->md.h1;
		ctx->C[i] = key->md.h2;
		ctx->D[i] = key->md.h3;
		ctx->E[i] = key->md.h4;

		/* fix seqnum */
#if defined(BSWAP8)
		blocks[i].q[0] = BSWAP8(BSWAP8(*(u64*)key->md.data)+i);
#else
		blocks[i].c[7] += ((u8*)key->md.data)[7]+i;
		if (blocks[i].c[7] < i) {
			int j;

			for (j=6;j>=0;j--) {
				if (blocks[i].c[j]=((u8*)key->md.data)[j]+1) break;
			}
		}
#endif
		blocks[i].c[8] = ((u8*)key->md.data)[8];
		blocks[i].c[9] = ((u8*)key->md.data)[9];
		blocks[i].c[10] = ((u8*)key->md.data)[10];
		/* fix length */
		blocks[i].c[11] = (u8)(len>>8);
		blocks[i].c[12] = (u8)(len);

		memcpy(blocks[i].c+13,hash_d[i].ptr,64-13);
		hash_d[i].ptr += 64-13;
		hash_d[i].blocks = (len-(64-13))/64;

		edges[i].ptr = blocks[i].c;
		edges[i].blocks = 1;
	}

	/* hash 13-byte headers and first 64-13 bytes of inputs */
	sha1_multi_block(ctx,edges,n4x);
	/* hash bulk inputs */
	sha1_multi_block(ctx,hash_d,n4x);

	memset(blocks,0,sizeof(blocks));
	for (i=0;i<x4;i++) {
		unsigned int		len = (i==(x4-1)?last:frag),
					off = hash_d[i].blocks*64;
		const unsigned char    *ptr = hash_d[i].ptr+off;

		off = len-(64-13)-off;	/* remainder actually */
		memcpy(blocks[i].c,ptr,off);
		blocks[i].c[off]=0x80;
		len += 64+13;		/* 64 is HMAC header */
		len *= 8;		/* convert to bits */
		if (off<(64-8)) {
			blocks[i].d[15] = BSWAP4(len);
			edges[i].blocks = 1;			
		} else {
			blocks[i].d[31] = BSWAP4(len);
			edges[i].blocks = 2;
		}
		edges[i].ptr = blocks[i].c;
	}

	/* hash input tails and finalize */
	sha1_multi_block(ctx,edges,n4x);

	memset(blocks,0,sizeof(blocks));
	for (i=0;i<x4;i++) {
		blocks[i].d[0] = BSWAP4(ctx->A[i]);	ctx->A[i] = key->tail.h0;
		blocks[i].d[1] = BSWAP4(ctx->B[i]);	ctx->B[i] = key->tail.h1;
		blocks[i].d[2] = BSWAP4(ctx->C[i]);	ctx->C[i] = key->tail.h2;
		blocks[i].d[3] = BSWAP4(ctx->D[i]);	ctx->D[i] = key->tail.h3;
		blocks[i].d[4] = BSWAP4(ctx->E[i]);	ctx->E[i] = key->tail.h4;
		blocks[i].c[20] = 0x80;
		blocks[i].d[15] = BSWAP4((64+20)*8);
		edges[i].ptr = blocks[i].c;
		edges[i].blocks = 1;
	}

	/* finalize MACs */
	sha1_multi_block(ctx,edges,n4x);

	packlen = 5+16+((frag+20+16)&-16);

	out += (packlen<<(1+n4x))-packlen;
	inp += (frag<<(1+n4x))-frag;

	RAND_bytes((IVs=blocks[0].c),16*x4);	/* ask for IVs in bulk */

	for (i=x4-1;;i--) {
		unsigned int len = (i==(x4-1)?last:frag), pad, j;
		unsigned char *out0 = out;

		out += 5+16;		/* place for header and explicit IV */
		ciph_d[i].inp = out;
		ciph_d[i].out = out;

		memmove(out,inp,len);
		out += len;

		/* write MAC */
		((u32 *)out)[0] = BSWAP4(ctx->A[i]);
		((u32 *)out)[1] = BSWAP4(ctx->B[i]);
		((u32 *)out)[2] = BSWAP4(ctx->C[i]);
		((u32 *)out)[3] = BSWAP4(ctx->D[i]);
		((u32 *)out)[4] = BSWAP4(ctx->E[i]);
		out += 20;
		len += 20;

		/* pad */
		pad = 15-len%16;
		for (j=0;j<=pad;j++) *(out++) = pad;
		len += pad+1;

		ciph_d[i].blocks = len/16;
		len += 16;	/* account for explicit iv */

		/* arrange header */
		out0[0] = ((u8*)key->md.data)[8];
		out0[1] = ((u8*)key->md.data)[9];
		out0[2] = ((u8*)key->md.data)[10];
		out0[3] = (u8)(len>>8);
		out0[4] = (u8)(len);

		/* explicit iv */
		memcpy(ciph_d[i].iv, IVs, 16);
		memcpy(&out0[5],     IVs, 16);

		ret += len+5;

		if (i==0) break;

		out = out0-packlen;
		inp -= frag;
		IVs += 16;
	}

	aesni_multi_cbc_encrypt(ciph_d,&key->ks,n4x);

	OPENSSL_cleanse(blocks,sizeof(blocks));
	OPENSSL_cleanse(ctx,sizeof(*ctx));

	return ret;
}
示例#2
0
static size_t tls1_1_multi_block_encrypt(EVP_AES_HMAC_SHA256 *key,
                                         unsigned char *out,
                                         const unsigned char *inp,
                                         size_t inp_len, int n4x)
{                               /* n4x is 1 or 2 */
    HASH_DESC hash_d[8], edges[8];
    CIPH_DESC ciph_d[8];
    unsigned char storage[sizeof(SHA256_MB_CTX) + 32];
    union {
        u64 q[16];
        u32 d[32];
        u8 c[128];
    } blocks[8];
    SHA256_MB_CTX *ctx;
    unsigned int frag, last, packlen, i, x4 = 4 * n4x, minblocks, processed =
        0;
    size_t ret = 0;
    u8 *IVs;
#   if defined(BSWAP8)
    u64 seqnum;
#   endif

    /* ask for IVs in bulk */
    if (RAND_bytes((IVs = blocks[0].c), 16 * x4) <= 0)
        return 0;

    /* align */
    ctx = (SHA256_MB_CTX *) (storage + 32 - ((size_t)storage % 32));

    frag = (unsigned int)inp_len >> (1 + n4x);
    last = (unsigned int)inp_len + frag - (frag << (1 + n4x));
    if (last > frag && ((last + 13 + 9) % 64) < (x4 - 1)) {
        frag++;
        last -= x4 - 1;
    }

    packlen = 5 + 16 + ((frag + 32 + 16) & -16);

    /* populate descriptors with pointers and IVs */
    hash_d[0].ptr = inp;
    ciph_d[0].inp = inp;
    /* 5+16 is place for header and explicit IV */
    ciph_d[0].out = out + 5 + 16;
    memcpy(ciph_d[0].out - 16, IVs, 16);
    memcpy(ciph_d[0].iv, IVs, 16);
    IVs += 16;

    for (i = 1; i < x4; i++) {
        ciph_d[i].inp = hash_d[i].ptr = hash_d[i - 1].ptr + frag;
        ciph_d[i].out = ciph_d[i - 1].out + packlen;
        memcpy(ciph_d[i].out - 16, IVs, 16);
        memcpy(ciph_d[i].iv, IVs, 16);
        IVs += 16;
    }

#   if defined(BSWAP8)
    memcpy(blocks[0].c, key->md.data, 8);
    seqnum = BSWAP8(blocks[0].q[0]);
#   endif
    for (i = 0; i < x4; i++) {
        unsigned int len = (i == (x4 - 1) ? last : frag);
#   if !defined(BSWAP8)
        unsigned int carry, j;
#   endif

        ctx->A[i] = key->md.h[0];
        ctx->B[i] = key->md.h[1];
        ctx->C[i] = key->md.h[2];
        ctx->D[i] = key->md.h[3];
        ctx->E[i] = key->md.h[4];
        ctx->F[i] = key->md.h[5];
        ctx->G[i] = key->md.h[6];
        ctx->H[i] = key->md.h[7];

        /* fix seqnum */
#   if defined(BSWAP8)
        blocks[i].q[0] = BSWAP8(seqnum + i);
#   else
        for (carry = i, j = 8; j--;) {
            blocks[i].c[j] = ((u8 *)key->md.data)[j] + carry;
            carry = (blocks[i].c[j] - carry) >> (sizeof(carry) * 8 - 1);
        }
#   endif
        blocks[i].c[8] = ((u8 *)key->md.data)[8];
        blocks[i].c[9] = ((u8 *)key->md.data)[9];
        blocks[i].c[10] = ((u8 *)key->md.data)[10];
        /* fix length */
        blocks[i].c[11] = (u8)(len >> 8);
        blocks[i].c[12] = (u8)(len);

        memcpy(blocks[i].c + 13, hash_d[i].ptr, 64 - 13);
        hash_d[i].ptr += 64 - 13;
        hash_d[i].blocks = (len - (64 - 13)) / 64;

        edges[i].ptr = blocks[i].c;
        edges[i].blocks = 1;
    }

    /* hash 13-byte headers and first 64-13 bytes of inputs */
    sha256_multi_block(ctx, edges, n4x);
    /* hash bulk inputs */
#   define MAXCHUNKSIZE    2048
#   if     MAXCHUNKSIZE%64
#    error  "MAXCHUNKSIZE is not divisible by 64"
#   elif   MAXCHUNKSIZE
    /*
     * goal is to minimize pressure on L1 cache by moving in shorter steps,
     * so that hashed data is still in the cache by the time we encrypt it
     */
    minblocks = ((frag <= last ? frag : last) - (64 - 13)) / 64;
    if (minblocks > MAXCHUNKSIZE / 64) {
        for (i = 0; i < x4; i++) {
            edges[i].ptr = hash_d[i].ptr;
            edges[i].blocks = MAXCHUNKSIZE / 64;
            ciph_d[i].blocks = MAXCHUNKSIZE / 16;
        }
        do {
            sha256_multi_block(ctx, edges, n4x);
            aesni_multi_cbc_encrypt(ciph_d, &key->ks, n4x);

            for (i = 0; i < x4; i++) {
                edges[i].ptr = hash_d[i].ptr += MAXCHUNKSIZE;
                hash_d[i].blocks -= MAXCHUNKSIZE / 64;
                edges[i].blocks = MAXCHUNKSIZE / 64;
                ciph_d[i].inp += MAXCHUNKSIZE;
                ciph_d[i].out += MAXCHUNKSIZE;
                ciph_d[i].blocks = MAXCHUNKSIZE / 16;
                memcpy(ciph_d[i].iv, ciph_d[i].out - 16, 16);
            }
            processed += MAXCHUNKSIZE;
            minblocks -= MAXCHUNKSIZE / 64;
        } while (minblocks > MAXCHUNKSIZE / 64);
    }
#   endif
#   undef  MAXCHUNKSIZE
    sha256_multi_block(ctx, hash_d, n4x);

    memset(blocks, 0, sizeof(blocks));
    for (i = 0; i < x4; i++) {
        unsigned int len = (i == (x4 - 1) ? last : frag),
            off = hash_d[i].blocks * 64;
        const unsigned char *ptr = hash_d[i].ptr + off;

        off = (len - processed) - (64 - 13) - off; /* remainder actually */
        memcpy(blocks[i].c, ptr, off);
        blocks[i].c[off] = 0x80;
        len += 64 + 13;         /* 64 is HMAC header */
        len *= 8;               /* convert to bits */
        if (off < (64 - 8)) {
#   ifdef BSWAP4
            blocks[i].d[15] = BSWAP4(len);
#   else
            PUTU32(blocks[i].c + 60, len);
#   endif
            edges[i].blocks = 1;
        } else {
#   ifdef BSWAP4
            blocks[i].d[31] = BSWAP4(len);
#   else
            PUTU32(blocks[i].c + 124, len);
#   endif
            edges[i].blocks = 2;
        }
        edges[i].ptr = blocks[i].c;
    }

    /* hash input tails and finalize */
    sha256_multi_block(ctx, edges, n4x);

    memset(blocks, 0, sizeof(blocks));
    for (i = 0; i < x4; i++) {
#   ifdef BSWAP4
        blocks[i].d[0] = BSWAP4(ctx->A[i]);
        ctx->A[i] = key->tail.h[0];
        blocks[i].d[1] = BSWAP4(ctx->B[i]);
        ctx->B[i] = key->tail.h[1];
        blocks[i].d[2] = BSWAP4(ctx->C[i]);
        ctx->C[i] = key->tail.h[2];
        blocks[i].d[3] = BSWAP4(ctx->D[i]);
        ctx->D[i] = key->tail.h[3];
        blocks[i].d[4] = BSWAP4(ctx->E[i]);
        ctx->E[i] = key->tail.h[4];
        blocks[i].d[5] = BSWAP4(ctx->F[i]);
        ctx->F[i] = key->tail.h[5];
        blocks[i].d[6] = BSWAP4(ctx->G[i]);
        ctx->G[i] = key->tail.h[6];
        blocks[i].d[7] = BSWAP4(ctx->H[i]);
        ctx->H[i] = key->tail.h[7];
        blocks[i].c[32] = 0x80;
        blocks[i].d[15] = BSWAP4((64 + 32) * 8);
#   else
        PUTU32(blocks[i].c + 0, ctx->A[i]);
        ctx->A[i] = key->tail.h[0];
        PUTU32(blocks[i].c + 4, ctx->B[i]);
        ctx->B[i] = key->tail.h[1];
        PUTU32(blocks[i].c + 8, ctx->C[i]);
        ctx->C[i] = key->tail.h[2];
        PUTU32(blocks[i].c + 12, ctx->D[i]);
        ctx->D[i] = key->tail.h[3];
        PUTU32(blocks[i].c + 16, ctx->E[i]);
        ctx->E[i] = key->tail.h[4];
        PUTU32(blocks[i].c + 20, ctx->F[i]);
        ctx->F[i] = key->tail.h[5];
        PUTU32(blocks[i].c + 24, ctx->G[i]);
        ctx->G[i] = key->tail.h[6];
        PUTU32(blocks[i].c + 28, ctx->H[i]);
        ctx->H[i] = key->tail.h[7];
        blocks[i].c[32] = 0x80;
        PUTU32(blocks[i].c + 60, (64 + 32) * 8);
#   endif
        edges[i].ptr = blocks[i].c;
        edges[i].blocks = 1;
    }

    /* finalize MACs */
    sha256_multi_block(ctx, edges, n4x);

    for (i = 0; i < x4; i++) {
        unsigned int len = (i == (x4 - 1) ? last : frag), pad, j;
        unsigned char *out0 = out;

        memcpy(ciph_d[i].out, ciph_d[i].inp, len - processed);
        ciph_d[i].inp = ciph_d[i].out;

        out += 5 + 16 + len;

        /* write MAC */
        PUTU32(out + 0, ctx->A[i]);
        PUTU32(out + 4, ctx->B[i]);
        PUTU32(out + 8, ctx->C[i]);
        PUTU32(out + 12, ctx->D[i]);
        PUTU32(out + 16, ctx->E[i]);
        PUTU32(out + 20, ctx->F[i]);
        PUTU32(out + 24, ctx->G[i]);
        PUTU32(out + 28, ctx->H[i]);
        out += 32;
        len += 32;

        /* pad */
        pad = 15 - len % 16;
        for (j = 0; j <= pad; j++)
            *(out++) = pad;
        len += pad + 1;

        ciph_d[i].blocks = (len - processed) / 16;
        len += 16;              /* account for explicit iv */

        /* arrange header */
        out0[0] = ((u8 *)key->md.data)[8];
        out0[1] = ((u8 *)key->md.data)[9];
        out0[2] = ((u8 *)key->md.data)[10];
        out0[3] = (u8)(len >> 8);
        out0[4] = (u8)(len);

        ret += len + 5;
        inp += frag;
    }

    aesni_multi_cbc_encrypt(ciph_d, &key->ks, n4x);

    OPENSSL_cleanse(blocks, sizeof(blocks));
    OPENSSL_cleanse(ctx, sizeof(*ctx));

    return ret;
}
示例#3
0
static void gcm_gmult_8bit(u64 Xi[2], const u128 Htable[256])
{
	u128 Z = { 0, 0};
	const u8 *xi = (const u8 *)Xi+15;
	size_t rem, n = *xi;
	static const size_t rem_8bit[256] = {
		PACK(0x0000), PACK(0x01C2), PACK(0x0384), PACK(0x0246),
		PACK(0x0708), PACK(0x06CA), PACK(0x048C), PACK(0x054E),
		PACK(0x0E10), PACK(0x0FD2), PACK(0x0D94), PACK(0x0C56),
		PACK(0x0918), PACK(0x08DA), PACK(0x0A9C), PACK(0x0B5E),
		PACK(0x1C20), PACK(0x1DE2), PACK(0x1FA4), PACK(0x1E66),
		PACK(0x1B28), PACK(0x1AEA), PACK(0x18AC), PACK(0x196E),
		PACK(0x1230), PACK(0x13F2), PACK(0x11B4), PACK(0x1076),
		PACK(0x1538), PACK(0x14FA), PACK(0x16BC), PACK(0x177E),
		PACK(0x3840), PACK(0x3982), PACK(0x3BC4), PACK(0x3A06),
		PACK(0x3F48), PACK(0x3E8A), PACK(0x3CCC), PACK(0x3D0E),
		PACK(0x3650), PACK(0x3792), PACK(0x35D4), PACK(0x3416),
		PACK(0x3158), PACK(0x309A), PACK(0x32DC), PACK(0x331E),
		PACK(0x2460), PACK(0x25A2), PACK(0x27E4), PACK(0x2626),
		PACK(0x2368), PACK(0x22AA), PACK(0x20EC), PACK(0x212E),
		PACK(0x2A70), PACK(0x2BB2), PACK(0x29F4), PACK(0x2836),
		PACK(0x2D78), PACK(0x2CBA), PACK(0x2EFC), PACK(0x2F3E),
		PACK(0x7080), PACK(0x7142), PACK(0x7304), PACK(0x72C6),
		PACK(0x7788), PACK(0x764A), PACK(0x740C), PACK(0x75CE),
		PACK(0x7E90), PACK(0x7F52), PACK(0x7D14), PACK(0x7CD6),
		PACK(0x7998), PACK(0x785A), PACK(0x7A1C), PACK(0x7BDE),
		PACK(0x6CA0), PACK(0x6D62), PACK(0x6F24), PACK(0x6EE6),
		PACK(0x6BA8), PACK(0x6A6A), PACK(0x682C), PACK(0x69EE),
		PACK(0x62B0), PACK(0x6372), PACK(0x6134), PACK(0x60F6),
		PACK(0x65B8), PACK(0x647A), PACK(0x663C), PACK(0x67FE),
		PACK(0x48C0), PACK(0x4902), PACK(0x4B44), PACK(0x4A86),
		PACK(0x4FC8), PACK(0x4E0A), PACK(0x4C4C), PACK(0x4D8E),
		PACK(0x46D0), PACK(0x4712), PACK(0x4554), PACK(0x4496),
		PACK(0x41D8), PACK(0x401A), PACK(0x425C), PACK(0x439E),
		PACK(0x54E0), PACK(0x5522), PACK(0x5764), PACK(0x56A6),
		PACK(0x53E8), PACK(0x522A), PACK(0x506C), PACK(0x51AE),
		PACK(0x5AF0), PACK(0x5B32), PACK(0x5974), PACK(0x58B6),
		PACK(0x5DF8), PACK(0x5C3A), PACK(0x5E7C), PACK(0x5FBE),
		PACK(0xE100), PACK(0xE0C2), PACK(0xE284), PACK(0xE346),
		PACK(0xE608), PACK(0xE7CA), PACK(0xE58C), PACK(0xE44E),
		PACK(0xEF10), PACK(0xEED2), PACK(0xEC94), PACK(0xED56),
		PACK(0xE818), PACK(0xE9DA), PACK(0xEB9C), PACK(0xEA5E),
		PACK(0xFD20), PACK(0xFCE2), PACK(0xFEA4), PACK(0xFF66),
		PACK(0xFA28), PACK(0xFBEA), PACK(0xF9AC), PACK(0xF86E),
		PACK(0xF330), PACK(0xF2F2), PACK(0xF0B4), PACK(0xF176),
		PACK(0xF438), PACK(0xF5FA), PACK(0xF7BC), PACK(0xF67E),
		PACK(0xD940), PACK(0xD882), PACK(0xDAC4), PACK(0xDB06),
		PACK(0xDE48), PACK(0xDF8A), PACK(0xDDCC), PACK(0xDC0E),
		PACK(0xD750), PACK(0xD692), PACK(0xD4D4), PACK(0xD516),
		PACK(0xD058), PACK(0xD19A), PACK(0xD3DC), PACK(0xD21E),
		PACK(0xC560), PACK(0xC4A2), PACK(0xC6E4), PACK(0xC726),
		PACK(0xC268), PACK(0xC3AA), PACK(0xC1EC), PACK(0xC02E),
		PACK(0xCB70), PACK(0xCAB2), PACK(0xC8F4), PACK(0xC936),
		PACK(0xCC78), PACK(0xCDBA), PACK(0xCFFC), PACK(0xCE3E),
		PACK(0x9180), PACK(0x9042), PACK(0x9204), PACK(0x93C6),
		PACK(0x9688), PACK(0x974A), PACK(0x950C), PACK(0x94CE),
		PACK(0x9F90), PACK(0x9E52), PACK(0x9C14), PACK(0x9DD6),
		PACK(0x9898), PACK(0x995A), PACK(0x9B1C), PACK(0x9ADE),
		PACK(0x8DA0), PACK(0x8C62), PACK(0x8E24), PACK(0x8FE6),
		PACK(0x8AA8), PACK(0x8B6A), PACK(0x892C), PACK(0x88EE),
		PACK(0x83B0), PACK(0x8272), PACK(0x8034), PACK(0x81F6),
		PACK(0x84B8), PACK(0x857A), PACK(0x873C), PACK(0x86FE),
		PACK(0xA9C0), PACK(0xA802), PACK(0xAA44), PACK(0xAB86),
		PACK(0xAEC8), PACK(0xAF0A), PACK(0xAD4C), PACK(0xAC8E),
		PACK(0xA7D0), PACK(0xA612), PACK(0xA454), PACK(0xA596),
		PACK(0xA0D8), PACK(0xA11A), PACK(0xA35C), PACK(0xA29E),
		PACK(0xB5E0), PACK(0xB422), PACK(0xB664), PACK(0xB7A6),
		PACK(0xB2E8), PACK(0xB32A), PACK(0xB16C), PACK(0xB0AE),
		PACK(0xBBF0), PACK(0xBA32), PACK(0xB874), PACK(0xB9B6),
		PACK(0xBCF8), PACK(0xBD3A), PACK(0xBF7C), PACK(0xBEBE) };

	while (1) {
		Z.hi ^= Htable[n].hi;
		Z.lo ^= Htable[n].lo;

		if ((u8 *)Xi==xi)	break;

		n = *(--xi);

		rem  = (size_t)Z.lo&0xff;
		Z.lo = (Z.hi<<56)|(Z.lo>>8);
		Z.hi = (Z.hi>>8);
		if (sizeof(size_t)==8)
			Z.hi ^= rem_8bit[rem];
		else
			Z.hi ^= (u64)rem_8bit[rem]<<32;
	}

	if (BYTE_ORDER == LITTLE_ENDIAN) {
#ifdef BSWAP8
		Xi[0] = BSWAP8(Z.hi);
		Xi[1] = BSWAP8(Z.lo);
#else
		u8 *p = (u8 *)Xi;
		u32 v;
		v = (u32)(Z.hi>>32);	PUTU32(p,v);
		v = (u32)(Z.hi);	PUTU32(p+4,v);
		v = (u32)(Z.lo>>32);	PUTU32(p+8,v);
		v = (u32)(Z.lo);	PUTU32(p+12,v);
#endif
	}