uint64_t siphash24(const void *src, unsigned long src_sz, const char key[16]) { const uint64_t *_key = (uint64_t *)key; uint64_t k0 = _le64toh(_key[0]); uint64_t k1 = _le64toh(_key[1]); uint64_t b = (uint64_t)src_sz << 56; const uint64_t *in = (uint64_t *)src; uint64_t v0 = k0 ^ 0x736f6d6570736575ULL; uint64_t v1 = k1 ^ 0x646f72616e646f6dULL; uint64_t v2 = k0 ^ 0x6c7967656e657261ULL; uint64_t v3 = k1 ^ 0x7465646279746573ULL; while (src_sz >= 8) { uint64_t mi = _le64toh(RD64p(in)); in += 1; src_sz -= 8; v3 ^= mi; DOUBLE_ROUND(v0, v1, v2, v3); v0 ^= mi; } uint64_t t = 0; uint8_t *pt = (uint8_t *)&t; uint8_t *m = (uint8_t *)in; switch (src_sz) { case 7: pt[6] = m[6]; case 6: pt[5] = m[5]; case 5: pt[4] = m[4]; case 4: *((uint32_t *)&pt[0]) = RD32p(&m[0]); break; case 3: pt[2] = m[2]; case 2: pt[1] = m[1]; case 1: pt[0] = m[0]; } b |= _le64toh(t); v3 ^= b; DOUBLE_ROUND(v0, v1, v2, v3); v0 ^= b; v2 ^= 0xff; DOUBLE_ROUND(v0, v1, v2, v3); DOUBLE_ROUND(v0, v1, v2, v3); return (v0 ^ v1) ^ (v2 ^ v3); }
uint64_t siphash24(const void *src, unsigned long src_sz, const struct sipkey *key) { uint64_t k0 = key->k0; uint64_t k1 = key->k1; uint64_t b = (uint64_t)src_sz << 56; const uint64_t *in = (uint64_t*)src; uint64_t t; uint8_t *pt, *m; uint64_t v0 = k0 ^ 0x736f6d6570736575ULL; uint64_t v1 = k1 ^ 0x646f72616e646f6dULL; uint64_t v2 = k0 ^ 0x6c7967656e657261ULL; uint64_t v3 = k1 ^ 0x7465646279746573ULL; while (src_sz >= 8) { #ifdef UNALIGNED_OK uint64_t mi = _le64toh(*in); #else uint64_t mi; memcpy(&mi, in, 8); mi = _le64toh(mi); #endif in += 1; src_sz -= 8; v3 ^= mi; DOUBLE_ROUND(v0,v1,v2,v3); v0 ^= mi; } t = 0; pt = (uint8_t*)&t; m = (uint8_t*)in; switch (src_sz) { case 7: pt[6] = m[6]; case 6: pt[5] = m[5]; case 5: pt[4] = m[4]; #ifdef UNALIGNED_OK case 4: *((uint32_t*)&pt[0]) = *((uint32_t*)&m[0]); break; #else case 4: pt[3] = m[3]; #endif case 3: pt[2] = m[2]; case 2: pt[1] = m[1]; case 1: pt[0] = m[0]; } b |= _le64toh(t); v3 ^= b; DOUBLE_ROUND(v0,v1,v2,v3); v0 ^= b; v2 ^= 0xff; DOUBLE_ROUND(v0,v1,v2,v3); DOUBLE_ROUND(v0,v1,v2,v3); return (v0 ^ v1) ^ (v2 ^ v3); }
void Camellia::Base::ProcessAndXorBlock(const byte *inBlock, const byte *xorBlock, byte *outBlock) const { #define KS(i, j) ks[i*4 + EFI(j/2)*2 + EFI(j%2)] #define FL(klh, kll, krh, krl) \ ll ^= rotlFixed(lh & klh, 1); \ lh ^= (ll | kll); \ rh ^= (rl | krl); \ rl ^= rotlFixed(rh & krh, 1); word32 lh, ll, rh, rl; typedef BlockGetAndPut<word32, BigEndian> Block; Block::Get(inBlock)(lh)(ll)(rh)(rl); const word32 *ks = m_key.data(); lh ^= KS(0,0); ll ^= KS(0,1); rh ^= KS(0,2); rl ^= KS(0,3); // timing attack countermeasure. see comments at top for more details const int cacheLineSize = GetCacheLineSize(); unsigned int i; volatile word32 _u = 0; word32 u = _u; assert(IsAlignedOn(s1,GetAlignmentOf<word32>())); for (i=0; i<256; i+=cacheLineSize) u &= *(const word32 *)(void*)(s1+i); u &= *(const word32 *)(void*)(s1+252); lh |= u; ll |= u; SLOW_ROUND(lh, ll, rh, rl, KS(1,0), KS(1,1)) SLOW_ROUND(rh, rl, lh, ll, KS(1,2), KS(1,3)) for (i = m_rounds-1; i > 0; --i) { DOUBLE_ROUND(lh, ll, rh, rl, KS(2,0), KS(2,1), KS(2,2), KS(2,3)) DOUBLE_ROUND(lh, ll, rh, rl, KS(3,0), KS(3,1), KS(3,2), KS(3,3)) FL(KS(4,0), KS(4,1), KS(4,2), KS(4,3)); DOUBLE_ROUND(lh, ll, rh, rl, KS(5,0), KS(5,1), KS(5,2), KS(5,3)) ks += 16; } DOUBLE_ROUND(lh, ll, rh, rl, KS(2,0), KS(2,1), KS(2,2), KS(2,3)) ROUND(lh, ll, rh, rl, KS(3,0), KS(3,1)) SLOW_ROUND(rh, rl, lh, ll, KS(3,2), KS(3,3)) lh ^= KS(4,0); ll ^= KS(4,1); rh ^= KS(4,2); rl ^= KS(4,3); Block::Put(xorBlock, outBlock)(rh)(rl)(lh)(ll); }
void Camellia::Base::UncheckedSetKey(const byte *key, unsigned int keylen, const NameValuePairs &) { m_rounds = (keylen >= 24) ? 4 : 3; unsigned int kslen = (8 * m_rounds + 2); m_key.New(kslen*2); word32 *ks32 = m_key.data(); int m=0, a=0; if (!IsForwardTransformation()) m = -1, a = kslen-1; word32 kl0, kl1, kl2, kl3; GetBlock<word32, BigEndian> getBlock(key); getBlock(kl0)(kl1)(kl2)(kl3); word32 k0=kl0, k1=kl1, k2=kl2, k3=kl3; #define CALC_ADDR2(base, i, j) ((byte *)(base)+8*(i)+4*(j)+((-16*(i))&m)) #define CALC_ADDR(base, i) CALC_ADDR2(base, i, 0) #if 1 word64 kwl, kwr; ks32 += 2*a; #define PREPARE_KS_ROUNDS \ kwl = (word64(k0) << 32) | k1; \ kwr = (word64(k2) << 32) | k3 #define KS_ROUND_0(i) \ assert(IsAlignedOn(CALC_ADDR(ks32, i+EFI(0)),GetAlignmentOf<word64>())); \ assert(IsAlignedOn(CALC_ADDR(ks32, i+EFI(1)),GetAlignmentOf<word64>())); \ *(word64*)(void*)CALC_ADDR(ks32, i+EFI(0)) = kwl; \ *(word64*)(void*)CALC_ADDR(ks32, i+EFI(1)) = kwr #define KS_ROUND(i, r, which) \ assert(IsAlignedOn(CALC_ADDR(ks32, i+EFI(r<64)),GetAlignmentOf<word64>())); \ assert(IsAlignedOn(CALC_ADDR(ks32, i+EFI(r>64)),GetAlignmentOf<word64>())); \ if (which & (1<<int(r<64))) *(word64*)(void*)CALC_ADDR(ks32, i+EFI(r<64)) = (kwr << (r%64)) | (kwl >> (64 - (r%64))); \ if (which & (1<<int(r>64))) *(word64*)(void*)CALC_ADDR(ks32, i+EFI(r>64)) = (kwl << (r%64)) | (kwr >> (64 - (r%64))) #else // SSE2 version is 30% faster on Intel Core 2. Doesn't seem worth the hassle of maintenance, but left here // #if'd out in case someone needs it. __m128i kw, kw2; __m128i *ks128 = (__m128i *)ks32+a/2; ks32 += 2*a; #define PREPARE_KS_ROUNDS \ kw = _mm_set_epi32(k0, k1, k2, k3); \ if (m) kw2 = kw, kw = _mm_shuffle_epi32(kw, _MM_SHUFFLE(1, 0, 3, 2)); \ else kw2 = _mm_shuffle_epi32(kw, _MM_SHUFFLE(1, 0, 3, 2)) #define KS_ROUND_0(i) \ _mm_store_si128((__m128i *)CALC_ADDR(ks128, i), kw) #define KS_ROUND(i, r, which) { \ __m128i temp; \ if (r<64 && (which!=1 || m)) temp = _mm_or_si128(_mm_slli_epi64(kw, r%64), _mm_srli_epi64(kw2, 64-r%64)); \ else temp = _mm_or_si128(_mm_slli_epi64(kw2, r%64), _mm_srli_epi64(kw, 64-r%64)); \ if (which & 2) _mm_store_si128((__m128i *)CALC_ADDR(ks128, i), temp); \ else _mm_storel_epi64((__m128i*)CALC_ADDR(ks32, i+EFI(0)), temp); \ } #endif if (keylen == 16) { // KL PREPARE_KS_ROUNDS; KS_ROUND_0(0); KS_ROUND(4, 15, 3); KS_ROUND(10, 45, 3); KS_ROUND(12, 60, 2); KS_ROUND(16, 77, 3); KS_ROUND(18, 94, 3); KS_ROUND(22, 111, 3); // KA k0=kl0, k1=kl1, k2=kl2, k3=kl3; DOUBLE_ROUND(k0, k1, k2, k3, 0xA09E667Ful, 0x3BCC908Bul, 0xB67AE858ul, 0x4CAA73B2ul); k0^=kl0, k1^=kl1, k2^=kl2, k3^=kl3; DOUBLE_ROUND(k0, k1, k2, k3, 0xC6EF372Ful, 0xE94F82BEul, 0x54FF53A5ul, 0xF1D36F1Cul); PREPARE_KS_ROUNDS; KS_ROUND_0(2); KS_ROUND(6, 15, 3); KS_ROUND(8, 30, 3); KS_ROUND(12, 45, 1); KS_ROUND(14, 60, 3); KS_ROUND(20, 94, 3); KS_ROUND(24, 47, 3); } else { // KL PREPARE_KS_ROUNDS; KS_ROUND_0(0); KS_ROUND(12, 45, 3); KS_ROUND(16, 60, 3); KS_ROUND(22, 77, 3); KS_ROUND(30, 111, 3); // KR word32 kr0, kr1, kr2, kr3; GetBlock<word32, BigEndian>(key+16)(kr0)(kr1); if (keylen == 24) kr2 = ~kr0, kr3 = ~kr1; else GetBlock<word32, BigEndian>(key+24)(kr2)(kr3); k0=kr0, k1=kr1, k2=kr2, k3=kr3; PREPARE_KS_ROUNDS; KS_ROUND(4, 15, 3); KS_ROUND(8, 30, 3); KS_ROUND(18, 60, 3); KS_ROUND(26, 94, 3); // KA k0^=kl0, k1^=kl1, k2^=kl2, k3^=kl3; DOUBLE_ROUND(k0, k1, k2, k3, 0xA09E667Ful, 0x3BCC908Bul, 0xB67AE858ul, 0x4CAA73B2ul); k0^=kl0, k1^=kl1, k2^=kl2, k3^=kl3; DOUBLE_ROUND(k0, k1, k2, k3, 0xC6EF372Ful, 0xE94F82BEul, 0x54FF53A5ul, 0xF1D36F1Cul); PREPARE_KS_ROUNDS; KS_ROUND(6, 15, 3); KS_ROUND(14, 45, 3); KS_ROUND(24, 77, 3); KS_ROUND(28, 94, 3); // KB k0^=kr0, k1^=kr1, k2^=kr2, k3^=kr3; DOUBLE_ROUND(k0, k1, k2, k3, 0x10E527FAul, 0xDE682D1Dul, 0xB05688C2ul, 0xB3E6C1FDul); PREPARE_KS_ROUNDS; KS_ROUND_0(2); KS_ROUND(10, 30, 3); KS_ROUND(20, 60, 3); KS_ROUND(32, 47, 3); } }
uint64_t siphash24(const void *src, unsigned long src_sz, const struct sipkey *key) { uint64_t k0 = key->k0; uint64_t k1 = key->k1; uint64_t b = (uint64_t)src_sz << 56; #ifdef UNALIGNED_OK const uint64_t *in = (uint64_t*)src; #else /* On platforms where alignment matters, if 'in' is a pointer to a * datatype that must be aligned, the compiler is allowed to * generate code that assumes that it is aligned as such. */ const uint8_t *in = (uint8_t *)src; #endif uint64_t t; uint8_t *pt, *m; uint64_t v0 = k0 ^ 0x736f6d6570736575ULL; uint64_t v1 = k1 ^ 0x646f72616e646f6dULL; uint64_t v2 = k0 ^ 0x6c7967656e657261ULL; uint64_t v3 = k1 ^ 0x7465646279746573ULL; while (src_sz >= 8) { #ifdef UNALIGNED_OK uint64_t mi = _le64toh(*in); in += 1; #else uint64_t mi; memcpy(&mi, in, 8); mi = _le64toh(mi); in += 8; #endif src_sz -= 8; v3 ^= mi; DOUBLE_ROUND(v0,v1,v2,v3); v0 ^= mi; } t = 0; pt = (uint8_t*)&t; m = (uint8_t*)in; switch (src_sz) { case 7: pt[6] = m[6]; case 6: pt[5] = m[5]; case 5: pt[4] = m[4]; #ifdef UNALIGNED_OK case 4: *((uint32_t*)&pt[0]) = *((uint32_t*)&m[0]); break; #else case 4: pt[3] = m[3]; #endif case 3: pt[2] = m[2]; case 2: pt[1] = m[1]; case 1: pt[0] = m[0]; } b |= _le64toh(t); v3 ^= b; DOUBLE_ROUND(v0,v1,v2,v3); v0 ^= b; v2 ^= 0xff; DOUBLE_ROUND(v0,v1,v2,v3); DOUBLE_ROUND(v0,v1,v2,v3); return (v0 ^ v1) ^ (v2 ^ v3); }
extern_c void crandom_chacha_expand(u_int64_t iv, u_int64_t ctr, int nr, int output_size, const unsigned char *key_, unsigned char *output_) { # if MIGHT_HAVE_SSE2 if (HAVE(SSE2)) { ssereg *key = (ssereg *)key_; ssereg *output = (ssereg *)output_; ssereg a1 = key[0], a2 = a1, aa = a1, b1 = key[1], b2 = b1, bb = b1, c1 = {iv, ctr}, c2 = {iv, ctr+1}, cc = c1, d1 = {0x3320646e61707865ull, 0x6b20657479622d32ull}, d2 = d1, dd = d1, p = {0, 1}; int i,r; # if (NEED_XOP) if (HAVE(XOP)) { for (i=0; i<output_size; i+=128) { for (r=nr; r>0; r-=2) DOUBLE_ROUND(quarter_round_xop); OUTPUT_FUNCTION; } return; } # endif # if (NEED_SSSE3) if (HAVE(SSSE3)) { for (i=0; i<output_size; i+=128) { for (r=nr; r>0; r-=2) DOUBLE_ROUND(quarter_round_ssse3); OUTPUT_FUNCTION; } return; } # endif # if (NEED_SSE2) if (HAVE(SSE2)) { for (i=0; i<output_size; i+=128) { for (r=nr; r>0; r-=2) DOUBLE_ROUND(quarter_round_sse2); OUTPUT_FUNCTION; } return; } # endif } # endif # if NEED_CONV { const u_int32_t *key = (const u_int32_t *)key_; u_int32_t x[16], input[16] = { key[0], key[1], key[2], key[3], key[4], key[5], key[6], key[7], iv, iv>>32, ctr, ctr>>32, 0x61707865, 0x3320646e, 0x79622d32, 0x6b206574 }, *output = (u_int32_t *)output_; int i, r; for (i=0; i<output_size; i+= 64) { for (r=0; r<16; r++) { x[r] = input[r]; } for (r=nr; r>0; r-=2) { quarter_round(&x[0], &x[4], &x[8], &x[12]); quarter_round(&x[1], &x[5], &x[9], &x[13]); quarter_round(&x[2], &x[6], &x[10], &x[14]); quarter_round(&x[3], &x[7], &x[11], &x[15]); quarter_round(&x[0], &x[5], &x[10], &x[15]); quarter_round(&x[1], &x[6], &x[11], &x[12]); quarter_round(&x[2], &x[7], &x[8], &x[13]); quarter_round(&x[3], &x[4], &x[9], &x[14]); } for (r=0; r<16; r++) { output[r] = x[r] + input[r]; } output += 16; input[11] ++; if (!input[11]) input[12]++; } }