unsigned long tdb_hash_calc(const char *data, size_t len) { #define MUL sizeof(long) int i; unsigned long crc0 = 0, crc1 = 0, h; unsigned long *d = (unsigned long *)data; size_t n = (len / MUL) & ~1UL; for (i = 0; i < n; i += 2) { /* See linux/arch/x86/crypto/crc32c-intel.c for CRC32C. */ crc0 = _mm_crc32_u64(crc0, d[i]); crc1 = _mm_crc32_u64(crc1, d[i + 1]); } if (n * MUL + MUL <= len) { crc0 = _mm_crc32_u64(crc0, d[n]); n++; } h = (crc1 << 32) | crc0; /* * Generate relatively small and dense hash tail values - they are good * for short strings in htrie which uses less significant bits at root, * however collisions are very probable. */ n *= MUL; switch (len - n) { case 7: h += data[n] * n; ++n; case 6: h += data[n] * n; ++n; case 5: h += data[n] * n; ++n; case 4: h += data[n] * n; ++n; case 3: h += data[n] * n; ++n; case 2: h += data[n] * n; ++n; case 1: h += data[n] * n; } return h; #undef MUL }
int memkind_thread_get_arena(struct memkind *kind, unsigned int *arena, size_t size) { int err = 0; unsigned int *arena_tsd; arena_tsd = pthread_getspecific(kind->arena_key); if (arena_tsd == NULL) { arena_tsd = jemk_malloc(sizeof(unsigned int)); if (arena_tsd == NULL) { err = MEMKIND_ERROR_MALLOC; } if (!err) { *arena_tsd = _mm_crc32_u64(0, (uint64_t)pthread_self()) % kind->arena_map_len; err = pthread_setspecific(kind->arena_key, arena_tsd) ? MEMKIND_ERROR_PTHREAD : 0; } } if (!err) { *arena = kind->arena_map[*arena_tsd]; if (*arena == UINT_MAX) { err = MEMKIND_ERROR_MALLCTL; } } return err; }
SIMD_INLINE void Crc32c(size_t & crc, const size_t * p, const size_t * end) { while(p < end) { #ifdef SIMD_X64_ENABLE crc = _mm_crc32_u64(crc, *p++); #else crc = _mm_crc32_u32(crc, *p++); #endif } }
size_t operator() (StringRef x) const { const char * pos = x.data; size_t size = x.size; if (size == 0) return 0; if (size < 16) { return hashLessThan16(x.data, x.size); } const char * end = pos + size; const char * end_16 = pos + size / 16 * 16; size_t res0 = -1ULL; size_t res1 = -1ULL; do { UInt64 word0 = reinterpret_cast<const UInt64 *>(pos)[0]; UInt64 word1 = reinterpret_cast<const UInt64 *>(pos)[1]; res0 = _mm_crc32_u64(res0, word0); res1 = _mm_crc32_u64(res1, word1); pos += 16; } while (pos < end_16); UInt64 word0 = *reinterpret_cast<const UInt64 *>(end - 8); UInt64 word1 = *reinterpret_cast<const UInt64 *>(end - 16); /* return HashLen16(Rotate(word0 - word1, 43) + Rotate(res0, 30) + res1, word0 + Rotate(word1 ^ k3, 20) - res0 + size);*/ res0 = _mm_crc32_u64(res0, word0); res1 = _mm_crc32_u64(res1, word1); return hashLen16(res0, res1); }
int memkind_thread_get_arena(struct memkind *kind, unsigned int *arena, size_t size) { static __thread unsigned int MEMKIND_TLS_MODEL arena_tls = UINT_MAX; int err = 0; if (arena_tls == UINT_MAX) { arena_tls = _mm_crc32_u64(0, (uint64_t)pthread_self()); } if (kind->arena_map != NULL) { *arena = kind->arena_map[arena_tls % kind->arena_map_len]; } else { err = MEMKIND_ERROR_RUNTIME; } return err; }
pg_crc32c pg_comp_crc32c_sse42(pg_crc32c crc, const void *data, size_t len) { const unsigned char *p = static_cast<const unsigned char *>(data); const unsigned char *pend = p + len; /* * Process eight bytes of data at a time. * * NB: We do unaligned accesses here. The Intel architecture allows that, * and performance testing didn't show any performance gain from aligning * the begin address. */ #ifdef __x86_64__ while (p + 8 <= pend) { crc = (uint32) _mm_crc32_u64(crc, *((const uint64 *) p)); p += 8; } /* Process remaining full four bytes if any */ if (p + 4 <= pend) { crc = _mm_crc32_u32(crc, *((const unsigned int *) p)); p += 4; } #else /* * Process four bytes at a time. (The eight byte instruction is not * available on the 32-bit x86 architecture). */ while (p + 4 <= pend) { crc = _mm_crc32_u32(crc, *((const unsigned int *) p)); p += 4; } #endif /* __x86_64__ */ /* Process any remaining bytes one at a time. */ while (p < pend) { crc = _mm_crc32_u8(crc, *p); p++; } return crc; }
// Hash a string of up to 24 characters int computeHash(const char* from,size_t l) { if (l<=8) { return _mm_crc32_u64(0,((uint64_t*)from)[0]<<(64-8*l)); } else if (l<=16) { return _mm_crc32_u64(_mm_crc32_u64(0,((uint64_t*)from)[0]),(((uint64_t*)from)[1])<<(128-8*l)); } else { return _mm_crc32_u64( _mm_crc32_u64( _mm_crc32_u64( 0, ((uint64_t*)from)[0] ), ((uint64_t*)from)[1] ), (((uint64_t*)from)[2])<<(128+64-8*l)); } assert(false&&"Missing a case in hash."); }
void metrohash128crc_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) { static const uint64_t k0 = 0xC83A91E1; static const uint64_t k1 = 0x8648DBDB; static const uint64_t k2 = 0x7BDEC03B; static const uint64_t k3 = 0x2F5870A5; const uint8_t * ptr = reinterpret_cast<const uint8_t*>(key); const uint8_t * const end = ptr + len; uint64_t v[4]; v[0] = ((static_cast<uint64_t>(seed) - k0) * k3) + len; v[1] = ((static_cast<uint64_t>(seed) + k1) * k2) + len; if (len >= 32) { v[2] = ((static_cast<uint64_t>(seed) + k0) * k2) + len; v[3] = ((static_cast<uint64_t>(seed) - k1) * k3) + len; do { v[0] ^= _mm_crc32_u64(v[0], read_u64(ptr)); ptr += 8; v[1] ^= _mm_crc32_u64(v[1], read_u64(ptr)); ptr += 8; v[2] ^= _mm_crc32_u64(v[2], read_u64(ptr)); ptr += 8; v[3] ^= _mm_crc32_u64(v[3], read_u64(ptr)); ptr += 8; } while (ptr <= (end - 32)); v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 34) * k1; v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 37) * k0; v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 34) * k1; v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 37) * k0; } if ((end - ptr) >= 16) { v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],34) * k3; v[1] += read_u64(ptr) * k2; ptr += 8; v[1] = rotate_right(v[1],34) * k3; v[0] ^= rotate_right((v[0] * k2) + v[1], 30) * k1; v[1] ^= rotate_right((v[1] * k3) + v[0], 30) * k0; } if ((end - ptr) >= 8) { v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],36) * k3; v[0] ^= rotate_right((v[0] * k2) + v[1], 23) * k1; } if ((end - ptr) >= 4) { v[1] ^= _mm_crc32_u64(v[0], read_u32(ptr)); ptr += 4; v[1] ^= rotate_right((v[1] * k3) + v[0], 19) * k0; } if ((end - ptr) >= 2) { v[0] ^= _mm_crc32_u64(v[1], read_u16(ptr)); ptr += 2; v[0] ^= rotate_right((v[0] * k2) + v[1], 13) * k1; } if ((end - ptr) >= 1) { v[1] ^= _mm_crc32_u64(v[0], read_u8 (ptr)); v[1] ^= rotate_right((v[1] * k3) + v[0], 17) * k0; } v[0] += rotate_right((v[0] * k0) + v[1], 11); v[1] += rotate_right((v[1] * k1) + v[0], 26); v[0] += rotate_right((v[0] * k0) + v[1], 11); v[1] += rotate_right((v[1] * k1) + v[0], 26); memcpy(out, v, 16); }
void metrohash128crc_2(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) { static const uint64_t k0 = 0xEE783E2F; static const uint64_t k1 = 0xAD07C493; static const uint64_t k2 = 0x797A90BB; static const uint64_t k3 = 0x2E4B2E1B; const uint8_t * ptr = reinterpret_cast<const uint8_t*>(key); const uint8_t * const end = ptr + len; uint64_t v[4]; v[0] = ((static_cast<uint64_t>(seed) - k0) * k3) + len; v[1] = ((static_cast<uint64_t>(seed) + k1) * k2) + len; if (len >= 32) { v[2] = ((static_cast<uint64_t>(seed) + k0) * k2) + len; v[3] = ((static_cast<uint64_t>(seed) - k1) * k3) + len; do { v[0] ^= _mm_crc32_u64(v[0], read_u64(ptr)); ptr += 8; v[1] ^= _mm_crc32_u64(v[1], read_u64(ptr)); ptr += 8; v[2] ^= _mm_crc32_u64(v[2], read_u64(ptr)); ptr += 8; v[3] ^= _mm_crc32_u64(v[3], read_u64(ptr)); ptr += 8; } while (ptr <= (end - 32)); v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 12) * k1; v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 19) * k0; v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 12) * k1; v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 19) * k0; } if ((end - ptr) >= 16) { v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],41) * k3; v[1] += read_u64(ptr) * k2; ptr += 8; v[1] = rotate_right(v[1],41) * k3; v[0] ^= rotate_right((v[0] * k2) + v[1], 10) * k1; v[1] ^= rotate_right((v[1] * k3) + v[0], 10) * k0; } if ((end - ptr) >= 8) { v[0] += read_u64(ptr) * k2; ptr += 8; v[0] = rotate_right(v[0],34) * k3; v[0] ^= rotate_right((v[0] * k2) + v[1], 22) * k1; } if ((end - ptr) >= 4) { v[1] ^= _mm_crc32_u64(v[0], read_u32(ptr)); ptr += 4; v[1] ^= rotate_right((v[1] * k3) + v[0], 14) * k0; } if ((end - ptr) >= 2) { v[0] ^= _mm_crc32_u64(v[1], read_u16(ptr)); ptr += 2; v[0] ^= rotate_right((v[0] * k2) + v[1], 15) * k1; } if ((end - ptr) >= 1) { v[1] ^= _mm_crc32_u64(v[0], read_u8 (ptr)); v[1] ^= rotate_right((v[1] * k3) + v[0], 18) * k0; } v[0] += rotate_right((v[0] * k0) + v[1], 15); v[1] += rotate_right((v[1] * k1) + v[0], 27); v[0] += rotate_right((v[0] * k0) + v[1], 15); v[1] += rotate_right((v[1] * k1) + v[0], 27); memcpy(out, v, 16); }
unsigned long long test_mm_crc32_u64(unsigned long long CRC, unsigned long long V) { // CHECK-LABEL: test_mm_crc32_u64 // CHECK: call i64 @llvm.x86.sse42.crc32.64.64(i64 %{{.*}}, i64 %{{.*}}) return _mm_crc32_u64(CRC, V); }
void metrohash64crc_1(const uint8_t * key, uint64_t len, uint32_t seed, uint8_t * out) { static const uint64_t k0 = 0xC83A91E1; static const uint64_t k1 = 0x8648DBDB; static const uint64_t k2 = 0x7BDEC03B; static const uint64_t k3 = 0x2F5870A5; const uint8_t * ptr = reinterpret_cast<const uint8_t*>(key); const uint8_t * const end = ptr + len; uint64_t hash = ((static_cast<uint64_t>(seed) + k2) * k0) + len; if (len >= 32) { uint64_t v[4]; v[0] = hash; v[1] = hash; v[2] = hash; v[3] = hash; do { v[0] ^= _mm_crc32_u64(v[0], read_u64(ptr)); ptr += 8; v[1] ^= _mm_crc32_u64(v[1], read_u64(ptr)); ptr += 8; v[2] ^= _mm_crc32_u64(v[2], read_u64(ptr)); ptr += 8; v[3] ^= _mm_crc32_u64(v[3], read_u64(ptr)); ptr += 8; } while (ptr <= (end - 32)); v[2] ^= rotate_right(((v[0] + v[3]) * k0) + v[1], 33) * k1; v[3] ^= rotate_right(((v[1] + v[2]) * k1) + v[0], 33) * k0; v[0] ^= rotate_right(((v[0] + v[2]) * k0) + v[3], 33) * k1; v[1] ^= rotate_right(((v[1] + v[3]) * k1) + v[2], 33) * k0; hash += v[0] ^ v[1]; } if ((end - ptr) >= 16) { uint64_t v0 = hash + (read_u64(ptr) * k0); ptr += 8; v0 = rotate_right(v0,33) * k1; uint64_t v1 = hash + (read_u64(ptr) * k1); ptr += 8; v1 = rotate_right(v1,33) * k2; v0 ^= rotate_right(v0 * k0, 35) + v1; v1 ^= rotate_right(v1 * k3, 35) + v0; hash += v1; } if ((end - ptr) >= 8) { hash += read_u64(ptr) * k3; ptr += 8; hash ^= rotate_right(hash, 33) * k1; } if ((end - ptr) >= 4) { hash ^= _mm_crc32_u64(hash, read_u32(ptr)); ptr += 4; hash ^= rotate_right(hash, 15) * k1; } if ((end - ptr) >= 2) { hash ^= _mm_crc32_u64(hash, read_u16(ptr)); ptr += 2; hash ^= rotate_right(hash, 13) * k1; } if ((end - ptr) >= 1) { hash ^= _mm_crc32_u64(hash, read_u8(ptr)); hash ^= rotate_right(hash, 25) * k1; } hash ^= rotate_right(hash, 33); hash *= k0; hash ^= rotate_right(hash, 33); memcpy(out, &hash, 8); }
static int ptr_hash(void *ptr, int table_len) { return _mm_crc32_u64(0, (size_t)ptr) % table_len; }
void libmaus::digest::CRC32C_sse42::update(uint8_t const * t, size_t l) { #if defined(LIBMAUS_HAVE_SMMINTRIN_H) && defined(LIBMAUS_USE_ASSEMBLY) && defined(LIBMAUS_HAVE_x86_64) && defined(LIBMAUS_HAVE_i386) ctx = ~ctx; size_t const offset = reinterpret_cast<size_t>(t); // check for 3 LSB if ( offset & 7 ) { // check for LSB if ( (offset & 1) && l ) { ctx = _mm_crc32_u8(ctx, *t); t += 1; l -= 1; } // check for 2nd LSB if ( (offset & 2) && (l>=2) ) { ctx = _mm_crc32_u16(ctx, *reinterpret_cast<uint16_t const *>(t)); t += 2; l -= 2; } // check for 3rd LSB if ( (offset & 4) && l >= 4 ) { ctx = _mm_crc32_u32(ctx, *reinterpret_cast<uint32_t const *>(t)); t += 4; l -= 4; } } uint64_t const * t64 = reinterpret_cast<uint64_t const *>(t); uint64_t const * const t64e = t64 + (l>>3); while ( t64 != t64e ) ctx = _mm_crc32_u64(ctx, *(t64++)); l &= 7; t = reinterpret_cast<uint8_t const *>(t64); if ( l >= 4 ) { ctx = _mm_crc32_u32(ctx, *reinterpret_cast<uint32_t const *>(t)); t += 4; l -= 4; } if ( l >= 2 ) { ctx = _mm_crc32_u16(ctx, *reinterpret_cast<uint16_t const *>(t)); t += 2; l -= 2; } if ( l ) { ctx = _mm_crc32_u8(ctx, *t); } ctx = ~ctx; #endif }
uint32_t sse42_crc32c(uint32_t crc, const unsigned char *buf, unsigned len) { #ifdef __amd64__ const size_t align = 8; #else const size_t align = 4; #endif const unsigned char *next, *end; #ifdef __amd64__ uint64_t crc0, crc1, crc2; #else uint32_t crc0, crc1, crc2; #endif next = buf; crc0 = crc; /* Compute the crc to bring the data pointer to an aligned boundary. */ while (len && ((uintptr_t)next & (align - 1)) != 0) { crc0 = _mm_crc32_u8(crc0, *next); next++; len--; } #if LONG > SHORT /* * Compute the crc on sets of LONG*3 bytes, executing three independent * crc instructions, each on LONG bytes -- this is optimized for the * Nehalem, Westmere, Sandy Bridge, and Ivy Bridge architectures, which * have a throughput of one crc per cycle, but a latency of three * cycles. */ crc = 0; while (len >= LONG * 3) { crc1 = 0; crc2 = 0; end = next + LONG; do { #ifdef __amd64__ crc0 = _mm_crc32_u64(crc0, *(const uint64_t *)next); crc1 = _mm_crc32_u64(crc1, *(const uint64_t *)(next + LONG)); crc2 = _mm_crc32_u64(crc2, *(const uint64_t *)(next + (LONG * 2))); #else crc0 = _mm_crc32_u32(crc0, *(const uint32_t *)next); crc1 = _mm_crc32_u32(crc1, *(const uint32_t *)(next + LONG)); crc2 = _mm_crc32_u32(crc2, *(const uint32_t *)(next + (LONG * 2))); #endif next += align; } while (next < end); /*- * Update the crc. Try to do it in parallel with the inner * loop. 'crc' is used to accumulate crc0 and crc1 * produced by the inner loop so that the next iteration * of the loop doesn't depend on anything except crc2. * * The full expression for the update is: * crc = S*S*S*crc + S*S*crc0 + S*crc1 * where the terms are polynomials modulo the CRC polynomial. * We regroup this subtly as: * crc = S*S * (S*crc + crc0) + S*crc1. * This has an extra dependency which reduces possible * parallelism for the expression, but it turns out to be * best to intentionally delay evaluation of this expression * so that it competes less with the inner loop. * * We also intentionally reduce parallelism by feedng back * crc2 to the inner loop as crc0 instead of accumulating * it in crc. This synchronizes the loop with crc update. * CPU and/or compiler schedulers produced bad order without * this. * * Shifts take about 12 cycles each, so 3 here with 2 * parallelizable take about 24 cycles and the crc update * takes slightly longer. 8 dependent crc32 instructions * can run in 24 cycles, so the 3-way blocking is worse * than useless for sizes less than 8 * <word size> = 64 * on amd64. In practice, SHORT = 32 confirms these * timing calculations by giving a small improvement * starting at size 96. Then the inner loop takes about * 12 cycles and the crc update about 24, but these are * partly in parallel so the total time is less than the * 36 cycles that 12 dependent crc32 instructions would * take. * * To have a chance of completely hiding the overhead for * the crc update, the inner loop must take considerably * longer than 24 cycles. LONG = 64 makes the inner loop * take about 24 cycles, so is not quite large enough. * LONG = 128 works OK. Unhideable overheads are about * 12 cycles per inner loop. All assuming timing like * Haswell. */ crc = crc32c_shift(crc32c_long, crc) ^ crc0; crc1 = crc32c_shift(crc32c_long, crc1); crc = crc32c_shift(crc32c_2long, crc) ^ crc1; crc0 = crc2; next += LONG * 2; len -= LONG * 3; } crc0 ^= crc; #endif /* LONG > SHORT */ /* * Do the same thing, but now on SHORT*3 blocks for the remaining data * less than a LONG*3 block */ crc = 0; while (len >= SHORT * 3) { crc1 = 0; crc2 = 0; end = next + SHORT; do { #ifdef __amd64__ crc0 = _mm_crc32_u64(crc0, *(const uint64_t *)next); crc1 = _mm_crc32_u64(crc1, *(const uint64_t *)(next + SHORT)); crc2 = _mm_crc32_u64(crc2, *(const uint64_t *)(next + (SHORT * 2))); #else crc0 = _mm_crc32_u32(crc0, *(const uint32_t *)next); crc1 = _mm_crc32_u32(crc1, *(const uint32_t *)(next + SHORT)); crc2 = _mm_crc32_u32(crc2, *(const uint32_t *)(next + (SHORT * 2))); #endif next += align; } while (next < end); crc = crc32c_shift(crc32c_short, crc) ^ crc0; crc1 = crc32c_shift(crc32c_short, crc1); crc = crc32c_shift(crc32c_2short, crc) ^ crc1; crc0 = crc2; next += SHORT * 2; len -= SHORT * 3; } crc0 ^= crc; /* Compute the crc on the remaining bytes at native word size. */ end = next + (len - (len & (align - 1))); while (next < end) { #ifdef __amd64__ crc0 = _mm_crc32_u64(crc0, *(const uint64_t *)next); #else crc0 = _mm_crc32_u32(crc0, *(const uint32_t *)next); #endif next += align; } len &= (align - 1); /* Compute the crc for any trailing bytes. */ while (len) { crc0 = _mm_crc32_u8(crc0, *next); next++; len--; } return ((uint32_t)crc0); }